@@ -238,6 +238,7 @@ def _partition_pdf_or_image_local(
238
238
ocr_mode : str = OCRMode .FULL_PAGE .value ,
239
239
model_name : Optional [str ] = None ,
240
240
metadata_last_modified : Optional [str ] = None ,
241
+ pdf_text_extractable : bool = False ,
241
242
extract_images_in_pdf : bool = False ,
242
243
extract_element_types : Optional [List [str ]] = None ,
243
244
image_output_dir_path : Optional [str ] = None ,
@@ -281,12 +282,14 @@ def _partition_pdf_or_image_local(
281
282
pdf_image_dpi = pdf_image_dpi ,
282
283
)
283
284
284
- # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
285
- merged_document_layout = process_file_with_pdfminer (
286
- inferred_document_layout ,
287
- filename ,
288
- is_image ,
289
- )
285
+ if pdf_text_extractable is True :
286
+ # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
287
+ merged_document_layout = process_file_with_pdfminer (
288
+ inferred_document_layout ,
289
+ filename ,
290
+ )
291
+ else :
292
+ merged_document_layout = inferred_document_layout
290
293
291
294
if model_name .startswith ("chipper" ):
292
295
# NOTE(alan): We shouldn't do OCR with chipper
@@ -310,13 +313,14 @@ def _partition_pdf_or_image_local(
310
313
)
311
314
if hasattr (file , "seek" ):
312
315
file .seek (0 )
313
-
314
- # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
315
- merged_document_layout = process_data_with_pdfminer (
316
- inferred_document_layout ,
317
- file ,
318
- is_image ,
319
- )
316
+ if pdf_text_extractable is True :
317
+ # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
318
+ merged_document_layout = process_data_with_pdfminer (
319
+ inferred_document_layout ,
320
+ file ,
321
+ )
322
+ else :
323
+ merged_document_layout = inferred_document_layout
320
324
321
325
if model_name .startswith ("chipper" ):
322
326
# NOTE(alan): We shouldn't do OCR with chipper
@@ -339,6 +343,11 @@ def _partition_pdf_or_image_local(
339
343
kwargs ["sort_mode" ] = SORT_MODE_DONT
340
344
341
345
final_document_layout = clean_pdfminer_inner_elements (final_document_layout )
346
+
347
+ for page in final_document_layout .pages :
348
+ for el in page .elements :
349
+ el .text = el .text or ""
350
+
342
351
elements = document_to_element_list (
343
352
final_document_layout ,
344
353
sortable = True ,
@@ -452,7 +461,7 @@ def partition_pdf_or_image(
452
461
isinstance (el , Text ) and el .text .strip () for el in extracted_elements
453
462
)
454
463
except Exception as e :
455
- logger .error (e , exc_info = True )
464
+ logger .error (e )
456
465
logger .warning ("PDF text extraction failed, skip text extraction..." )
457
466
458
467
strategy = determine_pdf_or_image_strategy (
@@ -476,6 +485,7 @@ def partition_pdf_or_image(
476
485
include_page_breaks = include_page_breaks ,
477
486
languages = languages ,
478
487
metadata_last_modified = metadata_last_modified or last_modification_date ,
488
+ pdf_text_extractable = pdf_text_extractable ,
479
489
extract_images_in_pdf = extract_images_in_pdf ,
480
490
extract_element_types = extract_element_types ,
481
491
image_output_dir_path = image_output_dir_path ,
0 commit comments