pdfminer bug (#2244)

Coniferish · christinestraub · web-flow · commit d3a404cfb541 · 2023-12-13T00:51:38.000Z
Closes #2212. ### Summary This PR implements logic to fall back to the "inferred_layout + OCR" if pdfminer fails in the `hi_res` pipeline (discussed in[ this slack channel](https://unstructuredw-kbe4326.slack.com/archives/C057R3F8F7A/p1701807299018929). ### Testing PDF: [NASA-SNA-8-D-027III-Rev2-CsmLmSpacecraftOperationalDataBook-Volume3-MassProperties-pg856.pdf](https://github.com/Unstructured-IO/unstructured/files/13554149/NASA-SNA-8-D-027III-Rev2-CsmLmSpacecraftOperationalDataBook-Volume3-MassProperties-pg856.pdf) ``` elements = partition_pdf( filename="NASA-SNA-8-D-027III-Rev2-CsmLmSpacecraftOperationalDataBook-Volume3-MassProperties-pg856.pdf", strategy="hi_res", ) ``` --------- Co-authored-by: christinestraub <christinemstraub@gmail.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.11.4-dev8
+## 0.11.4-dev9
 
 ### Enhancements
 
@@ -16,6 +16,8 @@
 
 ### Fixes
 
+* **Fix pdf `hi_res` partitioning failure when pdfminer fails.** Implemented logic to fall back to the "inferred_layout + OCR" if pdfminer fails in the `hi_res` strategy.
+
 ## 0.11.3
 
 ### Enhancements
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1055,7 +1055,6 @@ def test_partition_pdf_with_bad_color_profile():
     [
         ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
         ("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
-        ("failure-after-repair.pdf", "PDFMiner failed to process PDF page 26 after repairing it."),
     ],
 )
 def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.4-dev8"  # pragma: no cover
+__version__ = "0.11.4-dev9"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
@@ -198,7 +198,7 @@ def supplement_page_layout_with_ocr(
         )
     elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
         for element in page_layout.elements:
-            if element.text == "":
+            if not element.text:
                 padding = env_config.IMAGE_CROP_PAD
                 padded_element = pad_element_bboxes(element, padding=padding)
                 cropped_image = image.crop(
diff --git a/unstructured/partition/pdf_image/pdf.py b/unstructured/partition/pdf_image/pdf.py
@@ -238,6 +238,7 @@ def _partition_pdf_or_image_local(
     ocr_mode: str = OCRMode.FULL_PAGE.value,
     model_name: Optional[str] = None,
     metadata_last_modified: Optional[str] = None,
+    pdf_text_extractable: bool = False,
     extract_images_in_pdf: bool = False,
     extract_element_types: Optional[List[str]] = None,
     image_output_dir_path: Optional[str] = None,
@@ -281,12 +282,14 @@ def _partition_pdf_or_image_local(
             pdf_image_dpi=pdf_image_dpi,
         )
 
-        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-        merged_document_layout = process_file_with_pdfminer(
-            inferred_document_layout,
-            filename,
-            is_image,
-        )
+        if pdf_text_extractable is True:
+            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+            merged_document_layout = process_file_with_pdfminer(
+                inferred_document_layout,
+                filename,
+            )
+        else:
+            merged_document_layout = inferred_document_layout
 
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
@@ -310,13 +313,14 @@ def _partition_pdf_or_image_local(
         )
         if hasattr(file, "seek"):
             file.seek(0)
-
-        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-        merged_document_layout = process_data_with_pdfminer(
-            inferred_document_layout,
-            file,
-            is_image,
-        )
+        if pdf_text_extractable is True:
+            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+            merged_document_layout = process_data_with_pdfminer(
+                inferred_document_layout,
+                file,
+            )
+        else:
+            merged_document_layout = inferred_document_layout
 
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
@@ -339,6 +343,11 @@ def _partition_pdf_or_image_local(
         kwargs["sort_mode"] = SORT_MODE_DONT
 
     final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
+
+    for page in final_document_layout.pages:
+        for el in page.elements:
+            el.text = el.text or ""
+
     elements = document_to_element_list(
         final_document_layout,
         sortable=True,
@@ -452,7 +461,7 @@ def partition_pdf_or_image(
                 isinstance(el, Text) and el.text.strip() for el in extracted_elements
             )
         except Exception as e:
-            logger.error(e, exc_info=True)
+            logger.error(e)
             logger.warning("PDF text extraction failed, skip text extraction...")
 
     strategy = determine_pdf_or_image_strategy(
@@ -476,6 +485,7 @@ def partition_pdf_or_image(
                 include_page_breaks=include_page_breaks,
                 languages=languages,
                 metadata_last_modified=metadata_last_modified or last_modification_date,
+                pdf_text_extractable=pdf_text_extractable,
                 extract_images_in_pdf=extract_images_in_pdf,
                 extract_element_types=extract_element_types,
                 image_output_dir_path=image_output_dir_path,
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -27,28 +27,21 @@
 def process_file_with_pdfminer(
     inferred_document_layout: "DocumentLayout",
     filename: str = "",
-    is_image: bool = False,
 ) -> "DocumentLayout":
     with open_filename(filename, "rb") as fp:
         fp = cast(BinaryIO, fp)
         inferred_document_layout = process_data_with_pdfminer(
             inferred_document_layout=inferred_document_layout,
             file=fp,
-            is_image=is_image,
         )
         return inferred_document_layout
 
 
 def process_data_with_pdfminer(
     inferred_document_layout: "DocumentLayout",
     file: Optional[Union[bytes, BinaryIO]] = None,
-    is_image: bool = False,
 ) -> "DocumentLayout":
-    if is_image:
-        for page in inferred_document_layout.pages:
-            for el in page.elements:
-                el.text = el.text or ""
-        return inferred_document_layout
+    """Process document data using PDFMiner to extract layout information."""
 
     extracted_layouts = get_regions_by_pdfminer(file)
 
diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -104,14 +104,8 @@ def open_pdfminer_pages_generator(
                     with pikepdf.Pdf.open(error_page_data) as pdf:
                         pdf.save(tmp.name)
                     page = next(PDFPage.get_pages(open(tmp.name, "rb")))  # noqa: SIM115
-                    try:
-                        interpreter.process_page(page)
-                        page_layout = device.get_result()
-                    except Exception:
-                        logger.warning(
-                            f"PDFMiner failed to process PDF page {i+1} after repairing it."
-                        )
-                        break
+                    interpreter.process_page(page)
+                    page_layout = device.get_result()
             i += 1
             yield page, page_layout
     except PSSyntaxError:

Original file line number	Diff line number	Diff line change
`@@ -1055,7 +1055,6 @@ def test_partition_pdf_with_bad_color_profile():`
`1055`	`1055`	`[`
`1056`	`1056`	`("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),`
`1057`	`1057`	`("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),`
`1058`		`- ("failure-after-repair.pdf", "PDFMiner failed to process PDF page 26 after repairing it."),`
`1059`	`1058`	`],`
`1060`	`1059`	`)`
`1061`	`1060`	`def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.11.4-dev8" # pragma: no cover`
	`1`	`+__version__ = "0.11.4-dev9" # pragma: no cover`
Original file line number	Diff line number	Diff line change
`@@ -198,7 +198,7 @@ def supplement_page_layout_with_ocr(`
`198`	`198`	`)`
`199`	`199`	`elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:`
`200`	`200`	`for element in page_layout.elements:`
`201`		`- if element.text == "":`
	`201`	`+ if not element.text:`
`202`	`202`	`padding = env_config.IMAGE_CROP_PAD`
`203`	`203`	`padded_element = pad_element_bboxes(element, padding=padding)`
`204`	`204`	`cropped_image = image.crop(`