Skip to content

Commit d3a404c

Browse files
pdfminer bug (#2244)
Closes #2212. ### Summary This PR implements logic to fall back to the "inferred_layout + OCR" if pdfminer fails in the `hi_res` pipeline (discussed in[ this slack channel](https://unstructuredw-kbe4326.slack.com/archives/C057R3F8F7A/p1701807299018929). ### Testing PDF: [NASA-SNA-8-D-027III-Rev2-CsmLmSpacecraftOperationalDataBook-Volume3-MassProperties-pg856.pdf](https://github.com/Unstructured-IO/unstructured/files/13554149/NASA-SNA-8-D-027III-Rev2-CsmLmSpacecraftOperationalDataBook-Volume3-MassProperties-pg856.pdf) ``` elements = partition_pdf( filename="NASA-SNA-8-D-027III-Rev2-CsmLmSpacecraftOperationalDataBook-Volume3-MassProperties-pg856.pdf", strategy="hi_res", ) ``` --------- Co-authored-by: christinestraub <[email protected]>
1 parent 21bc67f commit d3a404c

File tree

7 files changed

+32
-34
lines changed

7 files changed

+32
-34
lines changed

CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.11.4-dev8
1+
## 0.11.4-dev9
22

33
### Enhancements
44

@@ -16,6 +16,8 @@
1616

1717
### Fixes
1818

19+
* **Fix pdf `hi_res` partitioning failure when pdfminer fails.** Implemented logic to fall back to the "inferred_layout + OCR" if pdfminer fails in the `hi_res` strategy.
20+
1921
## 0.11.3
2022

2123
### Enhancements

test_unstructured/partition/pdf_image/test_pdf.py

-1
Original file line numberDiff line numberDiff line change
@@ -1055,7 +1055,6 @@ def test_partition_pdf_with_bad_color_profile():
10551055
[
10561056
("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
10571057
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
1058-
("failure-after-repair.pdf", "PDFMiner failed to process PDF page 26 after repairing it."),
10591058
],
10601059
)
10611060
def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.11.4-dev8" # pragma: no cover
1+
__version__ = "0.11.4-dev9" # pragma: no cover

unstructured/partition/pdf_image/ocr.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def supplement_page_layout_with_ocr(
198198
)
199199
elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
200200
for element in page_layout.elements:
201-
if element.text == "":
201+
if not element.text:
202202
padding = env_config.IMAGE_CROP_PAD
203203
padded_element = pad_element_bboxes(element, padding=padding)
204204
cropped_image = image.crop(

unstructured/partition/pdf_image/pdf.py

+24-14
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ def _partition_pdf_or_image_local(
238238
ocr_mode: str = OCRMode.FULL_PAGE.value,
239239
model_name: Optional[str] = None,
240240
metadata_last_modified: Optional[str] = None,
241+
pdf_text_extractable: bool = False,
241242
extract_images_in_pdf: bool = False,
242243
extract_element_types: Optional[List[str]] = None,
243244
image_output_dir_path: Optional[str] = None,
@@ -281,12 +282,14 @@ def _partition_pdf_or_image_local(
281282
pdf_image_dpi=pdf_image_dpi,
282283
)
283284

284-
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
285-
merged_document_layout = process_file_with_pdfminer(
286-
inferred_document_layout,
287-
filename,
288-
is_image,
289-
)
285+
if pdf_text_extractable is True:
286+
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
287+
merged_document_layout = process_file_with_pdfminer(
288+
inferred_document_layout,
289+
filename,
290+
)
291+
else:
292+
merged_document_layout = inferred_document_layout
290293

291294
if model_name.startswith("chipper"):
292295
# NOTE(alan): We shouldn't do OCR with chipper
@@ -310,13 +313,14 @@ def _partition_pdf_or_image_local(
310313
)
311314
if hasattr(file, "seek"):
312315
file.seek(0)
313-
314-
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
315-
merged_document_layout = process_data_with_pdfminer(
316-
inferred_document_layout,
317-
file,
318-
is_image,
319-
)
316+
if pdf_text_extractable is True:
317+
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
318+
merged_document_layout = process_data_with_pdfminer(
319+
inferred_document_layout,
320+
file,
321+
)
322+
else:
323+
merged_document_layout = inferred_document_layout
320324

321325
if model_name.startswith("chipper"):
322326
# NOTE(alan): We shouldn't do OCR with chipper
@@ -339,6 +343,11 @@ def _partition_pdf_or_image_local(
339343
kwargs["sort_mode"] = SORT_MODE_DONT
340344

341345
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
346+
347+
for page in final_document_layout.pages:
348+
for el in page.elements:
349+
el.text = el.text or ""
350+
342351
elements = document_to_element_list(
343352
final_document_layout,
344353
sortable=True,
@@ -452,7 +461,7 @@ def partition_pdf_or_image(
452461
isinstance(el, Text) and el.text.strip() for el in extracted_elements
453462
)
454463
except Exception as e:
455-
logger.error(e, exc_info=True)
464+
logger.error(e)
456465
logger.warning("PDF text extraction failed, skip text extraction...")
457466

458467
strategy = determine_pdf_or_image_strategy(
@@ -476,6 +485,7 @@ def partition_pdf_or_image(
476485
include_page_breaks=include_page_breaks,
477486
languages=languages,
478487
metadata_last_modified=metadata_last_modified or last_modification_date,
488+
pdf_text_extractable=pdf_text_extractable,
479489
extract_images_in_pdf=extract_images_in_pdf,
480490
extract_element_types=extract_element_types,
481491
image_output_dir_path=image_output_dir_path,

unstructured/partition/pdf_image/pdfminer_processing.py

+1-8
Original file line numberDiff line numberDiff line change
@@ -27,28 +27,21 @@
2727
def process_file_with_pdfminer(
2828
inferred_document_layout: "DocumentLayout",
2929
filename: str = "",
30-
is_image: bool = False,
3130
) -> "DocumentLayout":
3231
with open_filename(filename, "rb") as fp:
3332
fp = cast(BinaryIO, fp)
3433
inferred_document_layout = process_data_with_pdfminer(
3534
inferred_document_layout=inferred_document_layout,
3635
file=fp,
37-
is_image=is_image,
3836
)
3937
return inferred_document_layout
4038

4139

4240
def process_data_with_pdfminer(
4341
inferred_document_layout: "DocumentLayout",
4442
file: Optional[Union[bytes, BinaryIO]] = None,
45-
is_image: bool = False,
4643
) -> "DocumentLayout":
47-
if is_image:
48-
for page in inferred_document_layout.pages:
49-
for el in page.elements:
50-
el.text = el.text or ""
51-
return inferred_document_layout
44+
"""Process document data using PDFMiner to extract layout information."""
5245

5346
extracted_layouts = get_regions_by_pdfminer(file)
5447

unstructured/partition/pdf_image/pdfminer_utils.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -104,14 +104,8 @@ def open_pdfminer_pages_generator(
104104
with pikepdf.Pdf.open(error_page_data) as pdf:
105105
pdf.save(tmp.name)
106106
page = next(PDFPage.get_pages(open(tmp.name, "rb"))) # noqa: SIM115
107-
try:
108-
interpreter.process_page(page)
109-
page_layout = device.get_result()
110-
except Exception:
111-
logger.warning(
112-
f"PDFMiner failed to process PDF page {i+1} after repairing it."
113-
)
114-
break
107+
interpreter.process_page(page)
108+
page_layout = device.get_result()
115109
i += 1
116110
yield page, page_layout
117111
except PSSyntaxError:

0 commit comments

Comments
 (0)