Skip to content

Fix: Error with 'fake' file and assigned name in partition_pdf function #1645

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion test_unstructured/partition/pdf-image/test_pdf.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import io
import os
from tempfile import SpooledTemporaryFile
from unittest import mock
Expand Down Expand Up @@ -112,7 +113,7 @@ def test_partition_pdf_local_raises_with_no_filename():
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)


@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool", "named_io"])
@pytest.mark.parametrize(
("strategy", "expected"),
# fast: can't capture the "intentionally left blank page" page
Expand All @@ -139,6 +140,12 @@ def _test(result):
with open(filename, "rb") as f:
result = pdf.partition_pdf(file=f, strategy=strategy)
_test(result)
elif file_mode == "named_io":
with open(filename, "rb") as test_file:
file = io.BytesIO(test_file.read())
file.name = "super.pdf"
result = pdf.partition_pdf(file=file, strategy=strategy)
_test(result)
else:
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
Expand Down
3 changes: 3 additions & 0 deletions unstructured/partition/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import subprocess
from datetime import datetime
from io import BufferedReader, BytesIO, TextIOWrapper
from pathlib import Path
from tempfile import SpooledTemporaryFile
from typing import (
IO,
Expand Down Expand Up @@ -81,6 +82,8 @@


def get_last_modified_date(filename: str) -> Union[str, None]:
if not Path(filename).exists():
return None
modify_date = datetime.fromtimestamp(os.path.getmtime(filename))
return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")

Expand Down