diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index 3634baa405..880dd5e172 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -1,3 +1,4 @@ +import io import os from tempfile import SpooledTemporaryFile from unittest import mock @@ -112,7 +113,7 @@ def test_partition_pdf_local_raises_with_no_filename(): pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False) -@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"]) +@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool", "named_io"]) @pytest.mark.parametrize( ("strategy", "expected"), # fast: can't capture the "intentionally left blank page" page @@ -139,6 +140,12 @@ def _test(result): with open(filename, "rb") as f: result = pdf.partition_pdf(file=f, strategy=strategy) _test(result) + elif file_mode == "named_io": + with open(filename, "rb") as test_file: + file = io.BytesIO(test_file.read()) + file.name = "super.pdf" + result = pdf.partition_pdf(file=file, strategy=strategy) + _test(result) else: with open(filename, "rb") as test_file: spooled_temp_file = SpooledTemporaryFile() diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 319be80608..42a9afad9a 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -5,6 +5,7 @@ import subprocess from datetime import datetime from io import BufferedReader, BytesIO, TextIOWrapper +from pathlib import Path from tempfile import SpooledTemporaryFile from typing import ( IO, @@ -81,6 +82,8 @@ def get_last_modified_date(filename: str) -> Union[str, None]: + if not Path(filename).exists(): + return None modify_date = datetime.fromtimestamp(os.path.getmtime(filename)) return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")