Unstructured-IO · mpolomdeepsense · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+## 0.37.0
+
+### Enhancements
+
+### Features
+
+### Fixes
+* Throws appropriate error message in case the given PDF file is invalid (corrupted or encrypted).
+
 ## 0.30.0
 
 ### Enhancements

diff --git a/_sample_docs/failing-encrypted.pdf b/_sample_docs/failing-encrypted.pdf
diff --git a/_sample_docs/failing-invalid.pdf b/_sample_docs/failing-invalid.pdf
diff --git a/_sample_docs/failing-missing-pages.pdf b/_sample_docs/failing-missing-pages.pdf
diff --git a/_sample_docs/failing-missing-root.pdf b/_sample_docs/failing-missing-root.pdf
diff --git a/_test_unstructured_client/integration/test_integration.py b/_test_unstructured_client/integration/test_integration.py
@@ -348,3 +348,31 @@ def test_partition_strategy_vlm_anthropic(split_pdf, vlm_model, vlm_model_provid
     assert response.status_code == 200
     assert len(response.elements) > 0
     assert response.elements[0]["metadata"]["partitioner_type"] == "vlm_partition"
+
+
+def test_returns_422_for_invalid_pdf(
+    caplog: pytest.LogCaptureFixture,
+    doc_path: Path,
+    client: UnstructuredClient,
+):
+    """Test that we get a RequestError with the correct error message for invalid PDF files."""
+    pdf_name = "failing-invalid.pdf"
+    with open(doc_path / pdf_name, "rb") as f:
+        files = shared.Files(
+            content=f.read(),
+            file_name=pdf_name,
+        )
+
+    req = operations.PartitionRequest(
+        partition_parameters=shared.PartitionParameters(
+            files=files,
+            strategy="fast",
+            split_pdf_page=True,
+        )
+    )
+
+    with pytest.raises(HTTPValidationError):
+        client.general.partition(request=req)
+
+    assert "File does not appear to be a valid PDF" in caplog.text
+    assert "422" in caplog.text
diff --git a/_test_unstructured_client/unit/test_pdf_utils.py b/_test_unstructured_client/unit/test_pdf_utils.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+import io
+
+import pytest
+from pypdf import PdfReader
+
+from unstructured_client._hooks.custom.pdf_utils import check_pdf, PDFValidationError
+from _test_unstructured_client.unit_utils import sample_docs_path
+
+
+def _open_pdf(pdf_path: str) -> PdfReader:
+    with open(pdf_path, "rb") as f:
+        pdf_content = f.read()
+    return PdfReader(io.BytesIO(pdf_content))
+
+
+def test_check_pdf_with_valid_pdf():
+    pdf_path = sample_docs_path("list-item-example-1.pdf")
+    pdf = _open_pdf(pdf_path)
+
+    result = check_pdf(pdf)
+    assert isinstance(result, PdfReader)
+
+
+@pytest.mark.parametrize(
+    ("pdf_name", "expected_error_message"),
+    [
+        (
+            "failing-encrypted.pdf",
+            "File is encrypted. Please decrypt it with password.",
+        ),
+        (
+            "failing-missing-root.pdf",
+            "File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
+        ),
+        (
+            "failing-missing-pages.pdf",
+            "File does not appear to be a valid PDF. Error: Invalid object in /Pages",
+        ),
+    ],
+)
+def test_check_pdf_raises_pdf_validation_error(
+    pdf_name: str, expected_error_message: str
+):
+    """Test that we get a PDFValidationError with the correct error message for invalid PDF files."""
+    pdf_path = sample_docs_path(pdf_name)
+    pdf = _open_pdf(pdf_path)
+
+    with pytest.raises(PDFValidationError) as exc_info:
+        check_pdf(pdf)
+
+    assert exc_info.value.message == expected_error_message
diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py
@@ -8,10 +8,12 @@
 from unittest.mock import MagicMock, patch
 
 import httpx
+from httpx import RequestError
 import pytest
 import requests
 from requests_toolbelt import MultipartDecoder
 
+from _test_unstructured_client.unit_utils import sample_docs_path
 from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
 from unstructured_client._hooks.custom.form_utils import (
     FormData,
@@ -29,6 +31,7 @@
     SplitPdfHook,
     get_optimal_split_size, run_tasks,
 )
+from unstructured_client._hooks.types import BeforeRequestContext
 from unstructured_client.models import shared
 
 
@@ -462,3 +465,64 @@ def test_unit_get_split_pdf_cache_tmp_data_dir_uses_dir_from_form_data(mock_path
     mock_path.assert_called_once_with(mock_dir)
     mock_path_instance.exists.assert_called_once()
     assert result == str(Path(mock_dir).resolve())
+
+
+def test_before_request_raises_request_error_when_pdf_check_fails():
+    """Test that before_request raises RequestError when pdf_utils.check_pdf throws PDFValidationError."""
+    hook = SplitPdfHook()
+
+    # Initialize the hook with a mock client
+    mock_client = MagicMock()
+    hook.sdk_init(base_url="http://localhost:8888", client=mock_client)
+
+    # Create a mock request context
+    mock_hook_ctx = MagicMock()
+    mock_hook_ctx.operation_id = "partition"
+
+    # Create a mock request with proper headers and content
+    mock_request = MagicMock()
+    mock_request.headers = {"Content-Type": "multipart/form-data"}
+    mock_request.url.host = "localhost"
+
+    # Mock the form data to include the necessary fields for PDF splitting
+    mock_pdf_file = MagicMock()
+    mock_pdf_file.read.return_value = b"mock_pdf_content"
+
+    mock_form_data = {
+        "split_pdf_page": "true",
+        "files": {
+            "filename": "test.pdf",
+            "content_type": "application/pdf",
+            "file": mock_pdf_file
+        }
+    }
+
+    # Mock the PDF reader object
+    mock_pdf_reader = MagicMock()
+
+    # Define the error message that will be raised
+    error_message = "File does not appear to be a valid PDF."
+
+    with patch("unstructured_client._hooks.custom.request_utils.get_multipart_stream_fields") as mock_get_fields, \
+         patch("unstructured_client._hooks.custom.pdf_utils.read_pdf") as mock_read_pdf, \
+         patch("unstructured_client._hooks.custom.pdf_utils.check_pdf") as mock_check_pdf, \
+         patch("unstructured_client._hooks.custom.request_utils.get_base_url") as mock_get_base_url:
+
+        # Set up the mocks
+        mock_get_fields.return_value = mock_form_data
+        mock_read_pdf.return_value = mock_pdf_reader
+        mock_check_pdf.side_effect = pdf_utils.PDFValidationError(error_message)
+        mock_get_base_url.return_value = "http://localhost:8888"
+
+        # Call the method under test and verify it raises RequestError
+        with pytest.raises(RequestError) as exc_info:
+            hook.before_request(mock_hook_ctx, mock_request)
+
+        # Verify the exception has the correct message and request object
+        assert str(exc_info.value) == error_message
+        assert exc_info.value.request == mock_request
+
+        # Verify that the mocked functions were called as expected
+        mock_get_fields.assert_called_once_with(mock_request)
+        mock_read_pdf.assert_called_once_with(mock_pdf_file)
+        mock_check_pdf.assert_called_once_with(mock_pdf_reader)
diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py
@@ -5,7 +5,7 @@
 from typing import cast, Optional, BinaryIO, Union
 
 from pypdf import PdfReader
-from pypdf.errors import PdfReadError
+from pypdf.errors import FileNotDecryptedError, PdfReadError
 
 from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
 
@@ -16,6 +16,15 @@
 pdf_logger = logging.getLogger("pypdf")
 pdf_logger.setLevel(logging.ERROR)
 
+
+class PDFValidationError(Exception):
+    """Base exception for PDF validation errors."""
+
+    def __init__(self, message: str):
+        self.message = message
+        super().__init__(self.message)
+
+
 def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
     """Reads the given PDF file.
 
@@ -33,3 +42,34 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
         return PdfReader(pdf_file, strict=False)
     except (PdfReadError, UnicodeDecodeError):
         return None
+
+
+def check_pdf(pdf: PdfReader) -> PdfReader:
+    """
+    Check if PDF is:
+    - Encrypted
+    - Has corrupted pages
+    - Has corrupted root object
+
+    Throws:
+    - PDFValidationError if file is encrypted or corrupted
+    """
+    try:
+        # This will raise if the file is encrypted
+        pdf.metadata  # pylint: disable=pointless-statement
+
+        # This will raise if the file's root object is corrupted
+        pdf.root_object  # pylint: disable=pointless-statement
+
+        # This will raise if the file's pages are corrupted
+        list(pdf.pages)
+
+        return pdf
+    except FileNotDecryptedError as e:
+        raise PDFValidationError(
+            "File is encrypted. Please decrypt it with password.",
+        ) from e
+    except PdfReadError as e:
+        raise PDFValidationError(
+            f"File does not appear to be a valid PDF. Error: {e}",
+        ) from e
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -16,7 +16,7 @@
 import aiofiles
 import httpx
 import nest_asyncio  # type: ignore
-from httpx import AsyncClient
+from httpx import AsyncClient, RequestError
 from pypdf import PdfReader, PdfWriter
 
 from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
@@ -303,6 +303,14 @@ def before_request(
         if pdf is None:
             return request
 
+        try:
+            pdf = pdf_utils.check_pdf(pdf)
+        except pdf_utils.PDFValidationError as e:
+            raise RequestError(
+                message=e.message,
+                request=request,
+            ) from e
+
         starting_page_number = form_utils.get_starting_page_number(
             form_data,
             key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,