diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c5b782a..582c5404 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.37.0 + +### Enhancements + +### Features + +### Fixes +* Throws appropriate error message in case the given PDF file is invalid (corrupted or encrypted). + ## 0.30.0 ### Enhancements diff --git a/_sample_docs/failing-encrypted.pdf b/_sample_docs/failing-encrypted.pdf new file mode 100644 index 00000000..f207fbaa Binary files /dev/null and b/_sample_docs/failing-encrypted.pdf differ diff --git a/_sample_docs/failing-invalid.pdf b/_sample_docs/failing-invalid.pdf new file mode 100644 index 00000000..a8ca9eae Binary files /dev/null and b/_sample_docs/failing-invalid.pdf differ diff --git a/_sample_docs/failing-missing-pages.pdf b/_sample_docs/failing-missing-pages.pdf new file mode 100644 index 00000000..b36599d1 Binary files /dev/null and b/_sample_docs/failing-missing-pages.pdf differ diff --git a/_sample_docs/failing-missing-root.pdf b/_sample_docs/failing-missing-root.pdf new file mode 100644 index 00000000..1cdf801c Binary files /dev/null and b/_sample_docs/failing-missing-root.pdf differ diff --git a/_test_unstructured_client/integration/test_integration.py b/_test_unstructured_client/integration/test_integration.py index 0f6e8d7b..4a275262 100644 --- a/_test_unstructured_client/integration/test_integration.py +++ b/_test_unstructured_client/integration/test_integration.py @@ -348,3 +348,31 @@ def test_partition_strategy_vlm_anthropic(split_pdf, vlm_model, vlm_model_provid assert response.status_code == 200 assert len(response.elements) > 0 assert response.elements[0]["metadata"]["partitioner_type"] == "vlm_partition" + + +def test_returns_422_for_invalid_pdf( + caplog: pytest.LogCaptureFixture, + doc_path: Path, + client: UnstructuredClient, +): + """Test that we get a RequestError with the correct error message for invalid PDF files.""" + pdf_name = "failing-invalid.pdf" + with open(doc_path / pdf_name, "rb") as f: + files = shared.Files( + content=f.read(), + file_name=pdf_name, + ) + + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=files, + strategy="fast", + split_pdf_page=True, + ) + ) + + with pytest.raises(HTTPValidationError): + client.general.partition(request=req) + + assert "File does not appear to be a valid PDF" in caplog.text + assert "422" in caplog.text diff --git a/_test_unstructured_client/unit/test_pdf_utils.py b/_test_unstructured_client/unit/test_pdf_utils.py new file mode 100644 index 00000000..92b3e79f --- /dev/null +++ b/_test_unstructured_client/unit/test_pdf_utils.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import io + +import pytest +from pypdf import PdfReader + +from unstructured_client._hooks.custom.pdf_utils import check_pdf, PDFValidationError +from _test_unstructured_client.unit_utils import sample_docs_path + + +def _open_pdf(pdf_path: str) -> PdfReader: + with open(pdf_path, "rb") as f: + pdf_content = f.read() + return PdfReader(io.BytesIO(pdf_content)) + + +def test_check_pdf_with_valid_pdf(): + pdf_path = sample_docs_path("list-item-example-1.pdf") + pdf = _open_pdf(pdf_path) + + result = check_pdf(pdf) + assert isinstance(result, PdfReader) + + +@pytest.mark.parametrize( + ("pdf_name", "expected_error_message"), + [ + ( + "failing-encrypted.pdf", + "File is encrypted. Please decrypt it with password.", + ), + ( + "failing-missing-root.pdf", + "File does not appear to be a valid PDF. Error: Cannot find Root object in pdf", + ), + ( + "failing-missing-pages.pdf", + "File does not appear to be a valid PDF. Error: Invalid object in /Pages", + ), + ], +) +def test_check_pdf_raises_pdf_validation_error( + pdf_name: str, expected_error_message: str +): + """Test that we get a PDFValidationError with the correct error message for invalid PDF files.""" + pdf_path = sample_docs_path(pdf_name) + pdf = _open_pdf(pdf_path) + + with pytest.raises(PDFValidationError) as exc_info: + check_pdf(pdf) + + assert exc_info.value.message == expected_error_message diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index 12792c69..75e2c17b 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -8,10 +8,12 @@ from unittest.mock import MagicMock, patch import httpx +from httpx import RequestError import pytest import requests from requests_toolbelt import MultipartDecoder +from _test_unstructured_client.unit_utils import sample_docs_path from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils from unstructured_client._hooks.custom.form_utils import ( FormData, @@ -29,6 +31,7 @@ SplitPdfHook, get_optimal_split_size, run_tasks, ) +from unstructured_client._hooks.types import BeforeRequestContext from unstructured_client.models import shared @@ -462,3 +465,64 @@ def test_unit_get_split_pdf_cache_tmp_data_dir_uses_dir_from_form_data(mock_path mock_path.assert_called_once_with(mock_dir) mock_path_instance.exists.assert_called_once() assert result == str(Path(mock_dir).resolve()) + + +def test_before_request_raises_request_error_when_pdf_check_fails(): + """Test that before_request raises RequestError when pdf_utils.check_pdf throws PDFValidationError.""" + hook = SplitPdfHook() + + # Initialize the hook with a mock client + mock_client = MagicMock() + hook.sdk_init(base_url="http://localhost:8888", client=mock_client) + + # Create a mock request context + mock_hook_ctx = MagicMock() + mock_hook_ctx.operation_id = "partition" + + # Create a mock request with proper headers and content + mock_request = MagicMock() + mock_request.headers = {"Content-Type": "multipart/form-data"} + mock_request.url.host = "localhost" + + # Mock the form data to include the necessary fields for PDF splitting + mock_pdf_file = MagicMock() + mock_pdf_file.read.return_value = b"mock_pdf_content" + + mock_form_data = { + "split_pdf_page": "true", + "files": { + "filename": "test.pdf", + "content_type": "application/pdf", + "file": mock_pdf_file + } + } + + # Mock the PDF reader object + mock_pdf_reader = MagicMock() + + # Define the error message that will be raised + error_message = "File does not appear to be a valid PDF." + + with patch("unstructured_client._hooks.custom.request_utils.get_multipart_stream_fields") as mock_get_fields, \ + patch("unstructured_client._hooks.custom.pdf_utils.read_pdf") as mock_read_pdf, \ + patch("unstructured_client._hooks.custom.pdf_utils.check_pdf") as mock_check_pdf, \ + patch("unstructured_client._hooks.custom.request_utils.get_base_url") as mock_get_base_url: + + # Set up the mocks + mock_get_fields.return_value = mock_form_data + mock_read_pdf.return_value = mock_pdf_reader + mock_check_pdf.side_effect = pdf_utils.PDFValidationError(error_message) + mock_get_base_url.return_value = "http://localhost:8888" + + # Call the method under test and verify it raises RequestError + with pytest.raises(RequestError) as exc_info: + hook.before_request(mock_hook_ctx, mock_request) + + # Verify the exception has the correct message and request object + assert str(exc_info.value) == error_message + assert exc_info.value.request == mock_request + + # Verify that the mocked functions were called as expected + mock_get_fields.assert_called_once_with(mock_request) + mock_read_pdf.assert_called_once_with(mock_pdf_file) + mock_check_pdf.assert_called_once_with(mock_pdf_reader) \ No newline at end of file diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py index 0a1f9a4f..eb0d0304 100644 --- a/src/unstructured_client/_hooks/custom/pdf_utils.py +++ b/src/unstructured_client/_hooks/custom/pdf_utils.py @@ -5,7 +5,7 @@ from typing import cast, Optional, BinaryIO, Union from pypdf import PdfReader -from pypdf.errors import PdfReadError +from pypdf.errors import FileNotDecryptedError, PdfReadError from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME @@ -16,6 +16,15 @@ pdf_logger = logging.getLogger("pypdf") pdf_logger.setLevel(logging.ERROR) + +class PDFValidationError(Exception): + """Base exception for PDF validation errors.""" + + def __init__(self, message: str): + self.message = message + super().__init__(self.message) + + def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]: """Reads the given PDF file. @@ -33,3 +42,34 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]: return PdfReader(pdf_file, strict=False) except (PdfReadError, UnicodeDecodeError): return None + + +def check_pdf(pdf: PdfReader) -> PdfReader: + """ + Check if PDF is: + - Encrypted + - Has corrupted pages + - Has corrupted root object + + Throws: + - PDFValidationError if file is encrypted or corrupted + """ + try: + # This will raise if the file is encrypted + pdf.metadata # pylint: disable=pointless-statement + + # This will raise if the file's root object is corrupted + pdf.root_object # pylint: disable=pointless-statement + + # This will raise if the file's pages are corrupted + list(pdf.pages) + + return pdf + except FileNotDecryptedError as e: + raise PDFValidationError( + "File is encrypted. Please decrypt it with password.", + ) from e + except PdfReadError as e: + raise PDFValidationError( + f"File does not appear to be a valid PDF. Error: {e}", + ) from e diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 0b6cff20..7e18f09a 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -16,7 +16,7 @@ import aiofiles import httpx import nest_asyncio # type: ignore -from httpx import AsyncClient +from httpx import AsyncClient, RequestError from pypdf import PdfReader, PdfWriter from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils @@ -303,6 +303,14 @@ def before_request( if pdf is None: return request + try: + pdf = pdf_utils.check_pdf(pdf) + except pdf_utils.PDFValidationError as e: + raise RequestError( + message=e.message, + request=request, + ) from e + starting_page_number = form_utils.get_starting_page_number( form_data, key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,