Skip to content

fix: Return error for invalid PDFs #277

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## 0.37.0

### Enhancements

### Features

### Fixes
* Throws appropriate error message in case the given PDF file is invalid (corrupted or encrypted).

## 0.30.0

### Enhancements
Expand Down
Binary file added _sample_docs/failing-encrypted.pdf
Binary file not shown.
Binary file added _sample_docs/failing-invalid.pdf
Binary file not shown.
Binary file added _sample_docs/failing-missing-pages.pdf
Binary file not shown.
Binary file added _sample_docs/failing-missing-root.pdf
Binary file not shown.
28 changes: 28 additions & 0 deletions _test_unstructured_client/integration/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,3 +348,31 @@ def test_partition_strategy_vlm_anthropic(split_pdf, vlm_model, vlm_model_provid
assert response.status_code == 200
assert len(response.elements) > 0
assert response.elements[0]["metadata"]["partitioner_type"] == "vlm_partition"


def test_returns_422_for_invalid_pdf(
caplog: pytest.LogCaptureFixture,
doc_path: Path,
client: UnstructuredClient,
):
"""Test that we get a RequestError with the correct error message for invalid PDF files."""
pdf_name = "failing-invalid.pdf"
with open(doc_path / pdf_name, "rb") as f:
files = shared.Files(
content=f.read(),
file_name=pdf_name,
)

req = operations.PartitionRequest(
partition_parameters=shared.PartitionParameters(
files=files,
strategy="fast",
split_pdf_page=True,
)
)

with pytest.raises(HTTPValidationError):
client.general.partition(request=req)

assert "File does not appear to be a valid PDF" in caplog.text
assert "422" in caplog.text
53 changes: 53 additions & 0 deletions _test_unstructured_client/unit/test_pdf_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from __future__ import annotations

import io

import pytest
from pypdf import PdfReader

from unstructured_client._hooks.custom.pdf_utils import check_pdf, PDFValidationError
from _test_unstructured_client.unit_utils import sample_docs_path


def _open_pdf(pdf_path: str) -> PdfReader:
with open(pdf_path, "rb") as f:
pdf_content = f.read()
return PdfReader(io.BytesIO(pdf_content))


def test_check_pdf_with_valid_pdf():
pdf_path = sample_docs_path("list-item-example-1.pdf")
pdf = _open_pdf(pdf_path)

result = check_pdf(pdf)
assert isinstance(result, PdfReader)


@pytest.mark.parametrize(
("pdf_name", "expected_error_message"),
[
(
"failing-encrypted.pdf",
"File is encrypted. Please decrypt it with password.",
),
(
"failing-missing-root.pdf",
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
),
(
"failing-missing-pages.pdf",
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
),
],
)
def test_check_pdf_raises_pdf_validation_error(
pdf_name: str, expected_error_message: str
):
"""Test that we get a PDFValidationError with the correct error message for invalid PDF files."""
pdf_path = sample_docs_path(pdf_name)
pdf = _open_pdf(pdf_path)

with pytest.raises(PDFValidationError) as exc_info:
check_pdf(pdf)

assert exc_info.value.message == expected_error_message
64 changes: 64 additions & 0 deletions _test_unstructured_client/unit/test_split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
from unittest.mock import MagicMock, patch

import httpx
from httpx import RequestError
import pytest
import requests
from requests_toolbelt import MultipartDecoder

from _test_unstructured_client.unit_utils import sample_docs_path
from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
from unstructured_client._hooks.custom.form_utils import (
FormData,
Expand All @@ -29,6 +31,7 @@
SplitPdfHook,
get_optimal_split_size, run_tasks,
)
from unstructured_client._hooks.types import BeforeRequestContext
from unstructured_client.models import shared


Expand Down Expand Up @@ -462,3 +465,64 @@ def test_unit_get_split_pdf_cache_tmp_data_dir_uses_dir_from_form_data(mock_path
mock_path.assert_called_once_with(mock_dir)
mock_path_instance.exists.assert_called_once()
assert result == str(Path(mock_dir).resolve())


def test_before_request_raises_request_error_when_pdf_check_fails():
"""Test that before_request raises RequestError when pdf_utils.check_pdf throws PDFValidationError."""
hook = SplitPdfHook()

# Initialize the hook with a mock client
mock_client = MagicMock()
hook.sdk_init(base_url="http://localhost:8888", client=mock_client)

# Create a mock request context
mock_hook_ctx = MagicMock()
mock_hook_ctx.operation_id = "partition"

# Create a mock request with proper headers and content
mock_request = MagicMock()
mock_request.headers = {"Content-Type": "multipart/form-data"}
mock_request.url.host = "localhost"

# Mock the form data to include the necessary fields for PDF splitting
mock_pdf_file = MagicMock()
mock_pdf_file.read.return_value = b"mock_pdf_content"

mock_form_data = {
"split_pdf_page": "true",
"files": {
"filename": "test.pdf",
"content_type": "application/pdf",
"file": mock_pdf_file
}
}

# Mock the PDF reader object
mock_pdf_reader = MagicMock()

# Define the error message that will be raised
error_message = "File does not appear to be a valid PDF."

with patch("unstructured_client._hooks.custom.request_utils.get_multipart_stream_fields") as mock_get_fields, \
patch("unstructured_client._hooks.custom.pdf_utils.read_pdf") as mock_read_pdf, \
patch("unstructured_client._hooks.custom.pdf_utils.check_pdf") as mock_check_pdf, \
patch("unstructured_client._hooks.custom.request_utils.get_base_url") as mock_get_base_url:

# Set up the mocks
mock_get_fields.return_value = mock_form_data
mock_read_pdf.return_value = mock_pdf_reader
mock_check_pdf.side_effect = pdf_utils.PDFValidationError(error_message)
mock_get_base_url.return_value = "http://localhost:8888"

# Call the method under test and verify it raises RequestError
with pytest.raises(RequestError) as exc_info:
hook.before_request(mock_hook_ctx, mock_request)

# Verify the exception has the correct message and request object
assert str(exc_info.value) == error_message
assert exc_info.value.request == mock_request

# Verify that the mocked functions were called as expected
mock_get_fields.assert_called_once_with(mock_request)
mock_read_pdf.assert_called_once_with(mock_pdf_file)
mock_check_pdf.assert_called_once_with(mock_pdf_reader)
42 changes: 41 additions & 1 deletion src/unstructured_client/_hooks/custom/pdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import cast, Optional, BinaryIO, Union

from pypdf import PdfReader
from pypdf.errors import PdfReadError
from pypdf.errors import FileNotDecryptedError, PdfReadError

from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME

Expand All @@ -16,6 +16,15 @@
pdf_logger = logging.getLogger("pypdf")
pdf_logger.setLevel(logging.ERROR)


class PDFValidationError(Exception):
"""Base exception for PDF validation errors."""

def __init__(self, message: str):
self.message = message
super().__init__(self.message)


def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
"""Reads the given PDF file.

Expand All @@ -33,3 +42,34 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
return PdfReader(pdf_file, strict=False)
except (PdfReadError, UnicodeDecodeError):
return None


def check_pdf(pdf: PdfReader) -> PdfReader:
"""
Check if PDF is:
- Encrypted
- Has corrupted pages
- Has corrupted root object

Throws:
- PDFValidationError if file is encrypted or corrupted
"""
try:
# This will raise if the file is encrypted
pdf.metadata # pylint: disable=pointless-statement

# This will raise if the file's root object is corrupted
pdf.root_object # pylint: disable=pointless-statement

# This will raise if the file's pages are corrupted
list(pdf.pages)
Copy link
Contributor

@pawel-kmiecik pawel-kmiecik Jun 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we profiled it for memory/time for large pdfs (like ~1k pages)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not, but maybe we could just check for the first page? I think it will result with the same error. WDYT?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tested. This does not slow down execution and doesn't affect memory usage, even with big files (tested on 200, 1000 and 10000 pages pdfs).

Results for 10k pages pdf:
mem_profiler_results.txt


return pdf
except FileNotDecryptedError as e:
raise PDFValidationError(
"File is encrypted. Please decrypt it with password.",
) from e
except PdfReadError as e:
raise PDFValidationError(
f"File does not appear to be a valid PDF. Error: {e}",
) from e
10 changes: 9 additions & 1 deletion src/unstructured_client/_hooks/custom/split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import aiofiles
import httpx
import nest_asyncio # type: ignore
from httpx import AsyncClient
from httpx import AsyncClient, RequestError
from pypdf import PdfReader, PdfWriter

from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
Expand Down Expand Up @@ -303,6 +303,14 @@ def before_request(
if pdf is None:
return request

try:
pdf = pdf_utils.check_pdf(pdf)
except pdf_utils.PDFValidationError as e:
raise RequestError(
message=e.message,
request=request,
) from e

starting_page_number = form_utils.get_starting_page_number(
form_data,
key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
Expand Down