From 18719b2b708e3afc921a34aff50d49f8b80157ac Mon Sep 17 00:00:00 2001 From: Sameena-Thabassum Date: Tue, 22 Apr 2025 22:36:44 +0530 Subject: [PATCH] Add support for 'basic' chunking strategy to match documentation --- prepline_general/api/models/form_params.py | 6 ++--- test_general/api/test_app.py | 27 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/prepline_general/api/models/form_params.py b/prepline_general/api/models/form_params.py index e0f4122e..7d7e00f9 100644 --- a/prepline_general/api/models/form_params.py +++ b/prepline_general/api/models/form_params.py @@ -178,11 +178,11 @@ def as_form( ] = False, # -- chunking options -- chunking_strategy: Annotated[ - Optional[Literal["by_title"]], + Optional[Literal["by_title", "basic"]], Form( title="Chunking Strategy", - description="Use one of the supported strategies to chunk the returned elements. Currently supports: by_title", - examples=["by_title"], + description="Use one of the supported strategies to chunk the returned elements. Currently supports: by_title and basic. Default: None", + examples=["by_title", "basic"], ), ] = None, combine_under_n_chars: Annotated[ diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index afb743ac..36ca9367 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -1153,3 +1153,30 @@ def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, tes assert "Here are important notes" == df["text"][0] else: assert "Here are important notes" != df["text"][0] + +def test_basic_chunking_strategy(): + """ + Verify that basic chunking strategy works as expected + """ + client = TestClient(app) + test_file = Path("sample-docs") / "layout-parser-paper-fast.pdf" + response = client.post( + MAIN_API_ROUTE, + files=[("files", (str(test_file), open(test_file, "rb")))], + data={"strategy": "hi_res"}, + ) + assert response.status_code == 200 + response_without_chunking = response.json() + + # chunking + response = client.post( + MAIN_API_ROUTE, + files=[("files", (str(test_file), open(test_file, "rb")))], + data={"chunking_strategy": "basic"}, + ) + assert response.status_code == 200 + + response_with_chunking = response.json() + assert len(response_with_chunking) != len(response_without_chunking) + assert "CompositeElement" in [element.get("type") for element in response_with_chunking] +