Skip to content

Commit 6dca3a3

Browse files
committed
Revert "Merge pull request #13 from Unstructured-IO/jj/cleaning"
This reverts commit 85bef70, reversing changes made to fd2a559.
1 parent 85bef70 commit 6dca3a3

16 files changed

+334
-531
lines changed

README.md

+15-22
Original file line numberDiff line numberDiff line change
@@ -34,37 +34,30 @@ from unstructured_client.models.errors import SDKError
3434

3535
s = UnstructuredClient(api_key_auth="YOUR_API_KEY")
3636

37-
filename = "sample-docs/layout-parser-paper-fast.pdf"
38-
39-
with open(filename, "rb") as f:
40-
# Note that this currently only supports a single file
41-
files=shared.Files(
42-
content=f.read(),
43-
file_name=filename,
44-
)
37+
filename = "sample-docs/layout-parser-paper.pdf"
38+
file = open(filename, "rb")
4539

4640
req = shared.PartitionParameters(
47-
files=files,
48-
strategy='ocr_only',
49-
languages=["eng"],
41+
# Note that this currently only supports a single file
42+
files=shared.Files(
43+
content=file.read(),
44+
file_name=filename,
45+
),
46+
# Other partition params
47+
strategy="fast",
5048
)
5149

5250
try:
53-
resp = s.general.partition(req)
54-
print(resp.elements[0])
51+
res = s.general.partition(req)
52+
print(res.elements[0])
5553
except SDKError as e:
5654
print(e)
5755

5856
# {
59-
# 'type': 'UncategorizedText',
60-
# 'element_id': 'fc550084fda1e008e07a0356894f5816',
61-
# 'metadata': {
62-
# 'filename': 'layout-parser-paper-fast.pdf',
63-
# 'filetype': 'application/pdf',
64-
# 'languages': ['eng'],
65-
# 'page_number': 1
66-
# },
67-
# 'text': '2103.15348v2 [cs.CV] 21 Jun 2021'
57+
# 'type': 'Title',
58+
# 'element_id': '015301d4f56aa4b20ec10ac889d2343f',
59+
# 'metadata': {'filename': 'layout-parser-paper.pdf', 'filetype': 'application/pdf', 'page_number': 1},
60+
# 'text': 'LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis'
6861
# }
6962
```
7063

-168 KB
Binary file not shown.

setup.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@
3232
"typing_extensions>=4.7.1",
3333
"urllib3>=2.0.4",
3434
],
35-
extras_require={"dev": ["pylint==2.16.2"]},
36-
package_dir={"": "src"},
37-
python_requires=">=3.8",
35+
extras_require={
36+
"dev":["pylint==2.16.2"]
37+
},
38+
package_dir={'': 'src'},
39+
python_requires='>=3.8'
3840
)

src/unstructured_client/general.py

+31-59
Original file line numberDiff line numberDiff line change
@@ -5,93 +5,65 @@
55
from unstructured_client import utils
66
from unstructured_client.models import errors, operations, shared
77

8-
98
class General:
109
sdk_configuration: SDKConfiguration
1110

1211
def __init__(self, sdk_config: SDKConfiguration) -> None:
1312
self.sdk_configuration = sdk_config
14-
15-
def partition(
16-
self,
17-
request: shared.PartitionParameters,
18-
retries: Optional[utils.RetryConfig] = None,
19-
) -> operations.PartitionResponse:
13+
14+
15+
16+
def partition(self, request: shared.PartitionParameters, retries: Optional[utils.RetryConfig] = None) -> operations.PartitionResponse:
2017
r"""Pipeline 1"""
2118
base_url = utils.template_url(*self.sdk_configuration.get_server_details())
22-
23-
url = base_url + "/general/v0/general"
19+
20+
url = base_url + '/general/v0/general'
2421
headers = {}
25-
req_content_type, data, form = utils.serialize_request_body(
26-
request, "request", False, True, "multipart"
27-
)
28-
if req_content_type not in ("multipart/form-data", "multipart/mixed"):
29-
headers["content-type"] = req_content_type
30-
headers["Accept"] = "application/json"
31-
headers["user-agent"] = self.sdk_configuration.user_agent
32-
22+
req_content_type, data, form = utils.serialize_request_body(request, "request", False, True, 'multipart')
23+
if req_content_type not in ('multipart/form-data', 'multipart/mixed'):
24+
headers['content-type'] = req_content_type
25+
headers['Accept'] = 'application/json'
26+
headers['user-agent'] = self.sdk_configuration.user_agent
27+
3328
if callable(self.sdk_configuration.security):
34-
client = utils.configure_security_client(
35-
self.sdk_configuration.client, self.sdk_configuration.security()
36-
)
29+
client = utils.configure_security_client(self.sdk_configuration.client, self.sdk_configuration.security())
3730
else:
38-
client = utils.configure_security_client(
39-
self.sdk_configuration.client, self.sdk_configuration.security
40-
)
41-
31+
client = utils.configure_security_client(self.sdk_configuration.client, self.sdk_configuration.security)
32+
4233
global_retry_config = self.sdk_configuration.retry_config
4334
retry_config = retries
4435
if retry_config is None:
4536
if global_retry_config:
4637
retry_config = global_retry_config
4738
else:
48-
retry_config = utils.RetryConfig(
49-
"backoff", utils.BackoffStrategy(500, 60000, 1.5, 3600000), True
50-
)
39+
retry_config = utils.RetryConfig('backoff', utils.BackoffStrategy(500, 60000, 1.5, 3600000), True)
5140

5241
def do_request():
53-
return client.request("POST", url, data=data, files=form, headers=headers)
54-
55-
http_res = utils.retry(do_request, utils.Retries(retry_config, ["5xx"]))
56-
content_type = http_res.headers.get("Content-Type")
42+
return client.request('POST', url, data=data, files=form, headers=headers)
5743

58-
res = operations.PartitionResponse(
59-
status_code=http_res.status_code,
60-
content_type=content_type,
61-
raw_response=http_res,
62-
)
44+
http_res = utils.retry(do_request, utils.Retries(retry_config, [
45+
'5xx'
46+
]))
47+
content_type = http_res.headers.get('Content-Type')
6348

49+
res = operations.PartitionResponse(status_code=http_res.status_code, content_type=content_type, raw_response=http_res)
50+
6451
if http_res.status_code == 200:
65-
if utils.match_content_type(content_type, "application/json"):
52+
if utils.match_content_type(content_type, 'application/json'):
6653
out = utils.unmarshal_json(http_res.text, Optional[List[Any]])
6754
res.elements = out
6855
else:
69-
raise errors.SDKError(
70-
f"unknown content-type received: {content_type}",
71-
http_res.status_code,
72-
http_res.text,
73-
http_res,
74-
)
56+
raise errors.SDKError(f'unknown content-type received: {content_type}', http_res.status_code, http_res.text, http_res)
7557
elif http_res.status_code == 422:
76-
if utils.match_content_type(content_type, "application/json"):
58+
if utils.match_content_type(content_type, 'application/json'):
7759
out = utils.unmarshal_json(http_res.text, errors.HTTPValidationError)
7860
out.raw_response = http_res
7961
raise out
8062
else:
81-
raise errors.SDKError(
82-
f"unknown content-type received: {content_type}",
83-
http_res.status_code,
84-
http_res.text,
85-
http_res,
86-
)
87-
elif (
88-
http_res.status_code >= 400
89-
and http_res.status_code < 500
90-
or http_res.status_code >= 500
91-
and http_res.status_code < 600
92-
):
93-
raise errors.SDKError(
94-
"API error occurred", http_res.status_code, http_res.text, http_res
95-
)
63+
raise errors.SDKError(f'unknown content-type received: {content_type}', http_res.status_code, http_res.text, http_res)
64+
elif http_res.status_code >= 400 and http_res.status_code < 500 or http_res.status_code >= 500 and http_res.status_code < 600:
65+
raise errors.SDKError('API error occurred', http_res.status_code, http_res.text, http_res)
9666

9767
return res
68+
69+

src/unstructured_client/models/errors/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
from .sdkerror import *
55
from .validationerror import *
66

7-
__all__ = ["HTTPValidationError", "SDKError", "ValidationError"]
7+
__all__ = ["HTTPValidationError","SDKError","ValidationError"]

src/unstructured_client/models/errors/httpvalidationerror.py

+3-9
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,11 @@
99

1010

1111
@dataclass_json(undefined=Undefined.EXCLUDE)
12+
1213
@dataclasses.dataclass
1314
class HTTPValidationError(Exception):
14-
detail: Optional[List[ValidationError]] = dataclasses.field(
15-
default=None,
16-
metadata={
17-
"dataclasses_json": {
18-
"letter_case": utils.get_field_name("detail"),
19-
"exclude": lambda f: f is None,
20-
}
21-
},
22-
)
15+
detail: Optional[List[ValidationError]] = dataclasses.field(default=None, metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('detail'), 'exclude': lambda f: f is None }})
16+
2317

2418
def __str__(self) -> str:
2519
return utils.marshal_json(self)

src/unstructured_client/models/errors/sdkerror.py

+4-11
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,20 @@
55

66
class SDKError(Exception):
77
"""Represents an error returned by the API."""
8-
98
message: str
109
status_code: int
1110
body: str
1211
raw_response: requests_http.Response
1312

14-
def __init__(
15-
self,
16-
message: str,
17-
status_code: int,
18-
body: str,
19-
raw_response: requests_http.Response,
20-
):
13+
def __init__(self, message: str, status_code: int, body: str, raw_response: requests_http.Response):
2114
self.message = message
2215
self.status_code = status_code
2316
self.body = body
2417
self.raw_response = raw_response
2518

2619
def __str__(self):
27-
body = ""
20+
body = ''
2821
if len(self.body) > 0:
29-
body = f"\n{self.body}"
22+
body = f'\n{self.body}'
3023

31-
return f"{self.message}: Status {self.status_code}{body}"
24+
return f'{self.message}: Status {self.status_code}{body}'

src/unstructured_client/models/errors/validationerror.py

+5-9
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,8 @@
1010
@dataclass_json(undefined=Undefined.EXCLUDE)
1111
@dataclasses.dataclass
1212
class ValidationError:
13-
loc: List[Union[str, int]] = dataclasses.field(
14-
metadata={"dataclasses_json": {"letter_case": utils.get_field_name("loc")}}
15-
)
16-
msg: str = dataclasses.field(
17-
metadata={"dataclasses_json": {"letter_case": utils.get_field_name("msg")}}
18-
)
19-
type: str = dataclasses.field(
20-
metadata={"dataclasses_json": {"letter_case": utils.get_field_name("type")}}
21-
)
13+
loc: List[Union[str, int]] = dataclasses.field(metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('loc') }})
14+
msg: str = dataclasses.field(metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('msg') }})
15+
type: str = dataclasses.field(metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('type') }})
16+
17+

src/unstructured_client/models/operations/partition.py

+2
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,5 @@ class PartitionResponse:
1616
r"""Successful Response"""
1717
raw_response: Optional[requests_http.Response] = dataclasses.field(default=None)
1818
r"""Raw HTTP response; suitable for custom response parsing"""
19+
20+

src/unstructured_client/models/shared/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
from .partition_parameters import *
44
from .security import *
55

6-
__all__ = ["Files", "PartitionParameters", "Security"]
6+
__all__ = ["Files","PartitionParameters","Security"]

src/unstructured_client/models/shared/partition_parameters.py

+23-59
Original file line numberDiff line numberDiff line change
@@ -7,83 +7,47 @@
77

88
@dataclasses.dataclass
99
class Files:
10-
content: bytes = dataclasses.field(metadata={"multipart_form": {"content": True}})
11-
file_name: str = dataclasses.field(
12-
metadata={"multipart_form": {"field_name": "files"}}
13-
)
10+
content: bytes = dataclasses.field(metadata={'multipart_form': { 'content': True }})
11+
file_name: str = dataclasses.field(metadata={'multipart_form': { 'field_name': 'files' }})
12+
13+
1414

1515

1616
@dataclasses.dataclass
1717
class PartitionParameters:
18-
chunking_strategy: Optional[str] = dataclasses.field(
19-
default=None, metadata={"multipart_form": {"field_name": "chunking_strategy"}}
20-
)
18+
chunking_strategy: Optional[str] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'chunking_strategy' }})
2119
r"""Use one of the supported strategies to chunk the returned elements. Currently supports: by_title"""
22-
combine_under_n_chars: Optional[int] = dataclasses.field(
23-
default=None,
24-
metadata={"multipart_form": {"field_name": "combine_under_n_chars"}},
25-
)
20+
combine_under_n_chars: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'combine_under_n_chars' }})
2621
r"""If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500"""
27-
coordinates: Optional[bool] = dataclasses.field(
28-
default=None, metadata={"multipart_form": {"field_name": "coordinates"}}
29-
)
22+
coordinates: Optional[bool] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'coordinates' }})
3023
r"""If true, return coordinates for each element. Default: false"""
31-
encoding: Optional[str] = dataclasses.field(
32-
default=None, metadata={"multipart_form": {"field_name": "encoding"}}
33-
)
24+
encoding: Optional[str] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'encoding' }})
3425
r"""The encoding method used to decode the text input. Default: utf-8"""
35-
files: Optional[Files] = dataclasses.field(
36-
default=None, metadata={"multipart_form": {"file": True}}
37-
)
26+
files: Optional[Files] = dataclasses.field(default=None, metadata={'multipart_form': { 'file': True }})
3827
r"""The file to extract"""
39-
gz_uncompressed_content_type: Optional[str] = dataclasses.field(
40-
default=None,
41-
metadata={"multipart_form": {"field_name": "gz_uncompressed_content_type"}},
42-
)
28+
gz_uncompressed_content_type: Optional[str] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'gz_uncompressed_content_type' }})
4329
r"""If file is gzipped, use this content type after unzipping"""
44-
hi_res_model_name: Optional[str] = dataclasses.field(
45-
default=None, metadata={"multipart_form": {"field_name": "hi_res_model_name"}}
46-
)
30+
hi_res_model_name: Optional[str] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'hi_res_model_name' }})
4731
r"""The name of the inference model used when strategy is hi_res"""
48-
include_page_breaks: Optional[bool] = dataclasses.field(
49-
default=None, metadata={"multipart_form": {"field_name": "include_page_breaks"}}
50-
)
32+
include_page_breaks: Optional[bool] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'include_page_breaks' }})
5133
r"""If True, the output will include page breaks if the filetype supports it. Default: false"""
52-
languages: Optional[List[str]] = dataclasses.field(
53-
default=None, metadata={"multipart_form": {"field_name": "languages"}}
54-
)
34+
languages: Optional[List[str]] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'languages' }})
5535
r"""The languages present in the document, for use in partitioning and/or OCR"""
56-
max_characters: Optional[int] = dataclasses.field(
57-
default=None, metadata={"multipart_form": {"field_name": "max_characters"}}
58-
)
36+
max_characters: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'max_characters' }})
5937
r"""If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 1500"""
60-
multipage_sections: Optional[bool] = dataclasses.field(
61-
default=None, metadata={"multipart_form": {"field_name": "multipage_sections"}}
62-
)
38+
multipage_sections: Optional[bool] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'multipage_sections' }})
6339
r"""If chunking strategy is set, determines if sections can span multiple sections. Default: true"""
64-
new_after_n_chars: Optional[int] = dataclasses.field(
65-
default=None, metadata={"multipart_form": {"field_name": "new_after_n_chars"}}
66-
)
40+
new_after_n_chars: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'new_after_n_chars' }})
6741
r"""If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500"""
68-
output_format: Optional[str] = dataclasses.field(
69-
default=None, metadata={"multipart_form": {"field_name": "output_format"}}
70-
)
42+
output_format: Optional[str] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'output_format' }})
7143
r"""The format of the response. Supported formats are application/json and text/csv. Default: application/json."""
72-
pdf_infer_table_structure: Optional[bool] = dataclasses.field(
73-
default=None,
74-
metadata={"multipart_form": {"field_name": "pdf_infer_table_structure"}},
75-
)
44+
pdf_infer_table_structure: Optional[bool] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'pdf_infer_table_structure' }})
7645
r"""If True and strategy=hi_res, any Table Elements extracted from a PDF will include an additional metadata field, 'text_as_html', where the value (string) is a just a transformation of the data into an HTML <table>."""
77-
skip_infer_table_types: Optional[List[str]] = dataclasses.field(
78-
default=None,
79-
metadata={"multipart_form": {"field_name": "skip_infer_table_types"}},
80-
)
46+
skip_infer_table_types: Optional[List[str]] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'skip_infer_table_types' }})
8147
r"""The document types that you want to skip table extraction with. Default: ['pdf', 'jpg', 'png']"""
82-
strategy: Optional[str] = dataclasses.field(
83-
default=None, metadata={"multipart_form": {"field_name": "strategy"}}
84-
)
48+
strategy: Optional[str] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'strategy' }})
8549
r"""The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto"""
86-
xml_keep_tags: Optional[bool] = dataclasses.field(
87-
default=None, metadata={"multipart_form": {"field_name": "xml_keep_tags"}}
88-
)
50+
xml_keep_tags: Optional[bool] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'xml_keep_tags' }})
8951
r"""If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml."""
52+
53+

0 commit comments

Comments
 (0)