Skip to content

Commit e1e2a1b

Browse files
authored
reduce info logging (#197)
1 parent b75cb2c commit e1e2a1b

File tree

3 files changed

+8
-47
lines changed

3 files changed

+8
-47
lines changed

src/unstructured_client/_hooks/custom/logger_hook.py

+6-9
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,10 @@ def after_error(
7777
if response and response.status_code == 200:
7878
# NOTE: Even though this is an after_error method, due to split_pdf_hook logic we may get
7979
# a success here when one of the split requests was partitioned successfully
80-
logger.info("Successfully partitioned the document.")
81-
82-
else:
83-
logger.error("Failed to partition the document.")
84-
if response:
85-
logger.error("Server responded with %d - %s", response.status_code, response.text)
86-
if error is not None:
87-
logger.error("Following error occurred - %s", error)
88-
80+
return response, error
81+
logger.error("Failed to partition the document.")
82+
if response:
83+
logger.error("Server responded with %d - %s", response.status_code, response.text)
84+
if error is not None:
85+
logger.error("Following error occurred - %s", error)
8986
return response, error

src/unstructured_client/_hooks/custom/pdf_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def get_pdf_pages(
5151
new_pdf.write(pdf_buffer)
5252
pdf_buffer.seek(0)
5353

54-
yield pdf_buffer, offset, offset_end
54+
yield pdf_buffer, offset
5555
offset += split_size
5656

5757

src/unstructured_client/_hooks/custom/split_pdf_hook.py

+1-37
Original file line numberDiff line numberDiff line change
@@ -222,39 +222,32 @@ def before_request(
222222
if split_pdf_page is None or split_pdf_page == "false":
223223
return request
224224

225-
logger.info("Preparing to split document for partition.")
226225
file = form_data.get(PARTITION_FORM_FILES_KEY)
227226
if (
228227
file is None
229228
or not isinstance(file, shared.Files)
230229
or not pdf_utils.is_pdf(file)
231230
):
232-
logger.info("Partitioning without split.")
233231
return request
234232

235233
starting_page_number = form_utils.get_starting_page_number(
236234
form_data,
237235
key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
238236
fallback_value=DEFAULT_STARTING_PAGE_NUMBER,
239237
)
240-
if starting_page_number > 1:
241-
logger.info("Starting page number set to %d", starting_page_number)
242-
logger.info("Starting page number set to %d", starting_page_number)
243238

244239
self.allow_failed = form_utils.get_split_pdf_allow_failed_param(
245240
form_data,
246241
key=PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
247242
fallback_value=DEFAULT_ALLOW_FAILED,
248243
)
249-
logger.info("Allow failed set to %d", self.allow_failed)
250244

251245
concurrency_level = form_utils.get_split_pdf_concurrency_level_param(
252246
form_data,
253247
key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
254248
fallback_value=DEFAULT_CONCURRENCY_LEVEL,
255249
max_allowed=MAX_CONCURRENCY_LEVEL,
256250
)
257-
logger.info("Concurrency level set to %d", concurrency_level)
258251
limiter = asyncio.Semaphore(concurrency_level)
259252

260253
content = cast(bytes, file.content)
@@ -267,40 +260,17 @@ def before_request(
267260
)
268261

269262
page_count = page_range_end - page_range_start + 1
270-
logger.info(
271-
"Splitting pages %d to %d (%d total)",
272-
page_range_start,
273-
page_range_end,
274-
page_count,
275-
)
276263

277264
split_size = get_optimal_split_size(
278265
num_pages=page_count, concurrency_level=concurrency_level
279266
)
280-
logger.info("Determined optimal split size of %d pages.", split_size)
281267

282268
# If the doc is small enough, and we aren't slicing it with a page range:
283269
# do not split, just continue with the original request
284270
if split_size >= page_count and page_count == len(pdf.pages):
285-
logger.info(
286-
"Document has too few pages (%d) to be split efficiently. Partitioning without split.",
287-
page_count,
288-
)
289271
return request
290272

291273
pages = pdf_utils.get_pdf_pages(pdf, split_size=split_size, page_start=page_range_start, page_end=page_range_end)
292-
logger.info(
293-
"Partitioning %d files with %d page(s) each.",
294-
math.floor(page_count / split_size),
295-
split_size,
296-
)
297-
298-
# Log the remainder pages if there are any
299-
if page_count % split_size > 0:
300-
logger.info(
301-
"Partitioning 1 file with %d page(s).",
302-
page_count % split_size,
303-
)
304274

305275
# Use a variable to adjust the httpx client timeout, or default to 30 minutes
306276
# When we're able to reuse the SDK to make these calls, we can remove this var
@@ -326,14 +296,8 @@ async def call_api_partial(page):
326296

327297
self.coroutines_to_execute[operation_id] = []
328298
set_index = 1
329-
for page_content, page_index, all_pages_number in pages:
299+
for page_content, page_index in pages:
330300
page_number = page_index + starting_page_number
331-
logger.info(
332-
"Partitioning set #%d (pages %d-%d).",
333-
set_index,
334-
page_number,
335-
min(page_number + split_size - 1, all_pages_number),
336-
)
337301

338302
coroutine = call_api_partial((page_content, page_number))
339303
self.coroutines_to_execute[operation_id].append(coroutine)

0 commit comments

Comments
 (0)