@@ -222,39 +222,32 @@ def before_request(
222
222
if split_pdf_page is None or split_pdf_page == "false" :
223
223
return request
224
224
225
- logger .info ("Preparing to split document for partition." )
226
225
file = form_data .get (PARTITION_FORM_FILES_KEY )
227
226
if (
228
227
file is None
229
228
or not isinstance (file , shared .Files )
230
229
or not pdf_utils .is_pdf (file )
231
230
):
232
- logger .info ("Partitioning without split." )
233
231
return request
234
232
235
233
starting_page_number = form_utils .get_starting_page_number (
236
234
form_data ,
237
235
key = PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ,
238
236
fallback_value = DEFAULT_STARTING_PAGE_NUMBER ,
239
237
)
240
- if starting_page_number > 1 :
241
- logger .info ("Starting page number set to %d" , starting_page_number )
242
- logger .info ("Starting page number set to %d" , starting_page_number )
243
238
244
239
self .allow_failed = form_utils .get_split_pdf_allow_failed_param (
245
240
form_data ,
246
241
key = PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY ,
247
242
fallback_value = DEFAULT_ALLOW_FAILED ,
248
243
)
249
- logger .info ("Allow failed set to %d" , self .allow_failed )
250
244
251
245
concurrency_level = form_utils .get_split_pdf_concurrency_level_param (
252
246
form_data ,
253
247
key = PARTITION_FORM_CONCURRENCY_LEVEL_KEY ,
254
248
fallback_value = DEFAULT_CONCURRENCY_LEVEL ,
255
249
max_allowed = MAX_CONCURRENCY_LEVEL ,
256
250
)
257
- logger .info ("Concurrency level set to %d" , concurrency_level )
258
251
limiter = asyncio .Semaphore (concurrency_level )
259
252
260
253
content = cast (bytes , file .content )
@@ -267,40 +260,17 @@ def before_request(
267
260
)
268
261
269
262
page_count = page_range_end - page_range_start + 1
270
- logger .info (
271
- "Splitting pages %d to %d (%d total)" ,
272
- page_range_start ,
273
- page_range_end ,
274
- page_count ,
275
- )
276
263
277
264
split_size = get_optimal_split_size (
278
265
num_pages = page_count , concurrency_level = concurrency_level
279
266
)
280
- logger .info ("Determined optimal split size of %d pages." , split_size )
281
267
282
268
# If the doc is small enough, and we aren't slicing it with a page range:
283
269
# do not split, just continue with the original request
284
270
if split_size >= page_count and page_count == len (pdf .pages ):
285
- logger .info (
286
- "Document has too few pages (%d) to be split efficiently. Partitioning without split." ,
287
- page_count ,
288
- )
289
271
return request
290
272
291
273
pages = pdf_utils .get_pdf_pages (pdf , split_size = split_size , page_start = page_range_start , page_end = page_range_end )
292
- logger .info (
293
- "Partitioning %d files with %d page(s) each." ,
294
- math .floor (page_count / split_size ),
295
- split_size ,
296
- )
297
-
298
- # Log the remainder pages if there are any
299
- if page_count % split_size > 0 :
300
- logger .info (
301
- "Partitioning 1 file with %d page(s)." ,
302
- page_count % split_size ,
303
- )
304
274
305
275
# Use a variable to adjust the httpx client timeout, or default to 30 minutes
306
276
# When we're able to reuse the SDK to make these calls, we can remove this var
@@ -326,14 +296,8 @@ async def call_api_partial(page):
326
296
327
297
self .coroutines_to_execute [operation_id ] = []
328
298
set_index = 1
329
- for page_content , page_index , all_pages_number in pages :
299
+ for page_content , page_index in pages :
330
300
page_number = page_index + starting_page_number
331
- logger .info (
332
- "Partitioning set #%d (pages %d-%d)." ,
333
- set_index ,
334
- page_number ,
335
- min (page_number + split_size - 1 , all_pages_number ),
336
- )
337
301
338
302
coroutine = call_api_partial ((page_content , page_number ))
339
303
self .coroutines_to_execute [operation_id ].append (coroutine )
0 commit comments