reduce info logging (#197)

jordan-homan · web-flow · commit e1e2a1b168d4 · 2024-10-24T09:50:27.000-04:00
diff --git a/src/unstructured_client/_hooks/custom/logger_hook.py b/src/unstructured_client/_hooks/custom/logger_hook.py
@@ -77,13 +77,10 @@ def after_error(
         if response and response.status_code == 200:
             # NOTE: Even though this is an after_error method, due to split_pdf_hook logic we may get
             # a success here when one of the split requests was partitioned successfully
-            logger.info("Successfully partitioned the document.")
-        
-        else:
-            logger.error("Failed to partition the document.")
-            if response:
-                logger.error("Server responded with %d - %s", response.status_code, response.text)
-            if error is not None:
-                logger.error("Following error occurred - %s", error)
-        
+            return response, error
+        logger.error("Failed to partition the document.")
+        if response:
+            logger.error("Server responded with %d - %s", response.status_code, response.text)
+        if error is not None:
+            logger.error("Following error occurred - %s", error)
         return response, error
diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py
@@ -51,7 +51,7 @@ def get_pdf_pages(
         new_pdf.write(pdf_buffer)
         pdf_buffer.seek(0)
 
-        yield pdf_buffer, offset, offset_end
+        yield pdf_buffer, offset
         offset += split_size
 
 
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -222,39 +222,32 @@ def before_request(
         if split_pdf_page is None or split_pdf_page == "false":
             return request
 
-        logger.info("Preparing to split document for partition.")
         file = form_data.get(PARTITION_FORM_FILES_KEY)
         if (
                 file is None
                 or not isinstance(file, shared.Files)
                 or not pdf_utils.is_pdf(file)
         ):
-            logger.info("Partitioning without split.")
             return request
 
         starting_page_number = form_utils.get_starting_page_number(
             form_data,
             key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
             fallback_value=DEFAULT_STARTING_PAGE_NUMBER,
         )
-        if starting_page_number > 1:
-            logger.info("Starting page number set to %d", starting_page_number)
-        logger.info("Starting page number set to %d", starting_page_number)
 
         self.allow_failed = form_utils.get_split_pdf_allow_failed_param(
             form_data,
             key=PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
             fallback_value=DEFAULT_ALLOW_FAILED,
         )
-        logger.info("Allow failed set to %d", self.allow_failed)
 
         concurrency_level = form_utils.get_split_pdf_concurrency_level_param(
             form_data,
             key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
             fallback_value=DEFAULT_CONCURRENCY_LEVEL,
             max_allowed=MAX_CONCURRENCY_LEVEL,
         )
-        logger.info("Concurrency level set to %d", concurrency_level)
         limiter = asyncio.Semaphore(concurrency_level)
 
         content = cast(bytes, file.content)
@@ -267,40 +260,17 @@ def before_request(
         )
 
         page_count = page_range_end - page_range_start + 1
-        logger.info(
-            "Splitting pages %d to %d (%d total)",
-            page_range_start,
-            page_range_end,
-            page_count,
-        )
 
         split_size = get_optimal_split_size(
             num_pages=page_count, concurrency_level=concurrency_level
         )
-        logger.info("Determined optimal split size of %d pages.", split_size)
 
         # If the doc is small enough, and we aren't slicing it with a page range:
         # do not split, just continue with the original request
         if split_size >= page_count and page_count == len(pdf.pages):
-            logger.info(
-                "Document has too few pages (%d) to be split efficiently. Partitioning without split.",
-                page_count,
-            )
             return request
 
         pages = pdf_utils.get_pdf_pages(pdf, split_size=split_size, page_start=page_range_start, page_end=page_range_end)
-        logger.info(
-            "Partitioning %d files with %d page(s) each.",
-            math.floor(page_count / split_size),
-            split_size,
-        )
-
-        # Log the remainder pages if there are any
-        if page_count % split_size > 0:
-            logger.info(
-                "Partitioning 1 file with %d page(s).",
-                page_count % split_size,
-            )
 
         # Use a variable to adjust the httpx client timeout, or default to 30 minutes
         # When we're able to reuse the SDK to make these calls, we can remove this var
@@ -326,14 +296,8 @@ async def call_api_partial(page):
 
         self.coroutines_to_execute[operation_id] = []
         set_index = 1
-        for page_content, page_index, all_pages_number in pages:
+        for page_content, page_index in pages:
             page_number = page_index + starting_page_number
-            logger.info(
-                "Partitioning set #%d (pages %d-%d).",
-                set_index,
-                page_number,
-                min(page_number + split_size - 1, all_pages_number),
-            )
 
             coroutine = call_api_partial((page_content, page_number))
             self.coroutines_to_execute[operation_id].append(coroutine)