diff --git a/Dockerfile.app b/Dockerfile.app index 6d4c6433..231eb7e4 100644 --- a/Dockerfile.app +++ b/Dockerfile.app @@ -32,7 +32,8 @@ RUN --mount=type=cache,target="/var/cache/apt",sharing=locked \ set -eux; \ apt-get update; \ apt-get upgrade -y; \ - apt-get install --no-install-recommends -y procps; \ + apt-get install --no-install-recommends -y \ + procps tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-chi-tra tesseract-ocr-deu; \ apt-get autoremove -y RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \ diff --git a/adala/skills/__init__.py b/adala/skills/__init__.py index 6cd7c3fa..65583f39 100644 --- a/adala/skills/__init__.py +++ b/adala/skills/__init__.py @@ -4,4 +4,5 @@ from .collection.rag import RAGSkill from .collection.ontology_creation import OntologyCreator, OntologyMerger from .collection.label_studio import LabelStudioSkill +from .collection.label_studio_image_ocr import LabelStudioSkillImageOCR from ._base import Skill, TransformSkill, AnalysisSkill, SynthesisSkill diff --git a/adala/skills/collection/label_studio_image_ocr.py b/adala/skills/collection/label_studio_image_ocr.py new file mode 100644 index 00000000..1bde9aa3 --- /dev/null +++ b/adala/skills/collection/label_studio_image_ocr.py @@ -0,0 +1,679 @@ +import re +import logging +import pandas as pd +from typing import List, Optional, Type, Dict, Tuple +from functools import cached_property +from copy import deepcopy +from collections import defaultdict +import aiohttp +import base64 +import asyncio +import io +from thefuzz import fuzz +from PIL import Image +from urllib.parse import urlparse +import uuid +from adala.skills._base import TransformSkill +from adala.runtimes import AsyncLiteLLMVisionRuntime +from adala.runtimes._litellm import MessageChunkType +from pydantic import BaseModel, Field, model_validator, computed_field +from difflib import SequenceMatcher +import numpy as np + +from adala.runtimes import Runtime, AsyncRuntime +from adala.utils.internal_data import InternalDataFrame + +from label_studio_sdk.label_interface import LabelInterface +from label_studio_sdk.label_interface.control_tags import ControlTag, ObjectTag +from label_studio_sdk._extensions.label_studio_tools.core.utils.json_schema import ( + json_schema_to_pydantic, +) +from .match_bbox_by_text import find_text_in_image + + +logger = logging.getLogger(__name__) + + +def extract_variable_name(input_string): + """Extract variable name in which would be specified as $""" + pattern = r"\$([a-zA-Z0-9_]+)" + matches = re.findall(pattern, input_string) + return matches + + +class LabelStudioSkillImageOCR(TransformSkill): + + name: str = "label_studio" + input_template: str = "Annotate the input data according to the provided schema." + # TODO: remove output_template, fix calling @model_validator(mode='after') in the base class + output_template: str = "Output: {field_name}" + response_model: Type[BaseModel] = ( + BaseModel # why validate_response_model is called in the base class? + ) + # ------------------------------ + label_config: str = "" + allowed_control_tags: Optional[list[str]] = None + allowed_object_tags: Optional[list[str]] = None + + # TODO: implement postprocessing to verify Taxonomy + + @cached_property + def label_interface(self) -> LabelInterface: + return LabelInterface(self.label_config) + + @cached_property + def image_tags(self) -> List[ObjectTag]: + # check if any image tags are used as input variables + object_tag_names = self.allowed_object_tags or list( + self.label_interface._objects.keys() + ) + tags = [] + for tag_name in object_tag_names: + tag = self.label_interface.get_object(tag_name) + if tag.tag.lower() == "image": + tags.append(tag) + return tags + + def __getstate__(self): + """Exclude cached properties when pickling - otherwise the 'Agent' can not be serialized in celery""" + state = deepcopy(super().__getstate__()) + # Remove cached_property values + for key in ["label_interface", "ner_tags", "image_tags"]: + state["__dict__"].pop(key, None) + return state + + @model_validator(mode="after") + def validate_response_model(self): + + logger.debug(f"Read labeling config {self.label_config}") + + if self.allowed_control_tags or self.allowed_object_tags: + if self.allowed_control_tags: + control_tags = { + tag: self.label_interface._controls[tag] + for tag in self.allowed_control_tags + } + else: + control_tags = self.label_interface._controls + if self.allowed_object_tags: + object_tags = { + tag: self.label_interface._objects[tag] + for tag in self.allowed_object_tags + } + else: + object_tags = self.label_interface._objects + interface = LabelInterface.create_instance( + tags={**control_tags, **object_tags} + ) + logger.debug( + f"Filtered labeling config based on allowed tags {self.allowed_control_tags=} and {self.allowed_object_tags=} to {interface.config}" + ) + else: + interface = self.label_interface + + # NOTE: filtered label config is used for the response model, but full label config is used for the prompt, so that the model has as much context as possible. + self.field_schema = interface.to_json_schema() + logger.debug(f"Converted labeling config to json schema: {self.field_schema}") + + return self + + def _create_response_model_from_field_schema(self): + pass + + def apply( + self, + input: InternalDataFrame, + runtime: Runtime, + ) -> InternalDataFrame: + + with json_schema_to_pydantic(self.field_schema) as ResponseModel: + return runtime.batch_to_batch( + input, + input_template=self.input_template, + output_template="", + instructions_template=self.instructions, + response_model=ResponseModel, + ) + + @classmethod + async def process_images_with_ocr(cls, images: list) -> list: + """ + Process a list of images with OCR by calling the OCR service. + + Args: + images: List of image data (URLs or base64 strings) + + Returns: + List of OCR results for each image + """ + + async def process_single_image(image_data): + # Check if the image is a URL + is_url = False + try: + parsed = urlparse(image_data) + is_url = all([parsed.scheme, parsed.netloc]) + except: + is_url = False + + if not is_url: + logger.warning(f"Image data is not a URL. OCR service requires URLs or base64 data.") + return None + + # Download the image and convert to base64 + async with aiohttp.ClientSession() as session: + try: + async with session.get(image_data) as response: + if response.status == 200: + image_bytes = await response.read() + # Get image dimensions + image = Image.open(io.BytesIO(image_bytes)) + width, height = image.size + # Convert to base64 + base64_data = base64.b64encode(image_bytes).decode('utf-8') + else: + error_text = await response.text() + logger.error(f"Failed to download image: {response.status}, {error_text}") + return None + except Exception as e: + logger.error(f"Error downloading image: {str(e)}") + return None + + # Call the OCR service with base64 data + ocr_url = "https://llm-ocr-server.appx.humansignal.com/ocr/base64" + + # Prepare form data - this is the key change + form_data = aiohttp.FormData() + form_data.add_field('image_data', base64_data) + form_data.add_field('confidence_threshold', str(0.3)) + form_data.add_field('languages', 'en,ch_sim') + + async with aiohttp.ClientSession() as session: + try: + async with session.post(ocr_url, data=form_data) as response: + if response.status == 200: + json_response = await response.json() + return { + "ocr_data": json_response, + "image_width": width, + "image_height": height + } + else: + error_text = await response.text() + logger.error(f"OCR service returned error: {response.status}, {error_text}") + return None + except Exception as e: + logger.error(f"Error calling OCR service: {str(e)}") + return None + + # Process all images concurrently + tasks = [process_single_image(image) for image in images] + results = await asyncio.gather(*tasks) + + return results + + def _get_normalized_bbox(self, bbox: List[List[int]], original_width: int, original_height: int) -> List[float]: + # Calculate top-left corner (minimum x and y) + # bbox format is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] + x_values = [point[0] for point in bbox] + y_values = [point[1] for point in bbox] + + min_x = min(x_values) + min_y = min(y_values) + + # Calculate width and height + max_x = max(x_values) + max_y = max(y_values) + width = max_x - min_x + height = max_y - min_y + + # Convert to percentages + x_percent = (min_x / original_width) * 100 + y_percent = (min_y / original_height) * 100 + width_percent = (width / original_width) * 100 + height_percent = (height / original_height) * 100 + + return { + 'x': x_percent, + 'y': y_percent, + 'width': width_percent, + 'height': height_percent + } + + def _convert_ocr_results_to_label_studio_format(self, results: list) -> Tuple[List, List]: + + # normalize EasyOCR results to RectangleLabels bounding boxes format of Label Studio + all_bbox_annotations = [] + all_text_annotations = [] + for result in results: + if not result: + continue + + # Extract OCR response data + bboxes = result.get('ocr_data', {}).get('bboxes', []) + texts = result.get('ocr_data', {}).get('texts', []) + scores = result.get('ocr_data', {}).get('scores', []) + original_width = result.get('image_width', 1000) + original_height = result.get('image_height', 1000) + + # Convert to Label Studio format + bbox_annotations = [] + text_annotations = [] + + for i, (bbox, text, score) in enumerate(zip(bboxes, texts, scores)): + # EasyOCR bboxes format is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] + # We need to convert to Label Studio format with x,y as top-left corner + # and width, height as percentages + + bbox_annotation = self._get_normalized_bbox(bbox, original_width, original_height) + + # generate unique id for the annotation + id_gen = str(uuid.uuid4())[:8] + # Create Label Studio format annotation + bbox_annotation['id'] = id_gen + bbox_annotation['rotation'] = 0 + + text_annotation = { + 'text': [text], + 'id': id_gen + } + + bbox_annotations.append(bbox_annotation) + text_annotations.append(text_annotation) + + # Replace the OCR result with the Label Studio formatted result + all_bbox_annotations.append(bbox_annotations) + all_text_annotations.append(text_annotations) + + return all_bbox_annotations, all_text_annotations + + def _convert_ocr_results_to_label_studio_format_v2(self, results: list) -> Tuple[List, List]: + """ + Same as _convert_ocr_results_to_label_studio_format, but uses a different approach to filter the OCR results: + results['ocr_data'] contains a dictionary of reference texts as keys and lists of OCR results as values. + For each reference text, we create a group of `bbox_annotations` and `text_annotations`. + In text_annotations, we place "parentID" as the id of the bbox_annotation that it belongs to (pick the first text_annotation id as parentID) + """ + all_bbox_annotations = [] + all_text_annotations = [] + + for result in results: + bbox_annotations = [] + text_annotations = [] + + original_width = result['image_width'] + original_height = result['image_height'] + + # Process each reference text and its associated OCR results + for reference_text, ocr_matches in result['ocr_data'].items(): + # Create a group for this reference text + group_id = None + + # Process each OCR match for this reference text + for bbox, text, score in zip(ocr_matches['bboxes'], ocr_matches['texts'], ocr_matches['scores']): + + bbox_annotation = self._get_normalized_bbox(bbox, original_width, original_height) + + # Generate unique id for the annotation + id_gen = str(uuid.uuid4())[:8] + if group_id is None: + group_id = id_gen + + # Create bbox annotation + bbox_annotation['rotation'] = 0 + bbox_annotation['id'] = id_gen + bbox_annotation['score'] = score + # Create text annotation + text_annotation = { + 'text': [text], + 'id': id_gen, + } + if group_id != id_gen: + text_annotation['parent_id'] = group_id + bbox_annotation['parent_id'] = group_id + + bbox_annotations.append(bbox_annotation) + text_annotations.append(text_annotation) + + # Add annotations for this result to the overall lists + all_bbox_annotations.append(bbox_annotations) + all_text_annotations.append(text_annotations) + + return all_bbox_annotations, all_text_annotations + + + @classmethod + def _calculate_similarity(cls, text: str, reference_texts: List[str]) -> Tuple[float, str]: + """ + Calculate similarity between a text and substrings within reference texts. + + Args: + text: The text to compare + reference_texts: List of reference texts + + Returns: + Similarity score between 0 and 1 and the best matching text + """ + # Convert to lowercase for case-insensitive comparison + text = text.lower() + text_len = len(text) + + best_score = 0 + best_match = None + + if reference_texts: + for ref_text in reference_texts: + ref_text_lower = ref_text.lower() + best_window_score = fuzz.partial_ratio(text, ref_text_lower) + if best_window_score > best_score: + best_score = best_window_score + best_match = ref_text + best_score = float(best_score) / 100 + print(f"Best substring similarity between '{text}' and '{best_match}': {best_score}") + + return best_score, best_match + + def _filter_ocr_results(self, ocr_results: Dict, reference_texts: List[str]) -> list: + """ + Filter OCR results based on similarity to output texts + + Args: + ocr_results: List of OCR results + reference_texts: List of reference texts + + Returns: + List of filtered OCR results + """ + filtered_results = { + 'bboxes': [], + 'texts': [], + 'scores': [] + } + for bbox, text, score in zip(ocr_results['bboxes'], ocr_results['texts'], ocr_results['scores']): + # Simple similarity function - can be replaced with more sophisticated methods + similarity, best_match = self._calculate_similarity(text, reference_texts) + if similarity >= 0.9: + filtered_results['bboxes'].append(bbox) + filtered_results['texts'].append(best_match) + filtered_results['scores'].append(score) + + return filtered_results + + def _filter_ocr_results_v2(self, ocr_results: Dict, reference_texts: List[str]) -> Dict[str, List]: + + output = {} + for ref_text in reference_texts: + ref_text_lower = ref_text.lower() + output[ref_text] = { + 'bboxes': [], + 'texts': [], + 'scores': [] + } + for text, score, bbox in zip(ocr_results['texts'], ocr_results['scores'], ocr_results['bboxes']): + text_lower = text.lower() + # check if text is a fuzzy substring of ref_text + similarity = fuzz.partial_ratio(text_lower, ref_text_lower) + if similarity >= 95: + output[ref_text]['bboxes'].append(bbox) + output[ref_text]['texts'].append(text) + output[ref_text]['scores'].append(score) + + # Filter to keep only horizontally aligned bounding boxes + for ref_text in output: + if not output[ref_text]['bboxes']: + continue + + # Group bounding boxes by their vertical position (y-coordinate) + # Using the middle y-coordinate of each box for grouping + y_groups = {} + for i, bbox in enumerate(output[ref_text]['bboxes']): + # Calculate middle y-coordinate of the bounding box + # bbox format is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] + # Calculate middle y-coordinate of the bounding box + # bbox format is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] + y_values = [point[1] for point in bbox] + mid_y = sum(y_values) / len(y_values) + + # Group with tolerance of 10 pixels + group_key = int(mid_y / 10) * 10 + if group_key not in y_groups: + y_groups[group_key] = [] + y_groups[group_key].append(i) + + # Find the group with the maximum number of bounding boxes + max_group_key = max(y_groups.keys(), key=lambda k: len(y_groups[k]), default=None) + + if max_group_key is not None: + # Keep only the bounding boxes in the largest horizontal group + indices_to_keep = y_groups[max_group_key] + + # Create new filtered lists + filtered_bboxes = [output[ref_text]['bboxes'][i] for i in indices_to_keep] + filtered_texts = [output[ref_text]['texts'][i] for i in indices_to_keep] + filtered_scores = [output[ref_text]['scores'][i] for i in indices_to_keep] + + # Sort bounding boxes by x-coordinate to maintain reading order + sorted_indices = sorted(range(len(filtered_bboxes)), + key=lambda i: min(point[0] for point in filtered_bboxes[i])) + + filtered_bboxes = [filtered_bboxes[i] for i in sorted_indices] + filtered_texts = [filtered_texts[i] for i in sorted_indices] + filtered_scores = [filtered_scores[i] for i in sorted_indices] + + # Create a combined bounding box that encompasses all individual boxes + if filtered_bboxes: + # Find min and max coordinates across all bounding boxes + all_x = [point[0] for bbox in filtered_bboxes for point in bbox] + all_y = [point[1] for bbox in filtered_bboxes for point in bbox] + + min_x, max_x = min(all_x), max(all_x) + min_y, max_y = min(all_y), max(all_y) + + # Create a new bounding box with the min/max coordinates + combined_bbox = [ + [min_x, min_y], # top-left + [max_x, min_y], # top-right + [max_x, max_y], # bottom-right + [min_x, max_y] # bottom-left + ] + + # Calculate average score + avg_score = sum(filtered_scores) / len(filtered_scores) if filtered_scores else 0 + + # Add the combined bounding box to the results + filtered_bboxes.insert(0, combined_bbox) + filtered_texts.insert(0, ref_text) # Use the reference text for the combined box + filtered_scores.insert(0, avg_score) + + # Update the output with filtered results + output[ref_text]['bboxes'] = filtered_bboxes + output[ref_text]['texts'] = filtered_texts + output[ref_text]['scores'] = filtered_scores + + return output + + + def _get_labels(self) -> List[str]: + # TODO: validate labels are coming from tag, use control tag name + # format: {'StartDate': LabelTag(attr={'value': 'StartDate', 'background': 'red'}, tag='Label', value='StartDate', parent_name='columns'), 'EndDate': LabelTag(attr={'value': 'EndDate', 'background': 'green'}, tag='Label', value='EndDate', parent_name='columns'), 'Amount': LabelTag(attr={'value': 'Amount'}, tag='Label', value='Amount', parent_name='columns')} + return list(self.label_interface.labels)[0] + + + async def aapply( + self, + input: InternalDataFrame, + runtime: AsyncRuntime, + ) -> InternalDataFrame: + + labels = self._get_labels() + # validate labels + from adala.utils.pydantic_generator import field_schema_to_pydantic_class + LineItem = field_schema_to_pydantic_class( + class_name="LineItem", + description="A single line extracted from the document", + field_schema={label: {"type": "string"} for label in labels} + ) + + class ResponseModel(BaseModel): + lines: List[LineItem] + + input_field_types = defaultdict(lambda: MessageChunkType.TEXT) + image_value_key = None + for tag in self.image_tags: + # these are the project variable names, NOT the label config tag names. TODO: pass this info from LSE to avoid recomputing it here. + variables = extract_variable_name(tag.value) + if len(variables) != 1: + logger.warning( + f"Image tag {tag.name} has multiple variables: {variables}. Cannot mark these variables as image inputs." + ) + continue + image_value_key = variables[0] + input_field_types[image_value_key] = ( + MessageChunkType.IMAGE_URLS + if tag.attr.get("valueList") + else MessageChunkType.IMAGE_URL + ) + + logger.debug( + f"Using VisionRuntime with input field types: {input_field_types}" + ) + output = await runtime.batch_to_batch( + input, + input_template=self.input_template, + output_template="", + instructions_template=self.instructions, + response_model=ResponseModel, + input_field_types=input_field_types, + ) + print(f'Output: {output}') + + images = input[image_value_key].tolist() + all_bbox_annotations = [] + all_text_annotations = [] + all_label_annotations = [] + for i, row in output.iterrows(): + extracted_results = row['lines'] + + ocr_results = find_text_in_image(images[i], extracted_results) + bbox_annotations = [] + text_annotations = [] + label_annotations = [] + for ocr_result in ocr_results: + # Add bbox annotation + + bbox_id = ocr_result['element']['id'] + parent_id = ocr_result['element'].get('parent_id') + + bbox_annotation = ocr_result['element'] + bbox_annotation['score'] = ocr_result['matching_score'] * ocr_result['element']['score'] + + bbox_annotations.append(ocr_result['element']) + + # Add text annotation + text_annotation = { + 'text': [ocr_result['reference_text']], + 'id': bbox_id + } + if parent_id: + text_annotation['parent_id'] = parent_id + text_annotations.append(text_annotation) + + label = ocr_result.pop('reference_label', None) + + if label: + label_annotation = { + 'labels': [label], + 'id': bbox_id + } + if parent_id: + label_annotation['parent_id'] = parent_id + label_annotations.append(label_annotation) + + all_bbox_annotations.append(bbox_annotations) + all_text_annotations.append(text_annotations) + all_label_annotations.append(label_annotations) + output['bbox'] = all_bbox_annotations + output['transcription'] = all_text_annotations + output['columns'] = all_label_annotations + return output + + + # with json_schema_to_pydantic(self.field_schema) as ResponseModel: + # # special handling to flag image inputs if they exist + # input_field_types = defaultdict(lambda: MessageChunkType.TEXT) + # image_value_key = None + # for tag in self.image_tags: + # # these are the project variable names, NOT the label config tag names. TODO: pass this info from LSE to avoid recomputing it here. + # variables = extract_variable_name(tag.value) + # if len(variables) != 1: + # logger.warning( + # f"Image tag {tag.name} has multiple variables: {variables}. Cannot mark these variables as image inputs." + # ) + # continue + # image_value_key = variables[0] + # input_field_types[image_value_key] = ( + # MessageChunkType.IMAGE_URLS + # if tag.attr.get("valueList") + # else MessageChunkType.IMAGE_URL + # ) + + # logger.debug( + # f"Using VisionRuntime with input field types: {input_field_types}" + # ) + # output = await runtime.batch_to_batch( + # input, + # input_template=self.input_template, + # output_template="", + # instructions_template=self.instructions, + # response_model=ResponseModel, + # input_field_types=input_field_types, + # ) + # print(f'Output: {output}') + # print(f'Process images with OCR: {input[image_value_key].tolist()}') + # # ocr_results = await self.process_images_with_ocr(input[image_value_key].tolist()) + # # filtered_ocr_results = [] + # images = input[image_value_key].tolist() + # all_bbox_annotations = [] + # all_text_annotations = [] + # for i, row in output.iterrows(): + # extracted_result = row['output'] + + # ocr_results = find_text_in_image(images[i], extracted_result) + # bbox_annotations = [] + # text_annotations = [] + # for ocr_result in ocr_results: + # # Convert OCR results to Label Studio format + # parent_id = ocr_result['bbox']['id'] + + # for word in ocr_result['words']: + # bbox_annotation = word['bbox'] + # bbox_annotation['rotation'] = 0 + # bbox_annotation['parent_id'] = parent_id + # bbox_annotation['score'] = word['score'] + + # text_annotation = { + # 'text': [word['text']], + # 'id': bbox_annotation['id'], + # 'parent_id': parent_id + # } + + # bbox_annotations.append(bbox_annotation) + # text_annotations.append(text_annotation) + + # bbox_annotation = ocr_result['bbox'] + # bbox_annotation['rotation'] = 0 + # bbox_annotation['score'] = float(np.sqrt(ocr_result['detection_score'] * ocr_result['matching_score'])) + + # text_annotation = { + # 'text': [ocr_result['reference_text']], + # 'id': parent_id + # } + + # bbox_annotations.append(bbox_annotation) + # text_annotations.append(text_annotation) + + # all_bbox_annotations.append(bbox_annotations) + # all_text_annotations.append(text_annotations) + # output['bbox'] = all_bbox_annotations + # output['transcription'] = all_text_annotations + # return output diff --git a/poetry.lock b/poetry.lock index 3467cbcb..85b81803 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3696,7 +3696,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "388257eeea2ed95ae769724bb8b6ad9c41511b96.zip", hash = "sha256:c86f91aebd989d762663aa49f4fff5dc128f168f70e4540b9d899e456a006f01"}, + {file = "2b0d2cc9403c83cd99dc3e6cc06a580d9e9db4f1.zip", hash = "sha256:118574bddf431edf96d564f55173f6e0f104d233c1ebc3dac8ca68decd4fc6f4"}, ] [package.dependencies] @@ -3722,7 +3722,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/388257eeea2ed95ae769724bb8b6ad9c41511b96.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/2b0d2cc9403c83cd99dc3e6cc06a580d9e9db4f1.zip" [[package]] name = "litellm" @@ -6063,6 +6063,22 @@ files = [ [package.extras] dev = ["build", "flake8", "mypy", "pytest", "twine"] +[[package]] +name = "pytesseract" +version = "0.3.13" +description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "pytesseract-0.3.13-py3-none-any.whl", hash = "sha256:7a99c6c2ac598360693d83a416e36e0b33a67638bb9d77fdcac094a3589d4b34"}, + {file = "pytesseract-0.3.13.tar.gz", hash = "sha256:4bf5f880c99406f52a3cfc2633e42d9dc67615e69d8a509d74867d3baddb5db9"}, +] + +[package.dependencies] +packaging = ">=21.3" +Pillow = ">=8.0.0" + [[package]] name = "pytest" version = "7.4.4" @@ -6524,6 +6540,113 @@ files = [ [package.dependencies] cffi = {version = "*", markers = "implementation_name == \"pypy\""} +[[package]] +name = "rapidfuzz" +version = "3.12.2" +description = "rapid fuzzy string matching" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "rapidfuzz-3.12.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0b9a75e0385a861178adf59e86d6616cbd0d5adca7228dc9eeabf6f62cf5b0b1"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6906a7eb458731e3dd2495af1d0410e23a21a2a2b7ced535e6d5cd15cb69afc5"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4b3334a8958b689f292d5ce8a928140ac98919b51e084f04bf0c14276e4c6ba"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:85a54ce30345cff2c79cbcffa063f270ad1daedd0d0c3ff6e541d3c3ba4288cf"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:acb63c5072c08058f8995404201a52fc4e1ecac105548a4d03c6c6934bda45a3"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5385398d390c6571f0f2a7837e6ddde0c8b912dac096dc8c87208ce9aaaa7570"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5032cbffa245b4beba0067f8ed17392ef2501b346ae3c1f1d14b950edf4b6115"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:195adbb384d89d6c55e2fd71e7fb262010f3196e459aa2f3f45f31dd7185fe72"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f43b773a4d4950606fb25568ecde5f25280daf8f97b87eb323e16ecd8177b328"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:55a43be0e0fa956a919043c19d19bd988991d15c59f179d413fe5145ed9deb43"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:71cf1ea16acdebe9e2fb62ee7a77f8f70e877bebcbb33b34e660af2eb6d341d9"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a3692d4ab36d44685f61326dca539975a4eda49b2a76f0a3df177d8a2c0de9d2"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-win32.whl", hash = "sha256:09227bd402caa4397ba1d6e239deea635703b042dd266a4092548661fb22b9c6"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-win_amd64.whl", hash = "sha256:0f05b7b95f9f87254b53fa92048367a8232c26cee7fc8665e4337268c3919def"}, + {file = "rapidfuzz-3.12.2-cp310-cp310-win_arm64.whl", hash = "sha256:6938738e00d9eb6e04097b3f565097e20b0c398f9c58959a2bc64f7f6be3d9da"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e9c4d984621ae17404c58f8d06ed8b025e167e52c0e6a511dfec83c37e9220cd"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9f9132c55d330f0a1d34ce6730a76805323a6250d97468a1ca766a883d6a9a25"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39b343b6cb4b2c3dbc8d2d4c5ee915b6088e3b144ddf8305a57eaab16cf9fc74"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24081077b571ec4ee6d5d7ea0e49bc6830bf05b50c1005028523b9cd356209f3"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c988a4fc91856260355773bf9d32bebab2083d4c6df33fafeddf4330e5ae9139"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:780b4469ee21cf62b1b2e8ada042941fd2525e45d5fb6a6901a9798a0e41153c"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:edd84b0a323885493c893bad16098c5e3b3005d7caa995ae653da07373665d97"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efa22059c765b3d8778083805b199deaaf643db070f65426f87d274565ddf36a"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:095776b11bb45daf7c2973dd61cc472d7ea7f2eecfa454aef940b4675659b92f"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:7e2574cf4aa86065600b664a1ac7b8b8499107d102ecde836aaaa403fc4f1784"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:d5a3425a6c50fd8fbd991d8f085ddb504791dae6ef9cc3ab299fea2cb5374bef"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:97fb05e1ddb7b71a054040af588b0634214ee87cea87900d309fafc16fd272a4"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-win32.whl", hash = "sha256:b4c5a0413589aef936892fbfa94b7ff6f7dd09edf19b5a7b83896cc9d4e8c184"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-win_amd64.whl", hash = "sha256:58d9ae5cf9246d102db2a2558b67fe7e73c533e5d769099747921232d88b9be2"}, + {file = "rapidfuzz-3.12.2-cp311-cp311-win_arm64.whl", hash = "sha256:7635fe34246cd241c8e35eb83084e978b01b83d5ef7e5bf72a704c637f270017"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1d982a651253ffe8434d9934ff0c1089111d60502228464721a2a4587435e159"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:02e6466caa0222d5233b1f05640873671cd99549a5c5ba4c29151634a1e56080"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e956b3f053e474abae69ac693a52742109d860ac2375fe88e9387d3277f4c96c"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2dee7d740a2d5418d4f964f39ab8d89923e6b945850db833e798a1969b19542a"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a057cdb0401e42c84b6516c9b1635f7aedd5e430c6e388bd5f6bcd1d6a0686bb"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dccf8d4fb5b86d39c581a59463c596b1d09df976da26ff04ae219604223d502f"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21d5b3793c6f5aecca595cd24164bf9d3c559e315ec684f912146fc4e769e367"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:46a616c0e13cff2de1761b011e0b14bb73b110182f009223f1453d505c9a975c"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19fa5bc4301a1ee55400d4a38a8ecf9522b0391fc31e6da5f4d68513fe5c0026"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:544a47190a0d25971658a9365dba7095397b4ce3e897f7dd0a77ca2cf6fa984e"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:f21af27c5e001f0ba1b88c36a0936437dfe034c452548d998891c21125eb640f"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b63170d9db00629b5b3f2862114d8d6ee19127eaba0eee43762d62a25817dbe0"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-win32.whl", hash = "sha256:6c7152d77b2eb6bfac7baa11f2a9c45fd5a2d848dbb310acd0953b3b789d95c9"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-win_amd64.whl", hash = "sha256:1a314d170ee272ac87579f25a6cf8d16a031e1f7a7b07663434b41a1473bc501"}, + {file = "rapidfuzz-3.12.2-cp312-cp312-win_arm64.whl", hash = "sha256:d41e8231326e94fd07c4d8f424f6bed08fead6f5e6688d1e6e787f1443ae7631"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:941f31038dba5d3dedcfcceba81d61570ad457c873a24ceb13f4f44fcb574260"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fe2dfc454ee51ba168a67b1e92b72aad251e45a074972cef13340bbad2fd9438"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78fafaf7f5a48ee35ccd7928339080a0136e27cf97396de45259eca1d331b714"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e0c7989ff32c077bb8fd53253fd6ca569d1bfebc80b17557e60750e6909ba4fe"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96fa00bc105caa34b6cd93dca14a29243a3a7f0c336e4dcd36348d38511e15ac"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bccfb30c668620c5bc3490f2dc7d7da1cca0ead5a9da8b755e2e02e2ef0dff14"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f9b0adc3d894beb51f5022f64717b6114a6fabaca83d77e93ac7675911c8cc5"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:32691aa59577f42864d5535cb6225d0f47e2c7bff59cf4556e5171e96af68cc1"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:758b10380ad34c1f51753a070d7bb278001b5e6fcf544121c6df93170952d705"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:50a9c54c0147b468363119132d514c5024fbad1ed8af12bd8bd411b0119f9208"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e3ceb87c11d2d0fbe8559bb795b0c0604b84cfc8bb7b8720b5c16e9e31e00f41"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f7c9a003002434889255ff5676ca0f8934a478065ab5e702f75dc42639505bba"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-win32.whl", hash = "sha256:cf165a76870cd875567941cf861dfd361a0a6e6a56b936c5d30042ddc9def090"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-win_amd64.whl", hash = "sha256:55bcc003541f5f16ec0a73bf6de758161973f9e8d75161954380738dd147f9f2"}, + {file = "rapidfuzz-3.12.2-cp313-cp313-win_arm64.whl", hash = "sha256:69f6ecdf1452139f2b947d0c169a605de578efdb72cbb2373cb0a94edca1fd34"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4c852cd8bed1516a64fd6e2d4c6f270d4356196ee03fda2af1e5a9e13c34643"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:42e7f747b55529a6d0d1588695d71025e884ab48664dca54b840413dea4588d8"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a749fd2690f24ef256b264a781487746bbb95344364fe8fe356f0eef7ef206ba"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9a11e1d036170bbafa43a9e63d8c309273564ec5bdfc5439062f439d1a16965a"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dfb337f1832c1231e3d5621bd0ebebb854e46036aedae3e6a49c1fc08f16f249"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e88c6e68fca301722fa3ab7fd3ca46998012c14ada577bc1e2c2fc04f2067ca6"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17e1a3a8b4b5125cfb63a6990459b25b87ea769bdaf90d05bb143f8febef076a"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:b9f8177b24ccc0a843e85932b1088c5e467a7dd7a181c13f84c684b796bea815"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6c506bdc2f304051592c0d3b0e82eed309248ec10cdf802f13220251358375ea"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:30bf15c1ecec2798b713d551df17f23401a3e3653ad9ed4e83ad1c2b06e86100"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:bd9a67cfc83e8453ef17ddd1c2c4ce4a74d448a197764efb54c29f29fb41f611"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7a6eaec2ef658dd650c6eb9b36dff7a361ebd7d8bea990ce9d639b911673b2cb"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-win32.whl", hash = "sha256:d7701769f110332cde45c41759cb2a497de8d2dca55e4c519a46aed5fbb19d1a"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-win_amd64.whl", hash = "sha256:296bf0fd4f678488670e262c87a3e4f91900b942d73ae38caa42a417e53643b1"}, + {file = "rapidfuzz-3.12.2-cp39-cp39-win_arm64.whl", hash = "sha256:7957f5d768de14f6b2715303ccdf224b78416738ee95a028a2965c95f73afbfb"}, + {file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e5fd3ce849b27d063755829cda27a9dab6dbd63be3801f2a40c60ec563a4c90f"}, + {file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:54e53662d71ed660c83c5109127c8e30b9e607884b7c45d2aff7929bbbd00589"}, + {file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b9e43cf2213e524f3309d329f1ad8dbf658db004ed44f6ae1cd2919aa997da5"}, + {file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29ca445e320e5a8df3bd1d75b4fa4ecfa7c681942b9ac65b55168070a1a1960e"}, + {file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83eb7ef732c2f8533c6b5fbe69858a722c218acc3e1fc190ab6924a8af7e7e0e"}, + {file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:648adc2dd2cf873efc23befcc6e75754e204a409dfa77efd0fea30d08f22ef9d"}, + {file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:9b1e6f48e1ffa0749261ee23a1c6462bdd0be5eac83093f4711de17a42ae78ad"}, + {file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:1ae9ded463f2ca4ba1eb762913c5f14c23d2e120739a62b7f4cc102eab32dc90"}, + {file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dda45f47b559be72ecbce45c7f71dc7c97b9772630ab0f3286d97d2c3025ab71"}, + {file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3745c6443890265513a3c8777f2de4cb897aeb906a406f97741019be8ad5bcc"}, + {file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36d3ef4f047ed1bc96fa29289f9e67a637ddca5e4f4d3dc7cb7f50eb33ec1664"}, + {file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:54bb69ebe5ca0bd7527357e348f16a4c0c52fe0c2fcc8a041010467dcb8385f7"}, + {file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3f2ddd5b99b254039a8c82be5749d4d75943f62eb2c2918acf6ffd586852834f"}, + {file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:8117dab9b26a1aaffab59b4e30f80ac4d55e61ad4139a637c149365960933bee"}, + {file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40c0f16d62d6553527de3dab2fb69709c4383430ea44bce8fb4711ed4cbc6ae3"}, + {file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f177e1eb6e4f5261a89c475e21bce7a99064a8f217d2336fb897408f46f0ceaf"}, + {file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5df0cecc2852fcb078ed1b4482fac4fc2c2e7787f3edda8920d9a4c0f51b1c95"}, + {file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3b3c4df0321df6f8f0b61afbaa2ced9622750ee1e619128db57a18533d139820"}, + {file = "rapidfuzz-3.12.2.tar.gz", hash = "sha256:b0ba1ccc22fff782e7152a3d3d0caca44ec4e32dc48ba01c560b8593965b5aa3"}, +] + +[package.extras] +all = ["numpy"] + [[package]] name = "redis" version = "5.0.8" @@ -7195,6 +7318,21 @@ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["pre-commit", "pytest (>=7.0)", "pytest-timeout"] typing = ["mypy (>=1.6,<2.0)", "traitlets (>=5.11.1)"] +[[package]] +name = "thefuzz" +version = "0.22.1" +description = "Fuzzy string matching in python" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "thefuzz-0.22.1-py3-none-any.whl", hash = "sha256:59729b33556850b90e1093c4cf9e618af6f2e4c985df193fdf3c5b5cf02ca481"}, + {file = "thefuzz-0.22.1.tar.gz", hash = "sha256:7138039a7ecf540da323792d8592ef9902b1d79eb78c147d4f20664de79f3680"}, +] + +[package.dependencies] +rapidfuzz = ">=3.0.0,<4.0.0" + [[package]] name = "tiktoken" version = "0.7.0" @@ -8467,4 +8605,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.9,<3.9.7 || >3.9.7,<3.13" -content-hash = "8930e9c676607922cdb1bfece4b688db95871e2d5b80c8bde2518d3311ad2675" +content-hash = "167ce58f336fb164546c14a06fe12ae870266918f4290ab8b41d31e4b72d8307" diff --git a/pyproject.toml b/pyproject.toml index 42a677de..d230b4a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ celery = {version = "^5.3.6", extras = ["redis"]} kombu = ">=5.4.0rc2" # Pin version to fix https://github.com/celery/celery/issues/8030. TODO: remove when this fix will be included in celery uvicorn = "*" pydantic-settings = "^2.2.1" -label-studio-sdk = {url = "https://github.com/HumanSignal/label-studio-sdk/archive/388257eeea2ed95ae769724bb8b6ad9c41511b96.zip"} +label-studio-sdk = {url = "https://github.com/HumanSignal/label-studio-sdk/archive/2b0d2cc9403c83cd99dc3e6cc06a580d9e9db4f1.zip"} kafka-python-ng = "^2.2.3" requests = "^2.32.0" # Using litellm from forked repo until vertex fix is released: https://github.com/BerriAI/litellm/issues/7904 @@ -50,6 +50,8 @@ requests = "^2.32.0" litellm = {url = "https://github.com/HumanSignal/litellm/archive/c0506d5844ef20d0db14144fbcbf99c05637bde3.zip"} pandarallel = "^1.6.5" instructor = "^1.4.3" +thefuzz = "0.22.1" +pytesseract = "^0.3.13" [tool.poetry.group.dev.dependencies] pytest = "^7.4.3"