Skip to content
This repository was archived by the owner on Aug 1, 2024. It is now read-only.

Add check for pandas NaN metadata values #6

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
1 change: 0 additions & 1 deletion labelbase/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,6 @@ def flatten_label(client:labelboxClient, label_dict:dict, ontology_index:dict, d
if column_name not in flat_label.keys():
flat_label[column_name] = []
if "bounding_box" in obj.keys():
print(obj)
annotation_value = [obj["bounding_box"]["top"], obj["bounding_box"]["left"], obj["bounding_box"]["height"], obj["bounding_box"]["width"]]
if "page_number" in obj.keys():
annotation_value.append(obj["page_number"])
Expand Down
2 changes: 2 additions & 0 deletions labelbase/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ def determine_actions(
attachments_action = True if attachment_index and not create_action else False
# Determine if we're batching data rows
batch_action = False if (project_id == project_id_col == "") else True
print(project_id)
print(project_id_col)
# Determine the upload_method if we're batching to projects
annotate_action = upload_method if (upload_method in ["mal", "import", "ground-truth"]) and annotation_index and batch_action else ""
# "ground-truth" defaults to "import" if no model informtion is given
Expand Down
Binary file not shown.
Binary file not shown.
7 changes: 6 additions & 1 deletion labelbase/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from labelbase.annotate import flatten_label

def export_and_flatten_labels(client:labelboxClient, project, include_metadata:bool=True, include_performance:bool=True,
include_agreement:bool=False, verbose:bool=False, mask_method:str="png", divider="///", export_filters:dict=None):
include_agreement:bool=False, include_label_details:bool=False, verbose:bool=False, mask_method:str="png", divider="///", export_filters:dict=None):
""" Exports and flattens labels from a Labelbox Project
Args:
client: : Required (labelbox.Client) - Labelbox Client object
Expand Down Expand Up @@ -82,6 +82,7 @@ def export_and_flatten_labels(client:labelboxClient, project, include_metadata:b
flat_label["seconds_to_create"] = nested_label['performance_details']['seconds_to_create']
flat_label["seconds_to_review"] = nested_label['performance_details']['seconds_to_review']
flat_label["seconds_to_label"] = nested_label['performance_details']['seconds_to_create'] - nested_label['performance_details']['seconds_to_review']
if include_metadata:
for metadata in label['metadata_fields']:
try:
if metadata['value'] in metadata_schema_to_name_key.keys():
Expand Down Expand Up @@ -115,6 +116,10 @@ def export_and_flatten_labels(client:labelboxClient, project, include_metadata:b
metadata_value = metadata['value']
if field_name != "lb_integration_source":
flat_label[f'metadata{divider}{metadata_type}{divider}{field_name}'] = metadata_value
if include_label_details:
flat_label["created_by"] = nested_label['label_details']["created_by"]
flat_label["updated_at"] = nested_label['label_details']["updated_at"]
flat_label["created_at"] = nested_label['label_details']["created_at"]
flattened_labels.append(flat_label)
if verbose:
print(f"Labels flattened")
Expand Down
3 changes: 3 additions & 0 deletions labelbase/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import datetime
from dateutil import parser
import pytz
import pandas

def get_metadata_schema_to_type(client:labelboxClient, lb_mdo=False, invert:bool=False):
""" Creates a dictionary where {key=metadata_schema_id: value=metadata_type}
Expand Down Expand Up @@ -121,6 +122,8 @@ def process_metadata_value(metadata_value, metadata_type:str, parent_name:str, m
return_value = None
if str(metadata_value) == "nan": # Catch NaN values
return_value = None
if pandas.isna(metadata_value): #Catch pandas df NaN values
return_value = None
# By metadata type
if metadata_type == "enum": # For enums, it must be a schema ID - if we can't match it, we have to skip it
name_key = f"{parent_name}{divider}{str(metadata_value)}"
Expand Down
60 changes: 38 additions & 22 deletions labelbase/uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from labelbox import Dataset as labelboxDataset
from labelbox import Project as labelboxProject
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed

def create_global_key_to_label_id_dict(client:labelboxClient, project_id:str, global_keys:list):
""" Creates a dictionary where { key=global_key : value=label_id } by exporting labels from a project
Expand Down Expand Up @@ -37,7 +38,7 @@ def create_global_key_to_data_row_id_dict(client:labelboxClient, global_keys:lis
global_key_to_data_row_dict[gks[i]] = res['results'][i]
return global_key_to_data_row_dict

def check_global_keys(client:labelboxClient, global_keys:list, batch_size=1000):
def check_global_keys(client:labelboxClient, global_keys:list):
""" Checks if data rows exist for a set of global keys - if data rows exist, returns as dictionary { key=data_row_id : value=global_key }
Args:
client : Required (labelbox.client.Client) - Labelbox Client object
Expand All @@ -52,20 +53,18 @@ def check_global_keys(client:labelboxClient, global_keys:list, batch_size=1000):
# Enforce global keys as strings
global_keys_list = [str(x) for x in global_keys]
# Batch global key checks
for i in range(0, len(global_keys_list), batch_size):
batch_gks = global_keys_list[i:] if i + batch_size >= len(global_keys_list) else global_keys_list[i:i+batch_size]
# Get the datarow ids
res = client.get_data_row_ids_for_global_keys(batch_gks)
# Check query job results for fetched data rows
for i in range(0, len(res["results"])):
data_row_id = res["results"][i]
if data_row_id:
existing_drid_to_gk[data_row_id] = batch_gks[i]
# Get the datarow ids
res = client.get_data_row_ids_for_global_keys(global_keys_list)
# Check query job results for fetched data rows
for i in range(0, len(res["results"])):
data_row_id = res["results"][i]
if data_row_id:
existing_drid_to_gk[data_row_id] = global_keys_list[i]
return existing_drid_to_gk

def batch_create_data_rows(
client:labelboxClient, upload_dict:dict, skip_duplicates:bool=True,
divider:str="___", batch_size:int=20000, verbose:bool=False):
divider:str="___", batch_size:int=100000, verbose:bool=False):
""" Uploads data rows, skipping duplicate global keys or auto-generating new unique ones.

upload_dict must be in the following format:
Expand Down Expand Up @@ -94,7 +93,7 @@ def batch_create_data_rows(

"""
# Default error message
e = "Success"
e = {}
# Vet all global keys
global_keys = list(upload_dict.keys()) # Get all global keys
if verbose:
Expand All @@ -103,11 +102,14 @@ def batch_create_data_rows(
gks = global_keys[i:] if i + batch_size >= len(global_keys) else global_keys[i:i+batch_size] # Batch of global keys to vet
existing_data_row_to_global_key = check_global_keys(client, gks) # Returns empty list if there are no duplicates
loop_counter = 0
if skip_duplicates:
e['skipped_global_keys'] = []
while existing_data_row_to_global_key:
if skip_duplicates: # Drop in-use global keys if we're skipping duplicates
if verbose:
print(f"Warning: Global keys in this upload are in use by active data rows, skipping the upload of data rows affected")
for gk in existing_data_row_to_global_key.values():
e['skipped_global_keys'].append(gk)
del upload_dict[gk]
break
else: # Create new suffix for taken global keys if we're not skipping duplicates
Expand Down Expand Up @@ -135,7 +137,9 @@ def batch_create_data_rows(
dataset_id_to_upload_list[dataset_id] = []
dataset_id_to_upload_list[dataset_id].append(data_row)
# Perform uploads grouped by dataset ID
e['errors'] = []
for dataset_id in dataset_id_to_upload_list:
task_list = []
dataset = client.get_dataset(dataset_id)
upload_list = dataset_id_to_upload_list[dataset_id]
if verbose:
Expand All @@ -147,16 +151,24 @@ def batch_create_data_rows(
if verbose:
print(f'Batch #{batch_number}: {len(batch)} data rows')
task = dataset.create_data_rows(batch)
task.wait_till_done()
errors = task.errors
if errors:
if verbose:
print(f'Error: Upload batch number {batch_number} unsuccessful')
e = errors
break
else:
if verbose:
print(f'Success: Upload batch number {batch_number} successful')
task_list.append(task)
# task.wait_till_done()
# errors = task.errors
# e['upload_results'].append(task.uid)
# if errors:
# if verbose:
# print(f'Error: Upload batch number {batch_number} unsuccessful')
# e['errors'] = errors
# break
# else:
# if verbose:
# print(f'Success: Upload batch number {batch_number} successful')
with ThreadPoolExecutor() as exc:
futures = [exc.submit(get_results_from_task, x) for x in task_list]
for future in as_completed(futures):
errors = future.result()
if errors:
e['errors'] += errors
if verbose:
print(f'Upload complete - all data rows uploaded')
return e, upload_dict
Expand Down Expand Up @@ -487,3 +499,7 @@ def batch_upload_predictions(
except Exception as error:
e = error
return e

def get_results_from_task(task):
task.wait_till_done()
return task.errors
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name='labelbase',
version='0.1.05',
version='0.1.06',
author='Labelbox',
author_email='[email protected]',
description='Labelbox Helper Library',
Expand Down