Skip to content

Commit eabbe27

Browse files
committed
Lint fixes
1 parent 7f82260 commit eabbe27

File tree

2 files changed

+16
-14
lines changed

2 files changed

+16
-14
lines changed

Diff for: scripts/autoscan_dolmadocs.py

+15-13
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,8 @@ def chatgpt_analyze_page(pdf_path: str, page_num: int, pdf_s3_client, openai_api
279279
- Login information (ONLY mark as PII when a username, password, and login location are present together)
280280
281281
If the document is a form, then only consider fields which are filled out with specific values as potential PII.
282+
If this page does not itself contain PII, but references documents (such as curriculum vitae, personal statements) that typically contain PII, then do not mark it as PII.
283+
Only consider actual occurrences of the PII within the document shown.
282284
"""
283285

284286
# Use the chat completions API with the custom schema
@@ -488,19 +490,19 @@ def print_annotation_report(annotation_results: Dict[str, List[Dict[str, Any]]])
488490
print("-" * 80)
489491

490492
# Print links to unreadable pages
491-
if annotation_results["cannot_read"]:
492-
print("\nUnreadable Pages:")
493-
print("-" * 80)
494-
for i, item in enumerate(annotation_results["cannot_read"], 1):
495-
pdf_path = item["pdf_path"]
496-
pdf_page = item["pdf_page"]
497-
presigned_url = item.get("presigned_url")
498-
499-
print(f"{i}. PDF: {pdf_path}")
500-
print(f" Page: {pdf_page}")
501-
if presigned_url:
502-
print(f" Presigned URL: {presigned_url}")
503-
print("-" * 80)
493+
# if annotation_results["cannot_read"]:
494+
# print("\nUnreadable Pages:")
495+
# print("-" * 80)
496+
# for i, item in enumerate(annotation_results["cannot_read"], 1):
497+
# pdf_path = item["pdf_path"]
498+
# pdf_page = item["pdf_page"]
499+
# presigned_url = item.get("presigned_url")
500+
501+
# print(f"{i}. PDF: {pdf_path}")
502+
# print(f" Page: {pdf_page}")
503+
# if presigned_url:
504+
# print(f" Presigned URL: {presigned_url}")
505+
# print("-" * 80)
504506

505507
# Print links to inappropriate content
506508
if annotation_results["report_content"]:

Diff for: scripts/scan_dolmadocs.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1602,7 +1602,7 @@ def read_and_process_results(args):
16021602
for category, items in annotation_results.items():
16031603
for item in items:
16041604
pdf_path = item["pdf_path"]
1605-
1605+
16061606
# Get the actual PDF page number
16071607
pdf_page = item.get("pdf_page")
16081608

0 commit comments

Comments
 (0)