Skip to content

Commit ee41913

Browse files
authored
Merge pull request #34 from fabianofranz/rhelai-4250-confidence-scores
Displays conversion confidence scores and grades
2 parents 162b1b3 + 51e7958 commit ee41913

File tree

1 file changed

+48
-2
lines changed

1 file changed

+48
-2
lines changed

notebooks/instructlab-knowledge/instructlab-knowledge.ipynb

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -360,14 +360,23 @@
360360
"source": [
361361
"import json\n",
362362
"\n",
363+
"confidence_reports = dict()\n",
364+
"\n",
363365
"json_files=[]\n",
366+
"\n",
364367
"for contribution in contributions:\n",
365368
" files = list((contribution[\"dir\"] / SOURCE_DOCUMENT_DIR).glob(\"*.pdf\"))\n",
366369
" \n",
367370
" for file in files:\n",
368-
" doc = doc_converter.convert(source=file).document\n",
371+
" print(f\"Converting {file}...\")\n",
372+
" \n",
373+
" conversion_result = doc_converter.convert(source=file)\n",
374+
"\n",
375+
" doc = conversion_result.document\n",
369376
" doc_dict = doc.export_to_dict()\n",
370377
" \n",
378+
" confidence_reports[file] = conversion_result.confidence\n",
379+
" \n",
371380
" conversion_output_dir = contribution[\"dir\"] / CONVERSION_DIR\n",
372381
" conversion_output_dir.mkdir(parents=True, exist_ok=True)\n",
373382
" \n",
@@ -377,7 +386,44 @@
377386
" print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")\n",
378387
" json_files.append(json_output_path.resolve())\n",
379388
"\n",
380-
" print(f\"\\nSample:\\n ${doc.export_to_text()[:500]}...\")"
389+
" print(\"Document sample:\\n\")\n",
390+
" print(f\"{doc.export_to_text()[:500]}...\")\n",
391+
" print()"
392+
]
393+
},
394+
{
395+
"cell_type": "markdown",
396+
"id": "07a15341-aa4a-4471-85ac-0a49df20fb2e",
397+
"metadata": {},
398+
"source": [
399+
"### Conversion confidence\n",
400+
"\n",
401+
"When converting a document, Docling can calculate how confident it is in the quality of the conversion. This *confidence* is expressed as both a *score* and a *grade*. The score is a numeric value between 0 and 1, and the grade is a label that can be **poor**, **fair**, **good**, or **excellent**. If Docling is unable to calculate a confidence grade, the value will be marked as *unspecified*.\n",
402+
"\n",
403+
"If your document receives a low score (for example, below 0.8) and a grade of *poor* or *fair*, you'll probably benefit from using a different conversion technique. In that case, go back to the *Configure Docling Conversion Pipeline* section and try selecting a different approach (e.g. forcing OCR or using a VLM) and compare the results."
404+
]
405+
},
406+
{
407+
"cell_type": "code",
408+
"execution_count": null,
409+
"id": "f5e4aaa1-4f82-456b-ae37-0a68da10a4c7",
410+
"metadata": {},
411+
"outputs": [],
412+
"source": [
413+
"for file, confidence_report in confidence_reports.items():\n",
414+
" print(f\"Conversion confidence for {file}:\")\n",
415+
" \n",
416+
" print(f\"Average confidence: \\x1b[1m{confidence_report.mean_grade.name}\\033[0m (score {confidence_report.mean_score:.3f})\")\n",
417+
" \n",
418+
" low_score_pages = []\n",
419+
" for page in confidence_report.pages:\n",
420+
" page_confidence_report = confidence_report.pages[page]\n",
421+
" if page_confidence_report.mean_score < confidence_report.mean_score:\n",
422+
" low_score_pages.append(page)\n",
423+
"\n",
424+
" print(f\"Pages that scored lower than average: {', '.join(str(x + 1) for x in low_score_pages)}\")\n",
425+
" \n",
426+
" print()"
381427
]
382428
},
383429
{

0 commit comments

Comments
 (0)