|
360 | 360 | "source": [
|
361 | 361 | "import json\n",
|
362 | 362 | "\n",
|
| 363 | + "confidence_reports = dict()\n", |
| 364 | + "\n", |
363 | 365 | "json_files=[]\n",
|
| 366 | + "\n", |
364 | 367 | "for contribution in contributions:\n",
|
365 | 368 | " files = list((contribution[\"dir\"] / SOURCE_DOCUMENT_DIR).glob(\"*.pdf\"))\n",
|
366 | 369 | " \n",
|
367 | 370 | " for file in files:\n",
|
368 |
| - " doc = doc_converter.convert(source=file).document\n", |
| 371 | + " print(f\"Converting {file}...\")\n", |
| 372 | + " \n", |
| 373 | + " conversion_result = doc_converter.convert(source=file)\n", |
| 374 | + "\n", |
| 375 | + " doc = conversion_result.document\n", |
369 | 376 | " doc_dict = doc.export_to_dict()\n",
|
370 | 377 | " \n",
|
| 378 | + " confidence_reports[file] = conversion_result.confidence\n", |
| 379 | + " \n", |
371 | 380 | " conversion_output_dir = contribution[\"dir\"] / CONVERSION_DIR\n",
|
372 | 381 | " conversion_output_dir.mkdir(parents=True, exist_ok=True)\n",
|
373 | 382 | " \n",
|
|
377 | 386 | " print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")\n",
|
378 | 387 | " json_files.append(json_output_path.resolve())\n",
|
379 | 388 | "\n",
|
380 |
| - " print(f\"\\nSample:\\n ${doc.export_to_text()[:500]}...\")" |
| 389 | + " print(\"Document sample:\\n\")\n", |
| 390 | + " print(f\"{doc.export_to_text()[:500]}...\")\n", |
| 391 | + " print()" |
| 392 | + ] |
| 393 | + }, |
| 394 | + { |
| 395 | + "cell_type": "markdown", |
| 396 | + "id": "07a15341-aa4a-4471-85ac-0a49df20fb2e", |
| 397 | + "metadata": {}, |
| 398 | + "source": [ |
| 399 | + "### Conversion confidence\n", |
| 400 | + "\n", |
| 401 | + "When converting a document, Docling can calculate how confident it is in the quality of the conversion. This *confidence* is expressed as both a *score* and a *grade*. The score is a numeric value between 0 and 1, and the grade is a label that can be **poor**, **fair**, **good**, or **excellent**. If Docling is unable to calculate a confidence grade, the value will be marked as *unspecified*.\n", |
| 402 | + "\n", |
| 403 | + "If your document receives a low score (for example, below 0.8) and a grade of *poor* or *fair*, you'll probably benefit from using a different conversion technique. In that case, go back to the *Configure Docling Conversion Pipeline* section and try selecting a different approach (e.g. forcing OCR or using a VLM) and compare the results." |
| 404 | + ] |
| 405 | + }, |
| 406 | + { |
| 407 | + "cell_type": "code", |
| 408 | + "execution_count": null, |
| 409 | + "id": "f5e4aaa1-4f82-456b-ae37-0a68da10a4c7", |
| 410 | + "metadata": {}, |
| 411 | + "outputs": [], |
| 412 | + "source": [ |
| 413 | + "for file, confidence_report in confidence_reports.items():\n", |
| 414 | + " print(f\"Conversion confidence for {file}:\")\n", |
| 415 | + " \n", |
| 416 | + " print(f\"Average confidence: \\x1b[1m{confidence_report.mean_grade.name}\\033[0m (score {confidence_report.mean_score:.3f})\")\n", |
| 417 | + " \n", |
| 418 | + " low_score_pages = []\n", |
| 419 | + " for page in confidence_report.pages:\n", |
| 420 | + " page_confidence_report = confidence_report.pages[page]\n", |
| 421 | + " if page_confidence_report.mean_score < confidence_report.mean_score:\n", |
| 422 | + " low_score_pages.append(page)\n", |
| 423 | + "\n", |
| 424 | + " print(f\"Pages that scored lower than average: {', '.join(str(x + 1) for x in low_score_pages)}\")\n", |
| 425 | + " \n", |
| 426 | + " print()" |
381 | 427 | ]
|
382 | 428 | },
|
383 | 429 | {
|
|
0 commit comments