Skip to content

chore: Migrate from Poetry to UV, added health checkpoint, and improved dockerimage #15

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
65 changes: 65 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Version control
.git
.gitignore
.gitattributes

# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
.env
.venv/
venv/
ENV/
env/
.pytest_cache/
.coverage
htmlcov/
.tox/
.nox/
.hypothesis/
.mypy_cache/

# IDE
.idea/
.vscode/
*.swp
*.swo
.DS_Store

# Project specific
tests/
docs/
*.md
!README.md
Makefile
docker-compose*.yml
.dockerignore
Dockerfile*

# Cache and temp files
*.log
.cache/
tmp/
temp/

# Distribution / packaging
dist/
build/
*.egg-info/

# Local development
*.local
.env.local
.env.*.local

# Model cache and downloads
model_cache/
downloads/

*.pyc
*.pyo
*.pyd
.env
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -18,3 +18,4 @@
.idea/misc.xml
.env
.DS_Store
__pycache__/
110 changes: 83 additions & 27 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,42 +1,98 @@
# Use a base image with CUDA support and the desired Python version
FROM python:3.12-slim-bookworm

FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS builder
ARG CPU_ONLY=false

WORKDIR /app

RUN apt-get update \
&& apt-get install -y redis-server libgl1 libglib2.0-0 curl wget git procps \
&& apt-get clean
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends libgl1 libglib2.0-0 && \
rm -rf /var/lib/apt/lists/*

# Install Poetry and configure it
RUN pip install poetry \
&& poetry config virtualenvs.create false
# Enable bytecode compilation and set proper link mode for cache mounting
ENV UV_COMPILE_BYTECODE=1 \
UV_LINK_MODE=copy \
HF_HOME=/app/.cache/huggingface \
TORCH_HOME=/app/.cache/torch \
PYTHONPATH=/app \
OMP_NUM_THREADS=4

COPY pyproject.toml poetry.lock ./
# Copy dependency files and README
COPY pyproject.toml uv.lock README.md ./

# Install dependencies before torch
RUN poetry install --no-interaction --no-root
# Install dependencies but not the project itself
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --no-install-project

# Copy the rest of the project
COPY . .

# Install PyTorch separately based on CPU_ONLY flag
RUN if [ "$CPU_ONLY" = "true" ]; then \
pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu; \
# Better GPU detection: Check both architecture and if NVIDIA is available
RUN ARCH=$(uname -m) && \
if [ "$CPU_ONLY" = "true" ] || [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ] || ! command -v nvidia-smi >/dev/null 2>&1; then \
USE_GPU=false; \
else \
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121; \
USE_GPU=true; \
fi && \
echo "Detected GPU availability: $USE_GPU" && \
# For PyTorch installation with architecture detection
uv pip uninstall -y torch torchvision torchaudio || true && \
if [ "$USE_GPU" = "false" ]; then \
# For CPU or ARM architectures or no NVIDIA
echo "Installing PyTorch for CPU" && \
uv pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu; \
else \
# For x86_64 with GPU support
echo "Installing PyTorch with CUDA support" && \
uv pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121; \
fi

ENV HF_HOME=/tmp/ \
TORCH_HOME=/tmp/ \
OMP_NUM_THREADS=4
# Install the project in non-editable mode
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --no-editable

# Download models for the pipeline
RUN uv run python -c "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True)"

# Pre-download EasyOCR models with better GPU detection
RUN ARCH=$(uname -m) && \
if [ "$CPU_ONLY" = "true" ] || [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ] || ! command -v nvidia-smi >/dev/null 2>&1; then \
echo "Downloading EasyOCR models for CPU" && \
uv run python -c "import easyocr; reader = easyocr.Reader(['fr', 'de', 'es', 'en', 'it', 'pt'], gpu=False); print('EasyOCR CPU models downloaded successfully')"; \
else \
echo "Downloading EasyOCR models with GPU support" && \
uv run python -c "import easyocr; reader = easyocr.Reader(['fr', 'de', 'es', 'en', 'it', 'pt'], gpu=True); print('EasyOCR GPU models downloaded successfully')"; \
fi

RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
# Production stage
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
WORKDIR /app

# Pre-download EasyOCR models in compatible groups
RUN python -c 'import easyocr; \
reader = easyocr.Reader(["fr", "de", "es", "en", "it", "pt"], gpu=True); \
print("EasyOCR models downloaded successfully")'
# Install runtime dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends redis-server libgl1 libglib2.0-0 curl && \
rm -rf /var/lib/apt/lists/*

COPY . .
# Set environment variables
ENV HF_HOME=/app/.cache/huggingface \
TORCH_HOME=/app/.cache/torch \
PYTHONPATH=/app \
OMP_NUM_THREADS=4 \
UV_COMPILE_BYTECODE=1

EXPOSE 8080
# Create a non-root user
RUN useradd --create-home app && \
mkdir -p /app && \
chown -R app:app /app /tmp

CMD ["poetry", "run", "uvicorn", "--port", "8080", "--host", "0.0.0.0", "main:app"]
# Copy the virtual environment from the builder stage
COPY --from=builder --chown=app:app /app/.venv /app/.venv
ENV PATH="/app/.venv/bin:$PATH"

# Copy necessary files for the application
COPY --chown=app:app . .

# Switch to non-root user
USER app

EXPOSE 8080
CMD ["uvicorn", "main:app", "--port", "8080", "--host", "0.0.0.0"]
125 changes: 125 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# Variables
PYTHON = poetry run
DOCKER_CPU_COMPOSE = docker-compose -f docker-compose.cpu.yml
DOCKER_GPU_COMPOSE = docker-compose -f docker-compose.gpu.yml
DOCKER_IMAGE = docling-api
PORT = 8080
WORKERS = 4

.PHONY: help install dev-setup start stop clean docker-* test lint format

help:
@echo "Available commands:"
@echo "Development:"
@echo " install - Install project dependencies using Poetry"
@echo " dev-setup - Setup development environment (install Redis, etc.)"
@echo " start - Start all development services locally"
@echo " stop - Stop all development services"
@echo " clean - Clean up temporary files and caches"
@echo ""
@echo "Docker:"
@echo " docker-build-cpu - Build Docker image (CPU version)"
@echo " docker-build-gpu - Build Docker image (GPU version)"
@echo " docker-start - Auto-detect system and start appropriate container (CPU/GPU)"
@echo " docker-start-cpu - Start services in CPU mode"
@echo " docker-start-gpu - Start services in GPU mode"
@echo " docker-stop - Stop all Docker services"
@echo " docker-clean - Clean Docker resources"
@echo ""
@echo "Code Quality:"
@echo " format - Format code using black"
@echo " lint - Run linter"
@echo " test - Run tests"

# Development commands
install:
curl -sSL https://install.python-poetry.org | python3 -
poetry install

dev-setup:
@echo "Setting up development environment..."
@if [ "$(shell uname)" = "Darwin" ]; then \
brew install redis; \
brew services start redis; \
elif [ -f /etc/debian_version ]; then \
sudo apt-get update && sudo apt-get install -y redis-server; \
sudo service redis-server start; \
fi
@echo "Creating .env file..."
@echo "REDIS_HOST=redis://localhost:6379/0" > .env
@echo "ENV=development" >> .env

start:
@echo "Starting FastAPI server..."
$(PYTHON) uvicorn main:app --reload --port $(PORT) & \
echo "Starting Celery worker..." && \
$(PYTHON) celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info & \
echo "Starting Flower dashboard..." && \
$(PYTHON) celery -A worker.celery_config flower --port=5555

stop:
@echo "Stopping services..."
@pkill -f "uvicorn main:app" || true
@pkill -f "celery" || true
@if [ "$(shell uname)" = "Darwin" ]; then \
brew services stop redis; \
elif [ -f /etc/debian_version ]; then \
sudo service redis-server stop; \
fi

# Docker commands
docker-build-cpu:
docker build --build-arg CPU_ONLY=true -t $(DOCKER_IMAGE):cpu .

docker-build-gpu:
docker build -t $(DOCKER_IMAGE):gpu .

docker-start-cpu:
$(DOCKER_CPU_COMPOSE) up --build --scale celery_worker=1

docker-start-gpu:
$(DOCKER_GPU_COMPOSE) up --build --scale celery_worker=3

# Auto-detect architecture and start appropriate container
docker-start:
@echo "Auto-detecting system architecture..."
@if [ "$(shell uname -m)" = "arm64" ] || [ "$(shell uname -m)" = "aarch64" ] || ! command -v nvidia-smi >/dev/null 2>&1; then \
echo "ARM architecture or NVIDIA drivers not detected. Using CPU mode."; \
$(MAKE) docker-start-cpu; \
else \
echo "NVIDIA GPU detected. Using GPU mode."; \
$(MAKE) docker-start-gpu; \
fi

docker-stop:
$(DOCKER_CPU_COMPOSE) down
$(DOCKER_GPU_COMPOSE) down

docker-clean:
docker system prune -f
docker volume prune -f

# Code quality commands
format:
$(PYTHON) black .

lint:
$(PYTHON) flake8 .
$(PYTHON) mypy .

test:
$(PYTHON) pytest

clean:
find . -type d -name "__pycache__" -exec rm -rf {} +
find . -type f -name "*.pyc" -delete
find . -type f -name "*.pyo" -delete
find . -type f -name "*.pyd" -delete
find . -type f -name ".coverage" -delete
find . -type d -name "*.egg-info" -exec rm -rf {} +
find . -type d -name "*.egg" -exec rm -rf {} +
find . -type d -name ".pytest_cache" -exec rm -rf {} +
find . -type d -name ".mypy_cache" -exec rm -rf {} +
find . -type d -name ".tox" -exec rm -rf {} +
find . -type d -name "build" -exec rm -rf {} +
find . -type d -name "dist" -exec rm -rf {} +
69 changes: 49 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
@@ -5,16 +5,16 @@
## Comparison to Other Parsing Libraries

| Original PDF |
|--------------|
| Original PDF |
| -------------------------------------------------------------------------------------------------------------------- |
| <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/original.png" width="500"/> |

| Docling-API | Marker |
|-------------|--------|
| Docling-API | Marker |
| ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
| <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/docling.png" width="500"/> | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/marker.png" width="500"/> |

| PyPDF | PyMuPDF4LLM |
|-------|-------------|
| PyPDF | PyMuPDF4LLM |
| ----------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
| <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/pypdf.png" width="500"/> | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/pymupdf.png" width="500"/> |

## Features
@@ -50,20 +50,20 @@
## Environment Setup (Running Locally)

### Prerequisites
- Python 3.8 or higher
- Poetry (Python package manager)
- Python 3.12 or higher
- uv (Python package manager)
- Redis server (for task queue)

### 1. Install Poetry (if not already installed)
### 1. Install uv (if not already installed)
```bash
curl -sSL https://install.python-poetry.org | python3 -
curl -LsSf https://astral.sh/uv/install.sh | sh
Copy link
Owner

@drmingler drmingler Mar 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also add instructions on how to install uv for those using windows or add a link to the uv installation doc? @spa5k

```

### 2. Clone and Setup Project
```bash
git clone https://github.com/drmingler/docling-api.git
cd docling-api
poetry install
uv pip install -r pyproject.toml
```

### 3. Configure Environment
@@ -92,17 +92,17 @@ sudo service redis-server start

1. Start the FastAPI server:
```bash
poetry run uvicorn main:app --reload --port 8080
uvicorn main:app --reload --port 8080
```

2. Start Celery worker (in a new terminal):
```bash
poetry run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info
celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info
```

3. Start Flower dashboard for monitoring (optional, in a new terminal):
```bash
poetry run celery -A worker.celery_config flower --port=5555
celery -A worker.celery_config flower --port=5555
```

### 6. Verify Installation
@@ -144,16 +144,43 @@ REDIS_HOST=redis://redis:6379/0
ENV=production
```

### CPU Mode
To start the service using CPU-only processing, use the following command. You can adjust the number of Celery workers by specifying the --scale option. In this example, 1 worker will be created:
### Using Makefile Commands

The project includes a Makefile for convenient management of Docker operations:

#### CPU Mode
```bash
# Build and run in CPU mode with 1 worker
make docker-build-cpu
make docker-start-cpu

# Or build and run with multiple workers
make docker-run-cpu WORKER_COUNT=3
```

#### GPU Mode (Recommended for production)
```bash
docker-compose -f docker-compose.cpu.yml up --build --scale celery_worker=1
# Build and run in GPU mode with 1 worker
make docker-build-gpu
make docker-start-gpu

# Or build and run with multiple workers
make docker-run-gpu WORKER_COUNT=3
```

### GPU Mode (Recommend for production)
For production, it is recommended to enable GPU acceleration, as it significantly improves performance. Use the command below to start the service with GPU support. You can also scale the number of Celery workers using the --scale option; here, 3 workers will be launched:
#### Other Makefile Commands
```bash
docker-compose -f docker-compose.gpu.yml up --build --scale celery_worker=3
# Stop all containers
make docker-stop

# Remove all containers
make docker-down

# View logs
make docker-logs

# Clean Docker resources
make docker-clean
```

## Service Components
@@ -236,6 +263,7 @@ The service uses a distributed architecture with the following components:
- GPU mode provides significantly faster processing for large documents
- CPU mode is suitable for smaller deployments or when GPU is not available
- Multiple workers can be scaled horizontally for increased throughput
- Using uv package manager for faster dependency installation and better caching

## License
The codebase is under MIT license. See LICENSE for more information
@@ -245,3 +273,4 @@ The codebase is under MIT license. See LICENSE for more information
- [FastAPI](https://fastapi.tiangolo.com/) the web framework
- [Celery](https://docs.celeryq.dev/en/stable/) for distributed task processing
- [Flower](https://flower.readthedocs.io/en/latest/) for monitoring and management
- [uv](https://github.com/astral/uv) for fast, reliable Python package management
76 changes: 76 additions & 0 deletions detect_gpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash

# Script to detect GPU and select the appropriate Docker Compose file

# Check if nvidia-smi exists and can be executed
if command -v nvidia-smi >/dev/null 2>&1; then
# Try to run nvidia-smi to check if drivers are loaded correctly
if nvidia-smi >/dev/null 2>&1; then
echo "NVIDIA GPU detected with working drivers."
GPU_AVAILABLE=true

# Check CUDA version
CUDA_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | cut -d'.' -f1)
echo "CUDA compatible driver version: $CUDA_VERSION"

# Check if the detected CUDA version is compatible with our requirements (CUDA 11+)
if [ -n "$CUDA_VERSION" ] && [ "$CUDA_VERSION" -ge 11 ]; then
echo "Using GPU configuration (CUDA $CUDA_VERSION detected)"
COMPOSE_FILE="docker-compose.gpu.yml"
DOCKER_BUILDKIT=1
DOCKER_BUILD_ARGS="--build-arg CPU_ONLY=false"
# Pass GPU capabilities to docker build
export DOCKER_BUILDKIT=1
export DOCKER_DEFAULT_PLATFORM=linux/amd64
export DOCKER_CLI_EXPERIMENTAL=enabled
else
echo "NVIDIA GPU detected but CUDA version ($CUDA_VERSION) is too old. Minimum required: 11"
echo "Falling back to CPU configuration."
GPU_AVAILABLE=false
COMPOSE_FILE="docker-compose.cpu.yml"
DOCKER_BUILD_ARGS="--build-arg CPU_ONLY=true"
fi
else
echo "NVIDIA GPU software detected but drivers may not be properly installed."
GPU_AVAILABLE=false
COMPOSE_FILE="docker-compose.cpu.yml"
DOCKER_BUILD_ARGS="--build-arg CPU_ONLY=true"
fi
else
echo "No NVIDIA GPU detected. Using CPU configuration."
GPU_AVAILABLE=false
COMPOSE_FILE="docker-compose.cpu.yml"
DOCKER_BUILD_ARGS="--build-arg CPU_ONLY=true"
fi

# Check architecture
ARCH=$(uname -m)
if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then
echo "ARM architecture detected. Forcing CPU mode regardless of GPU availability."
GPU_AVAILABLE=false
COMPOSE_FILE="docker-compose.cpu.yml"
DOCKER_BUILD_ARGS="--build-arg CPU_ONLY=true"
fi

# Export for other scripts to use
export GPU_AVAILABLE
export COMPOSE_FILE
export DOCKER_BUILD_ARGS

echo "Selected configuration: $COMPOSE_FILE"
echo "Build arguments: $DOCKER_BUILD_ARGS"
echo "GPU_AVAILABLE=$GPU_AVAILABLE"

# If this script is being sourced, don't execute docker-compose
if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
return 0
fi

# If passed arguments, run docker-compose with them
if [ $# -gt 0 ]; then
echo "Running: docker-compose -f $COMPOSE_FILE $@"
docker-compose -f $COMPOSE_FILE $@
else
echo "Usage: $0 [docker-compose commands]"
echo "or source this script to export the variables"
fi
40 changes: 29 additions & 11 deletions docker-compose.cpu.yml
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
version: "3.8"

services:
celery_worker:
build:
context: .
args:
CPU_ONLY: "true"
image: converter-cpu-image
command: poetry run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info
command: uv run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info
volumes:
- .:/app
- model_cache:/tmp
- ./worker:/app/worker
environment:
- REDIS_HOST=${REDIS_HOST}
- ENV=production
restart: on-failure
healthcheck:
test: [ "CMD", "uv", "run", "celery", "-A", "worker.celery_config", "inspect", "ping", "-d", "celery@worker_primary" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
depends_on:
- redis

@@ -24,8 +27,10 @@ services:
context: .
args:
CPU_ONLY: "true"
cache_from:
- converter-cpu-image
image: converter-cpu-image
command: poetry run uvicorn --port 8080 --host 0.0.0.0 main:app
command: uv run uvicorn main:app --port 8080 --host 0.0.0.0 --workers 4 --proxy-headers
environment:
- REDIS_HOST=${REDIS_HOST}
- ENV=production
@@ -35,9 +40,14 @@ services:
ports:
- "8080:8080"
volumes:
- .:/app
- model_cache:/tmp
- ./main.py:/app/main.py
restart: on-failure
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:8080/health" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
depends_on:
- redis

@@ -46,20 +56,27 @@ services:
image: redis:7.2.4-alpine
ports:
- "6379:6379"
healthcheck:
test: [ "CMD", "redis-cli", "ping" ]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s

flower:
container_name: flower_cpu
build:
context: .
args:
CPU_ONLY: "true"
cache_from:
- converter-cpu-image
image: converter-cpu-image
command: poetry run celery -A worker.celery_config flower --port=5555
command: uv run celery -A worker.celery_config flower --port=5555
ports:
- "5556:5555"
volumes:
- .:/app
- model_cache:/tmp
- ./worker:/app/worker
environment:
- REDIS_HOST=${REDIS_HOST}
- ENV=production
@@ -70,3 +87,4 @@ services:

volumes:
model_cache:
name: docling_model_cache
70 changes: 59 additions & 11 deletions docker-compose.gpu.yml
Original file line number Diff line number Diff line change
@@ -1,25 +1,37 @@
version: "3.8"

services:
celery_worker:
build:
context: .
args:
CPU_ONLY: "false"
x-bake:
platforms:
- linux/amd64
output: type=docker
contexts:
- default
x-pass-nvidia:
- capabilities=compute,utility,graphics,video
image: converter-gpu-image
command: poetry run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info
command: uv run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info
volumes:
- .:/app
- ./worker:/app/worker
environment:
- REDIS_HOST=${REDIS_HOST}
- ENV=production
healthcheck:
test: [ "CMD", "uv", "run", "celery", "-A", "worker.celery_config", "inspect", "ping", "-d", "celery@worker_primary" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
capabilities: [ gpu ]
depends_on:
- redis

@@ -29,23 +41,39 @@ services:
context: .
args:
CPU_ONLY: "false"
cache_from:
- converter-gpu-image
x-bake:
platforms:
- linux/amd64
output: type=docker
contexts:
- default
x-pass-nvidia:
- capabilities=compute,utility,graphics,video
image: converter-gpu-image
command: poetry run uvicorn --port 8080 --host 0.0.0.0 main:app
command: uv run uvicorn main:app --port 8080 --host 0.0.0.0 --workers 4 --proxy-headers
environment:
- REDIS_HOST=${REDIS_HOST}
- ENV=production
- NVIDIA_VISIBLE_DEVICES=all
ports:
- "8080:8080"
volumes:
- .:/app
- ./main.py:/app/main.py
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:8080/health" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
capabilities: [ gpu ]
depends_on:
- redis
- celery_worker
@@ -55,19 +83,35 @@ services:
image: redis:7.2.4-alpine
ports:
- "6379:6379"
healthcheck:
test: [ "CMD", "redis-cli", "ping" ]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s

flower:
container_name: flower_gpu
build:
context: .
args:
CPU_ONLY: "false"
cache_from:
- converter-gpu-image
x-bake:
platforms:
- linux/amd64
output: type=docker
contexts:
- default
x-pass-nvidia:
- capabilities=compute,utility,graphics,video
image: converter-gpu-image
command: poetry run celery -A worker.celery_config flower --port=5555
command: uv run celery -A worker.celery_config flower --port=5555
ports:
- "5556:5555"
volumes:
- .:/app
- ./worker:/app/worker
environment:
- REDIS_HOST=${REDIS_HOST}
- ENV=production
@@ -81,4 +125,8 @@ services:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
capabilities: [ gpu ]

volumes:
model_cache:
name: docling_model_cache
248 changes: 214 additions & 34 deletions document_converter/route.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from io import BytesIO
from typing import List
from fastapi import APIRouter, File, HTTPException, UploadFile, Query
from fastapi import APIRouter, File, HTTPException, UploadFile, Query, status
from fastapi.responses import JSONResponse

from document_converter.schema import BatchConversionJobResult, ConversationJobResult, ConversionResult
from document_converter.schema import (
BatchConversionJobResult,
ConversionJobResult,
ConversionResult
)
from document_converter.service import DocumentConverterService, DoclingDocumentConversion
from document_converter.utils import is_file_format_supported
from worker.tasks import convert_document_task, convert_documents_task
from worker.tasks import convert_document_task, convert_documents_task, ping

router = APIRouter()

@@ -19,16 +24,33 @@
'/documents/convert',
response_model=ConversionResult,
response_model_exclude_unset=True,
status_code=status.HTTP_200_OK,
responses={
200: {"description": "Document successfully converted"},
400: {"description": "Invalid request or unsupported file format"},
500: {"description": "Internal server error during conversion"}
},
description="Convert a single document synchronously",
)
async def convert_single_document(
document: UploadFile = File(...),
extract_tables_as_images: bool = False,
image_resolution_scale: int = Query(4, ge=1, le=4),
document: UploadFile = File(..., description="The document file to convert"),
extract_tables_as_images: bool = Query(
False,
description="Whether to extract tables as images"
),
image_resolution_scale: int = Query(
4,
ge=1,
le=4,
description="Scale factor for image resolution (1-4)"
),
):
file_bytes = await document.read()
if not is_file_format_supported(file_bytes, document.filename):
raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unsupported file format: {document.filename}"
)

return document_converter_service.convert_document(
(document.filename, BytesIO(file_bytes)),
@@ -41,18 +63,35 @@ async def convert_single_document(
'/documents/batch-convert',
response_model=List[ConversionResult],
response_model_exclude_unset=True,
status_code=status.HTTP_200_OK,
responses={
200: {"description": "All documents successfully converted"},
400: {"description": "Invalid request or unsupported file format"},
500: {"description": "Internal server error during conversion"}
},
description="Convert multiple documents synchronously",
)
async def convert_multiple_documents(
documents: List[UploadFile] = File(...),
extract_tables_as_images: bool = False,
image_resolution_scale: int = Query(4, ge=1, le=4),
documents: List[UploadFile] = File(..., description="List of document files to convert"),
extract_tables_as_images: bool = Query(
False,
description="Whether to extract tables as images"
),
image_resolution_scale: int = Query(
4,
ge=1,
le=4,
description="Scale factor for image resolution (1-4)"
),
):
doc_streams = []
for document in documents:
file_bytes = await document.read()
if not is_file_format_supported(file_bytes, document.filename):
raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unsupported file format: {document.filename}"
)
doc_streams.append((document.filename, BytesIO(file_bytes)))

return document_converter_service.convert_documents(
@@ -65,54 +104,120 @@ async def convert_multiple_documents(
# Asynchronous conversion jobs endpoints
@router.post(
'/conversion-jobs',
response_model=ConversationJobResult,
description="Create a conversion job for a single document",
response_model=ConversionJobResult,
status_code=status.HTTP_202_ACCEPTED,
responses={
202: {"description": "Conversion job accepted and queued"},
400: {"description": "Invalid request or unsupported file format"},
500: {"description": "Failed to queue conversion job"}
},
description="Create an asynchronous conversion job for a single document",
)
async def create_single_document_conversion_job(
document: UploadFile = File(...),
extract_tables_as_images: bool = False,
image_resolution_scale: int = Query(4, ge=1, le=4),
document: UploadFile = File(..., description="The document file to convert"),
extract_tables_as_images: bool = Query(
False,
description="Whether to extract tables as images"
),
image_resolution_scale: int = Query(
4,
ge=1,
le=4,
description="Scale factor for image resolution (1-4)"
),
):
file_bytes = await document.read()
if not is_file_format_supported(file_bytes, document.filename):
raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unsupported file format: {document.filename}"
)

task = convert_document_task.delay(
(document.filename, file_bytes),
extract_tables=extract_tables_as_images,
image_resolution_scale=image_resolution_scale,
)

return ConversationJobResult(job_id=task.id, status="IN_PROGRESS")
return ConversionJobResult(
job_id=task.id,
status="IN_PROGRESS"
)


@router.get(
'/conversion-jobs/{job_id}',
response_model=ConversationJobResult,
description="Get the status of a single document conversion job",
response_model_exclude_unset=True,
response_model=ConversionJobResult,
responses={
200: {"description": "Conversion job completed successfully"},
202: {"description": "Conversion job is still in progress"},
404: {"description": "Job not found"},
422: {"description": "Conversion job failed"}
},
description="Get the status and result of a single document conversion job",
)
async def get_conversion_job_status(job_id: str):
return document_converter_service.get_single_document_task_result(job_id)
try:
result = document_converter_service.get_single_document_task_result(job_id)

# Return 202 Accepted if job is still in progress
if result.status in ["IN_PROGRESS"]:
return JSONResponse(
status_code=status.HTTP_202_ACCEPTED,
content=result.dict(exclude_none=True)
)

# Return 422 for failed jobs
if result.status == "FAILURE":
return JSONResponse(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
content=result.dict(exclude_none=True)
)

# Return 200 OK for successful jobs
return JSONResponse(
status_code=status.HTTP_200_OK,
content=result.dict(exclude_none=True)
)
except KeyError:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Job not found: {job_id}"
)
Comment on lines +160 to +186
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please take this off, during job processing we do not raise errors, we return a status of FAILURE if the job failed and the reason for the failure in the error field. It was intentionally done that way.



@router.post(
'/batch-conversion-jobs',
response_model=BatchConversionJobResult,
response_model_exclude_unset=True,
description="Create a conversion job for multiple documents",
status_code=status.HTTP_202_ACCEPTED,
responses={
202: {"description": "Batch conversion job accepted and queued"},
400: {"description": "Invalid request or unsupported file format"},
500: {"description": "Failed to queue batch conversion job"}
},
description="Create an asynchronous conversion job for multiple documents",
)
async def create_batch_conversion_job(
documents: List[UploadFile] = File(...),
extract_tables_as_images: bool = False,
image_resolution_scale: int = Query(4, ge=1, le=4),
documents: List[UploadFile] = File(..., description="List of document files to convert"),
extract_tables_as_images: bool = Query(
False,
description="Whether to extract tables as images"
),
image_resolution_scale: int = Query(
4,
ge=1,
le=4,
description="Scale factor for image resolution (1-4)"
),
):
"""Create a batch conversion job for multiple documents."""
doc_data = []
for document in documents:
file_bytes = await document.read()
if not is_file_format_supported(file_bytes, document.filename):
raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unsupported file format: {document.filename}"
)
doc_data.append((document.filename, file_bytes))

task = convert_documents_task.delay(
@@ -121,15 +226,90 @@ async def create_batch_conversion_job(
image_resolution_scale=image_resolution_scale,
)

return BatchConversionJobResult(job_id=task.id, status="IN_PROGRESS")
return BatchConversionJobResult(
job_id=task.id,
status="IN_PROGRESS"
)


@router.get(
'/batch-conversion-jobs/{job_id}',
response_model=BatchConversionJobResult,
response_model_exclude_unset=True,
description="Get the status of a batch conversion job",
responses={
200: {"description": "All conversion jobs completed successfully"},
202: {"description": "Batch job is still in progress"},
404: {"description": "Batch job not found"},
422: {"description": "Batch job failed"}
},
description="Get the status and results of a batch conversion job",
)
async def get_batch_conversion_job_status(job_id: str):
"""Get the status and results of a batch conversion job."""
return document_converter_service.get_batch_conversion_task_result(job_id)
try:
result = document_converter_service.get_batch_conversion_task_result(job_id)

# Return 202 Accepted if the batch job or any sub-job is still in progress
if result.status in ["IN_PROGRESS"] or any(
job.status in ["IN_PROGRESS"]
for job in result.conversion_results
):
return JSONResponse(
status_code=status.HTTP_202_ACCEPTED,
content=result.dict(exclude_none=True)
)

# Return 422 for failed batch jobs
if result.status == "FAILURE" or any(
job.status == "FAILURE"
for job in result.conversion_results
):
return JSONResponse(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
content=result.dict(exclude_none=True)
)

# Return 200 OK for successful batch jobs (all success)
return JSONResponse(
status_code=status.HTTP_200_OK,
content=result.dict(exclude_none=True)
)
except KeyError:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Batch job not found: {job_id}"
)
Comment on lines +248 to +279
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here



@router.get(
"/health",
responses={
200: {"description": "All services are healthy"},
500: {"description": "One or more services are unhealthy"}
},
description="Check the health status of all dependent services"
)
async def health_check():
try:
# Check Celery/Redis connection by sending a ping task
result = ping.delay()
response = result.get(timeout=3) # Wait up to 3 seconds for response

if response != "pong":
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Celery/Redis connection test failed"
)

return {
"status": "healthy",
"services": {
"celery": "connected",
"redis": "connected",
"docling": "connected",
"document_converter": "connected",
}
}
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
8 changes: 4 additions & 4 deletions document_converter/schema.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pydantic import BaseModel, Field
from typing import List, Literal, Optional
from typing import List, Literal, Optional, Dict, Any


class ImageData(BaseModel):
@@ -21,7 +21,7 @@ class BatchConversionResult(BaseModel):
)


class ConversationJobResult(BaseModel):
class ConversionJobResult(BaseModel):
job_id: Optional[str] = Field(None, description="The id of the conversion job")
result: Optional[ConversionResult] = Field(None, description="The result of the conversion job")
error: Optional[str] = Field(None, description="The error that occurred during the conversion job")
@@ -30,10 +30,10 @@ class ConversationJobResult(BaseModel):

class BatchConversionJobResult(BaseModel):
job_id: str = Field(..., description="The id of the conversion job")
conversion_results: List[ConversationJobResult] = Field(
conversion_results: List[ConversionJobResult] = Field(
default_factory=list, description="The results of the conversion job"
)
status: Literal["IN_PROGRESS", "SUCCESS", "FAILURE"] = Field(
None, description="The status of the entire conversion jobs in the batch"
)
error: Optional[str] = Field(None, description="If the entire batch failed, this will be the error message")
error: Optional[str] = Field(None, description="If the entire batch failed, this will be the error message")
16 changes: 8 additions & 8 deletions document_converter/service.py
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@
from docling_core.types.doc import ImageRefMode, TableItem, PictureItem
from fastapi import HTTPException

from document_converter.schema import BatchConversionJobResult, ConversationJobResult, ConversionResult, ImageData
from document_converter.schema import BatchConversionJobResult, ConversionJobResult, ConversionResult, ImageData
from document_converter.utils import handle_csv_file

logging.basicConfig(level=logging.INFO)
@@ -180,7 +180,7 @@ def convert_documents_task(
documents = [(filename, BytesIO(file)) for filename, file in documents]
return self.document_converter.convert_batch(documents, **kwargs)

def get_single_document_task_result(self, job_id: str) -> ConversationJobResult:
def get_single_document_task_result(self, job_id: str) -> ConversionJobResult:
"""Get the status and result of a document conversion job.
Returns:
@@ -191,18 +191,18 @@ def get_single_document_task_result(self, job_id: str) -> ConversationJobResult:

task = AsyncResult(job_id)
if task.state == 'PENDING':
return ConversationJobResult(job_id=job_id, status="IN_PROGRESS")
return ConversionJobResult(job_id=job_id, status="IN_PROGRESS")

elif task.state == 'SUCCESS':
result = task.get()
# Check if the conversion result contains an error
if result.get('error'):
return ConversationJobResult(job_id=job_id, status="FAILURE", error=result['error'])
return ConversionJobResult(job_id=job_id, status="FAILURE", error=result['error'])

return ConversationJobResult(job_id=job_id, status="SUCCESS", result=ConversionResult(**result))
return ConversionJobResult(job_id=job_id, status="SUCCESS", result=ConversionResult(**result))

else:
return ConversationJobResult(job_id=job_id, status="FAILURE", error=str(task.result))
return ConversionJobResult(job_id=job_id, status="FAILURE", error=str(task.result))

def get_batch_conversion_task_result(self, job_id: str) -> BatchConversionJobResult:
"""Get the status and results of a batch conversion job.
@@ -224,9 +224,9 @@ def get_batch_conversion_task_result(self, job_id: str) -> BatchConversionJobRes

for result in conversion_results:
if result.get('error'):
job_result = ConversationJobResult(status="FAILURE", error=result['error'])
job_result = ConversionJobResult(status="FAILURE", error=result['error'])
else:
job_result = ConversationJobResult(
job_result = ConversionJobResult(
status="SUCCESS", result=ConversionResult(**result).model_dump(exclude_unset=True)
)
job_results.append(job_result)
6 changes: 6 additions & 0 deletions entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/sh
set -e


echo "Starting application..."
exec /app/.venv/bin/python -m uvicorn --port 8080 --host 0.0.0.0 main:app
3,571 changes: 0 additions & 3,571 deletions poetry.lock

This file was deleted.

36 changes: 20 additions & 16 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
[tool.poetry]
[project]
name = "document-to-markdown"
version = "0.1.0"
description = ""
authors = ["drmingler <davidemmanuel75@gmail.com>"]
authors = [
{name = "davidemmanuel75@gmail.com", email = "davidemmanuel75@gmail.com"},
]
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.12"
fastapi = "^0.115.4"
uvicorn = "^0.32.0"
docling = "^2.25.1"
python-multipart = "^0.0.17"
celery = "^5.4.0"
flower = "^2.0.1"
redis = "^5.2.0"
gunicorn = "^23.0.0"

requires-python = ">=3.12"
dependencies = [
"fastapi>=0.115.4",
"uvicorn>=0.32.0",
"docling>=2.25.1",
"python-multipart>=0.0.17",
"celery>=5.4.0",
"flower>=2.0.1",
"redis>=5.2.0",
"gunicorn>=23.0.0",
]

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["."]
1,902 changes: 1,902 additions & 0 deletions uv.lock

Large diffs are not rendered by default.