-
Notifications
You must be signed in to change notification settings - Fork 56
chore: Migrate from Poetry to UV, added health checkpoint, and improved dockerimage #15
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
c785f7c
b27d099
d78b947
b30e219
b4f97e2
01059e9
202983a
f8fc3fd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Version control | ||
.git | ||
.gitignore | ||
.gitattributes | ||
|
||
# Python | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
*.so | ||
.Python | ||
.env | ||
.venv/ | ||
venv/ | ||
ENV/ | ||
env/ | ||
.pytest_cache/ | ||
.coverage | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.hypothesis/ | ||
.mypy_cache/ | ||
|
||
# IDE | ||
.idea/ | ||
.vscode/ | ||
*.swp | ||
*.swo | ||
.DS_Store | ||
|
||
# Project specific | ||
tests/ | ||
docs/ | ||
*.md | ||
!README.md | ||
Makefile | ||
docker-compose*.yml | ||
.dockerignore | ||
Dockerfile* | ||
|
||
# Cache and temp files | ||
*.log | ||
.cache/ | ||
tmp/ | ||
temp/ | ||
|
||
# Distribution / packaging | ||
dist/ | ||
build/ | ||
*.egg-info/ | ||
|
||
# Local development | ||
*.local | ||
.env.local | ||
.env.*.local | ||
|
||
# Model cache and downloads | ||
model_cache/ | ||
downloads/ | ||
|
||
*.pyc | ||
*.pyo | ||
*.pyd | ||
.env |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,3 +18,4 @@ | |
.idea/misc.xml | ||
.env | ||
.DS_Store | ||
__pycache__/ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,98 @@ | ||
# Use a base image with CUDA support and the desired Python version | ||
FROM python:3.12-slim-bookworm | ||
|
||
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS builder | ||
ARG CPU_ONLY=false | ||
|
||
WORKDIR /app | ||
|
||
RUN apt-get update \ | ||
&& apt-get install -y redis-server libgl1 libglib2.0-0 curl wget git procps \ | ||
&& apt-get clean | ||
# Install build dependencies | ||
RUN apt-get update && \ | ||
apt-get install -y --no-install-recommends libgl1 libglib2.0-0 && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Install Poetry and configure it | ||
RUN pip install poetry \ | ||
&& poetry config virtualenvs.create false | ||
# Enable bytecode compilation and set proper link mode for cache mounting | ||
ENV UV_COMPILE_BYTECODE=1 \ | ||
UV_LINK_MODE=copy \ | ||
HF_HOME=/app/.cache/huggingface \ | ||
TORCH_HOME=/app/.cache/torch \ | ||
PYTHONPATH=/app \ | ||
OMP_NUM_THREADS=4 | ||
|
||
COPY pyproject.toml poetry.lock ./ | ||
# Copy dependency files and README | ||
COPY pyproject.toml uv.lock README.md ./ | ||
|
||
# Install dependencies before torch | ||
RUN poetry install --no-interaction --no-root | ||
# Install dependencies but not the project itself | ||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv sync --frozen --no-install-project | ||
|
||
# Copy the rest of the project | ||
COPY . . | ||
|
||
# Install PyTorch separately based on CPU_ONLY flag | ||
RUN if [ "$CPU_ONLY" = "true" ]; then \ | ||
pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu; \ | ||
# Better GPU detection: Check both architecture and if NVIDIA is available | ||
RUN ARCH=$(uname -m) && \ | ||
if [ "$CPU_ONLY" = "true" ] || [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ] || ! command -v nvidia-smi >/dev/null 2>&1; then \ | ||
USE_GPU=false; \ | ||
else \ | ||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121; \ | ||
USE_GPU=true; \ | ||
fi && \ | ||
echo "Detected GPU availability: $USE_GPU" && \ | ||
# For PyTorch installation with architecture detection | ||
uv pip uninstall -y torch torchvision torchaudio || true && \ | ||
if [ "$USE_GPU" = "false" ]; then \ | ||
# For CPU or ARM architectures or no NVIDIA | ||
echo "Installing PyTorch for CPU" && \ | ||
uv pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu; \ | ||
else \ | ||
# For x86_64 with GPU support | ||
echo "Installing PyTorch with CUDA support" && \ | ||
uv pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121; \ | ||
fi | ||
|
||
ENV HF_HOME=/tmp/ \ | ||
TORCH_HOME=/tmp/ \ | ||
OMP_NUM_THREADS=4 | ||
# Install the project in non-editable mode | ||
RUN --mount=type=cache,target=/root/.cache/uv \ | ||
uv sync --frozen --no-editable | ||
|
||
# Download models for the pipeline | ||
RUN uv run python -c "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True)" | ||
|
||
# Pre-download EasyOCR models with better GPU detection | ||
RUN ARCH=$(uname -m) && \ | ||
if [ "$CPU_ONLY" = "true" ] || [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ] || ! command -v nvidia-smi >/dev/null 2>&1; then \ | ||
echo "Downloading EasyOCR models for CPU" && \ | ||
uv run python -c "import easyocr; reader = easyocr.Reader(['fr', 'de', 'es', 'en', 'it', 'pt'], gpu=False); print('EasyOCR CPU models downloaded successfully')"; \ | ||
else \ | ||
echo "Downloading EasyOCR models with GPU support" && \ | ||
uv run python -c "import easyocr; reader = easyocr.Reader(['fr', 'de', 'es', 'en', 'it', 'pt'], gpu=True); print('EasyOCR GPU models downloaded successfully')"; \ | ||
fi | ||
|
||
RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);' | ||
# Production stage | ||
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim | ||
WORKDIR /app | ||
|
||
# Pre-download EasyOCR models in compatible groups | ||
RUN python -c 'import easyocr; \ | ||
reader = easyocr.Reader(["fr", "de", "es", "en", "it", "pt"], gpu=True); \ | ||
print("EasyOCR models downloaded successfully")' | ||
# Install runtime dependencies | ||
RUN apt-get update && \ | ||
apt-get install -y --no-install-recommends redis-server libgl1 libglib2.0-0 curl && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
COPY . . | ||
# Set environment variables | ||
ENV HF_HOME=/app/.cache/huggingface \ | ||
TORCH_HOME=/app/.cache/torch \ | ||
PYTHONPATH=/app \ | ||
OMP_NUM_THREADS=4 \ | ||
UV_COMPILE_BYTECODE=1 | ||
|
||
EXPOSE 8080 | ||
# Create a non-root user | ||
RUN useradd --create-home app && \ | ||
mkdir -p /app && \ | ||
chown -R app:app /app /tmp | ||
|
||
CMD ["poetry", "run", "uvicorn", "--port", "8080", "--host", "0.0.0.0", "main:app"] | ||
# Copy the virtual environment from the builder stage | ||
COPY --from=builder --chown=app:app /app/.venv /app/.venv | ||
ENV PATH="/app/.venv/bin:$PATH" | ||
|
||
# Copy necessary files for the application | ||
COPY --chown=app:app . . | ||
|
||
# Switch to non-root user | ||
USER app | ||
|
||
EXPOSE 8080 | ||
CMD ["uvicorn", "main:app", "--port", "8080", "--host", "0.0.0.0"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
# Variables | ||
PYTHON = poetry run | ||
DOCKER_CPU_COMPOSE = docker-compose -f docker-compose.cpu.yml | ||
DOCKER_GPU_COMPOSE = docker-compose -f docker-compose.gpu.yml | ||
DOCKER_IMAGE = docling-api | ||
PORT = 8080 | ||
WORKERS = 4 | ||
|
||
.PHONY: help install dev-setup start stop clean docker-* test lint format | ||
|
||
help: | ||
@echo "Available commands:" | ||
@echo "Development:" | ||
@echo " install - Install project dependencies using Poetry" | ||
@echo " dev-setup - Setup development environment (install Redis, etc.)" | ||
@echo " start - Start all development services locally" | ||
@echo " stop - Stop all development services" | ||
@echo " clean - Clean up temporary files and caches" | ||
@echo "" | ||
@echo "Docker:" | ||
@echo " docker-build-cpu - Build Docker image (CPU version)" | ||
@echo " docker-build-gpu - Build Docker image (GPU version)" | ||
@echo " docker-start - Auto-detect system and start appropriate container (CPU/GPU)" | ||
@echo " docker-start-cpu - Start services in CPU mode" | ||
@echo " docker-start-gpu - Start services in GPU mode" | ||
@echo " docker-stop - Stop all Docker services" | ||
@echo " docker-clean - Clean Docker resources" | ||
@echo "" | ||
@echo "Code Quality:" | ||
@echo " format - Format code using black" | ||
@echo " lint - Run linter" | ||
@echo " test - Run tests" | ||
|
||
# Development commands | ||
install: | ||
curl -sSL https://install.python-poetry.org | python3 - | ||
poetry install | ||
|
||
dev-setup: | ||
@echo "Setting up development environment..." | ||
@if [ "$(shell uname)" = "Darwin" ]; then \ | ||
brew install redis; \ | ||
brew services start redis; \ | ||
elif [ -f /etc/debian_version ]; then \ | ||
sudo apt-get update && sudo apt-get install -y redis-server; \ | ||
sudo service redis-server start; \ | ||
fi | ||
@echo "Creating .env file..." | ||
@echo "REDIS_HOST=redis://localhost:6379/0" > .env | ||
@echo "ENV=development" >> .env | ||
|
||
start: | ||
@echo "Starting FastAPI server..." | ||
$(PYTHON) uvicorn main:app --reload --port $(PORT) & \ | ||
echo "Starting Celery worker..." && \ | ||
$(PYTHON) celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info & \ | ||
echo "Starting Flower dashboard..." && \ | ||
$(PYTHON) celery -A worker.celery_config flower --port=5555 | ||
|
||
stop: | ||
@echo "Stopping services..." | ||
@pkill -f "uvicorn main:app" || true | ||
@pkill -f "celery" || true | ||
@if [ "$(shell uname)" = "Darwin" ]; then \ | ||
brew services stop redis; \ | ||
elif [ -f /etc/debian_version ]; then \ | ||
sudo service redis-server stop; \ | ||
fi | ||
|
||
# Docker commands | ||
docker-build-cpu: | ||
docker build --build-arg CPU_ONLY=true -t $(DOCKER_IMAGE):cpu . | ||
|
||
docker-build-gpu: | ||
docker build -t $(DOCKER_IMAGE):gpu . | ||
|
||
docker-start-cpu: | ||
$(DOCKER_CPU_COMPOSE) up --build --scale celery_worker=1 | ||
|
||
docker-start-gpu: | ||
$(DOCKER_GPU_COMPOSE) up --build --scale celery_worker=3 | ||
|
||
# Auto-detect architecture and start appropriate container | ||
docker-start: | ||
@echo "Auto-detecting system architecture..." | ||
@if [ "$(shell uname -m)" = "arm64" ] || [ "$(shell uname -m)" = "aarch64" ] || ! command -v nvidia-smi >/dev/null 2>&1; then \ | ||
echo "ARM architecture or NVIDIA drivers not detected. Using CPU mode."; \ | ||
$(MAKE) docker-start-cpu; \ | ||
else \ | ||
echo "NVIDIA GPU detected. Using GPU mode."; \ | ||
$(MAKE) docker-start-gpu; \ | ||
fi | ||
|
||
docker-stop: | ||
$(DOCKER_CPU_COMPOSE) down | ||
$(DOCKER_GPU_COMPOSE) down | ||
|
||
docker-clean: | ||
docker system prune -f | ||
docker volume prune -f | ||
|
||
# Code quality commands | ||
format: | ||
$(PYTHON) black . | ||
|
||
lint: | ||
$(PYTHON) flake8 . | ||
$(PYTHON) mypy . | ||
|
||
test: | ||
$(PYTHON) pytest | ||
|
||
clean: | ||
find . -type d -name "__pycache__" -exec rm -rf {} + | ||
find . -type f -name "*.pyc" -delete | ||
find . -type f -name "*.pyo" -delete | ||
find . -type f -name "*.pyd" -delete | ||
find . -type f -name ".coverage" -delete | ||
find . -type d -name "*.egg-info" -exec rm -rf {} + | ||
find . -type d -name "*.egg" -exec rm -rf {} + | ||
find . -type d -name ".pytest_cache" -exec rm -rf {} + | ||
find . -type d -name ".mypy_cache" -exec rm -rf {} + | ||
find . -type d -name ".tox" -exec rm -rf {} + | ||
find . -type d -name "build" -exec rm -rf {} + | ||
find . -type d -name "dist" -exec rm -rf {} + |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/bin/bash | ||
|
||
# Script to detect GPU and select the appropriate Docker Compose file | ||
|
||
# Check if nvidia-smi exists and can be executed | ||
if command -v nvidia-smi >/dev/null 2>&1; then | ||
# Try to run nvidia-smi to check if drivers are loaded correctly | ||
if nvidia-smi >/dev/null 2>&1; then | ||
echo "NVIDIA GPU detected with working drivers." | ||
GPU_AVAILABLE=true | ||
|
||
# Check CUDA version | ||
CUDA_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | cut -d'.' -f1) | ||
echo "CUDA compatible driver version: $CUDA_VERSION" | ||
|
||
# Check if the detected CUDA version is compatible with our requirements (CUDA 11+) | ||
if [ -n "$CUDA_VERSION" ] && [ "$CUDA_VERSION" -ge 11 ]; then | ||
echo "Using GPU configuration (CUDA $CUDA_VERSION detected)" | ||
COMPOSE_FILE="docker-compose.gpu.yml" | ||
DOCKER_BUILDKIT=1 | ||
DOCKER_BUILD_ARGS="--build-arg CPU_ONLY=false" | ||
# Pass GPU capabilities to docker build | ||
export DOCKER_BUILDKIT=1 | ||
export DOCKER_DEFAULT_PLATFORM=linux/amd64 | ||
export DOCKER_CLI_EXPERIMENTAL=enabled | ||
else | ||
echo "NVIDIA GPU detected but CUDA version ($CUDA_VERSION) is too old. Minimum required: 11" | ||
echo "Falling back to CPU configuration." | ||
GPU_AVAILABLE=false | ||
COMPOSE_FILE="docker-compose.cpu.yml" | ||
DOCKER_BUILD_ARGS="--build-arg CPU_ONLY=true" | ||
fi | ||
else | ||
echo "NVIDIA GPU software detected but drivers may not be properly installed." | ||
GPU_AVAILABLE=false | ||
COMPOSE_FILE="docker-compose.cpu.yml" | ||
DOCKER_BUILD_ARGS="--build-arg CPU_ONLY=true" | ||
fi | ||
else | ||
echo "No NVIDIA GPU detected. Using CPU configuration." | ||
GPU_AVAILABLE=false | ||
COMPOSE_FILE="docker-compose.cpu.yml" | ||
DOCKER_BUILD_ARGS="--build-arg CPU_ONLY=true" | ||
fi | ||
|
||
# Check architecture | ||
ARCH=$(uname -m) | ||
if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then | ||
echo "ARM architecture detected. Forcing CPU mode regardless of GPU availability." | ||
GPU_AVAILABLE=false | ||
COMPOSE_FILE="docker-compose.cpu.yml" | ||
DOCKER_BUILD_ARGS="--build-arg CPU_ONLY=true" | ||
fi | ||
|
||
# Export for other scripts to use | ||
export GPU_AVAILABLE | ||
export COMPOSE_FILE | ||
export DOCKER_BUILD_ARGS | ||
|
||
echo "Selected configuration: $COMPOSE_FILE" | ||
echo "Build arguments: $DOCKER_BUILD_ARGS" | ||
echo "GPU_AVAILABLE=$GPU_AVAILABLE" | ||
|
||
# If this script is being sourced, don't execute docker-compose | ||
if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then | ||
return 0 | ||
fi | ||
|
||
# If passed arguments, run docker-compose with them | ||
if [ $# -gt 0 ]; then | ||
echo "Running: docker-compose -f $COMPOSE_FILE $@" | ||
docker-compose -f $COMPOSE_FILE $@ | ||
else | ||
echo "Usage: $0 [docker-compose commands]" | ||
echo "or source this script to export the variables" | ||
fi |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,16 @@ | ||
from io import BytesIO | ||
from typing import List | ||
from fastapi import APIRouter, File, HTTPException, UploadFile, Query | ||
from fastapi import APIRouter, File, HTTPException, UploadFile, Query, status | ||
from fastapi.responses import JSONResponse | ||
|
||
from document_converter.schema import BatchConversionJobResult, ConversationJobResult, ConversionResult | ||
from document_converter.schema import ( | ||
BatchConversionJobResult, | ||
ConversionJobResult, | ||
ConversionResult | ||
) | ||
from document_converter.service import DocumentConverterService, DoclingDocumentConversion | ||
from document_converter.utils import is_file_format_supported | ||
from worker.tasks import convert_document_task, convert_documents_task | ||
from worker.tasks import convert_document_task, convert_documents_task, ping | ||
|
||
router = APIRouter() | ||
|
||
|
@@ -19,16 +24,33 @@ | |
'/documents/convert', | ||
response_model=ConversionResult, | ||
response_model_exclude_unset=True, | ||
status_code=status.HTTP_200_OK, | ||
responses={ | ||
200: {"description": "Document successfully converted"}, | ||
400: {"description": "Invalid request or unsupported file format"}, | ||
500: {"description": "Internal server error during conversion"} | ||
}, | ||
description="Convert a single document synchronously", | ||
) | ||
async def convert_single_document( | ||
document: UploadFile = File(...), | ||
extract_tables_as_images: bool = False, | ||
image_resolution_scale: int = Query(4, ge=1, le=4), | ||
document: UploadFile = File(..., description="The document file to convert"), | ||
extract_tables_as_images: bool = Query( | ||
False, | ||
description="Whether to extract tables as images" | ||
), | ||
image_resolution_scale: int = Query( | ||
4, | ||
ge=1, | ||
le=4, | ||
description="Scale factor for image resolution (1-4)" | ||
), | ||
): | ||
file_bytes = await document.read() | ||
if not is_file_format_supported(file_bytes, document.filename): | ||
raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}") | ||
raise HTTPException( | ||
status_code=status.HTTP_400_BAD_REQUEST, | ||
detail=f"Unsupported file format: {document.filename}" | ||
) | ||
|
||
return document_converter_service.convert_document( | ||
(document.filename, BytesIO(file_bytes)), | ||
|
@@ -41,18 +63,35 @@ async def convert_single_document( | |
'/documents/batch-convert', | ||
response_model=List[ConversionResult], | ||
response_model_exclude_unset=True, | ||
status_code=status.HTTP_200_OK, | ||
responses={ | ||
200: {"description": "All documents successfully converted"}, | ||
400: {"description": "Invalid request or unsupported file format"}, | ||
500: {"description": "Internal server error during conversion"} | ||
}, | ||
description="Convert multiple documents synchronously", | ||
) | ||
async def convert_multiple_documents( | ||
documents: List[UploadFile] = File(...), | ||
extract_tables_as_images: bool = False, | ||
image_resolution_scale: int = Query(4, ge=1, le=4), | ||
documents: List[UploadFile] = File(..., description="List of document files to convert"), | ||
extract_tables_as_images: bool = Query( | ||
False, | ||
description="Whether to extract tables as images" | ||
), | ||
image_resolution_scale: int = Query( | ||
4, | ||
ge=1, | ||
le=4, | ||
description="Scale factor for image resolution (1-4)" | ||
), | ||
): | ||
doc_streams = [] | ||
for document in documents: | ||
file_bytes = await document.read() | ||
if not is_file_format_supported(file_bytes, document.filename): | ||
raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}") | ||
raise HTTPException( | ||
status_code=status.HTTP_400_BAD_REQUEST, | ||
detail=f"Unsupported file format: {document.filename}" | ||
) | ||
doc_streams.append((document.filename, BytesIO(file_bytes))) | ||
|
||
return document_converter_service.convert_documents( | ||
|
@@ -65,54 +104,120 @@ async def convert_multiple_documents( | |
# Asynchronous conversion jobs endpoints | ||
@router.post( | ||
'/conversion-jobs', | ||
response_model=ConversationJobResult, | ||
description="Create a conversion job for a single document", | ||
response_model=ConversionJobResult, | ||
status_code=status.HTTP_202_ACCEPTED, | ||
responses={ | ||
202: {"description": "Conversion job accepted and queued"}, | ||
400: {"description": "Invalid request or unsupported file format"}, | ||
500: {"description": "Failed to queue conversion job"} | ||
}, | ||
description="Create an asynchronous conversion job for a single document", | ||
) | ||
async def create_single_document_conversion_job( | ||
document: UploadFile = File(...), | ||
extract_tables_as_images: bool = False, | ||
image_resolution_scale: int = Query(4, ge=1, le=4), | ||
document: UploadFile = File(..., description="The document file to convert"), | ||
extract_tables_as_images: bool = Query( | ||
False, | ||
description="Whether to extract tables as images" | ||
), | ||
image_resolution_scale: int = Query( | ||
4, | ||
ge=1, | ||
le=4, | ||
description="Scale factor for image resolution (1-4)" | ||
), | ||
): | ||
file_bytes = await document.read() | ||
if not is_file_format_supported(file_bytes, document.filename): | ||
raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}") | ||
raise HTTPException( | ||
status_code=status.HTTP_400_BAD_REQUEST, | ||
detail=f"Unsupported file format: {document.filename}" | ||
) | ||
|
||
task = convert_document_task.delay( | ||
(document.filename, file_bytes), | ||
extract_tables=extract_tables_as_images, | ||
image_resolution_scale=image_resolution_scale, | ||
) | ||
|
||
return ConversationJobResult(job_id=task.id, status="IN_PROGRESS") | ||
return ConversionJobResult( | ||
job_id=task.id, | ||
status="IN_PROGRESS" | ||
) | ||
|
||
|
||
@router.get( | ||
'/conversion-jobs/{job_id}', | ||
response_model=ConversationJobResult, | ||
description="Get the status of a single document conversion job", | ||
response_model_exclude_unset=True, | ||
response_model=ConversionJobResult, | ||
responses={ | ||
200: {"description": "Conversion job completed successfully"}, | ||
202: {"description": "Conversion job is still in progress"}, | ||
404: {"description": "Job not found"}, | ||
422: {"description": "Conversion job failed"} | ||
}, | ||
description="Get the status and result of a single document conversion job", | ||
) | ||
async def get_conversion_job_status(job_id: str): | ||
return document_converter_service.get_single_document_task_result(job_id) | ||
try: | ||
result = document_converter_service.get_single_document_task_result(job_id) | ||
|
||
# Return 202 Accepted if job is still in progress | ||
if result.status in ["IN_PROGRESS"]: | ||
return JSONResponse( | ||
status_code=status.HTTP_202_ACCEPTED, | ||
content=result.dict(exclude_none=True) | ||
) | ||
|
||
# Return 422 for failed jobs | ||
if result.status == "FAILURE": | ||
return JSONResponse( | ||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, | ||
content=result.dict(exclude_none=True) | ||
) | ||
|
||
# Return 200 OK for successful jobs | ||
return JSONResponse( | ||
status_code=status.HTTP_200_OK, | ||
content=result.dict(exclude_none=True) | ||
) | ||
except KeyError: | ||
raise HTTPException( | ||
status_code=status.HTTP_404_NOT_FOUND, | ||
detail=f"Job not found: {job_id}" | ||
) | ||
Comment on lines
+160
to
+186
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please take this off, during job processing we do not raise errors, we return a status of FAILURE if the job failed and the reason for the failure in the error field. It was intentionally done that way. |
||
|
||
|
||
@router.post( | ||
'/batch-conversion-jobs', | ||
response_model=BatchConversionJobResult, | ||
response_model_exclude_unset=True, | ||
description="Create a conversion job for multiple documents", | ||
status_code=status.HTTP_202_ACCEPTED, | ||
responses={ | ||
202: {"description": "Batch conversion job accepted and queued"}, | ||
400: {"description": "Invalid request or unsupported file format"}, | ||
500: {"description": "Failed to queue batch conversion job"} | ||
}, | ||
description="Create an asynchronous conversion job for multiple documents", | ||
) | ||
async def create_batch_conversion_job( | ||
documents: List[UploadFile] = File(...), | ||
extract_tables_as_images: bool = False, | ||
image_resolution_scale: int = Query(4, ge=1, le=4), | ||
documents: List[UploadFile] = File(..., description="List of document files to convert"), | ||
extract_tables_as_images: bool = Query( | ||
False, | ||
description="Whether to extract tables as images" | ||
), | ||
image_resolution_scale: int = Query( | ||
4, | ||
ge=1, | ||
le=4, | ||
description="Scale factor for image resolution (1-4)" | ||
), | ||
): | ||
"""Create a batch conversion job for multiple documents.""" | ||
doc_data = [] | ||
for document in documents: | ||
file_bytes = await document.read() | ||
if not is_file_format_supported(file_bytes, document.filename): | ||
raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}") | ||
raise HTTPException( | ||
status_code=status.HTTP_400_BAD_REQUEST, | ||
detail=f"Unsupported file format: {document.filename}" | ||
) | ||
doc_data.append((document.filename, file_bytes)) | ||
|
||
task = convert_documents_task.delay( | ||
|
@@ -121,15 +226,90 @@ async def create_batch_conversion_job( | |
image_resolution_scale=image_resolution_scale, | ||
) | ||
|
||
return BatchConversionJobResult(job_id=task.id, status="IN_PROGRESS") | ||
return BatchConversionJobResult( | ||
job_id=task.id, | ||
status="IN_PROGRESS" | ||
) | ||
|
||
|
||
@router.get( | ||
'/batch-conversion-jobs/{job_id}', | ||
response_model=BatchConversionJobResult, | ||
response_model_exclude_unset=True, | ||
description="Get the status of a batch conversion job", | ||
responses={ | ||
200: {"description": "All conversion jobs completed successfully"}, | ||
202: {"description": "Batch job is still in progress"}, | ||
404: {"description": "Batch job not found"}, | ||
422: {"description": "Batch job failed"} | ||
}, | ||
description="Get the status and results of a batch conversion job", | ||
) | ||
async def get_batch_conversion_job_status(job_id: str): | ||
"""Get the status and results of a batch conversion job.""" | ||
return document_converter_service.get_batch_conversion_task_result(job_id) | ||
try: | ||
result = document_converter_service.get_batch_conversion_task_result(job_id) | ||
|
||
# Return 202 Accepted if the batch job or any sub-job is still in progress | ||
if result.status in ["IN_PROGRESS"] or any( | ||
job.status in ["IN_PROGRESS"] | ||
for job in result.conversion_results | ||
): | ||
return JSONResponse( | ||
status_code=status.HTTP_202_ACCEPTED, | ||
content=result.dict(exclude_none=True) | ||
) | ||
|
||
# Return 422 for failed batch jobs | ||
if result.status == "FAILURE" or any( | ||
job.status == "FAILURE" | ||
for job in result.conversion_results | ||
): | ||
return JSONResponse( | ||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, | ||
content=result.dict(exclude_none=True) | ||
) | ||
|
||
# Return 200 OK for successful batch jobs (all success) | ||
return JSONResponse( | ||
status_code=status.HTTP_200_OK, | ||
content=result.dict(exclude_none=True) | ||
) | ||
except KeyError: | ||
raise HTTPException( | ||
status_code=status.HTTP_404_NOT_FOUND, | ||
detail=f"Batch job not found: {job_id}" | ||
) | ||
Comment on lines
+248
to
+279
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here |
||
|
||
|
||
@router.get( | ||
"/health", | ||
responses={ | ||
200: {"description": "All services are healthy"}, | ||
500: {"description": "One or more services are unhealthy"} | ||
}, | ||
description="Check the health status of all dependent services" | ||
) | ||
async def health_check(): | ||
try: | ||
# Check Celery/Redis connection by sending a ping task | ||
result = ping.delay() | ||
response = result.get(timeout=3) # Wait up to 3 seconds for response | ||
|
||
if response != "pong": | ||
raise HTTPException( | ||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | ||
detail="Celery/Redis connection test failed" | ||
) | ||
|
||
return { | ||
"status": "healthy", | ||
"services": { | ||
"celery": "connected", | ||
"redis": "connected", | ||
"docling": "connected", | ||
"document_converter": "connected", | ||
} | ||
} | ||
except Exception as e: | ||
raise HTTPException( | ||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | ||
detail=str(e) | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#!/bin/sh | ||
set -e | ||
|
||
|
||
echo "Starting application..." | ||
exec /app/.venv/bin/python -m uvicorn --port 8080 --host 0.0.0.0 main:app |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,26 @@ | ||
[tool.poetry] | ||
[project] | ||
name = "document-to-markdown" | ||
version = "0.1.0" | ||
description = "" | ||
authors = ["drmingler <davidemmanuel75@gmail.com>"] | ||
authors = [ | ||
{name = "davidemmanuel75@gmail.com", email = "davidemmanuel75@gmail.com"}, | ||
] | ||
readme = "README.md" | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.12" | ||
fastapi = "^0.115.4" | ||
uvicorn = "^0.32.0" | ||
docling = "^2.25.1" | ||
python-multipart = "^0.0.17" | ||
celery = "^5.4.0" | ||
flower = "^2.0.1" | ||
redis = "^5.2.0" | ||
gunicorn = "^23.0.0" | ||
|
||
requires-python = ">=3.12" | ||
dependencies = [ | ||
"fastapi>=0.115.4", | ||
"uvicorn>=0.32.0", | ||
"docling>=2.25.1", | ||
"python-multipart>=0.0.17", | ||
"celery>=5.4.0", | ||
"flower>=2.0.1", | ||
"redis>=5.2.0", | ||
"gunicorn>=23.0.0", | ||
] | ||
|
||
[build-system] | ||
requires = ["poetry-core"] | ||
build-backend = "poetry.core.masonry.api" | ||
requires = ["hatchling"] | ||
build-backend = "hatchling.build" | ||
|
||
[tool.hatch.build.targets.wheel] | ||
packages = ["."] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we also add instructions on how to install uv for those using windows or add a link to the uv installation doc? @spa5k