diff --git a/docs/docs.json b/docs/docs.json index 86b73cf527..cf7d0a3949 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -312,6 +312,7 @@ "group": "Python guides", "pages": [ "guides/python/python-image-processing", + "guides/python/python-doc-to-markdown", "guides/python/python-crawl4ai", "guides/python/python-pdf-form-extractor" ] diff --git a/docs/guides/introduction.mdx b/docs/guides/introduction.mdx index add0c353c1..9a7e7761eb 100644 --- a/docs/guides/introduction.mdx +++ b/docs/guides/introduction.mdx @@ -29,6 +29,7 @@ Get set up fast using our detailed walk-through guides. | [Cursor rules](/guides/cursor-rules) | Use Cursor rules to help write Trigger.dev tasks | | [Prisma](/guides/frameworks/prisma) | How to setup Prisma with Trigger.dev | | [Python image processing](/guides/python/python-image-processing) | Use Python and Pillow to process images | +| [Python document to markdown](/guides/python/python-doc-to-markdown) | Use Python and MarkItDown to convert documents to markdown | | [Python PDF form extractor](/guides/python/python-pdf-form-extractor) | Use Python, PyMuPDF and Trigger.dev to extract data from a PDF form | | [Python web crawler](/guides/python/python-crawl4ai) | Use Python, Crawl4AI and Playwright to create a headless web crawler | | [Sequin database triggers](/guides/frameworks/sequin) | Trigger tasks from database changes using Sequin | diff --git a/docs/guides/python/python-crawl4ai.mdx b/docs/guides/python/python-crawl4ai.mdx index c821bf6c98..526d0eaf58 100644 --- a/docs/guides/python/python-crawl4ai.mdx +++ b/docs/guides/python/python-crawl4ai.mdx @@ -1,6 +1,6 @@ --- title: "Python headless browser web crawler example" -sidebarTitle: "Python headless web crawler" +sidebarTitle: "Headless web crawler" description: "Learn how to use Python, Crawl4AI and Playwright to create a headless browser web crawler with Trigger.dev." --- diff --git a/docs/guides/python/python-doc-to-markdown.mdx b/docs/guides/python/python-doc-to-markdown.mdx new file mode 100644 index 0000000000..a36a240cec --- /dev/null +++ b/docs/guides/python/python-doc-to-markdown.mdx @@ -0,0 +1,224 @@ +--- +title: "Convert documents to markdown using Python and MarkItDown" +sidebarTitle: "Convert docs to markdown" +description: "Learn how to use Trigger.dev with Python to convert documents to markdown using MarkItDown." +--- + +import PythonLearnMore from "/snippets/python-learn-more.mdx"; + + + This project uses Trigger.dev v4 (which is currently in beta as of 28 April 2025). If you want to + run this project you will need to [upgrade to v4](/upgrade-to-v4). + + +## Overview + +Convert documents to markdown using Microsoft's [MarkItDown](https://github.com/microsoft/markitdown) library. This can be especially useful for preparing documents in a structured format for AI applications. + +## Prerequisites + +- A project with [Trigger.dev initialized](/quick-start) +- [Python](https://www.python.org/) installed on your local machine. _This example requires Python 3.10 or higher._ + +## Features + +- A Trigger.dev task which downloads a document from a URL and runs the Python script which converts it to markdown +- A Python script to convert documents to markdown using Microsoft's [MarkItDown](https://github.com/microsoft/markitdown) library +- Uses our [Python build extension](/config/extensions/pythonExtension) to install dependencies and run Python scripts + +## GitHub repo + + + Click here to view the full code for this project in our examples repository on GitHub. You can + fork it and use it as a starting point for your own project. + + +## The code + +### Build configuration + +After you've initialized your project with Trigger.dev, add these build settings to your `trigger.config.ts` file: + +```ts trigger.config.ts +import { pythonExtension } from "@trigger.dev/python/extension"; +import { defineConfig } from "@trigger.dev/sdk/v3"; + +export default defineConfig({ + runtime: "node", + project: "", + // Your other config settings... + build: { + extensions: [ + pythonExtension({ + // The path to your requirements.txt file + requirementsFile: "./requirements.txt", + // The path to your Python binary + devPythonBinaryPath: `venv/bin/python`, + // The paths to your Python scripts to run + scripts: ["src/python/**/*.py"], + }), + ], + }, +}); +``` + + + Learn more about executing scripts in your Trigger.dev project using our Python build extension + [here](/config/extensions/pythonExtension). + + +### Task code + +This task uses the `python.runScript` method to run the `markdown-converter.py` script with the given document URL as an argument. + +```ts src/trigger/convertToMarkdown.ts +import { task } from "@trigger.dev/sdk/v3"; +import { python } from "@trigger.dev/python"; +import * as fs from "fs"; +import * as path from "path"; +import * as os from "os"; + +export const convertToMarkdown = task({ + id: "convert-to-markdown", + run: async (payload: { url: string }) => { + const { url } = payload; + + // STEP 1: Create temporary file with unique name + const tempDir = os.tmpdir(); + const fileName = `doc-${Date.now()}-${Math.random().toString(36).substring(2, 7)}`; + const urlPath = new URL(url).pathname; + const extension = path.extname(urlPath) || ".docx"; + const tempFilePath = path.join(tempDir, `${fileName}${extension}`); + + // STEP 2: Download file from URL + const response = await fetch(url); + const buffer = await response.arrayBuffer(); + await fs.promises.writeFile(tempFilePath, Buffer.from(buffer)); + + // STEP 3: Run Python script to convert document to markdown + const pythonResult = await python.runScript("./src/python/markdown-converter.py", [ + JSON.stringify({ file_path: tempFilePath }), + ]); + + // STEP 4: Clean up temporary file + fs.unlink(tempFilePath, () => {}); + + // STEP 5: Process result + if (pythonResult.stdout) { + const result = JSON.parse(pythonResult.stdout); + return { + url, + markdown: result.status === "success" ? result.markdown : null, + error: result.status === "error" ? result.error : null, + success: result.status === "success", + }; + } + + return { + url, + markdown: null, + error: "No output from Python script", + success: false, + }; + }, +}); +``` + +### Add a requirements.txt file + +Add the following to your `requirements.txt` file. This is required in Python projects to install the dependencies. + +```txt requirements.txt +markitdown[all] +``` + +### The Python script + +The Python script uses MarkItDown to convert documents to Markdown format. + +```python src/python/markdown-converter.py +import json +import sys +import os +from markitdown import MarkItDown + +def convert_to_markdown(file_path): + """Convert a file to markdown format using MarkItDown""" + # Check if file exists + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + # Initialize MarkItDown + md = MarkItDown() + + # Convert the file + try: + result = md.convert(file_path) + return result.text_content + except Exception as e: + raise Exception(f"Error converting file: {str(e)}") + +def process_trigger_task(file_path): + """Process a file and convert to markdown""" + try: + markdown_result = convert_to_markdown(file_path) + return { + "status": "success", + "markdown": markdown_result + } + except Exception as e: + return { + "status": "error", + "error": str(e) + } + +if __name__ == "__main__": + # Get the file path from command line arguments + if len(sys.argv) < 2: + print(json.dumps({"status": "error", "error": "No file path provided"})) + sys.exit(1) + + try: + config = json.loads(sys.argv[1]) + file_path = config.get("file_path") + + if not file_path: + print(json.dumps({"status": "error", "error": "No file path specified in config"})) + sys.exit(1) + + result = process_trigger_task(file_path) + print(json.dumps(result)) + except Exception as e: + print(json.dumps({"status": "error", "error": str(e)})) + sys.exit(1) +``` + +## Testing your task + +1. Create a virtual environment `python -m venv venv` +2. Activate the virtual environment, depending on your OS: On Mac/Linux: `source venv/bin/activate`, on Windows: `venv\Scripts\activate` +3. Install the Python dependencies `pip install -r requirements.txt`. _Make sure you have Python 3.10 or higher installed._ +4. Copy the project ref from your [Trigger.dev dashboard](https://cloud.trigger.dev) and add it to the `trigger.config.ts` file. +5. Run the Trigger.dev CLI `dev` command (it may ask you to authorize the CLI if you haven't already). +6. Test the task in the dashboard by providing a valid document URL. +7. Deploy the task to production using the Trigger.dev CLI `deploy` command. + +## MarkItDown Conversion Capabilities + +- Convert various file formats to Markdown: + - Office formats (Word, PowerPoint, Excel) + - PDFs + - Images (with optional LLM-generated descriptions) + - HTML, CSV, JSON, XML + - Audio files (with optional transcription) + - ZIP archives + - And more +- Preserve document structure (headings, lists, tables, etc.) +- Handle multiple input methods (file paths, URLs, base64 data) +- Optional Azure Document Intelligence integration for better PDF and image conversion + + diff --git a/docs/guides/python/python-image-processing.mdx b/docs/guides/python/python-image-processing.mdx index 64e73ecdae..0f81d2b54f 100644 --- a/docs/guides/python/python-image-processing.mdx +++ b/docs/guides/python/python-image-processing.mdx @@ -1,6 +1,6 @@ --- title: "Python image processing example" -sidebarTitle: "Python image processing" +sidebarTitle: "Process images" description: "Learn how to use Trigger.dev with Python to process images from URLs and upload them to S3." --- diff --git a/docs/guides/python/python-pdf-form-extractor.mdx b/docs/guides/python/python-pdf-form-extractor.mdx index a62f0e8dc1..3367ea9baf 100644 --- a/docs/guides/python/python-pdf-form-extractor.mdx +++ b/docs/guides/python/python-pdf-form-extractor.mdx @@ -1,6 +1,6 @@ --- title: "Python PDF form extractor example" -sidebarTitle: "Python PDF form extractor" +sidebarTitle: "Extract form data from PDFs" description: "Learn how to use Trigger.dev with Python to extract form data from PDF files." ---