From a48abdfb24cfe74092d5e9225e754d06a098f202 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Mon, 24 Feb 2025 17:04:10 +0000 Subject: [PATCH 1/4] chore: update Firecrawl version and add FirecrawlExtractTool --- .../documentloaders/FireCrawl/FireCrawl.ts | 329 ++++++++++++++---- packages/components/package.json | 2 +- 2 files changed, 257 insertions(+), 74 deletions(-) diff --git a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts index a2707d13b07..b1968a343b5 100644 --- a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts +++ b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts @@ -17,25 +17,24 @@ interface FirecrawlDocumentMetadata { title?: string description?: string language?: string - // ... (other metadata fields) + sourceURL?: string + statusCode?: number + error?: string [key: string]: any } interface FirecrawlDocument { - id?: string - url?: string - content: string markdown?: string html?: string - llm_extraction?: Record - createdAt?: Date - updatedAt?: Date - type?: string + rawHtml?: string + screenshot?: string + links?: string[] + actions?: { + screenshots?: string[] + } metadata: FirecrawlDocumentMetadata - childrenLinks?: string[] - provider?: string + llm_extraction?: Record warning?: string - index?: number } interface ScrapeResponse { @@ -46,9 +45,25 @@ interface ScrapeResponse { interface CrawlResponse { success: boolean - jobId?: string + id: string + url: string +} + +interface CrawlStatusResponse { + status: string + total: number + completed: number + creditsUsed: number + expiresAt: string + next?: string data?: FirecrawlDocument[] - error?: string +} + +interface ExtractResponse { + success: boolean + id: string + url: string + data?: Record } interface Params { @@ -60,6 +75,35 @@ interface Params { } } +type Format = 'markdown' | 'html' | 'rawHtml' | 'links' | 'screenshot' | 'screenshot@fullPage' | 'json' + +interface ExtractRequest { + urls: string[] + prompt?: string + schema?: Record + enableWebSearch?: boolean + ignoreSitemap?: boolean + includeSubdomains?: boolean + showSources?: boolean + scrapeOptions?: { + formats: Format[] + onlyMainContent?: boolean + includeTags?: string[] + excludeTags?: string[] + mobile?: boolean + skipTlsVerification?: boolean + timeout?: number + [key: string]: any + } +} + +interface ExtractStatusResponse { + success: boolean + data: any + status: 'completed' | 'pending' | 'processing' | 'failed' | 'cancelled' + expiresAt: string +} + // FirecrawlApp class (not exported) class FirecrawlApp { private apiKey: string @@ -91,7 +135,7 @@ class FirecrawlApp { } } try { - const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v0/scrape', jsonData, headers) + const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/scrape', jsonData, headers) if (response.status === 200) { const responseData = response.data if (responseData.success) { @@ -114,17 +158,17 @@ class FirecrawlApp { waitUntilDone: boolean = true, pollInterval: number = 2, idempotencyKey?: string - ): Promise { + ): Promise { const headers = this.prepareHeaders(idempotencyKey) let jsonData: Params = { url, ...params } try { - const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v0/crawl', jsonData, headers) + const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/crawl', jsonData, headers) if (response.status === 200) { - const jobId: string = response.data.jobId + const crawlResponse = response.data as CrawlResponse if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval) + return this.monitorJobStatus(crawlResponse.id, headers, pollInterval) } else { - return { success: true, jobId } + return crawlResponse } } else { this.handleError(response, 'start crawl job') @@ -132,7 +176,46 @@ class FirecrawlApp { } catch (error: any) { throw new Error(error.message) } - return { success: false, error: 'Internal server error.' } + return { success: false, id: '', url: '' } + } + + async extract( + request: ExtractRequest, + waitUntilDone: boolean = true, + pollInterval: number = 2 + ): Promise { + const headers = this.prepareHeaders() + try { + const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/extract', request, headers) + if (response.status === 200) { + const extractResponse = response.data as ExtractResponse + if (waitUntilDone) { + return this.monitorExtractStatus(extractResponse.id, headers, pollInterval) + } else { + return extractResponse + } + } else { + this.handleError(response, 'start extract job') + } + } catch (error: any) { + throw new Error(error.message) + } + return { success: false, id: '', url: '' } + } + + async getExtractStatus(jobId: string): Promise { + const headers = this.prepareHeaders() + try { + const response: AxiosResponse = await this.getRequest(this.apiUrl + `/v1/extract/${jobId}`, headers) + if (response.status === 200) { + return response.data as ExtractStatusResponse + } else { + this.handleError(response, 'get extract status') + } + } catch (error: any) { + throw new Error(error.message) + } + return { success: false, data: null, status: 'failed', expiresAt: '' } } private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { @@ -151,33 +234,58 @@ class FirecrawlApp { return axios.get(url, { headers }) } - private async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise { + private async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise { let isJobCompleted = false while (!isJobCompleted) { - const statusResponse: AxiosResponse = await this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers) + const statusResponse: AxiosResponse = await this.getRequest(this.apiUrl + `/v1/crawl/${jobId}`, headers) if (statusResponse.status === 200) { - const statusData = statusResponse.data + const statusData = statusResponse.data as CrawlStatusResponse switch (statusData.status) { case 'completed': isJobCompleted = true - if ('data' in statusData) { - return statusData.data - } else { - throw new Error('Crawl job completed but no data was returned') + return statusData + case 'scraping': + case 'failed': + if (statusData.status === 'failed') { + throw new Error('Crawl job failed') } - case 'active': - case 'paused': - case 'pending': - case 'queued': await new Promise((resolve) => setTimeout(resolve, Math.max(checkInterval, 2) * 1000)) break default: - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`) + throw new Error(`Unknown crawl status: ${statusData.status}`) } } else { this.handleError(statusResponse, 'check crawl status') } } + throw new Error('Failed to monitor job status') + } + + private async monitorExtractStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise { + let isJobCompleted = false + while (!isJobCompleted) { + const statusResponse: AxiosResponse = await this.getRequest(this.apiUrl + `/v1/extract/${jobId}`, headers) + if (statusResponse.status === 200) { + const statusData = statusResponse.data as ExtractStatusResponse + switch (statusData.status) { + case 'completed': + isJobCompleted = true + return statusData + case 'processing': + case 'failed': + if (statusData.status === 'failed') { + throw new Error('Extract job failed') + } + await new Promise((resolve) => setTimeout(resolve, Math.max(checkInterval, 2) * 1000)) + break + default: + throw new Error(`Unknown extract status: ${statusData.status}`) + } + } else { + this.handleError(statusResponse, 'check extract status') + } + } + throw new Error('Failed to monitor extract status') } private handleError(response: AxiosResponse, action: string): void { @@ -195,7 +303,7 @@ interface FirecrawlLoaderParameters { url: string apiKey?: string apiUrl?: string - mode?: 'crawl' | 'scrape' + mode?: 'crawl' | 'scrape' | 'extract' | 'getExtractStatus' params?: Record } @@ -203,7 +311,7 @@ class FireCrawlLoader extends BaseDocumentLoader { private apiKey: string private apiUrl: string private url: string - private mode: 'crawl' | 'scrape' + private mode: 'crawl' | 'scrape' | 'extract' | 'getExtractStatus' private params?: Record constructor(loaderParams: FirecrawlLoaderParameters) { @@ -232,9 +340,26 @@ class FireCrawlLoader extends BaseDocumentLoader { firecrawlDocs = [response.data as FirecrawlDocument] } else if (this.mode === 'crawl') { const response = await app.crawlUrl(this.url, this.params, true) - firecrawlDocs = response as FirecrawlDocument[] + if ('data' in response) { + firecrawlDocs = response.data || [] + } else { + throw new Error('Crawl completed but no data was returned') + } + } else if (this.mode === 'extract') { + this.params!.urls = [this.url] + const response = await app.extract(this.params as any as ExtractRequest) + if (!response.success) { + throw new Error(`Firecrawl: Failed to extract URL.`) + } + firecrawlDocs = [response.data as FirecrawlDocument] + } else if (this.mode === 'getExtractStatus') { + const response = await app.getExtractStatus(this.params as any as string) + if (!response.success) { + throw new Error(`Firecrawl: Failed to get extract status.`) + } + return response.data } else { - throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`) + throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape', 'extract'.`) } return firecrawlDocs.map( @@ -287,7 +412,7 @@ class FireCrawl_DocumentLoaders implements INode { label: 'URLs', name: 'url', type: 'string', - description: 'URL to be crawled/scraped', + description: 'URL to be crawled/scraped/extracted', placeholder: 'https://docs.flowiseai.com' }, { @@ -304,22 +429,60 @@ class FireCrawl_DocumentLoaders implements INode { label: 'Scrape', name: 'scrape', description: 'Scrape a URL and get its content' + }, + { + label: 'Extract', + name: 'extract', + description: 'Extract data from a URL' + }, + { + label: 'Get extract status (DATA)', + name: 'getExtractStatus', + description: 'Get the status of an extract job' } ], default: 'crawl' }, { - // maxCrawlPages - label: 'Max Crawl Pages', - name: 'maxCrawlPages', + // includeTags + label: '[Scrape] Include Tags', + name: 'includeTags', type: 'string', - description: 'Maximum number of pages to crawl', + description: 'Tags to include in the output', + optional: true, + additionalParams: true + }, + { + // excludeTags + label: '[Scrape] Exclude Tags', + name: 'excludeTags', + type: 'string', + description: 'Tags to exclude from the output', + optional: true, + additionalParams: true + }, + { + // onlyMainContent + label: '[Scrape] Only Main Content', + name: 'onlyMainContent', + type: 'boolean', + description: 'Extract only the main content of the page', optional: true, additionalParams: true }, + { + // limit + label: '[Crawl] Limit', + name: 'limit', + type: 'string', + description: 'Maximum number of pages to crawl', + optional: true, + additionalParams: true, + default: '10000' + }, { // generateImgAltText - label: 'Generate Image Alt Text', + label: '[Crawl] Generate Image Alt Text', name: 'generateImgAltText', type: 'boolean', description: 'Generate alt text for images', @@ -327,37 +490,54 @@ class FireCrawl_DocumentLoaders implements INode { additionalParams: true }, { - // returnOnlyUrls - label: 'Return Only URLs', - name: 'returnOnlyUrls', - type: 'boolean', - description: 'Return only URLs of the crawled pages', + label: '[Extract] Schema', + name: 'extractSchema', + type: 'json', + description: 'JSON schema for data extraction', optional: true, additionalParams: true }, { - // onlyMainContent - label: 'Only Main Content', - name: 'onlyMainContent', - type: 'boolean', - description: 'Extract only the main content of the page', + label: '[Extract] Prompt', + name: 'extractPrompt', + type: 'string', + description: 'Prompt for data extraction', + optional: true, + additionalParams: true + }, + { + label: '[Extract] Job ID', + name: 'extractJobId', + type: 'string', + description: 'ID of the extract job', optional: true, additionalParams: true } - // ... (other input parameters) ] this.outputs = [ { - label: 'Document', - name: 'document', - description: 'Array of document objects containing metadata and pageContent', - baseClasses: [...this.baseClasses, 'json'] + label: 'Scrape', + name: 'scrape', + description: 'Scraped data', + baseClasses: ['json'] + }, + { + label: 'Crawl', + name: 'crawl', + description: 'Crawled data', + baseClasses: ['json'] }, { - label: 'Text', - name: 'text', - description: 'Concatenated string from pageContent of documents', - baseClasses: ['string', 'json'] + label: 'Extract', + name: 'extract', + description: 'ID of the extract job', + baseClasses: ['json'] + }, + { + label: 'Extract Status', + name: 'extractStatus', + description: 'Status of the extract job', + baseClasses: ['json'] } ] } @@ -367,9 +547,8 @@ class FireCrawl_DocumentLoaders implements INode { const metadata = nodeData.inputs?.metadata const url = nodeData.inputs?.url as string const crawlerType = nodeData.inputs?.crawlerType as string - const maxCrawlPages = nodeData.inputs?.maxCrawlPages as string + const limit = nodeData.inputs?.limit as string const generateImgAltText = nodeData.inputs?.generateImgAltText as boolean - const returnOnlyUrls = nodeData.inputs?.returnOnlyUrls as boolean const onlyMainContent = nodeData.inputs?.onlyMainContent as boolean const credentialData = await getCredentialData(nodeData.credential ?? '', options) const firecrawlApiToken = getCredentialParam('firecrawlApiToken', credentialData, nodeData) @@ -383,21 +562,25 @@ class FireCrawl_DocumentLoaders implements INode { ? (nodeData.inputs.urlPatternsIncludes.split(',') as string[]) : undefined + const extractSchema = nodeData.inputs?.extractSchema + const extractPrompt = nodeData.inputs?.extractPrompt as string + const input: FirecrawlLoaderParameters = { url, - mode: crawlerType as 'crawl' | 'scrape', + mode: crawlerType as 'crawl' | 'scrape' | 'extract' | 'getExtractStatus', apiKey: firecrawlApiToken, apiUrl: firecrawlApiUrl, params: { - crawlerOptions: { - includes: urlPatternsIncludes, - excludes: urlPatternsExcludes, - generateImgAltText, - returnOnlyUrls, - limit: maxCrawlPages ? parseFloat(maxCrawlPages) : undefined - }, - pageOptions: { - onlyMainContent + includePaths: urlPatternsIncludes, + excludePaths: urlPatternsExcludes, + generateImgAltText, + limit: limit ? parseFloat(limit) : 1000, + onlyMainContent, + includeTags: nodeData.inputs?.includeTags, + excludeTags: nodeData.inputs?.excludeTags, + extractOptions: { + schema: extractSchema ?? undefined, + prompt: extractPrompt ?? undefined } } } diff --git a/packages/components/package.json b/packages/components/package.json index 5d83568c6c4..ef923a39341 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -57,7 +57,7 @@ "@langchain/qdrant": "^0.0.5", "@langchain/weaviate": "^0.0.1", "@langchain/xai": "^0.0.1", - "@mendable/firecrawl-js": "^0.0.28", + "@mendable/firecrawl-js": "^1.18.2", "@mistralai/mistralai": "0.1.3", "@notionhq/client": "^2.2.8", "@opensearch-project/opensearch": "^1.2.0", From 637fc5f9e8444574b5ad500ea3713ce2e6c80e16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Thu, 27 Feb 2025 23:46:10 +0000 Subject: [PATCH 2/4] refactor: update outputs format --- .../documentloaders/FireCrawl/FireCrawl.ts | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts index b1968a343b5..6a516e98e6c 100644 --- a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts +++ b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts @@ -516,28 +516,16 @@ class FireCrawl_DocumentLoaders implements INode { ] this.outputs = [ { - label: 'Scrape', - name: 'scrape', - description: 'Scraped data', - baseClasses: ['json'] + label: 'Document', + name: 'document', + description: 'Array of document objects containing metadata and pageContent', + baseClasses: [...this.baseClasses, 'json'] }, { - label: 'Crawl', - name: 'crawl', - description: 'Crawled data', - baseClasses: ['json'] - }, - { - label: 'Extract', - name: 'extract', - description: 'ID of the extract job', - baseClasses: ['json'] - }, - { - label: 'Extract Status', - name: 'extractStatus', - description: 'Status of the extract job', - baseClasses: ['json'] + label: 'Text', + name: 'text', + description: 'Concatenated string from pageContent of documents', + baseClasses: ['string', 'json'] } ] } From f2ab76a95ba47eafd21094de229e291447ea469b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Fri, 21 Mar 2025 13:24:12 +0000 Subject: [PATCH 3/4] chore: update Firecrawl request headers to include X-Origin and X-Origin-Type --- .../components/nodes/documentloaders/FireCrawl/FireCrawl.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts index 6a516e98e6c..544523b8a19 100644 --- a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts +++ b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts @@ -222,8 +222,10 @@ class FirecrawlApp { return { 'Content-Type': 'application/json', Authorization: `Bearer ${this.apiKey}`, + 'X-Origin': 'flowise', + 'X-Origin-Type': 'integration', ...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}) - } as AxiosRequestHeaders & { 'x-idempotency-key'?: string } + } as AxiosRequestHeaders & { 'X-Origin': string; 'X-Origin-Type': string; 'x-idempotency-key'?: string } } private postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise { From 638f52e8b0e7cfd63800a7062eee530740771be4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Mon, 24 Mar 2025 21:48:32 +0000 Subject: [PATCH 4/4] feat: add FireCrawl testing suite for scraping, crawling, and data extraction - Introduced FireCrawl-TEST.ts to validate FireCrawlLoader functionality. - Implemented tests for basic scraping, crawling with text splitting, data extraction, and extract status retrieval. - Enhanced error handling in FireCrawlLoader for better debugging. --- .../FireCrawl/FireCrawl-TEST.ts | 147 ++++++++++++++++++ .../documentloaders/FireCrawl/FireCrawl.ts | 59 ++++--- 2 files changed, 182 insertions(+), 24 deletions(-) create mode 100644 packages/components/nodes/documentloaders/FireCrawl/FireCrawl-TEST.ts diff --git a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl-TEST.ts b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl-TEST.ts new file mode 100644 index 00000000000..651728223ab --- /dev/null +++ b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl-TEST.ts @@ -0,0 +1,147 @@ +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter' +import { FireCrawlLoader } from './FireCrawl' + +async function testFireCrawl() { + const apiKey = process.env.FIRECRAW_API_KEY || 'FIRECRAW_API_KEY' + const apiUrl = 'https://api.firecrawl.dev' + + // Test URLs + const testUrl = 'https://firecrawl.dev/' + const testUrlForExtract = 'https://firecrawl.dev/*' + + // Test 1: Basic Scraping + console.log('\n=== Testing Basic Scraping ===') + try { + const scrapeLoader = new FireCrawlLoader({ + url: testUrl, + apiKey, + apiUrl, + mode: 'scrape', + params: { + onlyMainContent: true, + includeTags: ['article', 'main', 'section'], + excludeTags: ['header', 'footer', 'nav'] + } + }) + const scrapeDocs = await scrapeLoader.load() + console.log('Scrape Results:', { + numDocs: scrapeDocs.length, + firstDocMetadata: scrapeDocs[0]?.metadata, + firstDocContent: scrapeDocs[0]?.pageContent.substring(0, 100) + '...' + }) + } catch (error) { + console.error('Scraping Error:', error) + } + + // Test 2: Crawling with Text Splitter + console.log('\n=== Testing Crawling with Text Splitter ===') + try { + const textSplitter = new RecursiveCharacterTextSplitter({ + chunkSize: 1000, + chunkOverlap: 200 + }) + const crawlLoader = new FireCrawlLoader({ + url: testUrl, + apiKey, + apiUrl, + mode: 'crawl', + params: { + limit: 5, // Limit to 5 pages for testing + includePaths: ['/docs', '/blog'], + excludePaths: ['/api', '/admin'] + } + }) + + console.log('Starting crawl with params:', { + url: testUrl, + apiKey: apiKey.substring(0, 8) + '...', + apiUrl, + mode: 'crawl' + }) + + const crawlDocs = await crawlLoader.load() + + if (!crawlDocs || crawlDocs.length === 0) { + console.warn('No documents were returned from the crawl') + return + } + + console.log('Crawl Results:', { + numDocs: crawlDocs.length, + firstDocMetadata: crawlDocs[0]?.metadata, + firstDocContent: crawlDocs[0]?.pageContent.substring(0, 100) + '...' + }) + } catch (error: any) { + console.error('Crawling Error Details:', { + message: error.message, + stack: error.stack, + response: error.response?.data, + status: error.response?.status + }) + } + + // Test 3: Data Extraction + console.log('\n=== Testing Data Extraction ===') + try { + const extractLoader = new FireCrawlLoader({ + url: testUrlForExtract, + apiKey, + apiUrl, + mode: 'extract', + params: { + schema: { + type: 'object', + properties: { + company: { + type: 'object', + properties: { + name: { + type: 'string' + }, + mission: { + type: 'string' + }, + is_open_source: { + type: 'boolean' + } + }, + required: ['name'] + } + }, + required: ['company'] + }, + prompt: 'Extract the company name, mission, and determine if the company is open source.' + } + }) + const extractDocs = await extractLoader.load() + console.log('Extract Results:', { + numDocs: extractDocs.length, + firstDocMetadata: extractDocs[0]?.metadata, + firstDocContent: extractDocs[0]?.pageContent + }) + } catch (error) { + console.error('Extraction Error:', error) + } + + // // Test 4: Get Extract Status + console.log('\n=== Testing Get Extract Status ===') + try { + const statusLoader = new FireCrawlLoader({ + url: testUrl, + apiKey, + apiUrl, + mode: 'getExtractStatus', + params: { jobId: 'EXTRACT_JOB_ID' } // Replace with an actual job ID + }) + const statusResult = await statusLoader.load() + console.log('Status Results:', statusResult) + } catch (error) { + console.error('Status Check Error:', error) + } +} + +// Run the tests +testFireCrawl().catch((error) => { + console.error('Fatal error:', error) + process.exit(1) +}) diff --git a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts index 544523b8a19..a72e6017e9a 100644 --- a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts +++ b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts @@ -47,6 +47,7 @@ interface CrawlResponse { success: boolean id: string url: string + error?: string } interface CrawlStatusResponse { @@ -165,6 +166,10 @@ class FirecrawlApp { const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/crawl', jsonData, headers) if (response.status === 200) { const crawlResponse = response.data as CrawlResponse + if (!crawlResponse.success) { + throw new Error(`Crawl request failed: ${crawlResponse.error || 'Unknown error'}`) + } + if (waitUntilDone) { return this.monitorJobStatus(crawlResponse.id, headers, pollInterval) } else { @@ -174,8 +179,13 @@ class FirecrawlApp { this.handleError(response, 'start crawl job') } } catch (error: any) { - throw new Error(error.message) + if (error.response?.data?.error) { + throw new Error(`Crawl failed: ${error.response.data.error}`) + } + + throw new Error(`Crawl failed: ${error.message}`) } + return { success: false, id: '', url: '' } } @@ -309,7 +319,7 @@ interface FirecrawlLoaderParameters { params?: Record } -class FireCrawlLoader extends BaseDocumentLoader { +export class FireCrawlLoader extends BaseDocumentLoader { private apiKey: string private apiUrl: string private url: string @@ -355,7 +365,8 @@ class FireCrawlLoader extends BaseDocumentLoader { } firecrawlDocs = [response.data as FirecrawlDocument] } else if (this.mode === 'getExtractStatus') { - const response = await app.getExtractStatus(this.params as any as string) + const jobId = this.params?.jobId as string + const response = await app.getExtractStatus(jobId) if (!response.success) { throw new Error(`Firecrawl: Failed to get extract status.`) } @@ -364,6 +375,14 @@ class FireCrawlLoader extends BaseDocumentLoader { throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape', 'extract'.`) } + if (this.mode === 'extract') { + const newDoc = new Document({ + pageContent: JSON.stringify(firecrawlDocs), + metadata: {} + }) + return [newDoc] + } + return firecrawlDocs.map( (doc) => new Document({ @@ -482,15 +501,6 @@ class FireCrawl_DocumentLoaders implements INode { additionalParams: true, default: '10000' }, - { - // generateImgAltText - label: '[Crawl] Generate Image Alt Text', - name: 'generateImgAltText', - type: 'boolean', - description: 'Generate alt text for images', - optional: true, - additionalParams: true - }, { label: '[Extract] Schema', name: 'extractSchema', @@ -538,7 +548,6 @@ class FireCrawl_DocumentLoaders implements INode { const url = nodeData.inputs?.url as string const crawlerType = nodeData.inputs?.crawlerType as string const limit = nodeData.inputs?.limit as string - const generateImgAltText = nodeData.inputs?.generateImgAltText as boolean const onlyMainContent = nodeData.inputs?.onlyMainContent as boolean const credentialData = await getCredentialData(nodeData.credential ?? '', options) const firecrawlApiToken = getCredentialParam('firecrawlApiToken', credentialData, nodeData) @@ -561,17 +570,16 @@ class FireCrawl_DocumentLoaders implements INode { apiKey: firecrawlApiToken, apiUrl: firecrawlApiUrl, params: { - includePaths: urlPatternsIncludes, - excludePaths: urlPatternsExcludes, - generateImgAltText, - limit: limit ? parseFloat(limit) : 1000, - onlyMainContent, - includeTags: nodeData.inputs?.includeTags, - excludeTags: nodeData.inputs?.excludeTags, - extractOptions: { - schema: extractSchema ?? undefined, - prompt: extractPrompt ?? undefined - } + scrapeOptions: { + includePaths: urlPatternsIncludes, + excludePaths: urlPatternsExcludes, + limit: limit ? parseFloat(limit) : 1000, + onlyMainContent, + includeTags: nodeData.inputs?.includeTags, + excludeTags: nodeData.inputs?.excludeTags + }, + schema: extractSchema ?? undefined, + prompt: extractPrompt ?? undefined } } const loader = new FireCrawlLoader(input) @@ -613,3 +621,6 @@ class FireCrawl_DocumentLoaders implements INode { } module.exports = { nodeClass: FireCrawl_DocumentLoaders } + +// FOR TESTING PURPOSES +// export { FireCrawl_DocumentLoaders }