diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js index b945b88..6e0f367 100644 --- a/apps/js-sdk/firecrawl/build/index.js +++ b/apps/js-sdk/firecrawl/build/index.js @@ -8,6 +8,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge }); }; import axios from "axios"; +import { z } from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; /** * Main class for interacting with the Firecrawl API. @@ -38,7 +39,11 @@ export default class FirecrawlApp { }; let jsonData = Object.assign({ url }, params); if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { - const schema = zodToJsonSchema(params.extractorOptions.extractionSchema); + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof z.ZodSchema) { + schema = zodToJsonSchema(schema); + } jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); } try { diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 3634730..a9359cf 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.17", + "version": "0.0.19", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 85253d8..0319c74 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -91,9 +91,11 @@ export default class FirecrawlApp { } as AxiosRequestHeaders; let jsonData: Params = { url, ...params }; if (params?.extractorOptions?.extractionSchema) { - const schema = zodToJsonSchema( - params.extractorOptions.extractionSchema as z.ZodSchema - ); + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof z.ZodSchema) { + schema = zodToJsonSchema(schema); + } jsonData = { ...jsonData, extractorOptions: { diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index 337972f..4d26319 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -9,7 +9,7 @@ "version": "1.0.0", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^0.0.17-beta.8", + "@mendable/firecrawl-js": "^0.0.19", "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", @@ -421,11 +421,10 @@ } }, "node_modules/@mendable/firecrawl-js": { - "version": "0.0.17-beta.8", - "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.17-beta.8.tgz", - "integrity": "sha512-d65AW+y4YUQ9oU4Jy8dqiuKBPr+QkAyOKYEwFev/GOpGbNfU6lBUGJlAujVXaVY6fDbUGkHoaEzUbuTsqZV+Ng==", + "version": "0.0.19", + "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.19.tgz", + "integrity": "sha512-u9BDVIN/bftDztxLlE2cf02Nz0si3+Vmy9cANDFHj/iriT3guzI8ITBk4uC81CyRmPzNyXrW6hSAG90g9ol4cA==", "dependencies": { - "@mendable/firecrawl-js": "^0.0.17-beta.5", "axios": "^1.6.8", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 9492e07..0e93fe3 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -11,7 +11,7 @@ "author": "", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^0.0.17-beta.8", + "@mendable/firecrawl-js": "^0.0.19", "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", diff --git a/apps/js-sdk/test.ts b/apps/js-sdk/test.ts index a35c369..5419c2d 100644 --- a/apps/js-sdk/test.ts +++ b/apps/js-sdk/test.ts @@ -3,7 +3,7 @@ import { z } from "zod"; async function a() { const app = new FirecrawlApp({ - apiKey: "fc-YOUR_FIRECRAWL_API_KEY", + apiKey: "fc-YOUR_API_KEY", }); // Define schema to extract contents into @@ -20,7 +20,7 @@ async function a() { .length(5) .describe("Top 5 stories on Hacker News"), }); - const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", { + const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", { extractorOptions: { extractionSchema: schema }, }); console.log(scrapeResult.data["llm_extraction"]); diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index b178400..3ca84af 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -1,13 +1,36 @@ from firecrawl import FirecrawlApp -app = FirecrawlApp(api_key="YOUR_API_KEY") +app = FirecrawlApp(api_key="fc-YOUR_API_KEY") -crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}) -print(crawl_result[0]['markdown']) +# crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}) -job_id = crawl_result['jobId'] -print(job_id) +# print(crawl_result[0]['markdown']) + +# job_id = crawl_result['jobId'] +# print(job_id) + +# status = app.check_crawl_status(job_id) +# print(status) +from pydantic import BaseModel, Field +from typing import List, Optional + +class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str + +class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") + +a = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': TopArticlesSchema.model_json_schema(), + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) -status = app.check_crawl_status(job_id) -print(status) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 441b940..e955ffe 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -1,4 +1,5 @@ import os +from typing import Any, Dict, Optional import requests import time @@ -8,26 +9,51 @@ class FirecrawlApp: if self.api_key is None: raise ValueError('No API key provided') - def scrape_url(self, url, params=None): + from pydantic import BaseModel + from typing import Optional, Dict, Any + + class ScrapeParams(BaseModel): + url: str + extractorOptions: Optional[Dict[str, Any]] = None + + def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}' } - json_data = {'url': url} + # Prepare the base scrape parameters with the URL + scrape_params = {'url': url} + + # If there are additional params, process them if params: - json_data.update(params) + # Initialize extractorOptions if present + extractor_options = params.get('extractorOptions', {}) + # Check and convert the extractionSchema if it's a Pydantic model + if 'extractionSchema' in extractor_options: + if hasattr(extractor_options['extractionSchema'], 'schema'): + extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema() + # Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided + extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') + # Update the scrape_params with the processed extractorOptions + scrape_params['extractorOptions'] = extractor_options + + # Include any other params directly at the top level of scrape_params + for key, value in params.items(): + if key != 'extractorOptions': + scrape_params[key] = value + print(scrape_params) + # Make the POST request with the prepared headers and JSON data response = requests.post( 'https://api.firecrawl.dev/v0/scrape', headers=headers, - json=json_data + json=scrape_params ) if response.status_code == 200: response = response.json() - if response['success'] == True: + if response['success']: return response['data'] else: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - elif response.status_code in [402, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py index a3589e3..b870da6 100644 --- a/apps/python-sdk/setup.py +++ b/apps/python-sdk/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages setup( name='firecrawl-py', - version='0.0.6', + version='0.0.7', url='https://github.com/mendableai/firecrawl', author='Mendable.ai', author_email='nick@mendable.ai',