0

Nick: fixes js and pydantic implementation

This commit is contained in:
Nicolas 2024-05-08 17:16:59 -07:00
parent c89964b230
commit e6dbbf1bab
9 changed files with 82 additions and 27 deletions

View File

@ -8,6 +8,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
}); });
}; };
import axios from "axios"; import axios from "axios";
import { z } from "zod";
import { zodToJsonSchema } from "zod-to-json-schema"; import { zodToJsonSchema } from "zod-to-json-schema";
/** /**
* Main class for interacting with the Firecrawl API. * Main class for interacting with the Firecrawl API.
@ -38,7 +39,11 @@ export default class FirecrawlApp {
}; };
let jsonData = Object.assign({ url }, params); let jsonData = Object.assign({ url }, params);
if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) {
const schema = zodToJsonSchema(params.extractorOptions.extractionSchema); let schema = params.extractorOptions.extractionSchema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema);
}
jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) });
} }
try { try {

View File

@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "0.0.17", "version": "0.0.19",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "build/index.js", "main": "build/index.js",
"types": "types/index.d.ts", "types": "types/index.d.ts",

View File

@ -91,9 +91,11 @@ export default class FirecrawlApp {
} as AxiosRequestHeaders; } as AxiosRequestHeaders;
let jsonData: Params = { url, ...params }; let jsonData: Params = { url, ...params };
if (params?.extractorOptions?.extractionSchema) { if (params?.extractorOptions?.extractionSchema) {
const schema = zodToJsonSchema( let schema = params.extractorOptions.extractionSchema;
params.extractorOptions.extractionSchema as z.ZodSchema // Check if schema is an instance of ZodSchema to correctly identify Zod schemas
); if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema);
}
jsonData = { jsonData = {
...jsonData, ...jsonData,
extractorOptions: { extractorOptions: {

View File

@ -9,7 +9,7 @@
"version": "1.0.0", "version": "1.0.0",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"@mendable/firecrawl-js": "^0.0.17-beta.8", "@mendable/firecrawl-js": "^0.0.19",
"axios": "^1.6.8", "axios": "^1.6.8",
"ts-node": "^10.9.2", "ts-node": "^10.9.2",
"typescript": "^5.4.5", "typescript": "^5.4.5",
@ -421,11 +421,10 @@
} }
}, },
"node_modules/@mendable/firecrawl-js": { "node_modules/@mendable/firecrawl-js": {
"version": "0.0.17-beta.8", "version": "0.0.19",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.17-beta.8.tgz", "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.19.tgz",
"integrity": "sha512-d65AW+y4YUQ9oU4Jy8dqiuKBPr+QkAyOKYEwFev/GOpGbNfU6lBUGJlAujVXaVY6fDbUGkHoaEzUbuTsqZV+Ng==", "integrity": "sha512-u9BDVIN/bftDztxLlE2cf02Nz0si3+Vmy9cANDFHj/iriT3guzI8ITBk4uC81CyRmPzNyXrW6hSAG90g9ol4cA==",
"dependencies": { "dependencies": {
"@mendable/firecrawl-js": "^0.0.17-beta.5",
"axios": "^1.6.8", "axios": "^1.6.8",
"zod": "^3.23.8", "zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0" "zod-to-json-schema": "^3.23.0"

View File

@ -11,7 +11,7 @@
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"@mendable/firecrawl-js": "^0.0.17-beta.8", "@mendable/firecrawl-js": "^0.0.19",
"axios": "^1.6.8", "axios": "^1.6.8",
"ts-node": "^10.9.2", "ts-node": "^10.9.2",
"typescript": "^5.4.5", "typescript": "^5.4.5",

View File

@ -3,7 +3,7 @@ import { z } from "zod";
async function a() { async function a() {
const app = new FirecrawlApp({ const app = new FirecrawlApp({
apiKey: "fc-YOUR_FIRECRAWL_API_KEY", apiKey: "fc-YOUR_API_KEY",
}); });
// Define schema to extract contents into // Define schema to extract contents into
@ -20,7 +20,7 @@ async function a() {
.length(5) .length(5)
.describe("Top 5 stories on Hacker News"), .describe("Top 5 stories on Hacker News"),
}); });
const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", { const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", {
extractorOptions: { extractionSchema: schema }, extractorOptions: { extractionSchema: schema },
}); });
console.log(scrapeResult.data["llm_extraction"]); console.log(scrapeResult.data["llm_extraction"]);

View File

@ -1,13 +1,36 @@
from firecrawl import FirecrawlApp from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="YOUR_API_KEY") app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}) # crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
print(crawl_result[0]['markdown'])
job_id = crawl_result['jobId'] # print(crawl_result[0]['markdown'])
print(job_id)
# job_id = crawl_result['jobId']
# print(job_id)
# status = app.check_crawl_status(job_id)
# print(status)
from pydantic import BaseModel, Field
from typing import List, Optional
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
a = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': TopArticlesSchema.model_json_schema(),
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})
status = app.check_crawl_status(job_id)
print(status)

View File

@ -1,4 +1,5 @@
import os import os
from typing import Any, Dict, Optional
import requests import requests
import time import time
@ -8,26 +9,51 @@ class FirecrawlApp:
if self.api_key is None: if self.api_key is None:
raise ValueError('No API key provided') raise ValueError('No API key provided')
def scrape_url(self, url, params=None): from pydantic import BaseModel
from typing import Optional, Dict, Any
class ScrapeParams(BaseModel):
url: str
extractorOptions: Optional[Dict[str, Any]] = None
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}'
} }
json_data = {'url': url} # Prepare the base scrape parameters with the URL
scrape_params = {'url': url}
# If there are additional params, process them
if params: if params:
json_data.update(params) # Initialize extractorOptions if present
extractor_options = params.get('extractorOptions', {})
# Check and convert the extractionSchema if it's a Pydantic model
if 'extractionSchema' in extractor_options:
if hasattr(extractor_options['extractionSchema'], 'schema'):
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
# Update the scrape_params with the processed extractorOptions
scrape_params['extractorOptions'] = extractor_options
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key != 'extractorOptions':
scrape_params[key] = value
print(scrape_params)
# Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/scrape', 'https://api.firecrawl.dev/v0/scrape',
headers=headers, headers=headers,
json=json_data json=scrape_params
) )
if response.status_code == 200: if response.status_code == 200:
response = response.json() response = response.json()
if response['success'] == True: if response['success']:
return response['data'] return response['data']
else: else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]: elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')

View File

@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup( setup(
name='firecrawl-py', name='firecrawl-py',
version='0.0.6', version='0.0.7',
url='https://github.com/mendableai/firecrawl', url='https://github.com/mendableai/firecrawl',
author='Mendable.ai', author='Mendable.ai',
author_email='nick@mendable.ai', author_email='nick@mendable.ai',