Nick: fixes js and pydantic implementation
This commit is contained in:
parent
c89964b230
commit
e6dbbf1bab
@ -8,6 +8,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
|
import { z } from "zod";
|
||||||
import { zodToJsonSchema } from "zod-to-json-schema";
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
||||||
/**
|
/**
|
||||||
* Main class for interacting with the Firecrawl API.
|
* Main class for interacting with the Firecrawl API.
|
||||||
@ -38,7 +39,11 @@ export default class FirecrawlApp {
|
|||||||
};
|
};
|
||||||
let jsonData = Object.assign({ url }, params);
|
let jsonData = Object.assign({ url }, params);
|
||||||
if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) {
|
if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) {
|
||||||
const schema = zodToJsonSchema(params.extractorOptions.extractionSchema);
|
let schema = params.extractorOptions.extractionSchema;
|
||||||
|
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
||||||
|
if (schema instanceof z.ZodSchema) {
|
||||||
|
schema = zodToJsonSchema(schema);
|
||||||
|
}
|
||||||
jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) });
|
jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) });
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "0.0.17",
|
"version": "0.0.19",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "build/index.js",
|
"main": "build/index.js",
|
||||||
"types": "types/index.d.ts",
|
"types": "types/index.d.ts",
|
||||||
|
@ -91,9 +91,11 @@ export default class FirecrawlApp {
|
|||||||
} as AxiosRequestHeaders;
|
} as AxiosRequestHeaders;
|
||||||
let jsonData: Params = { url, ...params };
|
let jsonData: Params = { url, ...params };
|
||||||
if (params?.extractorOptions?.extractionSchema) {
|
if (params?.extractorOptions?.extractionSchema) {
|
||||||
const schema = zodToJsonSchema(
|
let schema = params.extractorOptions.extractionSchema;
|
||||||
params.extractorOptions.extractionSchema as z.ZodSchema
|
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
||||||
);
|
if (schema instanceof z.ZodSchema) {
|
||||||
|
schema = zodToJsonSchema(schema);
|
||||||
|
}
|
||||||
jsonData = {
|
jsonData = {
|
||||||
...jsonData,
|
...jsonData,
|
||||||
extractorOptions: {
|
extractorOptions: {
|
||||||
|
9
apps/js-sdk/package-lock.json
generated
9
apps/js-sdk/package-lock.json
generated
@ -9,7 +9,7 @@
|
|||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@mendable/firecrawl-js": "^0.0.17-beta.8",
|
"@mendable/firecrawl-js": "^0.0.19",
|
||||||
"axios": "^1.6.8",
|
"axios": "^1.6.8",
|
||||||
"ts-node": "^10.9.2",
|
"ts-node": "^10.9.2",
|
||||||
"typescript": "^5.4.5",
|
"typescript": "^5.4.5",
|
||||||
@ -421,11 +421,10 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@mendable/firecrawl-js": {
|
"node_modules/@mendable/firecrawl-js": {
|
||||||
"version": "0.0.17-beta.8",
|
"version": "0.0.19",
|
||||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.17-beta.8.tgz",
|
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.19.tgz",
|
||||||
"integrity": "sha512-d65AW+y4YUQ9oU4Jy8dqiuKBPr+QkAyOKYEwFev/GOpGbNfU6lBUGJlAujVXaVY6fDbUGkHoaEzUbuTsqZV+Ng==",
|
"integrity": "sha512-u9BDVIN/bftDztxLlE2cf02Nz0si3+Vmy9cANDFHj/iriT3guzI8ITBk4uC81CyRmPzNyXrW6hSAG90g9ol4cA==",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@mendable/firecrawl-js": "^0.0.17-beta.5",
|
|
||||||
"axios": "^1.6.8",
|
"axios": "^1.6.8",
|
||||||
"zod": "^3.23.8",
|
"zod": "^3.23.8",
|
||||||
"zod-to-json-schema": "^3.23.0"
|
"zod-to-json-schema": "^3.23.0"
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@mendable/firecrawl-js": "^0.0.17-beta.8",
|
"@mendable/firecrawl-js": "^0.0.19",
|
||||||
"axios": "^1.6.8",
|
"axios": "^1.6.8",
|
||||||
"ts-node": "^10.9.2",
|
"ts-node": "^10.9.2",
|
||||||
"typescript": "^5.4.5",
|
"typescript": "^5.4.5",
|
||||||
|
@ -3,7 +3,7 @@ import { z } from "zod";
|
|||||||
|
|
||||||
async function a() {
|
async function a() {
|
||||||
const app = new FirecrawlApp({
|
const app = new FirecrawlApp({
|
||||||
apiKey: "fc-YOUR_FIRECRAWL_API_KEY",
|
apiKey: "fc-YOUR_API_KEY",
|
||||||
});
|
});
|
||||||
|
|
||||||
// Define schema to extract contents into
|
// Define schema to extract contents into
|
||||||
@ -20,7 +20,7 @@ async function a() {
|
|||||||
.length(5)
|
.length(5)
|
||||||
.describe("Top 5 stories on Hacker News"),
|
.describe("Top 5 stories on Hacker News"),
|
||||||
});
|
});
|
||||||
const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", {
|
||||||
extractorOptions: { extractionSchema: schema },
|
extractorOptions: { extractionSchema: schema },
|
||||||
});
|
});
|
||||||
console.log(scrapeResult.data["llm_extraction"]);
|
console.log(scrapeResult.data["llm_extraction"]);
|
||||||
|
@ -1,13 +1,36 @@
|
|||||||
from firecrawl import FirecrawlApp
|
from firecrawl import FirecrawlApp
|
||||||
|
|
||||||
|
|
||||||
app = FirecrawlApp(api_key="YOUR_API_KEY")
|
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||||
|
|
||||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
|
# crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
|
||||||
print(crawl_result[0]['markdown'])
|
|
||||||
|
|
||||||
job_id = crawl_result['jobId']
|
# print(crawl_result[0]['markdown'])
|
||||||
print(job_id)
|
|
||||||
|
# job_id = crawl_result['jobId']
|
||||||
|
# print(job_id)
|
||||||
|
|
||||||
|
# status = app.check_crawl_status(job_id)
|
||||||
|
# print(status)
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
class ArticleSchema(BaseModel):
|
||||||
|
title: str
|
||||||
|
points: int
|
||||||
|
by: str
|
||||||
|
commentsURL: str
|
||||||
|
|
||||||
|
class TopArticlesSchema(BaseModel):
|
||||||
|
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||||
|
|
||||||
|
a = app.scrape_url('https://news.ycombinator.com', {
|
||||||
|
'extractorOptions': {
|
||||||
|
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||||
|
'mode': 'llm-extraction'
|
||||||
|
},
|
||||||
|
'pageOptions':{
|
||||||
|
'onlyMainContent': True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
status = app.check_crawl_status(job_id)
|
|
||||||
print(status)
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -8,26 +9,51 @@ class FirecrawlApp:
|
|||||||
if self.api_key is None:
|
if self.api_key is None:
|
||||||
raise ValueError('No API key provided')
|
raise ValueError('No API key provided')
|
||||||
|
|
||||||
def scrape_url(self, url, params=None):
|
from pydantic import BaseModel
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
|
||||||
|
class ScrapeParams(BaseModel):
|
||||||
|
url: str
|
||||||
|
extractorOptions: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||||
headers = {
|
headers = {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
'Authorization': f'Bearer {self.api_key}'
|
||||||
}
|
}
|
||||||
json_data = {'url': url}
|
# Prepare the base scrape parameters with the URL
|
||||||
|
scrape_params = {'url': url}
|
||||||
|
|
||||||
|
# If there are additional params, process them
|
||||||
if params:
|
if params:
|
||||||
json_data.update(params)
|
# Initialize extractorOptions if present
|
||||||
|
extractor_options = params.get('extractorOptions', {})
|
||||||
|
# Check and convert the extractionSchema if it's a Pydantic model
|
||||||
|
if 'extractionSchema' in extractor_options:
|
||||||
|
if hasattr(extractor_options['extractionSchema'], 'schema'):
|
||||||
|
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
|
||||||
|
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
|
||||||
|
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
|
||||||
|
# Update the scrape_params with the processed extractorOptions
|
||||||
|
scrape_params['extractorOptions'] = extractor_options
|
||||||
|
|
||||||
|
# Include any other params directly at the top level of scrape_params
|
||||||
|
for key, value in params.items():
|
||||||
|
if key != 'extractorOptions':
|
||||||
|
scrape_params[key] = value
|
||||||
|
print(scrape_params)
|
||||||
|
# Make the POST request with the prepared headers and JSON data
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
'https://api.firecrawl.dev/v0/scrape',
|
'https://api.firecrawl.dev/v0/scrape',
|
||||||
headers=headers,
|
headers=headers,
|
||||||
json=json_data
|
json=scrape_params
|
||||||
)
|
)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
response = response.json()
|
response = response.json()
|
||||||
if response['success'] == True:
|
if response['success']:
|
||||||
return response['data']
|
return response['data']
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||||
|
|
||||||
elif response.status_code in [402, 409, 500]:
|
elif response.status_code in [402, 409, 500]:
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
error_message = response.json().get('error', 'Unknown error occurred')
|
||||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
||||||
|
@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='firecrawl-py',
|
name='firecrawl-py',
|
||||||
version='0.0.6',
|
version='0.0.7',
|
||||||
url='https://github.com/mendableai/firecrawl',
|
url='https://github.com/mendableai/firecrawl',
|
||||||
author='Mendable.ai',
|
author='Mendable.ai',
|
||||||
author_email='nick@mendable.ai',
|
author_email='nick@mendable.ai',
|
||||||
|
Loading…
Reference in New Issue
Block a user