0

Merge pull request #135 from mendableai/nsc/llm-extraction-zod-integration

Adds Zod Integration for LLM Extraction in the Firecrawl JS SDK
This commit is contained in:
Nicolas 2024-05-08 18:15:12 -07:00 committed by GitHub
commit c02d7aeebd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 1231 additions and 119 deletions

151
README.md
View File

@ -215,8 +215,6 @@ curl -X POST https://api.firecrawl.dev/v0/scrape \
``` ```
Coming soon to the Langchain and LLama Index integrations.
## Using Python SDK ## Using Python SDK
### Installing Python SDK ### Installing Python SDK
@ -248,6 +246,32 @@ url = 'https://example.com'
scraped_data = app.scrape_url(url) scraped_data = app.scrape_url(url)
``` ```
### Extracting structured data from a URL
With LLM extraction, you can easily extract structured data from any URL. We support pydanti schemas to make it easier for you too. Here is how you to use it:
```python
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
data = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': TopArticlesSchema.model_json_schema(),
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})
print(data["llm_extraction"])
```
### Search for a query ### Search for a query
Performs a web search, retrieve the top results, extract data from each page, and returns their markdown. Performs a web search, retrieve the top results, extract data from each page, and returns their markdown.
@ -257,6 +281,129 @@ query = 'What is Mendable?'
search_result = app.search(query) search_result = app.search(query)
``` ```
## Using the Node SDK
### Installation
To install the Firecrawl Node SDK, you can use npm:
```bash
npm install @mendable/firecrawl-js
```
### Usage
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
### Scraping a URL
To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
```js
try {
const url = 'https://example.com';
const scrapedData = await app.scrapeUrl(url);
console.log(scrapedData);
} catch (error) {
console.error(
'Error occurred while scraping:',
error.message
);
}
```
### Crawling a Website
To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
```js
const crawlUrl = 'https://example.com';
const params = {
crawlerOptions: {
excludes: ['blog/'],
includes: [], // leave empty for all pages
limit: 1000,
},
pageOptions: {
onlyMainContent: true
}
};
const waitUntilDone = true;
const timeout = 5;
const crawlResult = await app.crawlUrl(
crawlUrl,
params,
waitUntilDone,
timeout
);
```
### Checking Crawl Status
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
```js
const status = await app.checkCrawlStatus(jobId);
console.log(status);
```
### Extracting structured data from a URL
With LLM extraction, you can easily extract structured data from any URL. We support zod schema to make it easier for you too. Here is how you to use it:
```js
import FirecrawlApp from "@mendable/firecrawl-js";
import { z } from "zod";
const app = new FirecrawlApp({
apiKey: "fc-YOUR_API_KEY",
});
// Define schema to extract contents into
const schema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5)
.describe("Top 5 stories on Hacker News"),
});
const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", {
extractorOptions: { extractionSchema: schema },
});
console.log(scrapeResult.data["llm_extraction"]);
```
### Search for a query
With the `search` method, you can search for a query in a search engine and get the top results along with the page content for each result. The method takes the query as a parameter and returns the search results.
```js
const query = 'what is mendable?';
const searchResults = await app.search(query, {
pageOptions: {
fetchPageContent: true // Fetch the page content for each search result
}
});
```
## Contributing ## Contributing
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request. We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.

View File

@ -7,9 +7,9 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
step((generator = generator.apply(thisArg, _arguments || [])).next()); step((generator = generator.apply(thisArg, _arguments || [])).next());
}); });
}; };
import axios from 'axios'; import axios from "axios";
import dotenv from 'dotenv'; import { z } from "zod";
dotenv.config(); import { zodToJsonSchema } from "zod-to-json-schema";
/** /**
* Main class for interacting with the Firecrawl API. * Main class for interacting with the Firecrawl API.
*/ */
@ -19,9 +19,9 @@ export default class FirecrawlApp {
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
*/ */
constructor({ apiKey = null }) { constructor({ apiKey = null }) {
this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || ''; this.apiKey = apiKey || "";
if (!this.apiKey) { if (!this.apiKey) {
throw new Error('No API key provided'); throw new Error("No API key provided");
} }
} }
/** /**
@ -32,16 +32,22 @@ export default class FirecrawlApp {
*/ */
scrapeUrl(url_1) { scrapeUrl(url_1) {
return __awaiter(this, arguments, void 0, function* (url, params = null) { return __awaiter(this, arguments, void 0, function* (url, params = null) {
var _a;
const headers = { const headers = {
'Content-Type': 'application/json', "Content-Type": "application/json",
'Authorization': `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
}; };
let jsonData = { url }; let jsonData = Object.assign({ url }, params);
if (params) { if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) {
jsonData = Object.assign(Object.assign({}, jsonData), params); let schema = params.extractorOptions.extractionSchema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema);
}
jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) });
} }
try { try {
const response = yield axios.post('https://api.firecrawl.dev/v0/scrape', jsonData, { headers }); const response = yield axios.post("https://api.firecrawl.dev/v0/scrape", jsonData, { headers });
if (response.status === 200) { if (response.status === 200) {
const responseData = response.data; const responseData = response.data;
if (responseData.success) { if (responseData.success) {
@ -52,13 +58,13 @@ export default class FirecrawlApp {
} }
} }
else { else {
this.handleError(response, 'scrape URL'); this.handleError(response, "scrape URL");
} }
} }
catch (error) { catch (error) {
throw new Error(error.message); throw new Error(error.message);
} }
return { success: false, error: 'Internal server error.' }; return { success: false, error: "Internal server error." };
}); });
} }
/** /**
@ -70,15 +76,15 @@ export default class FirecrawlApp {
search(query_1) { search(query_1) {
return __awaiter(this, arguments, void 0, function* (query, params = null) { return __awaiter(this, arguments, void 0, function* (query, params = null) {
const headers = { const headers = {
'Content-Type': 'application/json', "Content-Type": "application/json",
'Authorization': `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
}; };
let jsonData = { query }; let jsonData = { query };
if (params) { if (params) {
jsonData = Object.assign(Object.assign({}, jsonData), params); jsonData = Object.assign(Object.assign({}, jsonData), params);
} }
try { try {
const response = yield axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers }); const response = yield axios.post("https://api.firecrawl.dev/v0/search", jsonData, { headers });
if (response.status === 200) { if (response.status === 200) {
const responseData = response.data; const responseData = response.data;
if (responseData.success) { if (responseData.success) {
@ -89,13 +95,13 @@ export default class FirecrawlApp {
} }
} }
else { else {
this.handleError(response, 'search'); this.handleError(response, "search");
} }
} }
catch (error) { catch (error) {
throw new Error(error.message); throw new Error(error.message);
} }
return { success: false, error: 'Internal server error.' }; return { success: false, error: "Internal server error." };
}); });
} }
/** /**
@ -114,7 +120,7 @@ export default class FirecrawlApp {
jsonData = Object.assign(Object.assign({}, jsonData), params); jsonData = Object.assign(Object.assign({}, jsonData), params);
} }
try { try {
const response = yield this.postRequest('https://api.firecrawl.dev/v0/crawl', jsonData, headers); const response = yield this.postRequest("https://api.firecrawl.dev/v0/crawl", jsonData, headers);
if (response.status === 200) { if (response.status === 200) {
const jobId = response.data.jobId; const jobId = response.data.jobId;
if (waitUntilDone) { if (waitUntilDone) {
@ -125,14 +131,14 @@ export default class FirecrawlApp {
} }
} }
else { else {
this.handleError(response, 'start crawl job'); this.handleError(response, "start crawl job");
} }
} }
catch (error) { catch (error) {
console.log(error); console.log(error);
throw new Error(error.message); throw new Error(error.message);
} }
return { success: false, error: 'Internal server error.' }; return { success: false, error: "Internal server error." };
}); });
} }
/** /**
@ -149,13 +155,17 @@ export default class FirecrawlApp {
return response.data; return response.data;
} }
else { else {
this.handleError(response, 'check crawl status'); this.handleError(response, "check crawl status");
} }
} }
catch (error) { catch (error) {
throw new Error(error.message); throw new Error(error.message);
} }
return { success: false, status: 'unknown', error: 'Internal server error.' }; return {
success: false,
status: "unknown",
error: "Internal server error.",
};
}); });
} }
/** /**
@ -164,8 +174,8 @@ export default class FirecrawlApp {
*/ */
prepareHeaders() { prepareHeaders() {
return { return {
'Content-Type': 'application/json', "Content-Type": "application/json",
'Authorization': `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
}; };
} }
/** /**
@ -200,26 +210,26 @@ export default class FirecrawlApp {
const statusResponse = yield this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers); const statusResponse = yield this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers);
if (statusResponse.status === 200) { if (statusResponse.status === 200) {
const statusData = statusResponse.data; const statusData = statusResponse.data;
if (statusData.status === 'completed') { if (statusData.status === "completed") {
if ('data' in statusData) { if ("data" in statusData) {
return statusData.data; return statusData.data;
} }
else { else {
throw new Error('Crawl job completed but no data was returned'); throw new Error("Crawl job completed but no data was returned");
} }
} }
else if (['active', 'paused', 'pending', 'queued'].includes(statusData.status)) { else if (["active", "paused", "pending", "queued"].includes(statusData.status)) {
if (timeout < 2) { if (timeout < 2) {
timeout = 2; timeout = 2;
} }
yield new Promise(resolve => setTimeout(resolve, timeout * 1000)); // Wait for the specified timeout before checking again yield new Promise((resolve) => setTimeout(resolve, timeout * 1000)); // Wait for the specified timeout before checking again
} }
else { else {
throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`);
} }
} }
else { else {
this.handleError(statusResponse, 'check crawl status'); this.handleError(statusResponse, "check crawl status");
} }
} }
}); });
@ -231,7 +241,7 @@ export default class FirecrawlApp {
*/ */
handleError(response, action) { handleError(response, action) {
if ([402, 409, 500].includes(response.status)) { if ([402, 409, 500].includes(response.status)) {
const errorMessage = response.data.error || 'Unknown error occurred'; const errorMessage = response.data.error || "Unknown error occurred";
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`);
} }
else { else {

View File

@ -1,15 +1,17 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "0.0.13", "version": "0.0.17-beta.8",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "0.0.13", "version": "0.0.17-beta.8",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"axios": "^1.6.8" "axios": "^1.6.8",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"
}, },
"devDependencies": { "devDependencies": {
"@jest/globals": "^29.7.0", "@jest/globals": "^29.7.0",
@ -3766,6 +3768,22 @@
"funding": { "funding": {
"url": "https://github.com/sponsors/sindresorhus" "url": "https://github.com/sponsors/sindresorhus"
} }
},
"node_modules/zod": {
"version": "3.23.8",
"resolved": "https://registry.npmjs.org/zod/-/zod-3.23.8.tgz",
"integrity": "sha512-XBx9AXhXktjUqnepgTiE5flcKIYWi/rme0Eaj+5Y0lftuGBq+jyRu/md4WnuxqgP1ubdpNCsYEYPxrzVHD8d6g==",
"funding": {
"url": "https://github.com/sponsors/colinhacks"
}
},
"node_modules/zod-to-json-schema": {
"version": "3.23.0",
"resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz",
"integrity": "sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==",
"peerDependencies": {
"zod": "^3.23.3"
}
} }
} }
} }

View File

@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "0.0.16", "version": "0.0.19",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "build/index.js", "main": "build/index.js",
"types": "types/index.d.ts", "types": "types/index.d.ts",
@ -8,6 +8,7 @@
"scripts": { "scripts": {
"build": "tsc", "build": "tsc",
"publish": "npm run build && npm publish --access public", "publish": "npm run build && npm publish --access public",
"publish-beta": "npm run build && npm publish --access public --tag beta",
"test": "jest src/**/*.test.ts" "test": "jest src/**/*.test.ts"
}, },
"repository": { "repository": {
@ -17,7 +18,9 @@
"author": "Mendable.ai", "author": "Mendable.ai",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"axios": "^1.6.8" "axios": "^1.6.8",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"
}, },
"bugs": { "bugs": {
"url": "https://github.com/mendableai/firecrawl/issues" "url": "https://github.com/mendableai/firecrawl/issues"

View File

@ -1,5 +1,6 @@
import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios'; import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
import { z } from "zod";
import { zodToJsonSchema } from "zod-to-json-schema";
/** /**
* Configuration interface for FirecrawlApp. * Configuration interface for FirecrawlApp.
*/ */
@ -12,6 +13,11 @@ export interface FirecrawlAppConfig {
*/ */
export interface Params { export interface Params {
[key: string]: any; [key: string]: any;
extractorOptions?: {
extractionSchema: z.ZodSchema | any;
mode?: "llm-extraction";
extractionPrompt?: string;
};
} }
/** /**
@ -63,9 +69,9 @@ export default class FirecrawlApp {
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
*/ */
constructor({ apiKey = null }: FirecrawlAppConfig) { constructor({ apiKey = null }: FirecrawlAppConfig) {
this.apiKey = apiKey || ''; this.apiKey = apiKey || "";
if (!this.apiKey) { if (!this.apiKey) {
throw new Error('No API key provided'); throw new Error("No API key provided");
} }
} }
@ -75,31 +81,50 @@ export default class FirecrawlApp {
* @param {Params | null} params - Additional parameters for the scrape request. * @param {Params | null} params - Additional parameters for the scrape request.
* @returns {Promise<ScrapeResponse>} The response from the scrape operation. * @returns {Promise<ScrapeResponse>} The response from the scrape operation.
*/ */
async scrapeUrl(url: string, params: Params | null = null): Promise<ScrapeResponse> { async scrapeUrl(
url: string,
params: Params | null = null
): Promise<ScrapeResponse> {
const headers: AxiosRequestHeaders = { const headers: AxiosRequestHeaders = {
'Content-Type': 'application/json', "Content-Type": "application/json",
'Authorization': `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders; } as AxiosRequestHeaders;
let jsonData: Params = { url }; let jsonData: Params = { url, ...params };
if (params) { if (params?.extractorOptions?.extractionSchema) {
jsonData = { ...jsonData, ...params }; let schema = params.extractorOptions.extractionSchema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema);
}
jsonData = {
...jsonData,
extractorOptions: {
...params.extractorOptions,
extractionSchema: schema,
mode: params.extractorOptions.mode || "llm-extraction",
},
};
} }
try { try {
const response: AxiosResponse = await axios.post('https://api.firecrawl.dev/v0/scrape', jsonData, { headers }); const response: AxiosResponse = await axios.post(
"https://api.firecrawl.dev/v0/scrape",
jsonData,
{ headers }
);
if (response.status === 200) { if (response.status === 200) {
const responseData = response.data; const responseData = response.data;
if (responseData.success) { if (responseData.success) {
return responseData; return responseData;
} else { } else {
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
} }
} else { } else {
this.handleError(response, 'scrape URL'); this.handleError(response, "scrape URL");
} }
} catch (error: any) { } catch (error: any) {
throw new Error(error.message); throw new Error(error.message);
} }
return { success: false, error: 'Internal server error.' }; return { success: false, error: "Internal server error." };
} }
/** /**
@ -108,31 +133,38 @@ export default class FirecrawlApp {
* @param {Params | null} params - Additional parameters for the search request. * @param {Params | null} params - Additional parameters for the search request.
* @returns {Promise<SearchResponse>} The response from the search operation. * @returns {Promise<SearchResponse>} The response from the search operation.
*/ */
async search(query: string, params: Params | null = null): Promise<SearchResponse> { async search(
query: string,
params: Params | null = null
): Promise<SearchResponse> {
const headers: AxiosRequestHeaders = { const headers: AxiosRequestHeaders = {
'Content-Type': 'application/json', "Content-Type": "application/json",
'Authorization': `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders; } as AxiosRequestHeaders;
let jsonData: Params = { query }; let jsonData: Params = { query };
if (params) { if (params) {
jsonData = { ...jsonData, ...params }; jsonData = { ...jsonData, ...params };
} }
try { try {
const response: AxiosResponse = await axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers }); const response: AxiosResponse = await axios.post(
"https://api.firecrawl.dev/v0/search",
jsonData,
{ headers }
);
if (response.status === 200) { if (response.status === 200) {
const responseData = response.data; const responseData = response.data;
if (responseData.success) { if (responseData.success) {
return responseData; return responseData;
} else { } else {
throw new Error(`Failed to search. Error: ${responseData.error}`); throw new Error(`Failed to search. Error: ${responseData.error}`);
} }
} else { } else {
this.handleError(response, 'search'); this.handleError(response, "search");
} }
} catch (error: any) { } catch (error: any) {
throw new Error(error.message); throw new Error(error.message);
} }
return { success: false, error: 'Internal server error.' }; return { success: false, error: "Internal server error." };
} }
/** /**
@ -143,14 +175,23 @@ export default class FirecrawlApp {
* @param {number} timeout - Timeout in seconds for job status checks. * @param {number} timeout - Timeout in seconds for job status checks.
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation. * @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
*/ */
async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2): Promise<CrawlResponse | any> { async crawlUrl(
url: string,
params: Params | null = null,
waitUntilDone: boolean = true,
timeout: number = 2
): Promise<CrawlResponse | any> {
const headers = this.prepareHeaders(); const headers = this.prepareHeaders();
let jsonData: Params = { url }; let jsonData: Params = { url };
if (params) { if (params) {
jsonData = { ...jsonData, ...params }; jsonData = { ...jsonData, ...params };
} }
try { try {
const response: AxiosResponse = await this.postRequest('https://api.firecrawl.dev/v0/crawl', jsonData, headers); const response: AxiosResponse = await this.postRequest(
"https://api.firecrawl.dev/v0/crawl",
jsonData,
headers
);
if (response.status === 200) { if (response.status === 200) {
const jobId: string = response.data.jobId; const jobId: string = response.data.jobId;
if (waitUntilDone) { if (waitUntilDone) {
@ -159,13 +200,13 @@ export default class FirecrawlApp {
return { success: true, jobId }; return { success: true, jobId };
} }
} else { } else {
this.handleError(response, 'start crawl job'); this.handleError(response, "start crawl job");
} }
} catch (error: any) { } catch (error: any) {
console.log(error) console.log(error);
throw new Error(error.message); throw new Error(error.message);
} }
return { success: false, error: 'Internal server error.' }; return { success: false, error: "Internal server error." };
} }
/** /**
@ -176,16 +217,23 @@ export default class FirecrawlApp {
async checkCrawlStatus(jobId: string): Promise<JobStatusResponse> { async checkCrawlStatus(jobId: string): Promise<JobStatusResponse> {
const headers: AxiosRequestHeaders = this.prepareHeaders(); const headers: AxiosRequestHeaders = this.prepareHeaders();
try { try {
const response: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers); const response: AxiosResponse = await this.getRequest(
`https://api.firecrawl.dev/v0/crawl/status/${jobId}`,
headers
);
if (response.status === 200) { if (response.status === 200) {
return response.data; return response.data;
} else { } else {
this.handleError(response, 'check crawl status'); this.handleError(response, "check crawl status");
} }
} catch (error: any) { } catch (error: any) {
throw new Error(error.message); throw new Error(error.message);
} }
return { success: false, status: 'unknown', error: 'Internal server error.' }; return {
success: false,
status: "unknown",
error: "Internal server error.",
};
} }
/** /**
@ -194,8 +242,8 @@ export default class FirecrawlApp {
*/ */
prepareHeaders(): AxiosRequestHeaders { prepareHeaders(): AxiosRequestHeaders {
return { return {
'Content-Type': 'application/json', "Content-Type": "application/json",
'Authorization': `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders; } as AxiosRequestHeaders;
} }
@ -206,7 +254,11 @@ export default class FirecrawlApp {
* @param {AxiosRequestHeaders} headers - The headers for the request. * @param {AxiosRequestHeaders} headers - The headers for the request.
* @returns {Promise<AxiosResponse>} The response from the POST request. * @returns {Promise<AxiosResponse>} The response from the POST request.
*/ */
postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> { postRequest(
url: string,
data: Params,
headers: AxiosRequestHeaders
): Promise<AxiosResponse> {
return axios.post(url, data, { headers }); return axios.post(url, data, { headers });
} }
@ -216,7 +268,10 @@ export default class FirecrawlApp {
* @param {AxiosRequestHeaders} headers - The headers for the request. * @param {AxiosRequestHeaders} headers - The headers for the request.
* @returns {Promise<AxiosResponse>} The response from the GET request. * @returns {Promise<AxiosResponse>} The response from the GET request.
*/ */
getRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse> { getRequest(
url: string,
headers: AxiosRequestHeaders
): Promise<AxiosResponse> {
return axios.get(url, { headers }); return axios.get(url, { headers });
} }
@ -227,27 +282,38 @@ export default class FirecrawlApp {
* @param {number} timeout - Timeout in seconds for job status checks. * @param {number} timeout - Timeout in seconds for job status checks.
* @returns {Promise<any>} The final job status or data. * @returns {Promise<any>} The final job status or data.
*/ */
async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise<any> { async monitorJobStatus(
jobId: string,
headers: AxiosRequestHeaders,
timeout: number
): Promise<any> {
while (true) { while (true) {
const statusResponse: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers); const statusResponse: AxiosResponse = await this.getRequest(
`https://api.firecrawl.dev/v0/crawl/status/${jobId}`,
headers
);
if (statusResponse.status === 200) { if (statusResponse.status === 200) {
const statusData = statusResponse.data; const statusData = statusResponse.data;
if (statusData.status === 'completed') { if (statusData.status === "completed") {
if ('data' in statusData) { if ("data" in statusData) {
return statusData.data; return statusData.data;
} else { } else {
throw new Error('Crawl job completed but no data was returned'); throw new Error("Crawl job completed but no data was returned");
} }
} else if (['active', 'paused', 'pending', 'queued'].includes(statusData.status)) { } else if (
["active", "paused", "pending", "queued"].includes(statusData.status)
) {
if (timeout < 2) { if (timeout < 2) {
timeout = 2; timeout = 2;
} }
await new Promise(resolve => setTimeout(resolve, timeout * 1000)); // Wait for the specified timeout before checking again await new Promise((resolve) => setTimeout(resolve, timeout * 1000)); // Wait for the specified timeout before checking again
} else { } else {
throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); throw new Error(
`Crawl job failed or was stopped. Status: ${statusData.status}`
);
} }
} else { } else {
this.handleError(statusResponse, 'check crawl status'); this.handleError(statusResponse, "check crawl status");
} }
} }
} }
@ -259,10 +325,15 @@ export default class FirecrawlApp {
*/ */
handleError(response: AxiosResponse, action: string): void { handleError(response: AxiosResponse, action: string): void {
if ([402, 409, 500].includes(response.status)) { if ([402, 409, 500].includes(response.status)) {
const errorMessage: string = response.data.error || 'Unknown error occurred'; const errorMessage: string =
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); response.data.error || "Unknown error occurred";
throw new Error(
`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`
);
} else { } else {
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); throw new Error(
`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`
);
} }
} }
} }

View File

@ -1,4 +1,5 @@
import { AxiosResponse, AxiosRequestHeaders } from 'axios'; import { AxiosResponse, AxiosRequestHeaders } from "axios";
import { z } from "zod";
/** /**
* Configuration interface for FirecrawlApp. * Configuration interface for FirecrawlApp.
*/ */
@ -10,6 +11,11 @@ export interface FirecrawlAppConfig {
*/ */
export interface Params { export interface Params {
[key: string]: any; [key: string]: any;
extractorOptions?: {
extractionSchema: z.ZodSchema | any;
mode?: "llm-extraction";
extractionPrompt?: string;
};
} }
/** /**
* Response interface for scraping operations. * Response interface for scraping operations.

View File

@ -9,19 +9,480 @@
"version": "1.0.0", "version": "1.0.0",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"@mendable/firecrawl-js": "^0.0.15", "@mendable/firecrawl-js": "^0.0.19",
"axios": "^1.6.8" "axios": "^1.6.8",
"ts-node": "^10.9.2",
"typescript": "^5.4.5",
"zod": "^3.23.8"
},
"devDependencies": {
"tsx": "^4.9.3"
}
},
"node_modules/@cspotcode/source-map-support": {
"version": "0.8.1",
"resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz",
"integrity": "sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==",
"dependencies": {
"@jridgewell/trace-mapping": "0.3.9"
},
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/aix-ppc64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.20.2.tgz",
"integrity": "sha512-D+EBOJHXdNZcLJRBkhENNG8Wji2kgc9AZ9KiPr1JuZjsNtyHzrsfLRrY0tk2H2aoFu6RANO1y1iPPUCDYWkb5g==",
"cpu": [
"ppc64"
],
"dev": true,
"optional": true,
"os": [
"aix"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/android-arm": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.20.2.tgz",
"integrity": "sha512-t98Ra6pw2VaDhqNWO2Oph2LXbz/EJcnLmKLGBJwEwXX/JAN83Fym1rU8l0JUWK6HkIbWONCSSatf4sf2NBRx/w==",
"cpu": [
"arm"
],
"dev": true,
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/android-arm64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.20.2.tgz",
"integrity": "sha512-mRzjLacRtl/tWU0SvD8lUEwb61yP9cqQo6noDZP/O8VkwafSYwZ4yWy24kan8jE/IMERpYncRt2dw438LP3Xmg==",
"cpu": [
"arm64"
],
"dev": true,
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/android-x64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.20.2.tgz",
"integrity": "sha512-btzExgV+/lMGDDa194CcUQm53ncxzeBrWJcncOBxuC6ndBkKxnHdFJn86mCIgTELsooUmwUm9FkhSp5HYu00Rg==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/darwin-arm64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.20.2.tgz",
"integrity": "sha512-4J6IRT+10J3aJH3l1yzEg9y3wkTDgDk7TSDFX+wKFiWjqWp/iCfLIYzGyasx9l0SAFPT1HwSCR+0w/h1ES/MjA==",
"cpu": [
"arm64"
],
"dev": true,
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/darwin-x64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.20.2.tgz",
"integrity": "sha512-tBcXp9KNphnNH0dfhv8KYkZhjc+H3XBkF5DKtswJblV7KlT9EI2+jeA8DgBjp908WEuYll6pF+UStUCfEpdysA==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/freebsd-arm64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.20.2.tgz",
"integrity": "sha512-d3qI41G4SuLiCGCFGUrKsSeTXyWG6yem1KcGZVS+3FYlYhtNoNgYrWcvkOoaqMhwXSMrZRl69ArHsGJ9mYdbbw==",
"cpu": [
"arm64"
],
"dev": true,
"optional": true,
"os": [
"freebsd"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/freebsd-x64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.20.2.tgz",
"integrity": "sha512-d+DipyvHRuqEeM5zDivKV1KuXn9WeRX6vqSqIDgwIfPQtwMP4jaDsQsDncjTDDsExT4lR/91OLjRo8bmC1e+Cw==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"freebsd"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-arm": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.20.2.tgz",
"integrity": "sha512-VhLPeR8HTMPccbuWWcEUD1Az68TqaTYyj6nfE4QByZIQEQVWBB8vup8PpR7y1QHL3CpcF6xd5WVBU/+SBEvGTg==",
"cpu": [
"arm"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-arm64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.20.2.tgz",
"integrity": "sha512-9pb6rBjGvTFNira2FLIWqDk/uaf42sSyLE8j1rnUpuzsODBq7FvpwHYZxQ/It/8b+QOS1RYfqgGFNLRI+qlq2A==",
"cpu": [
"arm64"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-ia32": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.20.2.tgz",
"integrity": "sha512-o10utieEkNPFDZFQm9CoP7Tvb33UutoJqg3qKf1PWVeeJhJw0Q347PxMvBgVVFgouYLGIhFYG0UGdBumROyiig==",
"cpu": [
"ia32"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-loong64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.20.2.tgz",
"integrity": "sha512-PR7sp6R/UC4CFVomVINKJ80pMFlfDfMQMYynX7t1tNTeivQ6XdX5r2XovMmha/VjR1YN/HgHWsVcTRIMkymrgQ==",
"cpu": [
"loong64"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-mips64el": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.20.2.tgz",
"integrity": "sha512-4BlTqeutE/KnOiTG5Y6Sb/Hw6hsBOZapOVF6njAESHInhlQAghVVZL1ZpIctBOoTFbQyGW+LsVYZ8lSSB3wkjA==",
"cpu": [
"mips64el"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-ppc64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.20.2.tgz",
"integrity": "sha512-rD3KsaDprDcfajSKdn25ooz5J5/fWBylaaXkuotBDGnMnDP1Uv5DLAN/45qfnf3JDYyJv/ytGHQaziHUdyzaAg==",
"cpu": [
"ppc64"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-riscv64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.20.2.tgz",
"integrity": "sha512-snwmBKacKmwTMmhLlz/3aH1Q9T8v45bKYGE3j26TsaOVtjIag4wLfWSiZykXzXuE1kbCE+zJRmwp+ZbIHinnVg==",
"cpu": [
"riscv64"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-s390x": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.20.2.tgz",
"integrity": "sha512-wcWISOobRWNm3cezm5HOZcYz1sKoHLd8VL1dl309DiixxVFoFe/o8HnwuIwn6sXre88Nwj+VwZUvJf4AFxkyrQ==",
"cpu": [
"s390x"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-x64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.20.2.tgz",
"integrity": "sha512-1MdwI6OOTsfQfek8sLwgyjOXAu+wKhLEoaOLTjbijk6E2WONYpH9ZU2mNtR+lZ2B4uwr+usqGuVfFT9tMtGvGw==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/netbsd-x64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.20.2.tgz",
"integrity": "sha512-K8/DhBxcVQkzYc43yJXDSyjlFeHQJBiowJ0uVL6Tor3jGQfSGHNNJcWxNbOI8v5k82prYqzPuwkzHt3J1T1iZQ==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"netbsd"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/openbsd-x64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.20.2.tgz",
"integrity": "sha512-eMpKlV0SThJmmJgiVyN9jTPJ2VBPquf6Kt/nAoo6DgHAoN57K15ZghiHaMvqjCye/uU4X5u3YSMgVBI1h3vKrQ==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"openbsd"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/sunos-x64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.20.2.tgz",
"integrity": "sha512-2UyFtRC6cXLyejf/YEld4Hajo7UHILetzE1vsRcGL3earZEW77JxrFjH4Ez2qaTiEfMgAXxfAZCm1fvM/G/o8w==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"sunos"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/win32-arm64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.20.2.tgz",
"integrity": "sha512-GRibxoawM9ZCnDxnP3usoUDO9vUkpAxIIZ6GQI+IlVmr5kP3zUq+l17xELTHMWTWzjxa2guPNyrpq1GWmPvcGQ==",
"cpu": [
"arm64"
],
"dev": true,
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/win32-ia32": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.20.2.tgz",
"integrity": "sha512-HfLOfn9YWmkSKRQqovpnITazdtquEW8/SoHW7pWpuEeguaZI4QnCRW6b+oZTztdBnZOS2hqJ6im/D5cPzBTTlQ==",
"cpu": [
"ia32"
],
"dev": true,
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/win32-x64": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.20.2.tgz",
"integrity": "sha512-N49X4lJX27+l9jbLKSqZ6bKNjzQvHaT8IIFUy+YIqmXQdjYCToGWwOItDrfby14c78aDd5NHQl29xingXfCdLQ==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@jridgewell/resolve-uri": {
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
"integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
"engines": {
"node": ">=6.0.0"
}
},
"node_modules/@jridgewell/sourcemap-codec": {
"version": "1.4.15",
"resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.15.tgz",
"integrity": "sha512-eF2rxCRulEKXHTRiDrDy6erMYWqNw4LPdQ8UQA4huuxaQsVeRPFl2oM8oDGxMFhJUWZf9McpLtJasDDZb/Bpeg=="
},
"node_modules/@jridgewell/trace-mapping": {
"version": "0.3.9",
"resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz",
"integrity": "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==",
"dependencies": {
"@jridgewell/resolve-uri": "^3.0.3",
"@jridgewell/sourcemap-codec": "^1.4.10"
} }
}, },
"node_modules/@mendable/firecrawl-js": { "node_modules/@mendable/firecrawl-js": {
"version": "0.0.15", "version": "0.0.19",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.15.tgz", "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.19.tgz",
"integrity": "sha512-e3iCCrLIiEh+jEDerGV9Uhdkn8ymo+sG+k3osCwPg51xW1xUdAnmlcHrcJoR43RvKXdvD/lqoxg8odUEsqyH+w==", "integrity": "sha512-u9BDVIN/bftDztxLlE2cf02Nz0si3+Vmy9cANDFHj/iriT3guzI8ITBk4uC81CyRmPzNyXrW6hSAG90g9ol4cA==",
"dependencies": { "dependencies": {
"axios": "^1.6.8", "axios": "^1.6.8",
"dotenv": "^16.4.5" "zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"
} }
}, },
"node_modules/@tsconfig/node10": {
"version": "1.0.11",
"resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz",
"integrity": "sha512-DcRjDCujK/kCk/cUe8Xz8ZSpm8mS3mNNpta+jGCA6USEDfktlNvm1+IuZ9eTcDbNk41BHwpHHeW+N1lKCz4zOw=="
},
"node_modules/@tsconfig/node12": {
"version": "1.0.11",
"resolved": "https://registry.npmjs.org/@tsconfig/node12/-/node12-1.0.11.tgz",
"integrity": "sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag=="
},
"node_modules/@tsconfig/node14": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/@tsconfig/node14/-/node14-1.0.3.tgz",
"integrity": "sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow=="
},
"node_modules/@tsconfig/node16": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz",
"integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA=="
},
"node_modules/@types/node": {
"version": "20.12.11",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.11.tgz",
"integrity": "sha512-vDg9PZ/zi+Nqp6boSOT7plNuthRugEKixDv5sFTIpkE89MmNtEArAShI4mxuX2+UrLEe9pxC1vm2cjm9YlWbJw==",
"peer": true,
"dependencies": {
"undici-types": "~5.26.4"
}
},
"node_modules/acorn": {
"version": "8.11.3",
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz",
"integrity": "sha512-Y9rRfJG5jcKOE0CLisYbojUjIrIEE7AGMzA/Sm4BslANhbS+cDMpgBdcPT91oJ7OuJ9hYJBx59RjbhxVnrF8Xg==",
"bin": {
"acorn": "bin/acorn"
},
"engines": {
"node": ">=0.4.0"
}
},
"node_modules/acorn-walk": {
"version": "8.3.2",
"resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.3.2.tgz",
"integrity": "sha512-cjkyv4OtNCIeqhHrfS81QWXoCBPExR/J62oyEqepVw8WaQeSqpW2uhuLPh1m9eWhDuOo/jUXVTlifvesOWp/4A==",
"engines": {
"node": ">=0.4.0"
}
},
"node_modules/arg": {
"version": "4.1.3",
"resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz",
"integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA=="
},
"node_modules/asynckit": { "node_modules/asynckit": {
"version": "0.4.0", "version": "0.4.0",
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
@ -48,6 +509,11 @@
"node": ">= 0.8" "node": ">= 0.8"
} }
}, },
"node_modules/create-require": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz",
"integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ=="
},
"node_modules/delayed-stream": { "node_modules/delayed-stream": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
@ -56,15 +522,50 @@
"node": ">=0.4.0" "node": ">=0.4.0"
} }
}, },
"node_modules/dotenv": { "node_modules/diff": {
"version": "16.4.5", "version": "4.0.2",
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.2.tgz",
"integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==", "integrity": "sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==",
"engines": {
"node": ">=0.3.1"
}
},
"node_modules/esbuild": {
"version": "0.20.2",
"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.20.2.tgz",
"integrity": "sha512-WdOOppmUNU+IbZ0PaDiTst80zjnrOkyJNHoKupIcVyU8Lvla3Ugx94VzkQ32Ijqd7UhHJy75gNWDMUekcrSJ6g==",
"dev": true,
"hasInstallScript": true,
"bin": {
"esbuild": "bin/esbuild"
},
"engines": { "engines": {
"node": ">=12" "node": ">=12"
}, },
"funding": { "optionalDependencies": {
"url": "https://dotenvx.com" "@esbuild/aix-ppc64": "0.20.2",
"@esbuild/android-arm": "0.20.2",
"@esbuild/android-arm64": "0.20.2",
"@esbuild/android-x64": "0.20.2",
"@esbuild/darwin-arm64": "0.20.2",
"@esbuild/darwin-x64": "0.20.2",
"@esbuild/freebsd-arm64": "0.20.2",
"@esbuild/freebsd-x64": "0.20.2",
"@esbuild/linux-arm": "0.20.2",
"@esbuild/linux-arm64": "0.20.2",
"@esbuild/linux-ia32": "0.20.2",
"@esbuild/linux-loong64": "0.20.2",
"@esbuild/linux-mips64el": "0.20.2",
"@esbuild/linux-ppc64": "0.20.2",
"@esbuild/linux-riscv64": "0.20.2",
"@esbuild/linux-s390x": "0.20.2",
"@esbuild/linux-x64": "0.20.2",
"@esbuild/netbsd-x64": "0.20.2",
"@esbuild/openbsd-x64": "0.20.2",
"@esbuild/sunos-x64": "0.20.2",
"@esbuild/win32-arm64": "0.20.2",
"@esbuild/win32-ia32": "0.20.2",
"@esbuild/win32-x64": "0.20.2"
} }
}, },
"node_modules/follow-redirects": { "node_modules/follow-redirects": {
@ -99,6 +600,37 @@
"node": ">= 6" "node": ">= 6"
} }
}, },
"node_modules/fsevents": {
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
"integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
"dev": true,
"hasInstallScript": true,
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
}
},
"node_modules/get-tsconfig": {
"version": "4.7.4",
"resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.7.4.tgz",
"integrity": "sha512-ofbkKj+0pjXjhejr007J/fLf+sW+8H7K5GCm+msC8q3IpvgjobpyPqSRFemNyIMxklC0zeJpi7VDFna19FacvQ==",
"dev": true,
"dependencies": {
"resolve-pkg-maps": "^1.0.0"
},
"funding": {
"url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
}
},
"node_modules/make-error": {
"version": "1.3.6",
"resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz",
"integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw=="
},
"node_modules/mime-db": { "node_modules/mime-db": {
"version": "1.52.0", "version": "1.52.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
@ -122,6 +654,123 @@
"version": "1.1.0", "version": "1.1.0",
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
"integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg=="
},
"node_modules/resolve-pkg-maps": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
"integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==",
"dev": true,
"funding": {
"url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
}
},
"node_modules/ts-node": {
"version": "10.9.2",
"resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.2.tgz",
"integrity": "sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==",
"dependencies": {
"@cspotcode/source-map-support": "^0.8.0",
"@tsconfig/node10": "^1.0.7",
"@tsconfig/node12": "^1.0.7",
"@tsconfig/node14": "^1.0.0",
"@tsconfig/node16": "^1.0.2",
"acorn": "^8.4.1",
"acorn-walk": "^8.1.1",
"arg": "^4.1.0",
"create-require": "^1.1.0",
"diff": "^4.0.1",
"make-error": "^1.1.1",
"v8-compile-cache-lib": "^3.0.1",
"yn": "3.1.1"
},
"bin": {
"ts-node": "dist/bin.js",
"ts-node-cwd": "dist/bin-cwd.js",
"ts-node-esm": "dist/bin-esm.js",
"ts-node-script": "dist/bin-script.js",
"ts-node-transpile-only": "dist/bin-transpile.js",
"ts-script": "dist/bin-script-deprecated.js"
},
"peerDependencies": {
"@swc/core": ">=1.2.50",
"@swc/wasm": ">=1.2.50",
"@types/node": "*",
"typescript": ">=2.7"
},
"peerDependenciesMeta": {
"@swc/core": {
"optional": true
},
"@swc/wasm": {
"optional": true
}
}
},
"node_modules/tsx": {
"version": "4.9.3",
"resolved": "https://registry.npmjs.org/tsx/-/tsx-4.9.3.tgz",
"integrity": "sha512-czVbetlILiyJZI5zGlj2kw9vFiSeyra9liPD4nG+Thh4pKTi0AmMEQ8zdV/L2xbIVKrIqif4sUNrsMAOksx9Zg==",
"dev": true,
"dependencies": {
"esbuild": "~0.20.2",
"get-tsconfig": "^4.7.3"
},
"bin": {
"tsx": "dist/cli.mjs"
},
"engines": {
"node": ">=18.0.0"
},
"optionalDependencies": {
"fsevents": "~2.3.3"
}
},
"node_modules/typescript": {
"version": "5.4.5",
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.4.5.tgz",
"integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
"bin": {
"tsc": "bin/tsc",
"tsserver": "bin/tsserver"
},
"engines": {
"node": ">=14.17"
}
},
"node_modules/undici-types": {
"version": "5.26.5",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
"peer": true
},
"node_modules/v8-compile-cache-lib": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg=="
},
"node_modules/yn": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz",
"integrity": "sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==",
"engines": {
"node": ">=6"
}
},
"node_modules/zod": {
"version": "3.23.8",
"resolved": "https://registry.npmjs.org/zod/-/zod-3.23.8.tgz",
"integrity": "sha512-XBx9AXhXktjUqnepgTiE5flcKIYWi/rme0Eaj+5Y0lftuGBq+jyRu/md4WnuxqgP1ubdpNCsYEYPxrzVHD8d6g==",
"funding": {
"url": "https://github.com/sponsors/colinhacks"
}
},
"node_modules/zod-to-json-schema": {
"version": "3.23.0",
"resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz",
"integrity": "sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==",
"peerDependencies": {
"zod": "^3.23.3"
}
} }
} }
} }

View File

@ -11,7 +11,13 @@
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"@mendable/firecrawl-js": "^0.0.15", "@mendable/firecrawl-js": "^0.0.19",
"axios": "^1.6.8" "axios": "^1.6.8",
"ts-node": "^10.9.2",
"typescript": "^5.4.5",
"zod": "^3.23.8"
},
"devDependencies": {
"tsx": "^4.9.3"
} }
} }

28
apps/js-sdk/test.ts Normal file
View File

@ -0,0 +1,28 @@
import FirecrawlApp from "@mendable/firecrawl-js";
import { z } from "zod";
async function a() {
const app = new FirecrawlApp({
apiKey: "fc-YOUR_API_KEY",
});
// Define schema to extract contents into
const schema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5)
.describe("Top 5 stories on Hacker News"),
});
const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", {
extractorOptions: { extractionSchema: schema },
});
console.log(scrapeResult.data["llm_extraction"]);
}
a();

72
apps/js-sdk/tsconfig.json Normal file
View File

@ -0,0 +1,72 @@
{
"compilerOptions": {
/* Visit https://aka.ms/tsconfig.json to read more about this file */
/* Basic Options */
// "incremental": true, /* Enable incremental compilation */
"target": "es6" /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */,
"module": "commonjs" /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */,
// "lib": [], /* Specify library files to be included in the compilation. */
// "allowJs": true, /* Allow javascript files to be compiled. */
// "checkJs": true, /* Report errors in .js files. */
// "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */
"declaration": true /* Generates corresponding '.d.ts' file. */,
// "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */
// "sourceMap": true, /* Generates corresponding '.map' file. */
// "outFile": "./", /* Concatenate and emit output to single file. */
"outDir": "./build" /* Redirect output structure to the directory. */,
// "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */
// "composite": true, /* Enable project compilation */
// "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */
// "removeComments": true, /* Do not emit comments to output. */
// "noEmit": true, /* Do not emit outputs. */
// "importHelpers": true, /* Import emit helpers from 'tslib'. */
// "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */
// "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */
/* Strict Type-Checking Options */
"strict": false /* Enable all strict type-checking options. */,
// "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */
// "strictNullChecks": true, /* Enable strict null checks. */
// "strictFunctionTypes": true, /* Enable strict checking of function types. */
// "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */
// "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */
// "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */
// "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */
/* Additional Checks */
// "noUnusedLocals": true, /* Report errors on unused locals. */
// "noUnusedParameters": true, /* Report errors on unused parameters. */
// "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */
// "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */
/* Module Resolution Options */
// "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */
// "baseUrl": "./", /* Base directory to resolve non-absolute module names. */
// "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */
// "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */
// "typeRoots": [], /* List of folders to include type definitions from. */
// "types": [], /* Type declaration files to be included in compilation. */
// "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */
"resolveJsonModule": true,
"esModuleInterop": true /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */,
// "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
/* Source Map Options */
// "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
// "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */
// "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */
/* Experimental Options */
// "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */
// "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */
/* Advanced Options */
"skipLibCheck": true /* Skip type checking of declaration files. */,
"forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */
},
"include": ["src", "test.ts"],
"exclude": ["node_modules", "**/__tests__/*"]
}

View File

@ -46,6 +46,31 @@ To scrape a single URL, use the `scrape_url` method. It takes the URL as a param
url = 'https://example.com' url = 'https://example.com'
scraped_data = app.scrape_url(url) scraped_data = app.scrape_url(url)
``` ```
### Extracting structured data from a URL
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
```python
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
data = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': TopArticlesSchema.model_json_schema(),
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})
print(data["llm_extraction"])
```
### Search for a query ### Search for a query

View File

@ -1,5 +1,7 @@
import os import os
from typing import Any, Dict, Optional
import requests import requests
import time
class FirecrawlApp: class FirecrawlApp:
def __init__(self, api_key=None): def __init__(self, api_key=None):
@ -7,26 +9,45 @@ class FirecrawlApp:
if self.api_key is None: if self.api_key is None:
raise ValueError('No API key provided') raise ValueError('No API key provided')
def scrape_url(self, url, params=None):
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}'
} }
json_data = {'url': url} # Prepare the base scrape parameters with the URL
scrape_params = {'url': url}
# If there are additional params, process them
if params: if params:
json_data.update(params) # Initialize extractorOptions if present
extractor_options = params.get('extractorOptions', {})
# Check and convert the extractionSchema if it's a Pydantic model
if 'extractionSchema' in extractor_options:
if hasattr(extractor_options['extractionSchema'], 'schema'):
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
# Update the scrape_params with the processed extractorOptions
scrape_params['extractorOptions'] = extractor_options
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key != 'extractorOptions':
scrape_params[key] = value
# Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/scrape', 'https://api.firecrawl.dev/v0/scrape',
headers=headers, headers=headers,
json=json_data json=scrape_params
) )
if response.status_code == 200: if response.status_code == 200:
response = response.json() response = response.json()
if response['success'] == True: if response['success']:
return response['data'] return response['data']
else: else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]: elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
@ -88,11 +109,23 @@ class FirecrawlApp:
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}'
} }
def _post_request(self, url, data, headers): def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
return requests.post(url, headers=headers, json=data) for attempt in range(retries):
response = requests.post(url, headers=headers, json=data)
if response.status_code == 502:
time.sleep(backoff_factor * (2 ** attempt))
else:
return response
return response
def _get_request(self, url, headers): def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
return requests.get(url, headers=headers) for attempt in range(retries):
response = requests.get(url, headers=headers)
if response.status_code == 502:
time.sleep(backoff_factor * (2 ** attempt))
else:
return response
return response
def _monitor_job_status(self, job_id, headers, timeout): def _monitor_job_status(self, job_id, headers, timeout):
import time import time

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,9 +1,10 @@
from firecrawl import FirecrawlApp from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="YOUR_API_KEY") app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}) crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
print(crawl_result[0]['markdown']) print(crawl_result[0]['markdown'])
job_id = crawl_result['jobId'] job_id = crawl_result['jobId']
@ -11,3 +12,26 @@ print(job_id)
status = app.check_crawl_status(job_id) status = app.check_crawl_status(job_id)
print(status) print(status)
from pydantic import BaseModel, Field
from typing import List, Optional
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
a = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': TopArticlesSchema.model_json_schema(),
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})

View File

@ -1,4 +1,5 @@
import os import os
from typing import Any, Dict, Optional
import requests import requests
import time import time
@ -8,26 +9,45 @@ class FirecrawlApp:
if self.api_key is None: if self.api_key is None:
raise ValueError('No API key provided') raise ValueError('No API key provided')
def scrape_url(self, url, params=None):
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}'
} }
json_data = {'url': url} # Prepare the base scrape parameters with the URL
scrape_params = {'url': url}
# If there are additional params, process them
if params: if params:
json_data.update(params) # Initialize extractorOptions if present
extractor_options = params.get('extractorOptions', {})
# Check and convert the extractionSchema if it's a Pydantic model
if 'extractionSchema' in extractor_options:
if hasattr(extractor_options['extractionSchema'], 'schema'):
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
# Update the scrape_params with the processed extractorOptions
scrape_params['extractorOptions'] = extractor_options
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key != 'extractorOptions':
scrape_params[key] = value
# Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/scrape', 'https://api.firecrawl.dev/v0/scrape',
headers=headers, headers=headers,
json=json_data json=scrape_params
) )
if response.status_code == 200: if response.status_code == 200:
response = response.json() response = response.json()
if response['success'] == True: if response['success']:
return response['data'] return response['data']
else: else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]: elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')

View File

@ -1,6 +1,6 @@
Metadata-Version: 2.1 Metadata-Version: 2.1
Name: firecrawl-py Name: firecrawl-py
Version: 0.0.6 Version: 0.0.8
Summary: Python SDK for Firecrawl API Summary: Python SDK for Firecrawl API
Home-page: https://github.com/mendableai/firecrawl Home-page: https://github.com/mendableai/firecrawl
Author: Mendable.ai Author: Mendable.ai

View File

@ -2,12 +2,12 @@ from setuptools import setup, find_packages
setup( setup(
name='firecrawl-py', name='firecrawl-py',
version='0.0.6', version='0.0.8',
url='https://github.com/mendableai/firecrawl', url='https://github.com/mendableai/firecrawl',
author='Mendable.ai', author='Mendable.ai',
author_email='nick@mendable.ai', author_email='nick@mendable.ai',
description='Python SDK for Firecrawl API', description='Python SDK for Firecrawl API',
packages=find_packages(), packages=find_packages(),
install_requires=[ install_requires=[
'requests', 'requests',
], ],