Merge branch 'main' into feat/maxDepthRelative
This commit is contained in:
commit
2c5f5c0ea2
@ -61,6 +61,13 @@
|
|||||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
"default": 0
|
"default": 0
|
||||||
},
|
},
|
||||||
|
"removeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
"headers": {
|
"headers": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||||
@ -194,6 +201,11 @@
|
|||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Maximum number of pages to crawl",
|
"description": "Maximum number of pages to crawl",
|
||||||
"default": 10000
|
"default": 10000
|
||||||
|
},
|
||||||
|
"allowBackwardCrawling": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
|
||||||
|
"default": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -219,6 +231,13 @@
|
|||||||
"type": "object",
|
"type": "object",
|
||||||
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
||||||
},
|
},
|
||||||
|
"removeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
"replaceAllPathsWithAbsolutePaths": {
|
"replaceAllPathsWithAbsolutePaths": {
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"description": "Replace all relative paths with absolute paths for images and links",
|
"description": "Replace all relative paths with absolute paths for images and links",
|
||||||
@ -492,7 +511,7 @@
|
|||||||
"html": {
|
"html": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"nullable": true,
|
"nullable": true,
|
||||||
"description": "Raw HTML content of the page if `includeHtml` is true"
|
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||||
},
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
@ -507,9 +526,126 @@
|
|||||||
"type": "string",
|
"type": "string",
|
||||||
"nullable": true
|
"nullable": true
|
||||||
},
|
},
|
||||||
|
"keywords": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"robots": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogTitle": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogDescription": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogUrl": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogImage": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogAudio": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogDeterminer": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogLocale": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogLocaleAlternate": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogSiteName": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogVideo": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dctermsCreated": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dcDateCreated": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dcDate": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dctermsType": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dcType": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dctermsAudience": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dctermsSubject": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dcSubject": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dcDescription": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dctermsKeywords": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"modifiedTime": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"publishedTime": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"articleTag": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"articleSection": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
"sourceURL": {
|
"sourceURL": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"format": "uri"
|
"format": "uri"
|
||||||
|
},
|
||||||
|
"pageStatusCode": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "The status code of the page"
|
||||||
|
},
|
||||||
|
"pageError": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "The error message of the page"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -558,9 +694,126 @@
|
|||||||
"type": "string",
|
"type": "string",
|
||||||
"nullable": true
|
"nullable": true
|
||||||
},
|
},
|
||||||
|
"keywords": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"robots": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogTitle": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogDescription": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogUrl": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogImage": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogAudio": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogDeterminer": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogLocale": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogLocaleAlternate": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogSiteName": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"ogVideo": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dctermsCreated": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dcDateCreated": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dcDate": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dctermsType": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dcType": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dctermsAudience": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dctermsSubject": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dcSubject": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dcDescription": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"dctermsKeywords": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"modifiedTime": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"publishedTime": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"articleTag": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
|
"articleSection": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true
|
||||||
|
},
|
||||||
"sourceURL": {
|
"sourceURL": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"format": "uri"
|
"format": "uri"
|
||||||
|
},
|
||||||
|
"pageStatusCode": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "The status code of the page"
|
||||||
|
},
|
||||||
|
"pageError": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "The error message of the page"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -83,6 +83,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data).not.toHaveProperty("html");
|
expect(response.body.data).not.toHaveProperty("html");
|
||||||
expect(response.body.data.content).toContain("_Roast_");
|
expect(response.body.data.content).toContain("_Roast_");
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
}, 30000); // 30 seconds timeout
|
}, 30000); // 30 seconds timeout
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => {
|
it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => {
|
||||||
@ -103,6 +105,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data.content).toContain("_Roast_");
|
expect(response.body.data.content).toContain("_Roast_");
|
||||||
expect(response.body.data.markdown).toContain("_Roast_");
|
expect(response.body.data.markdown).toContain("_Roast_");
|
||||||
expect(response.body.data.html).toContain("<h1");
|
expect(response.body.data.html).toContain("<h1");
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
}, 30000); // 30 seconds timeout
|
}, 30000); // 30 seconds timeout
|
||||||
|
|
||||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||||
@ -118,6 +122,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty('content');
|
expect(response.body.data).toHaveProperty('content');
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||||
@ -133,8 +139,59 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty('content');
|
expect(response.body.data).toHaveProperty('content');
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
|
||||||
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||||
|
const responseWithoutRemoveTags = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://www.scrapethissite.com/" });
|
||||||
|
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||||
|
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
|
||||||
|
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data).toHaveProperty("content");
|
||||||
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
|
expect(response.body.data).not.toHaveProperty("html");
|
||||||
|
expect(response.body.data.content).toContain("Scrape This Site");
|
||||||
|
expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
|
||||||
|
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
|
||||||
|
expect(response.body.data.content).not.toContain("web scraping"); // strong
|
||||||
|
}, 30000); // 30 seconds timeout
|
||||||
|
|
||||||
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
||||||
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
|
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
|
||||||
// const startTime = Date.now();
|
// const startTime = Date.now();
|
||||||
@ -155,6 +212,102 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
// expect(response.body.data.content).toContain("🔥 Firecrawl");
|
// expect(response.body.data.content).toContain("🔥 Firecrawl");
|
||||||
// expect(duration).toBeGreaterThanOrEqual(7000);
|
// expect(duration).toBeGreaterThanOrEqual(7000);
|
||||||
// }, 12000); // 12 seconds timeout
|
// }, 12000); // 12 seconds timeout
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/400' });
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(400);
|
||||||
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/401' });
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(401);
|
||||||
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response for a scrape with 403 page", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/403' });
|
||||||
|
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(403);
|
||||||
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/404' });
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
||||||
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/405' });
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(405);
|
||||||
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/500' });
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
||||||
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("POST /v0/crawl", () => {
|
describe("POST /v0/crawl", () => {
|
||||||
@ -270,6 +423,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => {
|
it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => {
|
||||||
@ -351,6 +506,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
|
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
|
||||||
@ -393,6 +550,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL
|
||||||
);
|
);
|
||||||
@ -651,6 +810,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||||
|
|
||||||
// 120 seconds
|
// 120 seconds
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||||
@ -658,7 +819,11 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0].content).toContain("_Roast_");
|
expect(completedResponse.body.data[0].content).toContain("_Roast_");
|
||||||
expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
|
expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
|
||||||
expect(completedResponse.body.data[0].html).toContain("<h1");
|
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||||
|
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||||
}, 180000);
|
}, 180000);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("POST /v0/crawlWebsitePreview", () => {
|
describe("POST /v0/crawlWebsitePreview", () => {
|
||||||
@ -792,6 +957,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||||
|
|
||||||
const childrenLinks = completedResponse.body.data.filter(doc =>
|
const childrenLinks = completedResponse.body.data.filter(doc =>
|
||||||
doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
|
doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
|
||||||
@ -835,8 +1002,13 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
})
|
})
|
||||||
])
|
])
|
||||||
);
|
);
|
||||||
|
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||||
}, 180000); // 120 seconds
|
}, 180000); // 120 seconds
|
||||||
|
|
||||||
|
|
||||||
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
|
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
|
||||||
const crawlResponse = await request(TEST_URL)
|
const crawlResponse = await request(TEST_URL)
|
||||||
.post("/v0/crawl")
|
.post("/v0/crawl")
|
||||||
@ -870,6 +1042,9 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||||
|
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL
|
||||||
);
|
);
|
||||||
@ -930,6 +1105,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0].content).toContain("_Roast_");
|
expect(completedResponse.body.data[0].content).toContain("_Roast_");
|
||||||
expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
|
expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
|
||||||
expect(completedResponse.body.data[0].html).toContain("<h1");
|
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||||
}, 60000);
|
}, 60000);
|
||||||
}); // 60 seconds
|
}); // 60 seconds
|
||||||
|
|
||||||
@ -973,6 +1150,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
expect(completedResponse.body.data[0].markdown).toContain("Mendable");
|
expect(completedResponse.body.data[0].markdown).toContain("Mendable");
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||||
|
|
||||||
const onlyChildrenLinks = completedResponse.body.data.filter(doc => {
|
const onlyChildrenLinks = completedResponse.body.data.filter(doc => {
|
||||||
return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
|
return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
|
||||||
@ -1013,7 +1192,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
|
expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
|
||||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
|
||||||
|
expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined();
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
describe("POST /v0/scrape with LLM Extraction", () => {
|
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||||
@ -1168,6 +1348,10 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(statusResponse.body).toHaveProperty("data");
|
expect(statusResponse.body).toHaveProperty("data");
|
||||||
expect(statusResponse.body.data[0]).toHaveProperty("content");
|
expect(statusResponse.body.data[0]).toHaveProperty("content");
|
||||||
expect(statusResponse.body.data[0]).toHaveProperty("markdown");
|
expect(statusResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
expect(statusResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
|
expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||||
|
|
||||||
const results = statusResponse.body.data;
|
const results = statusResponse.body.data;
|
||||||
// results.forEach((result, i) => {
|
// results.forEach((result, i) => {
|
||||||
// console.log(result.metadata.sourceURL);
|
// console.log(result.metadata.sourceURL);
|
||||||
|
@ -55,8 +55,16 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
|
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
const crawlerOptions = req.body.crawlerOptions ?? {
|
||||||
|
allowBackwardCrawling: false
|
||||||
|
};
|
||||||
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
removeTags: [],
|
||||||
|
parsePDF: true
|
||||||
|
};
|
||||||
|
|
||||||
if (mode === "single_urls" && !url.includes(",")) {
|
if (mode === "single_urls" && !url.includes(",")) {
|
||||||
try {
|
try {
|
||||||
|
@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||||
|
|
||||||
const job = await addWebScraperJob({
|
const job = await addWebScraperJob({
|
||||||
url: url,
|
url: url,
|
||||||
|
@ -61,7 +61,7 @@ export async function scrapeHelper(
|
|||||||
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||||
);
|
);
|
||||||
if (filteredDocs.length === 0) {
|
if (filteredDocs.length === 0) {
|
||||||
return { success: true, error: "No page found", returnCode: 200 };
|
return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
|
||||||
}
|
}
|
||||||
|
|
||||||
let creditsToBeBilled = filteredDocs.length;
|
let creditsToBeBilled = filteredDocs.length;
|
||||||
@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
waitFor: 0,
|
||||||
|
screenshot: false,
|
||||||
|
parsePDF: true
|
||||||
|
};
|
||||||
const extractorOptions = req.body.extractorOptions ?? {
|
const extractorOptions = req.body.extractorOptions ?? {
|
||||||
mode: "markdown"
|
mode: "markdown"
|
||||||
}
|
}
|
||||||
|
@ -85,6 +85,7 @@ export async function searchHelper(
|
|||||||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
||||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
||||||
includeHtml: pageOptions?.includeHtml ?? false,
|
includeHtml: pageOptions?.includeHtml ?? false,
|
||||||
|
removeTags: pageOptions?.removeTags ?? [],
|
||||||
fallback: false,
|
fallback: false,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@ -100,7 +101,7 @@ export async function searchHelper(
|
|||||||
);
|
);
|
||||||
|
|
||||||
if (filteredDocs.length === 0) {
|
if (filteredDocs.length === 0) {
|
||||||
return { success: true, error: "No page found", returnCode: 200 };
|
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
||||||
}
|
}
|
||||||
|
|
||||||
const billingResult = await billTeam(
|
const billingResult = await billTeam(
|
||||||
@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
onlyMainContent: true,
|
onlyMainContent: true,
|
||||||
fetchPageContent: true,
|
fetchPageContent: true,
|
||||||
|
removeTags: [],
|
||||||
fallback: false,
|
fallback: false,
|
||||||
};
|
};
|
||||||
const origin = req.body.origin ?? "api";
|
const origin = req.body.origin ?? "api";
|
||||||
|
@ -19,6 +19,8 @@ export type PageOptions = {
|
|||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
|
parsePDF?: boolean;
|
||||||
|
removeTags?: string | string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
@ -119,4 +121,7 @@ export class SearchResult {
|
|||||||
export interface FireEngineResponse {
|
export interface FireEngineResponse {
|
||||||
html: string;
|
html: string;
|
||||||
screenshot: string;
|
screenshot: string;
|
||||||
|
pageStatusCode?: number;
|
||||||
|
pageError?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,7 +223,7 @@ export class WebCrawler {
|
|||||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
|
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||||
const normalizedUrl = this.normalizeCrawlUrl(url);
|
const normalizedUrl = this.normalizeCrawlUrl(url);
|
||||||
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||||
return [];
|
return [];
|
||||||
@ -243,20 +243,27 @@ export class WebCrawler {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
let content: string = "";
|
let content: string = "";
|
||||||
|
let pageStatusCode: number;
|
||||||
|
let pageError: string | undefined = undefined;
|
||||||
|
|
||||||
// If it is the first link, fetch with single url
|
// If it is the first link, fetch with single url
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
|
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
|
||||||
content = page.html ?? "";
|
content = page.html ?? "";
|
||||||
|
pageStatusCode = page.metadata?.pageStatusCode;
|
||||||
|
pageError = page.metadata?.pageError || undefined;
|
||||||
} else {
|
} else {
|
||||||
const response = await axios.get(url);
|
const response = await axios.get(url);
|
||||||
content = response.data ?? "";
|
content = response.data ?? "";
|
||||||
|
pageStatusCode = response.status;
|
||||||
|
pageError = response.statusText != "OK" ? response.statusText : undefined;
|
||||||
}
|
}
|
||||||
const $ = load(content);
|
const $ = load(content);
|
||||||
let links: { url: string, html: string }[] = [];
|
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
|
||||||
|
|
||||||
// Add the initial URL to the list of links
|
// Add the initial URL to the list of links
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
links.push({ url, html: content });
|
links.push({ url, html: content, pageStatusCode, pageError });
|
||||||
}
|
}
|
||||||
|
|
||||||
$("a").each((_, element) => {
|
$("a").each((_, element) => {
|
||||||
@ -278,7 +285,7 @@ export class WebCrawler {
|
|||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
||||||
) {
|
) {
|
||||||
links.push({ url: fullUrl, html: content });
|
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
|
||||||
|
|
||||||
export async function handleCustomScraping(
|
export async function handleCustomScraping(
|
||||||
text: string,
|
text: string,
|
||||||
url: string
|
url: string
|
||||||
|
@ -245,7 +245,7 @@ export class WebScraperDataProvider {
|
|||||||
content: "",
|
content: "",
|
||||||
html: this.pageOptions?.includeHtml ? "" : undefined,
|
html: this.pageOptions?.includeHtml ? "" : undefined,
|
||||||
markdown: "",
|
markdown: "",
|
||||||
metadata: { sourceURL: url },
|
metadata: { sourceURL: url, pageStatusCode: 200 },
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -284,10 +284,10 @@ export class WebScraperDataProvider {
|
|||||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||||
return Promise.all(
|
return Promise.all(
|
||||||
pdfLinks.map(async (pdfLink) => {
|
pdfLinks.map(async (pdfLink) => {
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
|
||||||
return {
|
return {
|
||||||
content: pdfContent,
|
content: content,
|
||||||
metadata: { sourceURL: pdfLink },
|
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||||
provider: "web-scraper",
|
provider: "web-scraper",
|
||||||
};
|
};
|
||||||
})
|
})
|
||||||
@ -296,10 +296,10 @@ export class WebScraperDataProvider {
|
|||||||
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
||||||
return Promise.all(
|
return Promise.all(
|
||||||
docxLinks.map(async (p) => {
|
docxLinks.map(async (p) => {
|
||||||
const docXDocument = await fetchAndProcessDocx(p);
|
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p);
|
||||||
return {
|
return {
|
||||||
content: docXDocument,
|
content,
|
||||||
metadata: { sourceURL: p },
|
metadata: { sourceURL: p, pageStatusCode, pageError },
|
||||||
provider: "web-scraper",
|
provider: "web-scraper",
|
||||||
};
|
};
|
||||||
})
|
})
|
||||||
@ -479,7 +479,13 @@ export class WebScraperDataProvider {
|
|||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
|
this.pageOptions = options.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
replaceAllPathsWithAbsolutePaths: false,
|
||||||
|
parsePDF: true,
|
||||||
|
removeTags: []
|
||||||
|
};
|
||||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||||
|
@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
|
|||||||
url: string,
|
url: string,
|
||||||
waitFor: number = 0,
|
waitFor: number = 0,
|
||||||
screenshot: boolean = false,
|
screenshot: boolean = false,
|
||||||
pageOptions: { scrollXPaths?: string[] } = {},
|
pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
|
||||||
headers?: Record<string, string>,
|
headers?: Record<string, string>,
|
||||||
options?: any
|
options?: any
|
||||||
): Promise<FireEngineResponse> {
|
): Promise<FireEngineResponse> {
|
||||||
@ -83,17 +83,18 @@ export async function scrapWithFireEngine(
|
|||||||
console.error(
|
console.error(
|
||||||
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
||||||
);
|
);
|
||||||
return { html: "", screenshot: "" };
|
return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
|
return { html: content, screenshot: "", pageStatusCode, pageError };
|
||||||
} else {
|
} else {
|
||||||
const data = response.data;
|
const data = response.data;
|
||||||
const html = data.content;
|
const html = data.content;
|
||||||
const screenshot = data.screenshot;
|
const screenshot = data.screenshot;
|
||||||
return { html: html ?? "", screenshot: screenshot ?? "" };
|
return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.code === 'ECONNABORTED') {
|
if (error.code === 'ECONNABORTED') {
|
||||||
@ -108,44 +109,51 @@ export async function scrapWithFireEngine(
|
|||||||
export async function scrapWithScrapingBee(
|
export async function scrapWithScrapingBee(
|
||||||
url: string,
|
url: string,
|
||||||
wait_browser: string = "domcontentloaded",
|
wait_browser: string = "domcontentloaded",
|
||||||
timeout: number = universalTimeout
|
timeout: number = universalTimeout,
|
||||||
): Promise<string> {
|
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||||
|
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||||
try {
|
try {
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||||
const clientParams = await generateRequestParams(
|
const clientParams = await generateRequestParams(
|
||||||
url,
|
url,
|
||||||
wait_browser,
|
wait_browser,
|
||||||
timeout
|
timeout,
|
||||||
);
|
);
|
||||||
|
|
||||||
const response = await client.get(clientParams);
|
const response = await client.get({
|
||||||
|
...clientParams,
|
||||||
if (response.status !== 200 && response.status !== 404) {
|
params: {
|
||||||
console.error(
|
...clientParams.params,
|
||||||
`[ScrapingBee] Error fetching url: ${url} with status code ${response.status}`
|
'transparent_status_code': 'True'
|
||||||
);
|
}
|
||||||
return "";
|
});
|
||||||
}
|
|
||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
const decoder = new TextDecoder();
|
let text = "";
|
||||||
const text = decoder.decode(response.data);
|
try {
|
||||||
return text;
|
const decoder = new TextDecoder();
|
||||||
|
text = decoder.decode(response.data);
|
||||||
|
} catch (decodeError) {
|
||||||
|
console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`);
|
||||||
|
}
|
||||||
|
return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined };
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
|
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
|
||||||
return "";
|
return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapWithPlaywright(
|
export async function scrapWithPlaywright(
|
||||||
url: string,
|
url: string,
|
||||||
waitFor: number = 0,
|
waitFor: number = 0,
|
||||||
headers?: Record<string, string>
|
headers?: Record<string, string>,
|
||||||
): Promise<string> {
|
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||||
|
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||||
try {
|
try {
|
||||||
const reqParams = await generateRequestParams(url);
|
const reqParams = await generateRequestParams(url);
|
||||||
// If the user has passed a wait parameter in the request, use that
|
// If the user has passed a wait parameter in the request, use that
|
||||||
@ -167,21 +175,21 @@ export async function scrapWithPlaywright(
|
|||||||
console.error(
|
console.error(
|
||||||
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
||||||
);
|
);
|
||||||
return "";
|
return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
} else {
|
} else {
|
||||||
const textData = response.data;
|
const textData = response.data;
|
||||||
try {
|
try {
|
||||||
const data = JSON.parse(textData);
|
const data = JSON.parse(textData);
|
||||||
const html = data.content;
|
const html = data.content;
|
||||||
return html ?? "";
|
return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
|
||||||
} catch (jsonError) {
|
} catch (jsonError) {
|
||||||
console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`);
|
console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`);
|
||||||
return "";
|
return { content: "" };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@ -190,11 +198,14 @@ export async function scrapWithPlaywright(
|
|||||||
} else {
|
} else {
|
||||||
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
|
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
|
||||||
}
|
}
|
||||||
return "";
|
return { content: "" };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapWithFetch(url: string): Promise<string> {
|
export async function scrapWithFetch(
|
||||||
|
url: string,
|
||||||
|
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||||
|
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(url, {
|
const response = await axios.get(url, {
|
||||||
headers: {
|
headers: {
|
||||||
@ -208,15 +219,15 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
|||||||
console.error(
|
console.error(
|
||||||
`[Axios] Error fetching url: ${url} with status: ${response.status}`
|
`[Axios] Error fetching url: ${url} with status: ${response.status}`
|
||||||
);
|
);
|
||||||
return "";
|
return { content: "", pageStatusCode: response.status, pageError: response.statusText };
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
} else {
|
} else {
|
||||||
const text = response.data;
|
const text = response.data;
|
||||||
return text;
|
return { content: text, pageStatusCode: 200 };
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.code === 'ECONNABORTED') {
|
if (error.code === 'ECONNABORTED') {
|
||||||
@ -224,7 +235,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
|||||||
} else {
|
} else {
|
||||||
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
|
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
|
||||||
}
|
}
|
||||||
return "";
|
return { content: "" };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -304,6 +315,19 @@ export async function scrapSingleUrl(
|
|||||||
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||||
const soup = cheerio.load(html);
|
const soup = cheerio.load(html);
|
||||||
soup("script, style, iframe, noscript, meta, head").remove();
|
soup("script, style, iframe, noscript, meta, head").remove();
|
||||||
|
|
||||||
|
if (pageOptions.removeTags) {
|
||||||
|
if (typeof pageOptions.removeTags === 'string') {
|
||||||
|
pageOptions.removeTags.split(',').forEach((tag) => {
|
||||||
|
soup(tag.trim()).remove();
|
||||||
|
});
|
||||||
|
} else if (Array.isArray(pageOptions.removeTags)) {
|
||||||
|
pageOptions.removeTags.forEach((tag) => {
|
||||||
|
soup(tag).remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (pageOptions.onlyMainContent) {
|
if (pageOptions.onlyMainContent) {
|
||||||
// remove any other tags that are not in the main content
|
// remove any other tags that are not in the main content
|
||||||
excludeNonMainTags.forEach((tag) => {
|
excludeNonMainTags.forEach((tag) => {
|
||||||
@ -317,7 +341,7 @@ export async function scrapSingleUrl(
|
|||||||
url: string,
|
url: string,
|
||||||
method: (typeof baseScrapers)[number]
|
method: (typeof baseScrapers)[number]
|
||||||
) => {
|
) => {
|
||||||
let text = "";
|
let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} };
|
||||||
let screenshot = "";
|
let screenshot = "";
|
||||||
switch (method) {
|
switch (method) {
|
||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
@ -329,38 +353,52 @@ export async function scrapSingleUrl(
|
|||||||
pageOptions.screenshot,
|
pageOptions.screenshot,
|
||||||
pageOptions.headers
|
pageOptions.headers
|
||||||
);
|
);
|
||||||
text = response.html;
|
scraperResponse.text = response.html;
|
||||||
screenshot = response.screenshot;
|
scraperResponse.screenshot = response.screenshot;
|
||||||
|
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||||
|
scraperResponse.metadata.pageError = response.pageError;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||||
text = await scrapWithScrapingBee(
|
const response = await scrapWithScrapingBee(
|
||||||
url,
|
url,
|
||||||
"domcontentloaded",
|
"domcontentloaded",
|
||||||
pageOptions.fallback === false ? 7000 : 15000
|
pageOptions.fallback === false ? 7000 : 15000
|
||||||
);
|
);
|
||||||
|
scraperResponse.text = response.content;
|
||||||
|
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||||
|
scraperResponse.metadata.pageError = response.pageError;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "playwright":
|
case "playwright":
|
||||||
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
||||||
text = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
|
const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
|
||||||
|
scraperResponse.text = response.content;
|
||||||
|
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||||
|
scraperResponse.metadata.pageError = response.pageError;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "scrapingBeeLoad":
|
case "scrapingBeeLoad":
|
||||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||||
text = await scrapWithScrapingBee(url, "networkidle2");
|
const response = await scrapWithScrapingBee(url, "networkidle2");
|
||||||
|
scraperResponse.text = response.content;
|
||||||
|
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||||
|
scraperResponse.metadata.pageError = response.pageError;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "fetch":
|
case "fetch":
|
||||||
text = await scrapWithFetch(url);
|
const response = await scrapWithFetch(url);
|
||||||
|
scraperResponse.text = response.content;
|
||||||
|
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||||
|
scraperResponse.metadata.pageError = response.pageError;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
let customScrapedContent : FireEngineResponse | null = null;
|
let customScrapedContent : FireEngineResponse | null = null;
|
||||||
|
|
||||||
// Check for custom scraping conditions
|
// Check for custom scraping conditions
|
||||||
const customScraperResult = await handleCustomScraping(text, url);
|
const customScraperResult = await handleCustomScraping(scraperResponse.text, url);
|
||||||
|
|
||||||
if (customScraperResult){
|
if (customScraperResult){
|
||||||
switch (customScraperResult.scraper) {
|
switch (customScraperResult.scraper) {
|
||||||
@ -371,23 +409,30 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "pdf":
|
case "pdf":
|
||||||
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
|
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF);
|
||||||
|
customScrapedContent = { html: content, screenshot, pageStatusCode, pageError }
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (customScrapedContent) {
|
if (customScrapedContent) {
|
||||||
text = customScrapedContent.html;
|
scraperResponse.text = customScrapedContent.html;
|
||||||
screenshot = customScrapedContent.screenshot;
|
screenshot = customScrapedContent.screenshot;
|
||||||
}
|
}
|
||||||
|
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||||
|
|
||||||
return [await parseMarkdown(cleanedHtml), text, screenshot];
|
return {
|
||||||
|
text: await parseMarkdown(cleanedHtml),
|
||||||
|
html: scraperResponse.text,
|
||||||
|
screenshot: scraperResponse.screenshot,
|
||||||
|
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
||||||
|
pageError: scraperResponse.metadata.pageError || undefined
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
|
||||||
try {
|
try {
|
||||||
let [text, html, screenshot] = ["", "", ""];
|
|
||||||
let urlKey = urlToScrap;
|
let urlKey = urlToScrap;
|
||||||
try {
|
try {
|
||||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||||
@ -410,8 +455,21 @@ export async function scrapSingleUrl(
|
|||||||
html = existingHtml;
|
html = existingHtml;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
[text, html, screenshot] = await attemptScraping(urlToScrap, scraper);
|
|
||||||
|
const attempt = await attemptScraping(urlToScrap, scraper);
|
||||||
|
text = attempt.text ?? '';
|
||||||
|
html = attempt.html ?? '';
|
||||||
|
screenshot = attempt.screenshot ?? '';
|
||||||
|
if (attempt.pageStatusCode) {
|
||||||
|
pageStatusCode = attempt.pageStatusCode;
|
||||||
|
}
|
||||||
|
if (attempt.pageError) {
|
||||||
|
pageError = attempt.pageError;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if (text && text.trim().length >= 100) break;
|
if (text && text.trim().length >= 100) break;
|
||||||
|
if (pageStatusCode && pageStatusCode == 404) break;
|
||||||
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||||
if (nextScraperIndex < scrapersInOrder.length) {
|
if (nextScraperIndex < scrapersInOrder.length) {
|
||||||
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||||
@ -435,6 +493,8 @@ export async function scrapSingleUrl(
|
|||||||
...metadata,
|
...metadata,
|
||||||
screenshot: screenshot,
|
screenshot: screenshot,
|
||||||
sourceURL: urlToScrap,
|
sourceURL: urlToScrap,
|
||||||
|
pageStatusCode: pageStatusCode,
|
||||||
|
pageError: pageError
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
@ -442,7 +502,12 @@ export async function scrapSingleUrl(
|
|||||||
content: text,
|
content: text,
|
||||||
markdown: text,
|
markdown: text,
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
metadata: {
|
||||||
|
...metadata,
|
||||||
|
sourceURL: urlToScrap,
|
||||||
|
pageStatusCode: pageStatusCode,
|
||||||
|
pageError: pageError
|
||||||
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -453,7 +518,11 @@ export async function scrapSingleUrl(
|
|||||||
content: "",
|
content: "",
|
||||||
markdown: "",
|
markdown: "",
|
||||||
html: "",
|
html: "",
|
||||||
metadata: { sourceURL: urlToScrap },
|
metadata: {
|
||||||
|
sourceURL: urlToScrap,
|
||||||
|
pageStatusCode: pageStatusCode,
|
||||||
|
pageError: pageError
|
||||||
|
},
|
||||||
} as Document;
|
} as Document;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,11 +3,13 @@ import * as docxProcessor from "../docxProcessor";
|
|||||||
describe("DOCX Processing Module - Integration Test", () => {
|
describe("DOCX Processing Module - Integration Test", () => {
|
||||||
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
|
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
|
||||||
delete process.env.LLAMAPARSE_API_KEY;
|
delete process.env.LLAMAPARSE_API_KEY;
|
||||||
const docxContent = await docxProcessor.fetchAndProcessDocx(
|
const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx(
|
||||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
|
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
|
||||||
);
|
);
|
||||||
expect(docxContent.trim()).toContain(
|
expect(content.trim()).toContain(
|
||||||
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
|
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
|
||||||
);
|
);
|
||||||
|
expect(pageStatusCode).toBe(200);
|
||||||
|
expect(pageError).toBeUndefined();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -3,8 +3,10 @@ import * as pdfProcessor from '../pdfProcessor';
|
|||||||
describe('PDF Processing Module - Integration Test', () => {
|
describe('PDF Processing Module - Integration Test', () => {
|
||||||
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
||||||
delete process.env.LLAMAPARSE_API_KEY;
|
delete process.env.LLAMAPARSE_API_KEY;
|
||||||
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
|
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
|
||||||
expect(pdfContent.trim()).toEqual("Dummy PDF file");
|
expect(content.trim()).toEqual("Dummy PDF file");
|
||||||
|
expect(pageStatusCode).toEqual(200);
|
||||||
|
expect(pageError).toBeUndefined();
|
||||||
});
|
});
|
||||||
|
|
||||||
// We're hitting the LLAMAPARSE rate limit 🫠
|
// We're hitting the LLAMAPARSE rate limit 🫠
|
||||||
|
@ -5,14 +5,14 @@ import path from "path";
|
|||||||
import os from "os";
|
import os from "os";
|
||||||
import mammoth from "mammoth";
|
import mammoth from "mammoth";
|
||||||
|
|
||||||
export async function fetchAndProcessDocx(url: string): Promise<string> {
|
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
|
||||||
const tempFilePath = await downloadDocx(url);
|
const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url);
|
||||||
const content = await processDocxToText(tempFilePath);
|
const content = await processDocxToText(tempFilePath);
|
||||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||||
return content;
|
return { content, pageStatusCode, pageError };
|
||||||
}
|
}
|
||||||
|
|
||||||
async function downloadDocx(url: string): Promise<string> {
|
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
|
||||||
const response = await axios({
|
const response = await axios({
|
||||||
url,
|
url,
|
||||||
method: "GET",
|
method: "GET",
|
||||||
@ -25,7 +25,7 @@ async function downloadDocx(url: string): Promise<string> {
|
|||||||
response.data.pipe(writer);
|
response.data.pipe(writer);
|
||||||
|
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
writer.on("finish", () => resolve(tempFilePath));
|
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||||
writer.on("error", reject);
|
writer.on("error", reject);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,9 @@ interface Metadata {
|
|||||||
publishedTime?: string;
|
publishedTime?: string;
|
||||||
articleTag?: string;
|
articleTag?: string;
|
||||||
articleSection?: string;
|
articleSection?: string;
|
||||||
|
sourceURL?: string;
|
||||||
|
pageStatusCode?: number;
|
||||||
|
pageError?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||||
@ -61,6 +64,9 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||||||
let publishedTime: string | null = null;
|
let publishedTime: string | null = null;
|
||||||
let articleTag: string | null = null;
|
let articleTag: string | null = null;
|
||||||
let articleSection: string | null = null;
|
let articleSection: string | null = null;
|
||||||
|
let sourceURL: string | null = null;
|
||||||
|
let pageStatusCode: number | null = null;
|
||||||
|
let pageError: string | null = null;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
title = soup("title").text() || null;
|
title = soup("title").text() || null;
|
||||||
@ -132,5 +138,8 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||||||
...(publishedTime ? { publishedTime } : {}),
|
...(publishedTime ? { publishedTime } : {}),
|
||||||
...(articleTag ? { articleTag } : {}),
|
...(articleTag ? { articleTag } : {}),
|
||||||
...(articleSection ? { articleSection } : {}),
|
...(articleSection ? { articleSection } : {}),
|
||||||
|
...(sourceURL ? { sourceURL } : {}),
|
||||||
|
...(pageStatusCode ? { pageStatusCode } : {}),
|
||||||
|
...(pageError ? { pageError } : {}),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -9,14 +9,14 @@ import os from "os";
|
|||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
export async function fetchAndProcessPdf(url: string): Promise<string> {
|
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||||
const tempFilePath = await downloadPdf(url);
|
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
||||||
const content = await processPdfToText(tempFilePath);
|
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||||
return content;
|
return { content, pageStatusCode, pageError };
|
||||||
}
|
}
|
||||||
|
|
||||||
async function downloadPdf(url: string): Promise<string> {
|
async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
|
||||||
const response = await axios({
|
const response = await axios({
|
||||||
url,
|
url,
|
||||||
method: "GET",
|
method: "GET",
|
||||||
@ -29,15 +29,15 @@ async function downloadPdf(url: string): Promise<string> {
|
|||||||
response.data.pipe(writer);
|
response.data.pipe(writer);
|
||||||
|
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
writer.on("finish", () => resolve(tempFilePath));
|
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||||
writer.on("error", reject);
|
writer.on("error", reject);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function processPdfToText(filePath: string): Promise<string> {
|
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
|
||||||
let content = "";
|
let content = "";
|
||||||
|
|
||||||
if (process.env.LLAMAPARSE_API_KEY) {
|
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
||||||
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
||||||
const headers = {
|
const headers = {
|
||||||
Authorization: `Bearer ${apiKey}`,
|
Authorization: `Bearer ${apiKey}`,
|
||||||
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
|
|||||||
console.error("Error processing pdf document w/ LlamaIndex(2)");
|
console.error("Error processing pdf document w/ LlamaIndex(2)");
|
||||||
content = await processPdf(filePath);
|
content = await processPdf(filePath);
|
||||||
}
|
}
|
||||||
} else {
|
} else if (parsePDF) {
|
||||||
content = await processPdf(filePath);
|
content = await processPdf(filePath);
|
||||||
|
} else {
|
||||||
|
content = fs.readFileSync(filePath, "utf-8");
|
||||||
}
|
}
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
63
apps/playwright-service/get_error.py
Normal file
63
apps/playwright-service/get_error.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
def get_error(status_code: int) -> str:
|
||||||
|
error_messages = {
|
||||||
|
300: "Multiple Choices",
|
||||||
|
301: "Moved Permanently",
|
||||||
|
302: "Found",
|
||||||
|
303: "See Other",
|
||||||
|
304: "Not Modified",
|
||||||
|
305: "Use Proxy",
|
||||||
|
307: "Temporary Redirect",
|
||||||
|
308: "Permanent Redirect",
|
||||||
|
309: "Resume Incomplete",
|
||||||
|
310: "Too Many Redirects",
|
||||||
|
311: "Unavailable For Legal Reasons",
|
||||||
|
312: "Previously Used",
|
||||||
|
313: "I'm Used",
|
||||||
|
314: "Switch Proxy",
|
||||||
|
315: "Temporary Redirect",
|
||||||
|
316: "Resume Incomplete",
|
||||||
|
317: "Too Many Redirects",
|
||||||
|
400: "Bad Request",
|
||||||
|
401: "Unauthorized",
|
||||||
|
403: "Forbidden",
|
||||||
|
404: "Not Found",
|
||||||
|
405: "Method Not Allowed",
|
||||||
|
406: "Not Acceptable",
|
||||||
|
407: "Proxy Authentication Required",
|
||||||
|
408: "Request Timeout",
|
||||||
|
409: "Conflict",
|
||||||
|
410: "Gone",
|
||||||
|
411: "Length Required",
|
||||||
|
412: "Precondition Failed",
|
||||||
|
413: "Payload Too Large",
|
||||||
|
414: "URI Too Long",
|
||||||
|
415: "Unsupported Media Type",
|
||||||
|
416: "Range Not Satisfiable",
|
||||||
|
417: "Expectation Failed",
|
||||||
|
418: "I'm a teapot",
|
||||||
|
421: "Misdirected Request",
|
||||||
|
422: "Unprocessable Entity",
|
||||||
|
423: "Locked",
|
||||||
|
424: "Failed Dependency",
|
||||||
|
425: "Too Early",
|
||||||
|
426: "Upgrade Required",
|
||||||
|
428: "Precondition Required",
|
||||||
|
429: "Too Many Requests",
|
||||||
|
431: "Request Header Fields Too Large",
|
||||||
|
451: "Unavailable For Legal Reasons",
|
||||||
|
500: "Internal Server Error",
|
||||||
|
501: "Not Implemented",
|
||||||
|
502: "Bad Gateway",
|
||||||
|
503: "Service Unavailable",
|
||||||
|
504: "Gateway Timeout",
|
||||||
|
505: "HTTP Version Not Supported",
|
||||||
|
506: "Variant Also Negotiates",
|
||||||
|
507: "Insufficient Storage",
|
||||||
|
508: "Loop Detected",
|
||||||
|
510: "Not Extended",
|
||||||
|
511: "Network Authentication Required",
|
||||||
|
599: "Network Connect Timeout Error"
|
||||||
|
}
|
||||||
|
if status_code < 300:
|
||||||
|
return None
|
||||||
|
return error_messages.get(status_code, "Unknown Error")
|
@ -9,6 +9,7 @@ from fastapi import FastAPI
|
|||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
from playwright.async_api import Browser, async_playwright
|
from playwright.async_api import Browser, async_playwright
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
from get_error import get_error
|
||||||
|
|
||||||
PROXY_SERVER = environ.get("PROXY_SERVER", None)
|
PROXY_SERVER = environ.get("PROXY_SERVER", None)
|
||||||
PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
|
PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
|
||||||
@ -73,16 +74,22 @@ async def root(body: UrlModel):
|
|||||||
if body.headers:
|
if body.headers:
|
||||||
await page.set_extra_http_headers(body.headers)
|
await page.set_extra_http_headers(body.headers)
|
||||||
|
|
||||||
await page.goto(
|
response = await page.goto(
|
||||||
body.url,
|
body.url,
|
||||||
wait_until="load",
|
wait_until="load",
|
||||||
timeout=body.timeout,
|
timeout=body.timeout,
|
||||||
)
|
)
|
||||||
|
page_status_code = response.status
|
||||||
|
page_error = get_error(page_status_code)
|
||||||
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
|
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
|
||||||
if body.wait_after_load > 0:
|
if body.wait_after_load > 0:
|
||||||
await page.wait_for_timeout(body.wait_after_load)
|
await page.wait_for_timeout(body.wait_after_load)
|
||||||
|
|
||||||
page_content = await page.content()
|
page_content = await page.content()
|
||||||
await context.close()
|
await context.close()
|
||||||
json_compatible_item_data = {"content": page_content}
|
json_compatible_item_data = {
|
||||||
|
"content": page_content,
|
||||||
|
"pageStatusCode": page_status_code,
|
||||||
|
"pageError": page_error
|
||||||
|
}
|
||||||
return JSONResponse(content=json_compatible_item_data)
|
return JSONResponse(content=json_compatible_item_data)
|
@ -1,3 +1,57 @@
|
|||||||
|
"""
|
||||||
|
This is the Firecrawl package.
|
||||||
|
|
||||||
|
This package provides a Python SDK for interacting with the Firecrawl API.
|
||||||
|
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
||||||
|
and check the status of these jobs.
|
||||||
|
|
||||||
|
For more information visit https://github.com/firecrawl/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
from .firecrawl import FirecrawlApp
|
from .firecrawl import FirecrawlApp
|
||||||
|
|
||||||
__version__ = "0.0.14"
|
__version__ = "0.0.16"
|
||||||
|
|
||||||
|
# Define the logger for the Firecrawl project
|
||||||
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
|
||||||
|
|
||||||
|
def _basic_config() -> None:
|
||||||
|
"""Set up basic configuration for logging with a specific format and date format."""
|
||||||
|
try:
|
||||||
|
logging.basicConfig(
|
||||||
|
format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to configure logging: %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging() -> None:
|
||||||
|
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
||||||
|
env = os.environ.get(
|
||||||
|
"FIRECRAWL_LOGGING_LEVEL", "INFO"
|
||||||
|
).upper() # Default to 'INFO' level
|
||||||
|
_basic_config()
|
||||||
|
|
||||||
|
if env == "DEBUG":
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
elif env == "INFO":
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
elif env == "WARNING":
|
||||||
|
logger.setLevel(logging.WARNING)
|
||||||
|
elif env == "ERROR":
|
||||||
|
logger.setLevel(logging.ERROR)
|
||||||
|
elif env == "CRITICAL":
|
||||||
|
logger.setLevel(logging.CRITICAL)
|
||||||
|
else:
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize logging configuration when the module is imported
|
||||||
|
setup_logging()
|
||||||
|
logger.debug("Debugging logger setup")
|
||||||
|
Binary file not shown.
@ -27,14 +27,14 @@ def test_scrape_url_invalid_api_key():
|
|||||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
invalid_app.scrape_url('https://firecrawl.dev')
|
invalid_app.scrape_url('https://firecrawl.dev')
|
||||||
assert "Failed to scrape URL. Status code: 401" in str(excinfo.value)
|
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||||
|
|
||||||
def test_blocklisted_url():
|
def test_blocklisted_url():
|
||||||
blocklisted_url = "https://facebook.com/fake-test"
|
blocklisted_url = "https://facebook.com/fake-test"
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
app.scrape_url(blocklisted_url)
|
app.scrape_url(blocklisted_url)
|
||||||
assert "Failed to scrape URL. Status code: 403" in str(excinfo.value)
|
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||||
|
|
||||||
def test_successful_response_with_valid_preview_token():
|
def test_successful_response_with_valid_preview_token():
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
||||||
@ -86,14 +86,14 @@ def test_crawl_url_invalid_api_key():
|
|||||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
invalid_app.crawl_url('https://firecrawl.dev')
|
invalid_app.crawl_url('https://firecrawl.dev')
|
||||||
assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value)
|
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||||
|
|
||||||
def test_should_return_error_for_blocklisted_url():
|
def test_should_return_error_for_blocklisted_url():
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
blocklisted_url = "https://twitter.com/fake-test"
|
blocklisted_url = "https://twitter.com/fake-test"
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
app.crawl_url(blocklisted_url)
|
app.crawl_url(blocklisted_url)
|
||||||
assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value)
|
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||||
|
|
||||||
def test_crawl_url_wait_for_completion_e2e():
|
def test_crawl_url_wait_for_completion_e2e():
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
@ -114,7 +114,7 @@ def test_crawl_url_with_idempotency_key_e2e():
|
|||||||
|
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
||||||
assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value)
|
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
||||||
|
|
||||||
def test_check_crawl_status_e2e():
|
def test_check_crawl_status_e2e():
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
@ -141,7 +141,7 @@ def test_search_invalid_api_key():
|
|||||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
invalid_app.search("test query")
|
invalid_app.search("test query")
|
||||||
assert "Failed to search. Status code: 401" in str(excinfo.value)
|
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||||
|
|
||||||
def test_llm_extraction():
|
def test_llm_extraction():
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
|
@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes.
|
|||||||
Classes:
|
Classes:
|
||||||
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
||||||
"""
|
"""
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
logger : logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
|
||||||
class FirecrawlApp:
|
class FirecrawlApp:
|
||||||
"""
|
"""
|
||||||
@ -28,8 +29,15 @@ class FirecrawlApp:
|
|||||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||||
if self.api_key is None:
|
if self.api_key is None:
|
||||||
|
logger.warning("No API key provided")
|
||||||
raise ValueError('No API key provided')
|
raise ValueError('No API key provided')
|
||||||
|
else:
|
||||||
|
logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
|
||||||
|
|
||||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||||
|
if self.api_url != 'https://api.firecrawl.dev':
|
||||||
|
logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
|
||||||
|
|
||||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||||
"""
|
"""
|
||||||
Scrape the specified URL using the Firecrawl API.
|
Scrape the specified URL using the Firecrawl API.
|
||||||
@ -45,10 +53,8 @@ class FirecrawlApp:
|
|||||||
Exception: If the scrape request fails.
|
Exception: If the scrape request fails.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
headers = {
|
headers = self._prepare_headers()
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
|
||||||
}
|
|
||||||
# Prepare the base scrape parameters with the URL
|
# Prepare the base scrape parameters with the URL
|
||||||
scrape_params = {'url': url}
|
scrape_params = {'url': url}
|
||||||
|
|
||||||
@ -81,13 +87,10 @@ class FirecrawlApp:
|
|||||||
return response['data']
|
return response['data']
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||||
elif response.status_code in [402, 408, 409, 500]:
|
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
|
||||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
self._handle_error(response, 'scrape URL')
|
||||||
|
|
||||||
def search(self, query, params=None):
|
def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||||
"""
|
"""
|
||||||
Perform a search using the Firecrawl API.
|
Perform a search using the Firecrawl API.
|
||||||
|
|
||||||
@ -101,10 +104,7 @@ class FirecrawlApp:
|
|||||||
Raises:
|
Raises:
|
||||||
Exception: If the search request fails.
|
Exception: If the search request fails.
|
||||||
"""
|
"""
|
||||||
headers = {
|
headers = self._prepare_headers()
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
|
||||||
}
|
|
||||||
json_data = {'query': query}
|
json_data = {'query': query}
|
||||||
if params:
|
if params:
|
||||||
json_data.update(params)
|
json_data.update(params)
|
||||||
@ -121,13 +121,14 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
raise Exception(f'Failed to search. Error: {response["error"]}')
|
||||||
|
|
||||||
elif response.status_code in [402, 409, 500]:
|
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
|
||||||
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
|
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to search. Status code: {response.status_code}')
|
self._handle_error(response, 'search')
|
||||||
|
|
||||||
def crawl_url(self, url, params=None, wait_until_done=True, poll_interval=2, idempotency_key=None):
|
def crawl_url(self, url: str,
|
||||||
|
params: Optional[Dict[str, Any]] = None,
|
||||||
|
wait_until_done: bool = True,
|
||||||
|
poll_interval: int = 2,
|
||||||
|
idempotency_key: Optional[str] = None) -> Any:
|
||||||
"""
|
"""
|
||||||
Initiate a crawl job for the specified URL using the Firecrawl API.
|
Initiate a crawl job for the specified URL using the Firecrawl API.
|
||||||
|
|
||||||
@ -158,7 +159,7 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(response, 'start crawl job')
|
self._handle_error(response, 'start crawl job')
|
||||||
|
|
||||||
def check_crawl_status(self, job_id):
|
def check_crawl_status(self, job_id: str) -> Any:
|
||||||
"""
|
"""
|
||||||
Check the status of a crawl job using the Firecrawl API.
|
Check the status of a crawl job using the Firecrawl API.
|
||||||
|
|
||||||
@ -178,7 +179,7 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(response, 'check crawl status')
|
self._handle_error(response, 'check crawl status')
|
||||||
|
|
||||||
def _prepare_headers(self, idempotency_key=None):
|
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||||
"""
|
"""
|
||||||
Prepare the headers for API requests.
|
Prepare the headers for API requests.
|
||||||
|
|
||||||
@ -200,7 +201,11 @@ class FirecrawlApp:
|
|||||||
'Authorization': f'Bearer {self.api_key}',
|
'Authorization': f'Bearer {self.api_key}',
|
||||||
}
|
}
|
||||||
|
|
||||||
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
|
def _post_request(self, url: str,
|
||||||
|
data: Dict[str, Any],
|
||||||
|
headers: Dict[str, str],
|
||||||
|
retries: int = 3,
|
||||||
|
backoff_factor: float = 0.5) -> requests.Response:
|
||||||
"""
|
"""
|
||||||
Make a POST request with retries.
|
Make a POST request with retries.
|
||||||
|
|
||||||
@ -225,7 +230,10 @@ class FirecrawlApp:
|
|||||||
return response
|
return response
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
|
def _get_request(self, url: str,
|
||||||
|
headers: Dict[str, str],
|
||||||
|
retries: int = 3,
|
||||||
|
backoff_factor: float = 0.5) -> requests.Response:
|
||||||
"""
|
"""
|
||||||
Make a GET request with retries.
|
Make a GET request with retries.
|
||||||
|
|
||||||
@ -249,7 +257,7 @@ class FirecrawlApp:
|
|||||||
return response
|
return response
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def _monitor_job_status(self, job_id, headers, poll_interval):
|
def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any:
|
||||||
"""
|
"""
|
||||||
Monitor the status of a crawl job until completion.
|
Monitor the status of a crawl job until completion.
|
||||||
|
|
||||||
@ -281,7 +289,7 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(status_response, 'check crawl status')
|
self._handle_error(status_response, 'check crawl status')
|
||||||
|
|
||||||
def _handle_error(self, response, action):
|
def _handle_error(self, response: requests.Response, action: str) -> None:
|
||||||
"""
|
"""
|
||||||
Handle errors from API responses.
|
Handle errors from API responses.
|
||||||
|
|
||||||
@ -292,8 +300,19 @@ class FirecrawlApp:
|
|||||||
Raises:
|
Raises:
|
||||||
Exception: An exception with a message containing the status code and error details from the response.
|
Exception: An exception with a message containing the status code and error details from the response.
|
||||||
"""
|
"""
|
||||||
if response.status_code in [402, 408, 409, 500]:
|
error_message = response.json().get('error', 'No additional error details provided.')
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
|
||||||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
if response.status_code == 402:
|
||||||
|
message = f"Payment Required: Failed to {action}. {error_message}"
|
||||||
|
elif response.status_code == 408:
|
||||||
|
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}"
|
||||||
|
elif response.status_code == 409:
|
||||||
|
message = f"Conflict: Failed to {action} due to a conflict. {error_message}"
|
||||||
|
elif response.status_code == 500:
|
||||||
|
message = f"Internal Server Error: Failed to {action}. {error_message}"
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
|
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}"
|
||||||
|
|
||||||
|
# Raise an HTTPError with the custom message and attach the response
|
||||||
|
raise requests.exceptions.HTTPError(message, response=response)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user