diff --git a/apps/api/openapi.json b/apps/api/openapi.json
index a755e37..17b3677 100644
--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@@ -61,6 +61,13 @@
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
},
+ "removeTags": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+ },
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
@@ -194,6 +201,11 @@
"type": "integer",
"description": "Maximum number of pages to crawl",
"default": 10000
+ },
+ "allowBackwardCrawling": {
+ "type": "boolean",
+ "description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
+ "default": false
}
}
},
@@ -219,6 +231,13 @@
"type": "object",
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
},
+ "removeTags": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+ },
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
@@ -492,7 +511,7 @@
"html": {
"type": "string",
"nullable": true,
- "description": "Raw HTML content of the page if `includeHtml` is true"
+ "description": "Raw HTML content of the page if `includeHtml` is true"
},
"metadata": {
"type": "object",
@@ -507,9 +526,126 @@
"type": "string",
"nullable": true
},
+ "keywords": {
+ "type": "string",
+ "nullable": true
+ },
+ "robots": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogTitle": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogDescription": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogUrl": {
+ "type": "string",
+ "format": "uri",
+ "nullable": true
+ },
+ "ogImage": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogAudio": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogDeterminer": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogLocale": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogLocaleAlternate": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "nullable": true
+ },
+ "ogSiteName": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogVideo": {
+ "type": "string",
+ "nullable": true
+ },
+ "dctermsCreated": {
+ "type": "string",
+ "nullable": true
+ },
+ "dcDateCreated": {
+ "type": "string",
+ "nullable": true
+ },
+ "dcDate": {
+ "type": "string",
+ "nullable": true
+ },
+ "dctermsType": {
+ "type": "string",
+ "nullable": true
+ },
+ "dcType": {
+ "type": "string",
+ "nullable": true
+ },
+ "dctermsAudience": {
+ "type": "string",
+ "nullable": true
+ },
+ "dctermsSubject": {
+ "type": "string",
+ "nullable": true
+ },
+ "dcSubject": {
+ "type": "string",
+ "nullable": true
+ },
+ "dcDescription": {
+ "type": "string",
+ "nullable": true
+ },
+ "dctermsKeywords": {
+ "type": "string",
+ "nullable": true
+ },
+ "modifiedTime": {
+ "type": "string",
+ "nullable": true
+ },
+ "publishedTime": {
+ "type": "string",
+ "nullable": true
+ },
+ "articleTag": {
+ "type": "string",
+ "nullable": true
+ },
+ "articleSection": {
+ "type": "string",
+ "nullable": true
+ },
"sourceURL": {
"type": "string",
"format": "uri"
+ },
+ "pageStatusCode": {
+ "type": "integer",
+ "description": "The status code of the page"
+ },
+ "pageError": {
+ "type": "string",
+ "nullable": true,
+ "description": "The error message of the page"
}
}
},
@@ -558,9 +694,126 @@
"type": "string",
"nullable": true
},
+ "keywords": {
+ "type": "string",
+ "nullable": true
+ },
+ "robots": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogTitle": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogDescription": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogUrl": {
+ "type": "string",
+ "format": "uri",
+ "nullable": true
+ },
+ "ogImage": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogAudio": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogDeterminer": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogLocale": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogLocaleAlternate": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "nullable": true
+ },
+ "ogSiteName": {
+ "type": "string",
+ "nullable": true
+ },
+ "ogVideo": {
+ "type": "string",
+ "nullable": true
+ },
+ "dctermsCreated": {
+ "type": "string",
+ "nullable": true
+ },
+ "dcDateCreated": {
+ "type": "string",
+ "nullable": true
+ },
+ "dcDate": {
+ "type": "string",
+ "nullable": true
+ },
+ "dctermsType": {
+ "type": "string",
+ "nullable": true
+ },
+ "dcType": {
+ "type": "string",
+ "nullable": true
+ },
+ "dctermsAudience": {
+ "type": "string",
+ "nullable": true
+ },
+ "dctermsSubject": {
+ "type": "string",
+ "nullable": true
+ },
+ "dcSubject": {
+ "type": "string",
+ "nullable": true
+ },
+ "dcDescription": {
+ "type": "string",
+ "nullable": true
+ },
+ "dctermsKeywords": {
+ "type": "string",
+ "nullable": true
+ },
+ "modifiedTime": {
+ "type": "string",
+ "nullable": true
+ },
+ "publishedTime": {
+ "type": "string",
+ "nullable": true
+ },
+ "articleTag": {
+ "type": "string",
+ "nullable": true
+ },
+ "articleSection": {
+ "type": "string",
+ "nullable": true
+ },
"sourceURL": {
"type": "string",
"format": "uri"
+ },
+ "pageStatusCode": {
+ "type": "integer",
+ "description": "The status code of the page"
+ },
+ "pageError": {
+ "type": "string",
+ "nullable": true,
+ "description": "The error message of the page"
}
}
}
diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index a225a80..adce022 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -83,6 +83,8 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.content).toContain("_Roast_");
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
+ expect(response.body.data.metadata.pageError).toBeUndefined();
}, 30000); // 30 seconds timeout
it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => {
@@ -103,6 +105,8 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.content).toContain("_Roast_");
expect(response.body.data.markdown).toContain("_Roast_");
expect(response.body.data.html).toContain("
{
@@ -118,6 +122,8 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
+ expect(response.body.data.metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
@@ -133,8 +139,59 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
+ expect(response.body.data.metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds
+ it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
+ const response = await request(TEST_URL)
+ .post('/v0/scrape')
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+ .set('Content-Type', 'application/json')
+ .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
+ await new Promise((r) => setTimeout(r, 6000));
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('data');
+ expect(response.body.data).toHaveProperty('content');
+ expect(response.body.data).toHaveProperty('metadata');
+ expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
+ }, 60000); // 60 seconds
+
+ it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
+ const responseWithoutRemoveTags = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://www.scrapethissite.com/" });
+ expect(responseWithoutRemoveTags.statusCode).toBe(200);
+ expect(responseWithoutRemoveTags.body).toHaveProperty("data");
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
+ expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
+ expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
+ expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
+ expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
+ expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
+
+ const response = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("markdown");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data).not.toHaveProperty("html");
+ expect(response.body.data.content).toContain("Scrape This Site");
+ expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
+ expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
+ expect(response.body.data.content).not.toContain("web scraping"); // strong
+ }, 30000); // 30 seconds timeout
+
// TODO: add this test back once we nail the waitFor option to be more deterministic
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
// const startTime = Date.now();
@@ -155,6 +212,102 @@ describe("E2E Tests for API Routes", () => {
// expect(response.body.data.content).toContain("🔥 Firecrawl");
// expect(duration).toBeGreaterThanOrEqual(7000);
// }, 12000); // 12 seconds timeout
+
+ it.concurrent('should return a successful response for a scrape with 400 page', async () => {
+ const response = await request(TEST_URL)
+ .post('/v0/scrape')
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+ .set('Content-Type', 'application/json')
+ .send({ url: 'https://httpstat.us/400' });
+ await new Promise((r) => setTimeout(r, 5000));
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('data');
+ expect(response.body.data).toHaveProperty('content');
+ expect(response.body.data).toHaveProperty('metadata');
+ expect(response.body.data.metadata.pageStatusCode).toBe(400);
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request");
+ }, 60000); // 60 seconds
+
+ it.concurrent('should return a successful response for a scrape with 401 page', async () => {
+ const response = await request(TEST_URL)
+ .post('/v0/scrape')
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+ .set('Content-Type', 'application/json')
+ .send({ url: 'https://httpstat.us/401' });
+ await new Promise((r) => setTimeout(r, 5000));
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('data');
+ expect(response.body.data).toHaveProperty('content');
+ expect(response.body.data).toHaveProperty('metadata');
+ expect(response.body.data.metadata.pageStatusCode).toBe(401);
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized");
+ }, 60000); // 60 seconds
+
+ it.concurrent("should return a successful response for a scrape with 403 page", async () => {
+ const response = await request(TEST_URL)
+ .post('/v0/scrape')
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+ .set('Content-Type', 'application/json')
+ .send({ url: 'https://httpstat.us/403' });
+
+ await new Promise((r) => setTimeout(r, 5000));
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('data');
+ expect(response.body.data).toHaveProperty('content');
+ expect(response.body.data).toHaveProperty('metadata');
+ expect(response.body.data.metadata.pageStatusCode).toBe(403);
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden");
+ }, 60000); // 60 seconds
+
+ it.concurrent('should return a successful response for a scrape with 404 page', async () => {
+ const response = await request(TEST_URL)
+ .post('/v0/scrape')
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+ .set('Content-Type', 'application/json')
+ .send({ url: 'https://httpstat.us/404' });
+ await new Promise((r) => setTimeout(r, 5000));
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('data');
+ expect(response.body.data).toHaveProperty('content');
+ expect(response.body.data).toHaveProperty('metadata');
+ expect(response.body.data.metadata.pageStatusCode).toBe(404);
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found");
+ }, 60000); // 60 seconds
+
+ it.concurrent('should return a successful response for a scrape with 405 page', async () => {
+ const response = await request(TEST_URL)
+ .post('/v0/scrape')
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+ .set('Content-Type', 'application/json')
+ .send({ url: 'https://httpstat.us/405' });
+ await new Promise((r) => setTimeout(r, 5000));
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('data');
+ expect(response.body.data).toHaveProperty('content');
+ expect(response.body.data).toHaveProperty('metadata');
+ expect(response.body.data.metadata.pageStatusCode).toBe(405);
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed");
+ }, 60000); // 60 seconds
+
+ it.concurrent('should return a successful response for a scrape with 500 page', async () => {
+ const response = await request(TEST_URL)
+ .post('/v0/scrape')
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+ .set('Content-Type', 'application/json')
+ .send({ url: 'https://httpstat.us/500' });
+ await new Promise((r) => setTimeout(r, 5000));
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('data');
+ expect(response.body.data).toHaveProperty('content');
+ expect(response.body.data).toHaveProperty('metadata');
+ expect(response.body.data.metadata.pageStatusCode).toBe(500);
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error");
+ }, 60000); // 60 seconds
});
describe("POST /v0/crawl", () => {
@@ -270,6 +423,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+ expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds
it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => {
@@ -351,6 +506,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+ expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
@@ -393,6 +550,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+ expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
@@ -651,6 +810,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+ expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
// 120 seconds
expect(completedResponse.body.data[0]).toHaveProperty("html");
@@ -658,7 +819,11 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0].content).toContain("_Roast_");
expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
expect(completedResponse.body.data[0].html).toContain(" {
@@ -792,6 +957,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+ expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
const childrenLinks = completedResponse.body.data.filter(doc =>
doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
@@ -835,8 +1002,13 @@ describe("E2E Tests for API Routes", () => {
})
])
);
+
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+ expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 180000); // 120 seconds
+
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
@@ -870,6 +1042,9 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+ expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
+
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
@@ -930,6 +1105,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0].content).toContain("_Roast_");
expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
expect(completedResponse.body.data[0].html).toContain(" {
expect(completedResponse.body.data[0]).toHaveProperty("html");
expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].markdown).toContain("Mendable");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+ expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
const onlyChildrenLinks = completedResponse.body.data.filter(doc => {
return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
@@ -1013,7 +1192,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
-
+ expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200);
+ expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds
describe("POST /v0/scrape with LLM Extraction", () => {
@@ -1168,6 +1348,10 @@ describe("E2E Tests for API Routes", () => {
expect(statusResponse.body).toHaveProperty("data");
expect(statusResponse.body.data[0]).toHaveProperty("content");
expect(statusResponse.body.data[0]).toHaveProperty("markdown");
+ expect(statusResponse.body.data[0]).toHaveProperty("metadata");
+ expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+ expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined();
+
const results = statusResponse.body.data;
// results.forEach((result, i) => {
// console.log(result.metadata.sourceURL);
diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts
index 58d01e2..8fd876d 100644
--- a/apps/api/src/controllers/crawl.ts
+++ b/apps/api/src/controllers/crawl.ts
@@ -55,8 +55,16 @@ export async function crawlController(req: Request, res: Response) {
}
const mode = req.body.mode ?? "crawl";
- const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
- const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+
+ const crawlerOptions = req.body.crawlerOptions ?? {
+ allowBackwardCrawling: false
+ };
+ const pageOptions = req.body.pageOptions ?? {
+ onlyMainContent: false,
+ includeHtml: false,
+ removeTags: [],
+ parsePDF: true
+ };
if (mode === "single_urls" && !url.includes(",")) {
try {
diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts
index d3e9afe..2c3dc4e 100644
--- a/apps/api/src/controllers/crawlPreview.ts
+++ b/apps/api/src/controllers/crawlPreview.ts
@@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {};
- const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+ const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
const job = await addWebScraperJob({
url: url,
diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts
index d5ab1de..1537c07 100644
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@@ -61,7 +61,7 @@ export async function scrapeHelper(
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
);
if (filteredDocs.length === 0) {
- return { success: true, error: "No page found", returnCode: 200 };
+ return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
}
let creditsToBeBilled = filteredDocs.length;
@@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(status).json({ error });
}
const crawlerOptions = req.body.crawlerOptions ?? {};
- const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
+ const pageOptions = req.body.pageOptions ?? {
+ onlyMainContent: false,
+ includeHtml: false,
+ waitFor: 0,
+ screenshot: false,
+ parsePDF: true
+ };
const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown"
}
diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts
index 7474aae..b555197 100644
--- a/apps/api/src/controllers/search.ts
+++ b/apps/api/src/controllers/search.ts
@@ -85,6 +85,7 @@ export async function searchHelper(
onlyMainContent: pageOptions?.onlyMainContent ?? true,
fetchPageContent: pageOptions?.fetchPageContent ?? true,
includeHtml: pageOptions?.includeHtml ?? false,
+ removeTags: pageOptions?.removeTags ?? [],
fallback: false,
},
});
@@ -100,7 +101,7 @@ export async function searchHelper(
);
if (filteredDocs.length === 0) {
- return { success: true, error: "No page found", returnCode: 200 };
+ return { success: true, error: "No page found", returnCode: 200, data: docs };
}
const billingResult = await billTeam(
@@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
includeHtml: false,
onlyMainContent: true,
fetchPageContent: true,
+ removeTags: [],
fallback: false,
};
const origin = req.body.origin ?? "api";
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 81bf12c..12d8c36 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -19,6 +19,8 @@ export type PageOptions = {
screenshot?: boolean;
headers?: Record;
replaceAllPathsWithAbsolutePaths?: boolean;
+ parsePDF?: boolean;
+ removeTags?: string | string[];
};
export type ExtractorOptions = {
@@ -119,4 +121,7 @@ export class SearchResult {
export interface FireEngineResponse {
html: string;
screenshot: string;
-}
\ No newline at end of file
+ pageStatusCode?: number;
+ pageError?: string;
+}
+
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 33a643b..ba5e003 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -223,7 +223,7 @@ export class WebCrawler {
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
}
- async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
+ async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
const normalizedUrl = this.normalizeCrawlUrl(url);
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return [];
@@ -243,20 +243,27 @@ export class WebCrawler {
try {
let content: string = "";
+ let pageStatusCode: number;
+ let pageError: string | undefined = undefined;
+
// If it is the first link, fetch with single url
if (this.visited.size === 1) {
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
content = page.html ?? "";
+ pageStatusCode = page.metadata?.pageStatusCode;
+ pageError = page.metadata?.pageError || undefined;
} else {
const response = await axios.get(url);
content = response.data ?? "";
+ pageStatusCode = response.status;
+ pageError = response.statusText != "OK" ? response.statusText : undefined;
}
const $ = load(content);
- let links: { url: string, html: string }[] = [];
+ let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
// Add the initial URL to the list of links
if (this.visited.size === 1) {
- links.push({ url, html: content });
+ links.push({ url, html: content, pageStatusCode, pageError });
}
$("a").each((_, element) => {
@@ -278,7 +285,7 @@ export class WebCrawler {
!this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
) {
- links.push({ url: fullUrl, html: content });
+ links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
}
});
diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
index 8108a9e..081150b 100644
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@@ -1,5 +1,3 @@
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-
export async function handleCustomScraping(
text: string,
url: string
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 250931b..030f795 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -245,7 +245,7 @@ export class WebScraperDataProvider {
content: "",
html: this.pageOptions?.includeHtml ? "" : undefined,
markdown: "",
- metadata: { sourceURL: url },
+ metadata: { sourceURL: url, pageStatusCode: 200 },
}));
}
@@ -284,10 +284,10 @@ export class WebScraperDataProvider {
private async fetchPdfDocuments(pdfLinks: string[]): Promise {
return Promise.all(
pdfLinks.map(async (pdfLink) => {
- const pdfContent = await fetchAndProcessPdf(pdfLink);
+ const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
return {
- content: pdfContent,
- metadata: { sourceURL: pdfLink },
+ content: content,
+ metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
provider: "web-scraper",
};
})
@@ -296,10 +296,10 @@ export class WebScraperDataProvider {
private async fetchDocxDocuments(docxLinks: string[]): Promise {
return Promise.all(
docxLinks.map(async (p) => {
- const docXDocument = await fetchAndProcessDocx(p);
+ const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p);
return {
- content: docXDocument,
- metadata: { sourceURL: p },
+ content,
+ metadata: { sourceURL: p, pageStatusCode, pageError },
provider: "web-scraper",
};
})
@@ -479,7 +479,13 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
- this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
+ this.pageOptions = options.pageOptions ?? {
+ onlyMainContent: false,
+ includeHtml: false,
+ replaceAllPathsWithAbsolutePaths: false,
+ parsePDF: true,
+ removeTags: []
+ };
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index c2dcea1..1ba2832 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
url: string,
waitFor: number = 0,
screenshot: boolean = false,
- pageOptions: { scrollXPaths?: string[] } = {},
+ pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
headers?: Record,
options?: any
): Promise {
@@ -83,17 +83,18 @@ export async function scrapWithFireEngine(
console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
);
- return { html: "", screenshot: "" };
+ return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
- return { html: await fetchAndProcessPdf(url), screenshot: "" };
+ const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
+ return { html: content, screenshot: "", pageStatusCode, pageError };
} else {
const data = response.data;
const html = data.content;
const screenshot = data.screenshot;
- return { html: html ?? "", screenshot: screenshot ?? "" };
+ return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
}
} catch (error) {
if (error.code === 'ECONNABORTED') {
@@ -108,44 +109,51 @@ export async function scrapWithFireEngine(
export async function scrapWithScrapingBee(
url: string,
wait_browser: string = "domcontentloaded",
- timeout: number = universalTimeout
-): Promise {
+ timeout: number = universalTimeout,
+ pageOptions: { parsePDF?: boolean } = { parsePDF: true }
+): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
const clientParams = await generateRequestParams(
url,
wait_browser,
- timeout
+ timeout,
);
- const response = await client.get(clientParams);
-
- if (response.status !== 200 && response.status !== 404) {
- console.error(
- `[ScrapingBee] Error fetching url: ${url} with status code ${response.status}`
- );
- return "";
- }
+ const response = await client.get({
+ ...clientParams,
+ params: {
+ ...clientParams.params,
+ 'transparent_status_code': 'True'
+ }
+ });
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
- return fetchAndProcessPdf(url);
+ return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
+
} else {
- const decoder = new TextDecoder();
- const text = decoder.decode(response.data);
- return text;
+ let text = "";
+ try {
+ const decoder = new TextDecoder();
+ text = decoder.decode(response.data);
+ } catch (decodeError) {
+ console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`);
+ }
+ return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined };
}
} catch (error) {
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
- return "";
+ return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText };
}
}
export async function scrapWithPlaywright(
url: string,
waitFor: number = 0,
- headers?: Record
-): Promise {
+ headers?: Record,
+ pageOptions: { parsePDF?: boolean } = { parsePDF: true }
+): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
try {
const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that
@@ -167,21 +175,21 @@ export async function scrapWithPlaywright(
console.error(
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
);
- return "";
+ return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
- return fetchAndProcessPdf(url);
+ return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else {
const textData = response.data;
try {
const data = JSON.parse(textData);
const html = data.content;
- return html ?? "";
+ return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
} catch (jsonError) {
console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`);
- return "";
+ return { content: "" };
}
}
} catch (error) {
@@ -190,11 +198,14 @@ export async function scrapWithPlaywright(
} else {
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
}
- return "";
+ return { content: "" };
}
}
-export async function scrapWithFetch(url: string): Promise {
+export async function scrapWithFetch(
+ url: string,
+ pageOptions: { parsePDF?: boolean } = { parsePDF: true }
+): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
try {
const response = await axios.get(url, {
headers: {
@@ -208,15 +219,15 @@ export async function scrapWithFetch(url: string): Promise {
console.error(
`[Axios] Error fetching url: ${url} with status: ${response.status}`
);
- return "";
+ return { content: "", pageStatusCode: response.status, pageError: response.statusText };
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
- return fetchAndProcessPdf(url);
+ return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else {
const text = response.data;
- return text;
+ return { content: text, pageStatusCode: 200 };
}
} catch (error) {
if (error.code === 'ECONNABORTED') {
@@ -224,7 +235,7 @@ export async function scrapWithFetch(url: string): Promise {
} else {
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
}
- return "";
+ return { content: "" };
}
}
@@ -304,6 +315,19 @@ export async function scrapSingleUrl(
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
const soup = cheerio.load(html);
soup("script, style, iframe, noscript, meta, head").remove();
+
+ if (pageOptions.removeTags) {
+ if (typeof pageOptions.removeTags === 'string') {
+ pageOptions.removeTags.split(',').forEach((tag) => {
+ soup(tag.trim()).remove();
+ });
+ } else if (Array.isArray(pageOptions.removeTags)) {
+ pageOptions.removeTags.forEach((tag) => {
+ soup(tag).remove();
+ });
+ }
+ }
+
if (pageOptions.onlyMainContent) {
// remove any other tags that are not in the main content
excludeNonMainTags.forEach((tag) => {
@@ -317,7 +341,7 @@ export async function scrapSingleUrl(
url: string,
method: (typeof baseScrapers)[number]
) => {
- let text = "";
+ let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} };
let screenshot = "";
switch (method) {
case "fire-engine":
@@ -329,38 +353,52 @@ export async function scrapSingleUrl(
pageOptions.screenshot,
pageOptions.headers
);
- text = response.html;
- screenshot = response.screenshot;
+ scraperResponse.text = response.html;
+ scraperResponse.screenshot = response.screenshot;
+ scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
+ scraperResponse.metadata.pageError = response.pageError;
}
break;
case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) {
- text = await scrapWithScrapingBee(
+ const response = await scrapWithScrapingBee(
url,
"domcontentloaded",
pageOptions.fallback === false ? 7000 : 15000
);
+ scraperResponse.text = response.content;
+ scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
+ scraperResponse.metadata.pageError = response.pageError;
}
break;
case "playwright":
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
- text = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
+ const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
+ scraperResponse.text = response.content;
+ scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
+ scraperResponse.metadata.pageError = response.pageError;
}
break;
case "scrapingBeeLoad":
if (process.env.SCRAPING_BEE_API_KEY) {
- text = await scrapWithScrapingBee(url, "networkidle2");
+ const response = await scrapWithScrapingBee(url, "networkidle2");
+ scraperResponse.text = response.content;
+ scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
+ scraperResponse.metadata.pageError = response.pageError;
}
break;
case "fetch":
- text = await scrapWithFetch(url);
+ const response = await scrapWithFetch(url);
+ scraperResponse.text = response.content;
+ scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
+ scraperResponse.metadata.pageError = response.pageError;
break;
}
let customScrapedContent : FireEngineResponse | null = null;
// Check for custom scraping conditions
- const customScraperResult = await handleCustomScraping(text, url);
+ const customScraperResult = await handleCustomScraping(scraperResponse.text, url);
if (customScraperResult){
switch (customScraperResult.scraper) {
@@ -371,23 +409,30 @@ export async function scrapSingleUrl(
}
break;
case "pdf":
- customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
+ const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF);
+ customScrapedContent = { html: content, screenshot, pageStatusCode, pageError }
break;
}
}
if (customScrapedContent) {
- text = customScrapedContent.html;
+ scraperResponse.text = customScrapedContent.html;
screenshot = customScrapedContent.screenshot;
}
//* TODO: add an optional to return markdown or structured/extracted content
- let cleanedHtml = removeUnwantedElements(text, pageOptions);
+ let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
- return [await parseMarkdown(cleanedHtml), text, screenshot];
+ return {
+ text: await parseMarkdown(cleanedHtml),
+ html: scraperResponse.text,
+ screenshot: scraperResponse.screenshot,
+ pageStatusCode: scraperResponse.metadata.pageStatusCode,
+ pageError: scraperResponse.metadata.pageError || undefined
+ };
};
+ let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
try {
- let [text, html, screenshot] = ["", "", ""];
let urlKey = urlToScrap;
try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
@@ -410,8 +455,21 @@ export async function scrapSingleUrl(
html = existingHtml;
break;
}
- [text, html, screenshot] = await attemptScraping(urlToScrap, scraper);
+
+ const attempt = await attemptScraping(urlToScrap, scraper);
+ text = attempt.text ?? '';
+ html = attempt.html ?? '';
+ screenshot = attempt.screenshot ?? '';
+ if (attempt.pageStatusCode) {
+ pageStatusCode = attempt.pageStatusCode;
+ }
+ if (attempt.pageError) {
+ pageError = attempt.pageError;
+ }
+
+
if (text && text.trim().length >= 100) break;
+ if (pageStatusCode && pageStatusCode == 404) break;
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
if (nextScraperIndex < scrapersInOrder.length) {
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
@@ -435,6 +493,8 @@ export async function scrapSingleUrl(
...metadata,
screenshot: screenshot,
sourceURL: urlToScrap,
+ pageStatusCode: pageStatusCode,
+ pageError: pageError
},
};
} else {
@@ -442,7 +502,12 @@ export async function scrapSingleUrl(
content: text,
markdown: text,
html: pageOptions.includeHtml ? html : undefined,
- metadata: { ...metadata, sourceURL: urlToScrap },
+ metadata: {
+ ...metadata,
+ sourceURL: urlToScrap,
+ pageStatusCode: pageStatusCode,
+ pageError: pageError
+ },
};
}
@@ -453,7 +518,11 @@ export async function scrapSingleUrl(
content: "",
markdown: "",
html: "",
- metadata: { sourceURL: urlToScrap },
+ metadata: {
+ sourceURL: urlToScrap,
+ pageStatusCode: pageStatusCode,
+ pageError: pageError
+ },
} as Document;
}
}
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts
index e018ffa..53237ef 100644
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts
@@ -3,11 +3,13 @@ import * as docxProcessor from "../docxProcessor";
describe("DOCX Processing Module - Integration Test", () => {
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
delete process.env.LLAMAPARSE_API_KEY;
- const docxContent = await docxProcessor.fetchAndProcessDocx(
+ const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx(
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
);
- expect(docxContent.trim()).toContain(
+ expect(content.trim()).toContain(
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
);
+ expect(pageStatusCode).toBe(200);
+ expect(pageError).toBeUndefined();
});
});
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
index f14c8d4..55930f2 100644
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
@@ -3,8 +3,10 @@ import * as pdfProcessor from '../pdfProcessor';
describe('PDF Processing Module - Integration Test', () => {
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
delete process.env.LLAMAPARSE_API_KEY;
- const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
- expect(pdfContent.trim()).toEqual("Dummy PDF file");
+ const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
+ expect(content.trim()).toEqual("Dummy PDF file");
+ expect(pageStatusCode).toEqual(200);
+ expect(pageError).toBeUndefined();
});
// We're hitting the LLAMAPARSE rate limit ðŸ«
diff --git a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
index 38759f8..a01b8a2 100644
--- a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
@@ -5,14 +5,14 @@ import path from "path";
import os from "os";
import mammoth from "mammoth";
-export async function fetchAndProcessDocx(url: string): Promise {
- const tempFilePath = await downloadDocx(url);
+export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
+ const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url);
const content = await processDocxToText(tempFilePath);
fs.unlinkSync(tempFilePath); // Clean up the temporary file
- return content;
+ return { content, pageStatusCode, pageError };
}
-async function downloadDocx(url: string): Promise {
+async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
const response = await axios({
url,
method: "GET",
@@ -25,7 +25,7 @@ async function downloadDocx(url: string): Promise {
response.data.pipe(writer);
return new Promise((resolve, reject) => {
- writer.on("finish", () => resolve(tempFilePath));
+ writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
writer.on("error", reject);
});
}
diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts
index ddaf1e8..3f2052c 100644
--- a/apps/api/src/scraper/WebScraper/utils/metadata.ts
+++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts
@@ -29,6 +29,9 @@ interface Metadata {
publishedTime?: string;
articleTag?: string;
articleSection?: string;
+ sourceURL?: string;
+ pageStatusCode?: number;
+ pageError?: string;
}
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
@@ -61,6 +64,9 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
let publishedTime: string | null = null;
let articleTag: string | null = null;
let articleSection: string | null = null;
+ let sourceURL: string | null = null;
+ let pageStatusCode: number | null = null;
+ let pageError: string | null = null;
try {
title = soup("title").text() || null;
@@ -132,5 +138,8 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
...(publishedTime ? { publishedTime } : {}),
...(articleTag ? { articleTag } : {}),
...(articleSection ? { articleSection } : {}),
+ ...(sourceURL ? { sourceURL } : {}),
+ ...(pageStatusCode ? { pageStatusCode } : {}),
+ ...(pageError ? { pageError } : {}),
};
}
diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
index 71984f2..1a67d60 100644
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -9,14 +9,14 @@ import os from "os";
dotenv.config();
-export async function fetchAndProcessPdf(url: string): Promise {
- const tempFilePath = await downloadPdf(url);
- const content = await processPdfToText(tempFilePath);
+export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
+ const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
+ const content = await processPdfToText(tempFilePath, parsePDF);
fs.unlinkSync(tempFilePath); // Clean up the temporary file
- return content;
+ return { content, pageStatusCode, pageError };
}
-async function downloadPdf(url: string): Promise {
+async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
const response = await axios({
url,
method: "GET",
@@ -29,15 +29,15 @@ async function downloadPdf(url: string): Promise {
response.data.pipe(writer);
return new Promise((resolve, reject) => {
- writer.on("finish", () => resolve(tempFilePath));
+ writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
writer.on("error", reject);
});
}
-export async function processPdfToText(filePath: string): Promise {
+export async function processPdfToText(filePath: string, parsePDF: boolean): Promise {
let content = "";
- if (process.env.LLAMAPARSE_API_KEY) {
+ if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
const apiKey = process.env.LLAMAPARSE_API_KEY;
const headers = {
Authorization: `Bearer ${apiKey}`,
@@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise {
console.error("Error processing pdf document w/ LlamaIndex(2)");
content = await processPdf(filePath);
}
- } else {
+ } else if (parsePDF) {
content = await processPdf(filePath);
+ } else {
+ content = fs.readFileSync(filePath, "utf-8");
}
return content;
}
diff --git a/apps/playwright-service/get_error.py b/apps/playwright-service/get_error.py
new file mode 100644
index 0000000..a33de5e
--- /dev/null
+++ b/apps/playwright-service/get_error.py
@@ -0,0 +1,63 @@
+def get_error(status_code: int) -> str:
+ error_messages = {
+ 300: "Multiple Choices",
+ 301: "Moved Permanently",
+ 302: "Found",
+ 303: "See Other",
+ 304: "Not Modified",
+ 305: "Use Proxy",
+ 307: "Temporary Redirect",
+ 308: "Permanent Redirect",
+ 309: "Resume Incomplete",
+ 310: "Too Many Redirects",
+ 311: "Unavailable For Legal Reasons",
+ 312: "Previously Used",
+ 313: "I'm Used",
+ 314: "Switch Proxy",
+ 315: "Temporary Redirect",
+ 316: "Resume Incomplete",
+ 317: "Too Many Redirects",
+ 400: "Bad Request",
+ 401: "Unauthorized",
+ 403: "Forbidden",
+ 404: "Not Found",
+ 405: "Method Not Allowed",
+ 406: "Not Acceptable",
+ 407: "Proxy Authentication Required",
+ 408: "Request Timeout",
+ 409: "Conflict",
+ 410: "Gone",
+ 411: "Length Required",
+ 412: "Precondition Failed",
+ 413: "Payload Too Large",
+ 414: "URI Too Long",
+ 415: "Unsupported Media Type",
+ 416: "Range Not Satisfiable",
+ 417: "Expectation Failed",
+ 418: "I'm a teapot",
+ 421: "Misdirected Request",
+ 422: "Unprocessable Entity",
+ 423: "Locked",
+ 424: "Failed Dependency",
+ 425: "Too Early",
+ 426: "Upgrade Required",
+ 428: "Precondition Required",
+ 429: "Too Many Requests",
+ 431: "Request Header Fields Too Large",
+ 451: "Unavailable For Legal Reasons",
+ 500: "Internal Server Error",
+ 501: "Not Implemented",
+ 502: "Bad Gateway",
+ 503: "Service Unavailable",
+ 504: "Gateway Timeout",
+ 505: "HTTP Version Not Supported",
+ 506: "Variant Also Negotiates",
+ 507: "Insufficient Storage",
+ 508: "Loop Detected",
+ 510: "Not Extended",
+ 511: "Network Authentication Required",
+ 599: "Network Connect Timeout Error"
+ }
+ if status_code < 300:
+ return None
+ return error_messages.get(status_code, "Unknown Error")
diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py
index 8ef7418..bd6b14e 100644
--- a/apps/playwright-service/main.py
+++ b/apps/playwright-service/main.py
@@ -9,6 +9,7 @@ from fastapi import FastAPI
from fastapi.responses import JSONResponse
from playwright.async_api import Browser, async_playwright
from pydantic import BaseModel
+from get_error import get_error
PROXY_SERVER = environ.get("PROXY_SERVER", None)
PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
@@ -73,16 +74,22 @@ async def root(body: UrlModel):
if body.headers:
await page.set_extra_http_headers(body.headers)
- await page.goto(
+ response = await page.goto(
body.url,
wait_until="load",
timeout=body.timeout,
)
+ page_status_code = response.status
+ page_error = get_error(page_status_code)
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
if body.wait_after_load > 0:
await page.wait_for_timeout(body.wait_after_load)
page_content = await page.content()
await context.close()
- json_compatible_item_data = {"content": page_content}
- return JSONResponse(content=json_compatible_item_data)
+ json_compatible_item_data = {
+ "content": page_content,
+ "pageStatusCode": page_status_code,
+ "pageError": page_error
+ }
+ return JSONResponse(content=json_compatible_item_data)
\ No newline at end of file
diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py
index ecb017f..fbb2bdb 100644
--- a/apps/python-sdk/firecrawl/__init__.py
+++ b/apps/python-sdk/firecrawl/__init__.py
@@ -1,3 +1,57 @@
+"""
+This is the Firecrawl package.
+
+This package provides a Python SDK for interacting with the Firecrawl API.
+It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
+and check the status of these jobs.
+
+For more information visit https://github.com/firecrawl/
+"""
+
+import logging
+import os
+
from .firecrawl import FirecrawlApp
-__version__ = "0.0.14"
+__version__ = "0.0.16"
+
+# Define the logger for the Firecrawl project
+logger: logging.Logger = logging.getLogger("firecrawl")
+
+
+def _basic_config() -> None:
+ """Set up basic configuration for logging with a specific format and date format."""
+ try:
+ logging.basicConfig(
+ format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
+ datefmt="%Y-%m-%d %H:%M:%S",
+ )
+ except Exception as e:
+ logger.error("Failed to configure logging: %s", e)
+
+
+def setup_logging() -> None:
+ """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
+ env = os.environ.get(
+ "FIRECRAWL_LOGGING_LEVEL", "INFO"
+ ).upper() # Default to 'INFO' level
+ _basic_config()
+
+ if env == "DEBUG":
+ logger.setLevel(logging.DEBUG)
+ elif env == "INFO":
+ logger.setLevel(logging.INFO)
+ elif env == "WARNING":
+ logger.setLevel(logging.WARNING)
+ elif env == "ERROR":
+ logger.setLevel(logging.ERROR)
+ elif env == "CRITICAL":
+ logger.setLevel(logging.CRITICAL)
+ else:
+ logger.setLevel(logging.INFO)
+ logger.warning("Unknown logging level: %s, defaulting to INFO", env)
+
+
+# Initialize logging configuration when the module is imported
+setup_logging()
+logger.debug("Debugging logger setup")
diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc
new file mode 100644
index 0000000..5ba1f13
Binary files /dev/null and b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc differ
diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py
index 90a6498..452d498 100644
--- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py
+++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py
@@ -27,14 +27,14 @@ def test_scrape_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.scrape_url('https://firecrawl.dev')
- assert "Failed to scrape URL. Status code: 401" in str(excinfo.value)
+ assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_blocklisted_url():
blocklisted_url = "https://facebook.com/fake-test"
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
with pytest.raises(Exception) as excinfo:
app.scrape_url(blocklisted_url)
- assert "Failed to scrape URL. Status code: 403" in str(excinfo.value)
+ assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
def test_successful_response_with_valid_preview_token():
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
@@ -86,14 +86,14 @@ def test_crawl_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.crawl_url('https://firecrawl.dev')
- assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value)
+ assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_should_return_error_for_blocklisted_url():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
blocklisted_url = "https://twitter.com/fake-test"
with pytest.raises(Exception) as excinfo:
app.crawl_url(blocklisted_url)
- assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value)
+ assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
def test_crawl_url_wait_for_completion_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@@ -114,7 +114,7 @@ def test_crawl_url_with_idempotency_key_e2e():
with pytest.raises(Exception) as excinfo:
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
- assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value)
+ assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
def test_check_crawl_status_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@@ -141,7 +141,7 @@ def test_search_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.search("test query")
- assert "Failed to search. Status code: 401" in str(excinfo.value)
+ assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_llm_extraction():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index b9a823f..7ec0d33 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes.
Classes:
- FirecrawlApp: Main class for interacting with the Firecrawl API.
"""
-
+import logging
import os
import time
from typing import Any, Dict, Optional
import requests
+logger : logging.Logger = logging.getLogger("firecrawl")
class FirecrawlApp:
"""
@@ -28,8 +29,15 @@ class FirecrawlApp:
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
if self.api_key is None:
+ logger.warning("No API key provided")
raise ValueError('No API key provided')
+ else:
+ logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
+
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
+ if self.api_url != 'https://api.firecrawl.dev':
+ logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
+
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
"""
Scrape the specified URL using the Firecrawl API.
@@ -45,10 +53,8 @@ class FirecrawlApp:
Exception: If the scrape request fails.
"""
- headers = {
- 'Content-Type': 'application/json',
- 'Authorization': f'Bearer {self.api_key}'
- }
+ headers = self._prepare_headers()
+
# Prepare the base scrape parameters with the URL
scrape_params = {'url': url}
@@ -81,13 +87,10 @@ class FirecrawlApp:
return response['data']
else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
- elif response.status_code in [402, 408, 409, 500]:
- error_message = response.json().get('error', 'Unknown error occurred')
- raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else:
- raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
+ self._handle_error(response, 'scrape URL')
- def search(self, query, params=None):
+ def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
"""
Perform a search using the Firecrawl API.
@@ -101,10 +104,7 @@ class FirecrawlApp:
Raises:
Exception: If the search request fails.
"""
- headers = {
- 'Content-Type': 'application/json',
- 'Authorization': f'Bearer {self.api_key}'
- }
+ headers = self._prepare_headers()
json_data = {'query': query}
if params:
json_data.update(params)
@@ -121,13 +121,14 @@ class FirecrawlApp:
else:
raise Exception(f'Failed to search. Error: {response["error"]}')
- elif response.status_code in [402, 409, 500]:
- error_message = response.json().get('error', 'Unknown error occurred')
- raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
else:
- raise Exception(f'Failed to search. Status code: {response.status_code}')
+ self._handle_error(response, 'search')
- def crawl_url(self, url, params=None, wait_until_done=True, poll_interval=2, idempotency_key=None):
+ def crawl_url(self, url: str,
+ params: Optional[Dict[str, Any]] = None,
+ wait_until_done: bool = True,
+ poll_interval: int = 2,
+ idempotency_key: Optional[str] = None) -> Any:
"""
Initiate a crawl job for the specified URL using the Firecrawl API.
@@ -158,7 +159,7 @@ class FirecrawlApp:
else:
self._handle_error(response, 'start crawl job')
- def check_crawl_status(self, job_id):
+ def check_crawl_status(self, job_id: str) -> Any:
"""
Check the status of a crawl job using the Firecrawl API.
@@ -178,7 +179,7 @@ class FirecrawlApp:
else:
self._handle_error(response, 'check crawl status')
- def _prepare_headers(self, idempotency_key=None):
+ def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
"""
Prepare the headers for API requests.
@@ -200,7 +201,11 @@ class FirecrawlApp:
'Authorization': f'Bearer {self.api_key}',
}
- def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
+ def _post_request(self, url: str,
+ data: Dict[str, Any],
+ headers: Dict[str, str],
+ retries: int = 3,
+ backoff_factor: float = 0.5) -> requests.Response:
"""
Make a POST request with retries.
@@ -225,7 +230,10 @@ class FirecrawlApp:
return response
return response
- def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
+ def _get_request(self, url: str,
+ headers: Dict[str, str],
+ retries: int = 3,
+ backoff_factor: float = 0.5) -> requests.Response:
"""
Make a GET request with retries.
@@ -249,7 +257,7 @@ class FirecrawlApp:
return response
return response
- def _monitor_job_status(self, job_id, headers, poll_interval):
+ def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any:
"""
Monitor the status of a crawl job until completion.
@@ -281,7 +289,7 @@ class FirecrawlApp:
else:
self._handle_error(status_response, 'check crawl status')
- def _handle_error(self, response, action):
+ def _handle_error(self, response: requests.Response, action: str) -> None:
"""
Handle errors from API responses.
@@ -292,8 +300,19 @@ class FirecrawlApp:
Raises:
Exception: An exception with a message containing the status code and error details from the response.
"""
- if response.status_code in [402, 408, 409, 500]:
- error_message = response.json().get('error', 'Unknown error occurred')
- raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
+ error_message = response.json().get('error', 'No additional error details provided.')
+
+ if response.status_code == 402:
+ message = f"Payment Required: Failed to {action}. {error_message}"
+ elif response.status_code == 408:
+ message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}"
+ elif response.status_code == 409:
+ message = f"Conflict: Failed to {action} due to a conflict. {error_message}"
+ elif response.status_code == 500:
+ message = f"Internal Server Error: Failed to {action}. {error_message}"
else:
- raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
+ message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}"
+
+ # Raise an HTTPError with the custom message and attach the response
+ raise requests.exceptions.HTTPError(message, response=response)
+
\ No newline at end of file