0

Merge pull request #276 from mendableai/feat/issue-266

[Feat] Added metadata.pageStatusCode and metadata.pageError properties
This commit is contained in:
Rafael Miller 2024-06-14 12:19:24 -03:00 committed by GitHub
commit 52d6201c42
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 587 additions and 78 deletions

View File

@ -511,7 +511,7 @@
"html": { "html": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "Raw HTML content of the page if `includeHtml` is true" "description": "Raw HTML content of the page if `includeHtml` is true"
}, },
"metadata": { "metadata": {
"type": "object", "type": "object",
@ -526,9 +526,126 @@
"type": "string", "type": "string",
"nullable": true "nullable": true
}, },
"keywords": {
"type": "string",
"nullable": true
},
"robots": {
"type": "string",
"nullable": true
},
"ogTitle": {
"type": "string",
"nullable": true
},
"ogDescription": {
"type": "string",
"nullable": true
},
"ogUrl": {
"type": "string",
"format": "uri",
"nullable": true
},
"ogImage": {
"type": "string",
"nullable": true
},
"ogAudio": {
"type": "string",
"nullable": true
},
"ogDeterminer": {
"type": "string",
"nullable": true
},
"ogLocale": {
"type": "string",
"nullable": true
},
"ogLocaleAlternate": {
"type": "array",
"items": {
"type": "string"
},
"nullable": true
},
"ogSiteName": {
"type": "string",
"nullable": true
},
"ogVideo": {
"type": "string",
"nullable": true
},
"dctermsCreated": {
"type": "string",
"nullable": true
},
"dcDateCreated": {
"type": "string",
"nullable": true
},
"dcDate": {
"type": "string",
"nullable": true
},
"dctermsType": {
"type": "string",
"nullable": true
},
"dcType": {
"type": "string",
"nullable": true
},
"dctermsAudience": {
"type": "string",
"nullable": true
},
"dctermsSubject": {
"type": "string",
"nullable": true
},
"dcSubject": {
"type": "string",
"nullable": true
},
"dcDescription": {
"type": "string",
"nullable": true
},
"dctermsKeywords": {
"type": "string",
"nullable": true
},
"modifiedTime": {
"type": "string",
"nullable": true
},
"publishedTime": {
"type": "string",
"nullable": true
},
"articleTag": {
"type": "string",
"nullable": true
},
"articleSection": {
"type": "string",
"nullable": true
},
"sourceURL": { "sourceURL": {
"type": "string", "type": "string",
"format": "uri" "format": "uri"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
} }
} }
}, },
@ -577,9 +694,126 @@
"type": "string", "type": "string",
"nullable": true "nullable": true
}, },
"keywords": {
"type": "string",
"nullable": true
},
"robots": {
"type": "string",
"nullable": true
},
"ogTitle": {
"type": "string",
"nullable": true
},
"ogDescription": {
"type": "string",
"nullable": true
},
"ogUrl": {
"type": "string",
"format": "uri",
"nullable": true
},
"ogImage": {
"type": "string",
"nullable": true
},
"ogAudio": {
"type": "string",
"nullable": true
},
"ogDeterminer": {
"type": "string",
"nullable": true
},
"ogLocale": {
"type": "string",
"nullable": true
},
"ogLocaleAlternate": {
"type": "array",
"items": {
"type": "string"
},
"nullable": true
},
"ogSiteName": {
"type": "string",
"nullable": true
},
"ogVideo": {
"type": "string",
"nullable": true
},
"dctermsCreated": {
"type": "string",
"nullable": true
},
"dcDateCreated": {
"type": "string",
"nullable": true
},
"dcDate": {
"type": "string",
"nullable": true
},
"dctermsType": {
"type": "string",
"nullable": true
},
"dcType": {
"type": "string",
"nullable": true
},
"dctermsAudience": {
"type": "string",
"nullable": true
},
"dctermsSubject": {
"type": "string",
"nullable": true
},
"dcSubject": {
"type": "string",
"nullable": true
},
"dcDescription": {
"type": "string",
"nullable": true
},
"dctermsKeywords": {
"type": "string",
"nullable": true
},
"modifiedTime": {
"type": "string",
"nullable": true
},
"publishedTime": {
"type": "string",
"nullable": true
},
"articleTag": {
"type": "string",
"nullable": true
},
"articleSection": {
"type": "string",
"nullable": true
},
"sourceURL": { "sourceURL": {
"type": "string", "type": "string",
"format": "uri" "format": "uri"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
} }
} }
} }

View File

@ -83,6 +83,8 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.content).toContain("_Roast_"); expect(response.body.data.content).toContain("_Roast_");
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => { it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => {
@ -103,6 +105,8 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.content).toContain("_Roast_"); expect(response.body.data.content).toContain("_Roast_");
expect(response.body.data.markdown).toContain("_Roast_"); expect(response.body.data.markdown).toContain("_Roast_");
expect(response.body.data.html).toContain("<h1"); expect(response.body.data.html).toContain("<h1");
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => { it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
@ -118,6 +122,8 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds }, 60000); // 60 seconds
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
@ -133,6 +139,8 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds }, 60000); // 60 seconds
it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
@ -204,6 +212,102 @@ describe("E2E Tests for API Routes", () => {
// expect(response.body.data.content).toContain("🔥 Firecrawl"); // expect(response.body.data.content).toContain("🔥 Firecrawl");
// expect(duration).toBeGreaterThanOrEqual(7000); // expect(duration).toBeGreaterThanOrEqual(7000);
// }, 12000); // 12 seconds timeout // }, 12000); // 12 seconds timeout
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/400' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(400);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/401' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(401);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized");
}, 60000); // 60 seconds
it.concurrent("should return a successful response for a scrape with 403 page", async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/403' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(403);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/404' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(404);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/405' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(405);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/500' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(500);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error");
}, 60000); // 60 seconds
}); });
describe("POST /v0/crawl", () => { describe("POST /v0/crawl", () => {
@ -319,6 +423,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds }, 60000); // 60 seconds
it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => { it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => {
@ -400,6 +506,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds }, 60000); // 60 seconds
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
@ -432,6 +540,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
const urls = completedResponse.body.data.map( const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL (item: any) => item.metadata?.sourceURL
); );
@ -530,6 +640,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
// 120 seconds // 120 seconds
expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("html");
@ -537,6 +649,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0].content).toContain("_Roast_"); expect(completedResponse.body.data[0].content).toContain("_Roast_");
expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
expect(completedResponse.body.data[0].html).toContain("<h1"); expect(completedResponse.body.data[0].html).toContain("<h1");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 60000); }, 60000);
}); });
@ -671,6 +785,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
const childrenLinks = completedResponse.body.data.filter(doc => const childrenLinks = completedResponse.body.data.filter(doc =>
doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
@ -714,6 +830,9 @@ describe("E2E Tests for API Routes", () => {
}) })
]) ])
); );
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 120000); // 120 seconds }, 120000); // 120 seconds
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
@ -749,6 +868,9 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
const urls = completedResponse.body.data.map( const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL (item: any) => item.metadata?.sourceURL
); );
@ -808,6 +930,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0].content).toContain("_Roast_"); expect(completedResponse.body.data[0].content).toContain("_Roast_");
expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
expect(completedResponse.body.data[0].html).toContain("<h1"); expect(completedResponse.body.data[0].html).toContain("<h1");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 60000); }, 60000);
}); // 60 seconds }); // 60 seconds
@ -851,6 +975,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("html");
expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].markdown).toContain("Mendable"); expect(completedResponse.body.data[0].markdown).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
const onlyChildrenLinks = completedResponse.body.data.filter(doc => { const onlyChildrenLinks = completedResponse.body.data.filter(doc => {
return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
@ -891,7 +1017,8 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds }, 60000); // 60 seconds
describe("POST /v0/scrape with LLM Extraction", () => { describe("POST /v0/scrape with LLM Extraction", () => {
@ -1046,6 +1173,10 @@ describe("E2E Tests for API Routes", () => {
expect(statusResponse.body).toHaveProperty("data"); expect(statusResponse.body).toHaveProperty("data");
expect(statusResponse.body.data[0]).toHaveProperty("content"); expect(statusResponse.body.data[0]).toHaveProperty("content");
expect(statusResponse.body.data[0]).toHaveProperty("markdown"); expect(statusResponse.body.data[0]).toHaveProperty("markdown");
expect(statusResponse.body.data[0]).toHaveProperty("metadata");
expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined();
const results = statusResponse.body.data; const results = statusResponse.body.data;
// results.forEach((result, i) => { // results.forEach((result, i) => {
// console.log(result.metadata.sourceURL); // console.log(result.metadata.sourceURL);

View File

@ -61,7 +61,7 @@ export async function scrapeHelper(
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0 (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
); );
if (filteredDocs.length === 0) { if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200 }; return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
} }
let creditsToBeBilled = filteredDocs.length; let creditsToBeBilled = filteredDocs.length;

View File

@ -101,7 +101,7 @@ export async function searchHelper(
); );
if (filteredDocs.length === 0) { if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200 }; return { success: true, error: "No page found", returnCode: 200, data: docs };
} }
const billingResult = await billTeam( const billingResult = await billTeam(

View File

@ -121,4 +121,7 @@ export class SearchResult {
export interface FireEngineResponse { export interface FireEngineResponse {
html: string; html: string;
screenshot: string; screenshot: string;
} pageStatusCode?: number;
pageError?: string;
}

View File

@ -224,7 +224,7 @@ export class WebCrawler {
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> { async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
const normalizedUrl = this.normalizeCrawlUrl(url); const normalizedUrl = this.normalizeCrawlUrl(url);
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) { if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return []; return [];
@ -244,20 +244,27 @@ export class WebCrawler {
try { try {
let content: string = ""; let content: string = "";
let pageStatusCode: number;
let pageError: string | undefined = undefined;
// If it is the first link, fetch with single url // If it is the first link, fetch with single url
if (this.visited.size === 1) { if (this.visited.size === 1) {
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
content = page.html ?? ""; content = page.html ?? "";
pageStatusCode = page.metadata?.pageStatusCode;
pageError = page.metadata?.pageError || undefined;
} else { } else {
const response = await axios.get(url); const response = await axios.get(url);
content = response.data ?? ""; content = response.data ?? "";
pageStatusCode = response.status;
pageError = response.statusText != "OK" ? response.statusText : undefined;
} }
const $ = load(content); const $ = load(content);
let links: { url: string, html: string }[] = []; let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
// Add the initial URL to the list of links // Add the initial URL to the list of links
if (this.visited.size === 1) { if (this.visited.size === 1) {
links.push({ url, html: content }); links.push({ url, html: content, pageStatusCode, pageError });
} }
$("a").each((_, element) => { $("a").each((_, element) => {
@ -279,7 +286,7 @@ export class WebCrawler {
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent") this.robots.isAllowed(fullUrl, "FireCrawlAgent")
) { ) {
links.push({ url: fullUrl, html: content }); links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
} }
} }
}); });

View File

@ -241,7 +241,7 @@ export class WebScraperDataProvider {
content: "", content: "",
html: this.pageOptions?.includeHtml ? "" : undefined, html: this.pageOptions?.includeHtml ? "" : undefined,
markdown: "", markdown: "",
metadata: { sourceURL: url }, metadata: { sourceURL: url, pageStatusCode: 200 },
})); }));
} }
@ -280,10 +280,10 @@ export class WebScraperDataProvider {
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> { private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
return Promise.all( return Promise.all(
pdfLinks.map(async (pdfLink) => { pdfLinks.map(async (pdfLink) => {
const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF); const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
return { return {
content: pdfContent, content: content,
metadata: { sourceURL: pdfLink }, metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
provider: "web-scraper", provider: "web-scraper",
}; };
}) })
@ -292,10 +292,10 @@ export class WebScraperDataProvider {
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> { private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
return Promise.all( return Promise.all(
docxLinks.map(async (p) => { docxLinks.map(async (p) => {
const docXDocument = await fetchAndProcessDocx(p); const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p);
return { return {
content: docXDocument, content,
metadata: { sourceURL: p }, metadata: { sourceURL: p, pageStatusCode, pageError },
provider: "web-scraper", provider: "web-scraper",
}; };
}) })

View File

@ -83,17 +83,18 @@ export async function scrapWithFireEngine(
console.error( console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
); );
return { html: "", screenshot: "" }; return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
} }
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" }; const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
return { html: content, screenshot: "", pageStatusCode, pageError };
} else { } else {
const data = response.data; const data = response.data;
const html = data.content; const html = data.content;
const screenshot = data.screenshot; const screenshot = data.screenshot;
return { html: html ?? "", screenshot: screenshot ?? "" }; return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
} }
} catch (error) { } catch (error) {
if (error.code === 'ECONNABORTED') { if (error.code === 'ECONNABORTED') {
@ -110,35 +111,40 @@ export async function scrapWithScrapingBee(
wait_browser: string = "domcontentloaded", wait_browser: string = "domcontentloaded",
timeout: number = universalTimeout, timeout: number = universalTimeout,
pageOptions: { parsePDF?: boolean } = { parsePDF: true } pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> { ): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
try { try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
const clientParams = await generateRequestParams( const clientParams = await generateRequestParams(
url, url,
wait_browser, wait_browser,
timeout timeout,
); );
const response = await client.get(clientParams); const response = await client.get({
...clientParams,
if (response.status !== 200 && response.status !== 404) { params: {
console.error( ...clientParams.params,
`[ScrapingBee] Error fetching url: ${url} with status code ${response.status}` 'transparent_status_code': 'True'
); }
return ""; });
}
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url, pageOptions?.parsePDF); return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else { } else {
const decoder = new TextDecoder(); let text = "";
const text = decoder.decode(response.data); try {
return text; const decoder = new TextDecoder();
text = decoder.decode(response.data);
} catch (decodeError) {
console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`);
}
return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined };
} }
} catch (error) { } catch (error) {
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
return ""; return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText };
} }
} }
@ -147,7 +153,7 @@ export async function scrapWithPlaywright(
waitFor: number = 0, waitFor: number = 0,
headers?: Record<string, string>, headers?: Record<string, string>,
pageOptions: { parsePDF?: boolean } = { parsePDF: true } pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> { ): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that // If the user has passed a wait parameter in the request, use that
@ -169,21 +175,21 @@ export async function scrapWithPlaywright(
console.error( console.error(
`[Playwright] Error fetching url: ${url} with status: ${response.status}` `[Playwright] Error fetching url: ${url} with status: ${response.status}`
); );
return ""; return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
} }
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url, pageOptions?.parsePDF); return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else { } else {
const textData = response.data; const textData = response.data;
try { try {
const data = JSON.parse(textData); const data = JSON.parse(textData);
const html = data.content; const html = data.content;
return html ?? ""; return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
} catch (jsonError) { } catch (jsonError) {
console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`); console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`);
return ""; return { content: "" };
} }
} }
} catch (error) { } catch (error) {
@ -192,14 +198,14 @@ export async function scrapWithPlaywright(
} else { } else {
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
} }
return ""; return { content: "" };
} }
} }
export async function scrapWithFetch( export async function scrapWithFetch(
url: string, url: string,
pageOptions: { parsePDF?: boolean } = { parsePDF: true } pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> { ): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
try { try {
const response = await axios.get(url, { const response = await axios.get(url, {
headers: { headers: {
@ -213,15 +219,15 @@ export async function scrapWithFetch(
console.error( console.error(
`[Axios] Error fetching url: ${url} with status: ${response.status}` `[Axios] Error fetching url: ${url} with status: ${response.status}`
); );
return ""; return { content: "", pageStatusCode: response.status, pageError: response.statusText };
} }
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url, pageOptions?.parsePDF); return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else { } else {
const text = response.data; const text = response.data;
return text; return { content: text, pageStatusCode: 200 };
} }
} catch (error) { } catch (error) {
if (error.code === 'ECONNABORTED') { if (error.code === 'ECONNABORTED') {
@ -229,7 +235,7 @@ export async function scrapWithFetch(
} else { } else {
console.error(`[Axios] Error fetching url: ${url} -> ${error}`); console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
} }
return ""; return { content: "" };
} }
} }
@ -335,7 +341,7 @@ export async function scrapSingleUrl(
url: string, url: string,
method: (typeof baseScrapers)[number] method: (typeof baseScrapers)[number]
) => { ) => {
let text = ""; let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} };
let screenshot = ""; let screenshot = "";
switch (method) { switch (method) {
case "fire-engine": case "fire-engine":
@ -347,38 +353,52 @@ export async function scrapSingleUrl(
pageOptions.screenshot, pageOptions.screenshot,
pageOptions.headers pageOptions.headers
); );
text = response.html; scraperResponse.text = response.html;
screenshot = response.screenshot; scraperResponse.screenshot = response.screenshot;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
} }
break; break;
case "scrapingBee": case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) { if (process.env.SCRAPING_BEE_API_KEY) {
text = await scrapWithScrapingBee( const response = await scrapWithScrapingBee(
url, url,
"domcontentloaded", "domcontentloaded",
pageOptions.fallback === false ? 7000 : 15000 pageOptions.fallback === false ? 7000 : 15000
); );
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
} }
break; break;
case "playwright": case "playwright":
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
text = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers); const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
} }
break; break;
case "scrapingBeeLoad": case "scrapingBeeLoad":
if (process.env.SCRAPING_BEE_API_KEY) { if (process.env.SCRAPING_BEE_API_KEY) {
text = await scrapWithScrapingBee(url, "networkidle2"); const response = await scrapWithScrapingBee(url, "networkidle2");
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
} }
break; break;
case "fetch": case "fetch":
text = await scrapWithFetch(url); const response = await scrapWithFetch(url);
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
break; break;
} }
let customScrapedContent : FireEngineResponse | null = null; let customScrapedContent : FireEngineResponse | null = null;
// Check for custom scraping conditions // Check for custom scraping conditions
const customScraperResult = await handleCustomScraping(text, url); const customScraperResult = await handleCustomScraping(scraperResponse.text, url);
if (customScraperResult){ if (customScraperResult){
switch (customScraperResult.scraper) { switch (customScraperResult.scraper) {
@ -389,23 +409,30 @@ export async function scrapSingleUrl(
} }
break; break;
case "pdf": case "pdf":
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot } const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF);
customScrapedContent = { html: content, screenshot, pageStatusCode, pageError }
break; break;
} }
} }
if (customScrapedContent) { if (customScrapedContent) {
text = customScrapedContent.html; scraperResponse.text = customScrapedContent.html;
screenshot = customScrapedContent.screenshot; screenshot = customScrapedContent.screenshot;
} }
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(text, pageOptions); let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
return [await parseMarkdown(cleanedHtml), text, screenshot]; return {
text: await parseMarkdown(cleanedHtml),
html: scraperResponse.text,
screenshot: scraperResponse.screenshot,
pageStatusCode: scraperResponse.metadata.pageStatusCode,
pageError: scraperResponse.metadata.pageError || undefined
};
}; };
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
try { try {
let [text, html, screenshot] = ["", "", ""];
let urlKey = urlToScrap; let urlKey = urlToScrap;
try { try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
@ -428,8 +455,21 @@ export async function scrapSingleUrl(
html = existingHtml; html = existingHtml;
break; break;
} }
[text, html, screenshot] = await attemptScraping(urlToScrap, scraper);
const attempt = await attemptScraping(urlToScrap, scraper);
text = attempt.text ?? '';
html = attempt.html ?? '';
screenshot = attempt.screenshot ?? '';
if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode;
}
if (attempt.pageError) {
pageError = attempt.pageError;
}
if (text && text.trim().length >= 100) break; if (text && text.trim().length >= 100) break;
if (pageStatusCode && pageStatusCode == 404) break;
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
if (nextScraperIndex < scrapersInOrder.length) { if (nextScraperIndex < scrapersInOrder.length) {
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
@ -453,6 +493,8 @@ export async function scrapSingleUrl(
...metadata, ...metadata,
screenshot: screenshot, screenshot: screenshot,
sourceURL: urlToScrap, sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
pageError: pageError
}, },
}; };
} else { } else {
@ -460,7 +502,12 @@ export async function scrapSingleUrl(
content: text, content: text,
markdown: text, markdown: text,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, sourceURL: urlToScrap }, metadata: {
...metadata,
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
pageError: pageError
},
}; };
} }
@ -471,7 +518,11 @@ export async function scrapSingleUrl(
content: "", content: "",
markdown: "", markdown: "",
html: "", html: "",
metadata: { sourceURL: urlToScrap }, metadata: {
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
pageError: pageError
},
} as Document; } as Document;
} }
} }

View File

@ -3,11 +3,13 @@ import * as docxProcessor from "../docxProcessor";
describe("DOCX Processing Module - Integration Test", () => { describe("DOCX Processing Module - Integration Test", () => {
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => { it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
delete process.env.LLAMAPARSE_API_KEY; delete process.env.LLAMAPARSE_API_KEY;
const docxContent = await docxProcessor.fetchAndProcessDocx( const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx(
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx" "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
); );
expect(docxContent.trim()).toContain( expect(content.trim()).toContain(
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT" "SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
); );
expect(pageStatusCode).toBe(200);
expect(pageError).toBeUndefined();
}); });
}); });

View File

@ -3,8 +3,10 @@ import * as pdfProcessor from '../pdfProcessor';
describe('PDF Processing Module - Integration Test', () => { describe('PDF Processing Module - Integration Test', () => {
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
delete process.env.LLAMAPARSE_API_KEY; delete process.env.LLAMAPARSE_API_KEY;
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true); const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
expect(pdfContent.trim()).toEqual("Dummy PDF file"); expect(content.trim()).toEqual("Dummy PDF file");
expect(pageStatusCode).toEqual(200);
expect(pageError).toBeUndefined();
}); });
// We're hitting the LLAMAPARSE rate limit 🫠 // We're hitting the LLAMAPARSE rate limit 🫠

View File

@ -5,14 +5,14 @@ import path from "path";
import os from "os"; import os from "os";
import mammoth from "mammoth"; import mammoth from "mammoth";
export async function fetchAndProcessDocx(url: string): Promise<string> { export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
const tempFilePath = await downloadDocx(url); const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url);
const content = await processDocxToText(tempFilePath); const content = await processDocxToText(tempFilePath);
fs.unlinkSync(tempFilePath); // Clean up the temporary file fs.unlinkSync(tempFilePath); // Clean up the temporary file
return content; return { content, pageStatusCode, pageError };
} }
async function downloadDocx(url: string): Promise<string> { async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
const response = await axios({ const response = await axios({
url, url,
method: "GET", method: "GET",
@ -25,7 +25,7 @@ async function downloadDocx(url: string): Promise<string> {
response.data.pipe(writer); response.data.pipe(writer);
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
writer.on("finish", () => resolve(tempFilePath)); writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
writer.on("error", reject); writer.on("error", reject);
}); });
} }

View File

@ -29,6 +29,9 @@ interface Metadata {
publishedTime?: string; publishedTime?: string;
articleTag?: string; articleTag?: string;
articleSection?: string; articleSection?: string;
sourceURL?: string;
pageStatusCode?: number;
pageError?: string;
} }
export function extractMetadata(soup: CheerioAPI, url: string): Metadata { export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
@ -61,6 +64,9 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
let publishedTime: string | null = null; let publishedTime: string | null = null;
let articleTag: string | null = null; let articleTag: string | null = null;
let articleSection: string | null = null; let articleSection: string | null = null;
let sourceURL: string | null = null;
let pageStatusCode: number | null = null;
let pageError: string | null = null;
try { try {
title = soup("title").text() || null; title = soup("title").text() || null;
@ -132,5 +138,8 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
...(publishedTime ? { publishedTime } : {}), ...(publishedTime ? { publishedTime } : {}),
...(articleTag ? { articleTag } : {}), ...(articleTag ? { articleTag } : {}),
...(articleSection ? { articleSection } : {}), ...(articleSection ? { articleSection } : {}),
...(sourceURL ? { sourceURL } : {}),
...(pageStatusCode ? { pageStatusCode } : {}),
...(pageError ? { pageError } : {}),
}; };
} }

View File

@ -9,14 +9,14 @@ import os from "os";
dotenv.config(); dotenv.config();
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<string> { export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
const tempFilePath = await downloadPdf(url); const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
const content = await processPdfToText(tempFilePath, parsePDF); const content = await processPdfToText(tempFilePath, parsePDF);
fs.unlinkSync(tempFilePath); // Clean up the temporary file fs.unlinkSync(tempFilePath); // Clean up the temporary file
return content; return { content, pageStatusCode, pageError };
} }
async function downloadPdf(url: string): Promise<string> { async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
const response = await axios({ const response = await axios({
url, url,
method: "GET", method: "GET",
@ -29,7 +29,7 @@ async function downloadPdf(url: string): Promise<string> {
response.data.pipe(writer); response.data.pipe(writer);
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
writer.on("finish", () => resolve(tempFilePath)); writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
writer.on("error", reject); writer.on("error", reject);
}); });
} }

View File

@ -0,0 +1,63 @@
def get_error(status_code: int) -> str:
error_messages = {
300: "Multiple Choices",
301: "Moved Permanently",
302: "Found",
303: "See Other",
304: "Not Modified",
305: "Use Proxy",
307: "Temporary Redirect",
308: "Permanent Redirect",
309: "Resume Incomplete",
310: "Too Many Redirects",
311: "Unavailable For Legal Reasons",
312: "Previously Used",
313: "I'm Used",
314: "Switch Proxy",
315: "Temporary Redirect",
316: "Resume Incomplete",
317: "Too Many Redirects",
400: "Bad Request",
401: "Unauthorized",
403: "Forbidden",
404: "Not Found",
405: "Method Not Allowed",
406: "Not Acceptable",
407: "Proxy Authentication Required",
408: "Request Timeout",
409: "Conflict",
410: "Gone",
411: "Length Required",
412: "Precondition Failed",
413: "Payload Too Large",
414: "URI Too Long",
415: "Unsupported Media Type",
416: "Range Not Satisfiable",
417: "Expectation Failed",
418: "I'm a teapot",
421: "Misdirected Request",
422: "Unprocessable Entity",
423: "Locked",
424: "Failed Dependency",
425: "Too Early",
426: "Upgrade Required",
428: "Precondition Required",
429: "Too Many Requests",
431: "Request Header Fields Too Large",
451: "Unavailable For Legal Reasons",
500: "Internal Server Error",
501: "Not Implemented",
502: "Bad Gateway",
503: "Service Unavailable",
504: "Gateway Timeout",
505: "HTTP Version Not Supported",
506: "Variant Also Negotiates",
507: "Insufficient Storage",
508: "Loop Detected",
510: "Not Extended",
511: "Network Authentication Required",
599: "Network Connect Timeout Error"
}
if status_code < 300:
return None
return error_messages.get(status_code, "Unknown Error")

View File

@ -9,6 +9,7 @@ from fastapi import FastAPI
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from playwright.async_api import Browser, async_playwright from playwright.async_api import Browser, async_playwright
from pydantic import BaseModel from pydantic import BaseModel
from get_error import get_error
PROXY_SERVER = environ.get("PROXY_SERVER", None) PROXY_SERVER = environ.get("PROXY_SERVER", None)
PROXY_USERNAME = environ.get("PROXY_USERNAME", None) PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
@ -73,16 +74,22 @@ async def root(body: UrlModel):
if body.headers: if body.headers:
await page.set_extra_http_headers(body.headers) await page.set_extra_http_headers(body.headers)
await page.goto( response = await page.goto(
body.url, body.url,
wait_until="load", wait_until="load",
timeout=body.timeout, timeout=body.timeout,
) )
page_status_code = response.status
page_error = get_error(page_status_code)
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
if body.wait_after_load > 0: if body.wait_after_load > 0:
await page.wait_for_timeout(body.wait_after_load) await page.wait_for_timeout(body.wait_after_load)
page_content = await page.content() page_content = await page.content()
await context.close() await context.close()
json_compatible_item_data = {"content": page_content} json_compatible_item_data = {
return JSONResponse(content=json_compatible_item_data) "content": page_content,
"pageStatusCode": page_status_code,
"pageError": page_error
}
return JSONResponse(content=json_compatible_item_data)