diff --git a/apps/api/openapi.json b/apps/api/openapi.json index b07e43f..17b3677 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -511,7 +511,7 @@ "html": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeHtml` is true" + "description": "Raw HTML content of the page if `includeHtml` is true" }, "metadata": { "type": "object", @@ -526,9 +526,126 @@ "type": "string", "nullable": true }, + "keywords": { + "type": "string", + "nullable": true + }, + "robots": { + "type": "string", + "nullable": true + }, + "ogTitle": { + "type": "string", + "nullable": true + }, + "ogDescription": { + "type": "string", + "nullable": true + }, + "ogUrl": { + "type": "string", + "format": "uri", + "nullable": true + }, + "ogImage": { + "type": "string", + "nullable": true + }, + "ogAudio": { + "type": "string", + "nullable": true + }, + "ogDeterminer": { + "type": "string", + "nullable": true + }, + "ogLocale": { + "type": "string", + "nullable": true + }, + "ogLocaleAlternate": { + "type": "array", + "items": { + "type": "string" + }, + "nullable": true + }, + "ogSiteName": { + "type": "string", + "nullable": true + }, + "ogVideo": { + "type": "string", + "nullable": true + }, + "dctermsCreated": { + "type": "string", + "nullable": true + }, + "dcDateCreated": { + "type": "string", + "nullable": true + }, + "dcDate": { + "type": "string", + "nullable": true + }, + "dctermsType": { + "type": "string", + "nullable": true + }, + "dcType": { + "type": "string", + "nullable": true + }, + "dctermsAudience": { + "type": "string", + "nullable": true + }, + "dctermsSubject": { + "type": "string", + "nullable": true + }, + "dcSubject": { + "type": "string", + "nullable": true + }, + "dcDescription": { + "type": "string", + "nullable": true + }, + "dctermsKeywords": { + "type": "string", + "nullable": true + }, + "modifiedTime": { + "type": "string", + "nullable": true + }, + "publishedTime": { + "type": "string", + "nullable": true + }, + "articleTag": { + "type": "string", + "nullable": true + }, + "articleSection": { + "type": "string", + "nullable": true + }, "sourceURL": { "type": "string", "format": "uri" + }, + "pageStatusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "pageError": { + "type": "string", + "nullable": true, + "description": "The error message of the page" } } }, @@ -577,9 +694,126 @@ "type": "string", "nullable": true }, + "keywords": { + "type": "string", + "nullable": true + }, + "robots": { + "type": "string", + "nullable": true + }, + "ogTitle": { + "type": "string", + "nullable": true + }, + "ogDescription": { + "type": "string", + "nullable": true + }, + "ogUrl": { + "type": "string", + "format": "uri", + "nullable": true + }, + "ogImage": { + "type": "string", + "nullable": true + }, + "ogAudio": { + "type": "string", + "nullable": true + }, + "ogDeterminer": { + "type": "string", + "nullable": true + }, + "ogLocale": { + "type": "string", + "nullable": true + }, + "ogLocaleAlternate": { + "type": "array", + "items": { + "type": "string" + }, + "nullable": true + }, + "ogSiteName": { + "type": "string", + "nullable": true + }, + "ogVideo": { + "type": "string", + "nullable": true + }, + "dctermsCreated": { + "type": "string", + "nullable": true + }, + "dcDateCreated": { + "type": "string", + "nullable": true + }, + "dcDate": { + "type": "string", + "nullable": true + }, + "dctermsType": { + "type": "string", + "nullable": true + }, + "dcType": { + "type": "string", + "nullable": true + }, + "dctermsAudience": { + "type": "string", + "nullable": true + }, + "dctermsSubject": { + "type": "string", + "nullable": true + }, + "dcSubject": { + "type": "string", + "nullable": true + }, + "dcDescription": { + "type": "string", + "nullable": true + }, + "dctermsKeywords": { + "type": "string", + "nullable": true + }, + "modifiedTime": { + "type": "string", + "nullable": true + }, + "publishedTime": { + "type": "string", + "nullable": true + }, + "articleTag": { + "type": "string", + "nullable": true + }, + "articleSection": { + "type": "string", + "nullable": true + }, "sourceURL": { "type": "string", "format": "uri" + }, + "pageStatusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "pageError": { + "type": "string", + "nullable": true, + "description": "The error message of the page" } } } diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 780ad39..9149c01 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -83,6 +83,8 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); }, 30000); // 30 seconds timeout it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => { @@ -103,6 +105,8 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).toContain("_Roast_"); expect(response.body.data.markdown).toContain("_Roast_"); expect(response.body.data.html).toContain(" { @@ -118,6 +122,8 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { @@ -133,6 +139,8 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { @@ -204,6 +212,102 @@ describe("E2E Tests for API Routes", () => { // expect(response.body.data.content).toContain("🔥 Firecrawl"); // expect(duration).toBeGreaterThanOrEqual(7000); // }, 12000); // 12 seconds timeout + + it.concurrent('should return a successful response for a scrape with 400 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/400' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(400); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 401 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/401' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(401); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized"); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response for a scrape with 403 page", async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/403' }); + + await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(403); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 404 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/404' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(404); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 405 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/405' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(405); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 500 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/500' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(500); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error"); + }, 60000); // 60 seconds }); describe("POST /v0/crawl", () => { @@ -319,6 +423,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => { @@ -400,6 +506,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { @@ -432,6 +540,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); const urls = completedResponse.body.data.map( (item: any) => item.metadata?.sourceURL ); @@ -530,6 +640,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); // 120 seconds expect(completedResponse.body.data[0]).toHaveProperty("html"); @@ -537,6 +649,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].content).toContain("_Roast_"); expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); expect(completedResponse.body.data[0].html).toContain(" { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); const childrenLinks = completedResponse.body.data.filter(doc => doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") @@ -714,6 +830,9 @@ describe("E2E Tests for API Routes", () => { }) ]) ); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); }, 120000); // 120 seconds it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { @@ -749,6 +868,9 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + const urls = completedResponse.body.data.map( (item: any) => item.metadata?.sourceURL ); @@ -808,6 +930,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].content).toContain("_Roast_"); expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); expect(completedResponse.body.data[0].html).toContain(" { expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].markdown).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); const onlyChildrenLinks = completedResponse.body.data.filter(doc => { return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") @@ -891,7 +1017,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); - + expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds describe("POST /v0/scrape with LLM Extraction", () => { @@ -1046,6 +1173,10 @@ describe("E2E Tests for API Routes", () => { expect(statusResponse.body).toHaveProperty("data"); expect(statusResponse.body.data[0]).toHaveProperty("content"); expect(statusResponse.body.data[0]).toHaveProperty("markdown"); + expect(statusResponse.body.data[0]).toHaveProperty("metadata"); + expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined(); + const results = statusResponse.body.data; // results.forEach((result, i) => { // console.log(result.metadata.sourceURL); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index ed28639..1537c07 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -61,7 +61,7 @@ export async function scrapeHelper( (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 ); if (filteredDocs.length === 0) { - return { success: true, error: "No page found", returnCode: 200 }; + return { success: true, error: "No page found", returnCode: 200, data: docs[0] }; } let creditsToBeBilled = filteredDocs.length; diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index abbc357..b555197 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -101,7 +101,7 @@ export async function searchHelper( ); if (filteredDocs.length === 0) { - return { success: true, error: "No page found", returnCode: 200 }; + return { success: true, error: "No page found", returnCode: 200, data: docs }; } const billingResult = await billTeam( diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 0dae9ba..12d8c36 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -121,4 +121,7 @@ export class SearchResult { export interface FireEngineResponse { html: string; screenshot: string; -} \ No newline at end of file + pageStatusCode?: number; + pageError?: string; +} + diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 7720991..8087591 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -224,7 +224,7 @@ export class WebCrawler { return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } - async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> { + async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> { const normalizedUrl = this.normalizeCrawlUrl(url); if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) { return []; @@ -244,20 +244,27 @@ export class WebCrawler { try { let content: string = ""; + let pageStatusCode: number; + let pageError: string | undefined = undefined; + // If it is the first link, fetch with single url if (this.visited.size === 1) { const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); content = page.html ?? ""; + pageStatusCode = page.metadata?.pageStatusCode; + pageError = page.metadata?.pageError || undefined; } else { const response = await axios.get(url); content = response.data ?? ""; + pageStatusCode = response.status; + pageError = response.statusText != "OK" ? response.statusText : undefined; } const $ = load(content); - let links: { url: string, html: string }[] = []; + let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = []; // Add the initial URL to the list of links if (this.visited.size === 1) { - links.push({ url, html: content }); + links.push({ url, html: content, pageStatusCode, pageError }); } $("a").each((_, element) => { @@ -279,7 +286,7 @@ export class WebCrawler { !this.matchesExcludes(path) && this.robots.isAllowed(fullUrl, "FireCrawlAgent") ) { - links.push({ url: fullUrl, html: content }); + links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } } }); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 36af58a..3fbc6d1 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -241,7 +241,7 @@ export class WebScraperDataProvider { content: "", html: this.pageOptions?.includeHtml ? "" : undefined, markdown: "", - metadata: { sourceURL: url }, + metadata: { sourceURL: url, pageStatusCode: 200 }, })); } @@ -280,10 +280,10 @@ export class WebScraperDataProvider { private async fetchPdfDocuments(pdfLinks: string[]): Promise { return Promise.all( pdfLinks.map(async (pdfLink) => { - const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF); return { - content: pdfContent, - metadata: { sourceURL: pdfLink }, + content: content, + metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, provider: "web-scraper", }; }) @@ -292,10 +292,10 @@ export class WebScraperDataProvider { private async fetchDocxDocuments(docxLinks: string[]): Promise { return Promise.all( docxLinks.map(async (p) => { - const docXDocument = await fetchAndProcessDocx(p); + const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p); return { - content: docXDocument, - metadata: { sourceURL: p }, + content, + metadata: { sourceURL: p, pageStatusCode, pageError }, provider: "web-scraper", }; }) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4723a56..1ba2832 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -83,17 +83,18 @@ export async function scrapWithFireEngine( console.error( `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` ); - return { html: "", screenshot: "" }; + return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; } const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" }; + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + return { html: content, screenshot: "", pageStatusCode, pageError }; } else { const data = response.data; const html = data.content; const screenshot = data.screenshot; - return { html: html ?? "", screenshot: screenshot ?? "" }; + return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError }; } } catch (error) { if (error.code === 'ECONNABORTED') { @@ -110,35 +111,40 @@ export async function scrapWithScrapingBee( wait_browser: string = "domcontentloaded", timeout: number = universalTimeout, pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise { +): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const clientParams = await generateRequestParams( url, wait_browser, - timeout + timeout, ); - const response = await client.get(clientParams); - - if (response.status !== 200 && response.status !== 404) { - console.error( - `[ScrapingBee] Error fetching url: ${url} with status code ${response.status}` - ); - return ""; - } + const response = await client.get({ + ...clientParams, + params: { + ...clientParams.params, + 'transparent_status_code': 'True' + } + }); const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url, pageOptions?.parsePDF); + return await fetchAndProcessPdf(url, pageOptions?.parsePDF); + } else { - const decoder = new TextDecoder(); - const text = decoder.decode(response.data); - return text; + let text = ""; + try { + const decoder = new TextDecoder(); + text = decoder.decode(response.data); + } catch (decodeError) { + console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`); + } + return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }; } } catch (error) { console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); - return ""; + return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText }; } } @@ -147,7 +153,7 @@ export async function scrapWithPlaywright( waitFor: number = 0, headers?: Record, pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise { +): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { const reqParams = await generateRequestParams(url); // If the user has passed a wait parameter in the request, use that @@ -169,21 +175,21 @@ export async function scrapWithPlaywright( console.error( `[Playwright] Error fetching url: ${url} with status: ${response.status}` ); - return ""; + return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; } const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url, pageOptions?.parsePDF); + return await fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const textData = response.data; try { const data = JSON.parse(textData); const html = data.content; - return html ?? ""; + return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError }; } catch (jsonError) { console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`); - return ""; + return { content: "" }; } } } catch (error) { @@ -192,14 +198,14 @@ export async function scrapWithPlaywright( } else { console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); } - return ""; + return { content: "" }; } } export async function scrapWithFetch( url: string, pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise { +): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { const response = await axios.get(url, { headers: { @@ -213,15 +219,15 @@ export async function scrapWithFetch( console.error( `[Axios] Error fetching url: ${url} with status: ${response.status}` ); - return ""; + return { content: "", pageStatusCode: response.status, pageError: response.statusText }; } const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url, pageOptions?.parsePDF); + return await fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const text = response.data; - return text; + return { content: text, pageStatusCode: 200 }; } } catch (error) { if (error.code === 'ECONNABORTED') { @@ -229,7 +235,7 @@ export async function scrapWithFetch( } else { console.error(`[Axios] Error fetching url: ${url} -> ${error}`); } - return ""; + return { content: "" }; } } @@ -335,7 +341,7 @@ export async function scrapSingleUrl( url: string, method: (typeof baseScrapers)[number] ) => { - let text = ""; + let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} }; let screenshot = ""; switch (method) { case "fire-engine": @@ -347,38 +353,52 @@ export async function scrapSingleUrl( pageOptions.screenshot, pageOptions.headers ); - text = response.html; - screenshot = response.screenshot; + scraperResponse.text = response.html; + scraperResponse.screenshot = response.screenshot; + scraperResponse.metadata.pageStatusCode = response.pageStatusCode; + scraperResponse.metadata.pageError = response.pageError; } break; case "scrapingBee": if (process.env.SCRAPING_BEE_API_KEY) { - text = await scrapWithScrapingBee( + const response = await scrapWithScrapingBee( url, "domcontentloaded", pageOptions.fallback === false ? 7000 : 15000 ); + scraperResponse.text = response.content; + scraperResponse.metadata.pageStatusCode = response.pageStatusCode; + scraperResponse.metadata.pageError = response.pageError; } break; case "playwright": if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { - text = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers); + const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers); + scraperResponse.text = response.content; + scraperResponse.metadata.pageStatusCode = response.pageStatusCode; + scraperResponse.metadata.pageError = response.pageError; } break; case "scrapingBeeLoad": if (process.env.SCRAPING_BEE_API_KEY) { - text = await scrapWithScrapingBee(url, "networkidle2"); + const response = await scrapWithScrapingBee(url, "networkidle2"); + scraperResponse.text = response.content; + scraperResponse.metadata.pageStatusCode = response.pageStatusCode; + scraperResponse.metadata.pageError = response.pageError; } break; case "fetch": - text = await scrapWithFetch(url); + const response = await scrapWithFetch(url); + scraperResponse.text = response.content; + scraperResponse.metadata.pageStatusCode = response.pageStatusCode; + scraperResponse.metadata.pageError = response.pageError; break; } let customScrapedContent : FireEngineResponse | null = null; // Check for custom scraping conditions - const customScraperResult = await handleCustomScraping(text, url); + const customScraperResult = await handleCustomScraping(scraperResponse.text, url); if (customScraperResult){ switch (customScraperResult.scraper) { @@ -389,23 +409,30 @@ export async function scrapSingleUrl( } break; case "pdf": - customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot } + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF); + customScrapedContent = { html: content, screenshot, pageStatusCode, pageError } break; } } if (customScrapedContent) { - text = customScrapedContent.html; + scraperResponse.text = customScrapedContent.html; screenshot = customScrapedContent.screenshot; } //* TODO: add an optional to return markdown or structured/extracted content - let cleanedHtml = removeUnwantedElements(text, pageOptions); + let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); - return [await parseMarkdown(cleanedHtml), text, screenshot]; + return { + text: await parseMarkdown(cleanedHtml), + html: scraperResponse.text, + screenshot: scraperResponse.screenshot, + pageStatusCode: scraperResponse.metadata.pageStatusCode, + pageError: scraperResponse.metadata.pageError || undefined + }; }; + let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined }; try { - let [text, html, screenshot] = ["", "", ""]; let urlKey = urlToScrap; try { urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); @@ -428,8 +455,21 @@ export async function scrapSingleUrl( html = existingHtml; break; } - [text, html, screenshot] = await attemptScraping(urlToScrap, scraper); + + const attempt = await attemptScraping(urlToScrap, scraper); + text = attempt.text ?? ''; + html = attempt.html ?? ''; + screenshot = attempt.screenshot ?? ''; + if (attempt.pageStatusCode) { + pageStatusCode = attempt.pageStatusCode; + } + if (attempt.pageError) { + pageError = attempt.pageError; + } + + if (text && text.trim().length >= 100) break; + if (pageStatusCode && pageStatusCode == 404) break; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; if (nextScraperIndex < scrapersInOrder.length) { console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); @@ -453,6 +493,8 @@ export async function scrapSingleUrl( ...metadata, screenshot: screenshot, sourceURL: urlToScrap, + pageStatusCode: pageStatusCode, + pageError: pageError }, }; } else { @@ -460,7 +502,12 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, - metadata: { ...metadata, sourceURL: urlToScrap }, + metadata: { + ...metadata, + sourceURL: urlToScrap, + pageStatusCode: pageStatusCode, + pageError: pageError + }, }; } @@ -471,7 +518,11 @@ export async function scrapSingleUrl( content: "", markdown: "", html: "", - metadata: { sourceURL: urlToScrap }, + metadata: { + sourceURL: urlToScrap, + pageStatusCode: pageStatusCode, + pageError: pageError + }, } as Document; } } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts index e018ffa..53237ef 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts @@ -3,11 +3,13 @@ import * as docxProcessor from "../docxProcessor"; describe("DOCX Processing Module - Integration Test", () => { it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => { delete process.env.LLAMAPARSE_API_KEY; - const docxContent = await docxProcessor.fetchAndProcessDocx( + const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx( "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx" ); - expect(docxContent.trim()).toContain( + expect(content.trim()).toContain( "SERIES A PREFERRED STOCK PURCHASE AGREEMENT" ); + expect(pageStatusCode).toBe(200); + expect(pageError).toBeUndefined(); }); }); diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts index f4ed3c6..55930f2 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -3,8 +3,10 @@ import * as pdfProcessor from '../pdfProcessor'; describe('PDF Processing Module - Integration Test', () => { it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { delete process.env.LLAMAPARSE_API_KEY; - const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true); - expect(pdfContent.trim()).toEqual("Dummy PDF file"); + const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true); + expect(content.trim()).toEqual("Dummy PDF file"); + expect(pageStatusCode).toEqual(200); + expect(pageError).toBeUndefined(); }); // We're hitting the LLAMAPARSE rate limit 🫠 diff --git a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts index 38759f8..a01b8a2 100644 --- a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts @@ -5,14 +5,14 @@ import path from "path"; import os from "os"; import mammoth from "mammoth"; -export async function fetchAndProcessDocx(url: string): Promise { - const tempFilePath = await downloadDocx(url); +export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> { + const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url); const content = await processDocxToText(tempFilePath); fs.unlinkSync(tempFilePath); // Clean up the temporary file - return content; + return { content, pageStatusCode, pageError }; } -async function downloadDocx(url: string): Promise { +async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> { const response = await axios({ url, method: "GET", @@ -25,7 +25,7 @@ async function downloadDocx(url: string): Promise { response.data.pipe(writer); return new Promise((resolve, reject) => { - writer.on("finish", () => resolve(tempFilePath)); + writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined })); writer.on("error", reject); }); } diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index ddaf1e8..3f2052c 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -29,6 +29,9 @@ interface Metadata { publishedTime?: string; articleTag?: string; articleSection?: string; + sourceURL?: string; + pageStatusCode?: number; + pageError?: string; } export function extractMetadata(soup: CheerioAPI, url: string): Metadata { @@ -61,6 +64,9 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { let publishedTime: string | null = null; let articleTag: string | null = null; let articleSection: string | null = null; + let sourceURL: string | null = null; + let pageStatusCode: number | null = null; + let pageError: string | null = null; try { title = soup("title").text() || null; @@ -132,5 +138,8 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { ...(publishedTime ? { publishedTime } : {}), ...(articleTag ? { articleTag } : {}), ...(articleSection ? { articleSection } : {}), + ...(sourceURL ? { sourceURL } : {}), + ...(pageStatusCode ? { pageStatusCode } : {}), + ...(pageError ? { pageError } : {}), }; } diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 1f0d6e8..1a67d60 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -9,14 +9,14 @@ import os from "os"; dotenv.config(); -export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise { - const tempFilePath = await downloadPdf(url); +export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { + const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url); const content = await processPdfToText(tempFilePath, parsePDF); fs.unlinkSync(tempFilePath); // Clean up the temporary file - return content; + return { content, pageStatusCode, pageError }; } -async function downloadPdf(url: string): Promise { +async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> { const response = await axios({ url, method: "GET", @@ -29,7 +29,7 @@ async function downloadPdf(url: string): Promise { response.data.pipe(writer); return new Promise((resolve, reject) => { - writer.on("finish", () => resolve(tempFilePath)); + writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined })); writer.on("error", reject); }); } diff --git a/apps/playwright-service/get_error.py b/apps/playwright-service/get_error.py new file mode 100644 index 0000000..a33de5e --- /dev/null +++ b/apps/playwright-service/get_error.py @@ -0,0 +1,63 @@ +def get_error(status_code: int) -> str: + error_messages = { + 300: "Multiple Choices", + 301: "Moved Permanently", + 302: "Found", + 303: "See Other", + 304: "Not Modified", + 305: "Use Proxy", + 307: "Temporary Redirect", + 308: "Permanent Redirect", + 309: "Resume Incomplete", + 310: "Too Many Redirects", + 311: "Unavailable For Legal Reasons", + 312: "Previously Used", + 313: "I'm Used", + 314: "Switch Proxy", + 315: "Temporary Redirect", + 316: "Resume Incomplete", + 317: "Too Many Redirects", + 400: "Bad Request", + 401: "Unauthorized", + 403: "Forbidden", + 404: "Not Found", + 405: "Method Not Allowed", + 406: "Not Acceptable", + 407: "Proxy Authentication Required", + 408: "Request Timeout", + 409: "Conflict", + 410: "Gone", + 411: "Length Required", + 412: "Precondition Failed", + 413: "Payload Too Large", + 414: "URI Too Long", + 415: "Unsupported Media Type", + 416: "Range Not Satisfiable", + 417: "Expectation Failed", + 418: "I'm a teapot", + 421: "Misdirected Request", + 422: "Unprocessable Entity", + 423: "Locked", + 424: "Failed Dependency", + 425: "Too Early", + 426: "Upgrade Required", + 428: "Precondition Required", + 429: "Too Many Requests", + 431: "Request Header Fields Too Large", + 451: "Unavailable For Legal Reasons", + 500: "Internal Server Error", + 501: "Not Implemented", + 502: "Bad Gateway", + 503: "Service Unavailable", + 504: "Gateway Timeout", + 505: "HTTP Version Not Supported", + 506: "Variant Also Negotiates", + 507: "Insufficient Storage", + 508: "Loop Detected", + 510: "Not Extended", + 511: "Network Authentication Required", + 599: "Network Connect Timeout Error" + } + if status_code < 300: + return None + return error_messages.get(status_code, "Unknown Error") diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index 8ef7418..bd6b14e 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -9,6 +9,7 @@ from fastapi import FastAPI from fastapi.responses import JSONResponse from playwright.async_api import Browser, async_playwright from pydantic import BaseModel +from get_error import get_error PROXY_SERVER = environ.get("PROXY_SERVER", None) PROXY_USERNAME = environ.get("PROXY_USERNAME", None) @@ -73,16 +74,22 @@ async def root(body: UrlModel): if body.headers: await page.set_extra_http_headers(body.headers) - await page.goto( + response = await page.goto( body.url, wait_until="load", timeout=body.timeout, ) + page_status_code = response.status + page_error = get_error(page_status_code) # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough if body.wait_after_load > 0: await page.wait_for_timeout(body.wait_after_load) page_content = await page.content() await context.close() - json_compatible_item_data = {"content": page_content} - return JSONResponse(content=json_compatible_item_data) + json_compatible_item_data = { + "content": page_content, + "pageStatusCode": page_status_code, + "pageError": page_error + } + return JSONResponse(content=json_compatible_item_data) \ No newline at end of file