From 5dd18ca79b9f033ea65a6a5ead02bc15cdb9ed4a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 14 Jun 2024 09:46:55 -0300 Subject: [PATCH] fixed edge cases --- .../src/__tests__/e2e_withAuth/index.test.ts | 89 +++++++++++++++++-- apps/api/src/controllers/scrape.ts | 2 +- apps/api/src/controllers/search.ts | 2 +- apps/api/src/scraper/WebScraper/single_url.ts | 26 ++++-- 4 files changed, 104 insertions(+), 15 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 4fd35a8..1e1d5e3 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -164,21 +164,100 @@ describe("E2E Tests for API Routes", () => { // expect(duration).toBeGreaterThanOrEqual(7000); // }, 12000); // 12 seconds timeout - it.concurrent('should return a successful response for a scrape with 404 page', async () => { + it.concurrent('should return a successful response for a scrape with 400 page', async () => { const response = await request(TEST_URL) .post('/v0/scrape') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') - .send({ url: 'https://mendable.ai/alshdiasuhdasd' }); - await new Promise((r) => setTimeout(r, 6000)); + .send({ url: 'https://httpstat.us/400' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(400); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 401 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/401' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(401); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized"); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response for a scrape with 403 page", async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/403' }); + + await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(403); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 404 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/404' }); + await new Promise((r) => setTimeout(r, 5000)); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty('data'); expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('Mendable'); expect(response.body.data.metadata.pageStatusCode).toBe(404); - expect(response.body.data.metadata.pageError).toBe("Not Found"); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 405 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/405' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(405); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 500 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/500' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(500); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error"); }, 60000); // 60 seconds }); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index d5ab1de..ae0de6a 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -61,7 +61,7 @@ export async function scrapeHelper( (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 ); if (filteredDocs.length === 0) { - return { success: true, error: "No page found", returnCode: 200 }; + return { success: true, error: "No page found", returnCode: 200, data: docs[0] }; } let creditsToBeBilled = filteredDocs.length; diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 7474aae..5427d49 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -100,7 +100,7 @@ export async function searchHelper( ); if (filteredDocs.length === 0) { - return { success: true, error: "No page found", returnCode: 200 }; + return { success: true, error: "No page found", returnCode: 200, data: docs }; } const billingResult = await billTeam( diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index f83771e..3f7e789 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -83,7 +83,7 @@ export async function scrapWithFireEngine( console.error( `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` ); - return { html: "", screenshot: "" }; + return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; } const contentType = response.headers["content-type"]; @@ -94,7 +94,7 @@ export async function scrapWithFireEngine( const data = response.data; const html = data.content; const screenshot = data.screenshot; - return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.error }; + return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError }; } } catch (error) { if (error.code === 'ECONNABORTED') { @@ -142,7 +142,7 @@ export async function scrapWithScrapingBee( } } catch (error) { console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); - return { content: "" }; + return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText }; } } @@ -172,7 +172,7 @@ export async function scrapWithPlaywright( console.error( `[Playwright] Error fetching url: ${url} with status: ${response.status}` ); - return { content: "" }; + return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; } const contentType = response.headers["content-type"]; @@ -412,8 +412,8 @@ export async function scrapSingleUrl( pageError: scraperResponse.metadata.pageError || undefined }; }; + let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined }; try { - let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined }; let urlKey = urlToScrap; try { urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); @@ -441,10 +441,16 @@ export async function scrapSingleUrl( text = attempt.text ?? ''; html = attempt.html ?? ''; screenshot = attempt.screenshot ?? ''; - pageStatusCode = attempt.pageStatusCode; - pageError = attempt.pageError; + if (attempt.pageStatusCode) { + pageStatusCode = attempt.pageStatusCode; + } + if (attempt.pageError) { + pageError = attempt.pageError; + } + if (text && text.trim().length >= 100) break; + if (pageStatusCode && pageStatusCode == 404) break; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; if (nextScraperIndex < scrapersInOrder.length) { console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); @@ -493,7 +499,11 @@ export async function scrapSingleUrl( content: "", markdown: "", html: "", - metadata: { sourceURL: urlToScrap }, + metadata: { + sourceURL: urlToScrap, + pageStatusCode: pageStatusCode, + pageError: pageError + }, } as Document; } }