0

fixed edge cases

This commit is contained in:
rafaelsideguide 2024-06-14 09:46:55 -03:00
parent bb859ae9a7
commit 5dd18ca79b
4 changed files with 104 additions and 15 deletions

View File

@ -164,21 +164,100 @@ describe("E2E Tests for API Routes", () => {
// expect(duration).toBeGreaterThanOrEqual(7000); // expect(duration).toBeGreaterThanOrEqual(7000);
// }, 12000); // 12 seconds timeout // }, 12000); // 12 seconds timeout
it.concurrent('should return a successful response for a scrape with 404 page', async () => { it.concurrent('should return a successful response for a scrape with 400 page', async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post('/v0/scrape') .post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json') .set('Content-Type', 'application/json')
.send({ url: 'https://mendable.ai/alshdiasuhdasd' }); .send({ url: 'https://httpstat.us/400' });
await new Promise((r) => setTimeout(r, 6000)); await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(400);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/401' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(401);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized");
}, 60000); // 60 seconds
it.concurrent("should return a successful response for a scrape with 403 page", async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/403' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(403);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/404' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data'); expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('Mendable');
expect(response.body.data.metadata.pageStatusCode).toBe(404); expect(response.body.data.metadata.pageStatusCode).toBe(404);
expect(response.body.data.metadata.pageError).toBe("Not Found"); expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/405' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(405);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/500' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(500);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error");
}, 60000); // 60 seconds }, 60000); // 60 seconds
}); });

View File

@ -61,7 +61,7 @@ export async function scrapeHelper(
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0 (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
); );
if (filteredDocs.length === 0) { if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200 }; return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
} }
let creditsToBeBilled = filteredDocs.length; let creditsToBeBilled = filteredDocs.length;

View File

@ -100,7 +100,7 @@ export async function searchHelper(
); );
if (filteredDocs.length === 0) { if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200 }; return { success: true, error: "No page found", returnCode: 200, data: docs };
} }
const billingResult = await billTeam( const billingResult = await billTeam(

View File

@ -83,7 +83,7 @@ export async function scrapWithFireEngine(
console.error( console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
); );
return { html: "", screenshot: "" }; return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
} }
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
@ -94,7 +94,7 @@ export async function scrapWithFireEngine(
const data = response.data; const data = response.data;
const html = data.content; const html = data.content;
const screenshot = data.screenshot; const screenshot = data.screenshot;
return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.error }; return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
} }
} catch (error) { } catch (error) {
if (error.code === 'ECONNABORTED') { if (error.code === 'ECONNABORTED') {
@ -142,7 +142,7 @@ export async function scrapWithScrapingBee(
} }
} catch (error) { } catch (error) {
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
return { content: "" }; return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText };
} }
} }
@ -172,7 +172,7 @@ export async function scrapWithPlaywright(
console.error( console.error(
`[Playwright] Error fetching url: ${url} with status: ${response.status}` `[Playwright] Error fetching url: ${url} with status: ${response.status}`
); );
return { content: "" }; return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
} }
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
@ -412,8 +412,8 @@ export async function scrapSingleUrl(
pageError: scraperResponse.metadata.pageError || undefined pageError: scraperResponse.metadata.pageError || undefined
}; };
}; };
try {
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined }; let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
try {
let urlKey = urlToScrap; let urlKey = urlToScrap;
try { try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
@ -441,10 +441,16 @@ export async function scrapSingleUrl(
text = attempt.text ?? ''; text = attempt.text ?? '';
html = attempt.html ?? ''; html = attempt.html ?? '';
screenshot = attempt.screenshot ?? ''; screenshot = attempt.screenshot ?? '';
if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode; pageStatusCode = attempt.pageStatusCode;
}
if (attempt.pageError) {
pageError = attempt.pageError; pageError = attempt.pageError;
}
if (text && text.trim().length >= 100) break; if (text && text.trim().length >= 100) break;
if (pageStatusCode && pageStatusCode == 404) break;
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
if (nextScraperIndex < scrapersInOrder.length) { if (nextScraperIndex < scrapersInOrder.length) {
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
@ -493,7 +499,11 @@ export async function scrapSingleUrl(
content: "", content: "",
markdown: "", markdown: "",
html: "", html: "",
metadata: { sourceURL: urlToScrap }, metadata: {
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
pageError: pageError
},
} as Document; } as Document;
} }
} }