0

fixed edge cases

This commit is contained in:
rafaelsideguide 2024-06-14 09:46:55 -03:00
parent bb859ae9a7
commit 5dd18ca79b
4 changed files with 104 additions and 15 deletions

View File

@ -164,21 +164,100 @@ describe("E2E Tests for API Routes", () => {
// expect(duration).toBeGreaterThanOrEqual(7000);
// }, 12000); // 12 seconds timeout
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://mendable.ai/alshdiasuhdasd' });
await new Promise((r) => setTimeout(r, 6000));
.send({ url: 'https://httpstat.us/400' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(400);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/401' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(401);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized");
}, 60000); // 60 seconds
it.concurrent("should return a successful response for a scrape with 403 page", async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/403' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(403);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/404' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('Mendable');
expect(response.body.data.metadata.pageStatusCode).toBe(404);
expect(response.body.data.metadata.pageError).toBe("Not Found");
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/405' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(405);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed");
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/500' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.pageStatusCode).toBe(500);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error");
}, 60000); // 60 seconds
});

View File

@ -61,7 +61,7 @@ export async function scrapeHelper(
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
);
if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200 };
return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
}
let creditsToBeBilled = filteredDocs.length;

View File

@ -100,7 +100,7 @@ export async function searchHelper(
);
if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200 };
return { success: true, error: "No page found", returnCode: 200, data: docs };
}
const billingResult = await billTeam(

View File

@ -83,7 +83,7 @@ export async function scrapWithFireEngine(
console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
);
return { html: "", screenshot: "" };
return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
}
const contentType = response.headers["content-type"];
@ -94,7 +94,7 @@ export async function scrapWithFireEngine(
const data = response.data;
const html = data.content;
const screenshot = data.screenshot;
return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.error };
return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
}
} catch (error) {
if (error.code === 'ECONNABORTED') {
@ -142,7 +142,7 @@ export async function scrapWithScrapingBee(
}
} catch (error) {
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
return { content: "" };
return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText };
}
}
@ -172,7 +172,7 @@ export async function scrapWithPlaywright(
console.error(
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
);
return { content: "" };
return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
}
const contentType = response.headers["content-type"];
@ -412,8 +412,8 @@ export async function scrapSingleUrl(
pageError: scraperResponse.metadata.pageError || undefined
};
};
try {
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
try {
let urlKey = urlToScrap;
try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
@ -441,10 +441,16 @@ export async function scrapSingleUrl(
text = attempt.text ?? '';
html = attempt.html ?? '';
screenshot = attempt.screenshot ?? '';
if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode;
}
if (attempt.pageError) {
pageError = attempt.pageError;
}
if (text && text.trim().length >= 100) break;
if (pageStatusCode && pageStatusCode == 404) break;
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
if (nextScraperIndex < scrapersInOrder.length) {
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
@ -493,7 +499,11 @@ export async function scrapSingleUrl(
content: "",
markdown: "",
html: "",
metadata: { sourceURL: urlToScrap },
metadata: {
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
pageError: pageError
},
} as Document;
}
}