fixed edge cases
This commit is contained in:
parent
bb859ae9a7
commit
5dd18ca79b
@ -164,21 +164,100 @@ describe("E2E Tests for API Routes", () => {
|
||||
// expect(duration).toBeGreaterThanOrEqual(7000);
|
||||
// }, 12000); // 12 seconds timeout
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
||||
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://mendable.ai/alshdiasuhdasd' });
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
.send({ url: 'https://httpstat.us/400' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(400);
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/401' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(401);
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent("should return a successful response for a scrape with 403 page", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/403' });
|
||||
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(403);
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/404' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.content).toContain('Mendable');
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
||||
expect(response.body.data.metadata.pageError).toBe("Not Found");
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/405' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(405);
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/500' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error");
|
||||
}, 60000); // 60 seconds
|
||||
});
|
||||
|
||||
|
@ -61,7 +61,7 @@ export async function scrapeHelper(
|
||||
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||
);
|
||||
if (filteredDocs.length === 0) {
|
||||
return { success: true, error: "No page found", returnCode: 200 };
|
||||
return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
|
||||
}
|
||||
|
||||
let creditsToBeBilled = filteredDocs.length;
|
||||
|
@ -100,7 +100,7 @@ export async function searchHelper(
|
||||
);
|
||||
|
||||
if (filteredDocs.length === 0) {
|
||||
return { success: true, error: "No page found", returnCode: 200 };
|
||||
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
||||
}
|
||||
|
||||
const billingResult = await billTeam(
|
||||
|
@ -83,7 +83,7 @@ export async function scrapWithFireEngine(
|
||||
console.error(
|
||||
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
||||
);
|
||||
return { html: "", screenshot: "" };
|
||||
return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
@ -94,7 +94,7 @@ export async function scrapWithFireEngine(
|
||||
const data = response.data;
|
||||
const html = data.content;
|
||||
const screenshot = data.screenshot;
|
||||
return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.error };
|
||||
return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.code === 'ECONNABORTED') {
|
||||
@ -142,7 +142,7 @@ export async function scrapWithScrapingBee(
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
|
||||
return { content: "" };
|
||||
return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText };
|
||||
}
|
||||
}
|
||||
|
||||
@ -172,7 +172,7 @@ export async function scrapWithPlaywright(
|
||||
console.error(
|
||||
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
||||
);
|
||||
return { content: "" };
|
||||
return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
@ -412,8 +412,8 @@ export async function scrapSingleUrl(
|
||||
pageError: scraperResponse.metadata.pageError || undefined
|
||||
};
|
||||
};
|
||||
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
|
||||
try {
|
||||
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
|
||||
let urlKey = urlToScrap;
|
||||
try {
|
||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||
@ -441,10 +441,16 @@ export async function scrapSingleUrl(
|
||||
text = attempt.text ?? '';
|
||||
html = attempt.html ?? '';
|
||||
screenshot = attempt.screenshot ?? '';
|
||||
pageStatusCode = attempt.pageStatusCode;
|
||||
pageError = attempt.pageError;
|
||||
if (attempt.pageStatusCode) {
|
||||
pageStatusCode = attempt.pageStatusCode;
|
||||
}
|
||||
if (attempt.pageError) {
|
||||
pageError = attempt.pageError;
|
||||
}
|
||||
|
||||
|
||||
if (text && text.trim().length >= 100) break;
|
||||
if (pageStatusCode && pageStatusCode == 404) break;
|
||||
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||
if (nextScraperIndex < scrapersInOrder.length) {
|
||||
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||
@ -493,7 +499,11 @@ export async function scrapSingleUrl(
|
||||
content: "",
|
||||
markdown: "",
|
||||
html: "",
|
||||
metadata: { sourceURL: urlToScrap },
|
||||
metadata: {
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError
|
||||
},
|
||||
} as Document;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user