fixed edge cases
This commit is contained in:
parent
bb859ae9a7
commit
5dd18ca79b
@ -164,21 +164,100 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
// expect(duration).toBeGreaterThanOrEqual(7000);
|
// expect(duration).toBeGreaterThanOrEqual(7000);
|
||||||
// }, 12000); // 12 seconds timeout
|
// }, 12000); // 12 seconds timeout
|
||||||
|
|
||||||
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post('/v0/scrape')
|
.post('/v0/scrape')
|
||||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set('Content-Type', 'application/json')
|
.set('Content-Type', 'application/json')
|
||||||
.send({ url: 'https://mendable.ai/alshdiasuhdasd' });
|
.send({ url: 'https://httpstat.us/400' });
|
||||||
await new Promise((r) => setTimeout(r, 6000));
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(400);
|
||||||
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/401' });
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(401);
|
||||||
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response for a scrape with 403 page", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/403' });
|
||||||
|
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(403);
|
||||||
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/404' });
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty('data');
|
expect(response.body).toHaveProperty('data');
|
||||||
expect(response.body.data).toHaveProperty('content');
|
expect(response.body.data).toHaveProperty('content');
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
expect(response.body.data.content).toContain('Mendable');
|
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
||||||
expect(response.body.data.metadata.pageError).toBe("Not Found");
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/405' });
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(405);
|
||||||
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/500' });
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
||||||
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error");
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -61,7 +61,7 @@ export async function scrapeHelper(
|
|||||||
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||||
);
|
);
|
||||||
if (filteredDocs.length === 0) {
|
if (filteredDocs.length === 0) {
|
||||||
return { success: true, error: "No page found", returnCode: 200 };
|
return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
|
||||||
}
|
}
|
||||||
|
|
||||||
let creditsToBeBilled = filteredDocs.length;
|
let creditsToBeBilled = filteredDocs.length;
|
||||||
|
@ -100,7 +100,7 @@ export async function searchHelper(
|
|||||||
);
|
);
|
||||||
|
|
||||||
if (filteredDocs.length === 0) {
|
if (filteredDocs.length === 0) {
|
||||||
return { success: true, error: "No page found", returnCode: 200 };
|
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
||||||
}
|
}
|
||||||
|
|
||||||
const billingResult = await billTeam(
|
const billingResult = await billTeam(
|
||||||
|
@ -83,7 +83,7 @@ export async function scrapWithFireEngine(
|
|||||||
console.error(
|
console.error(
|
||||||
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
||||||
);
|
);
|
||||||
return { html: "", screenshot: "" };
|
return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
@ -94,7 +94,7 @@ export async function scrapWithFireEngine(
|
|||||||
const data = response.data;
|
const data = response.data;
|
||||||
const html = data.content;
|
const html = data.content;
|
||||||
const screenshot = data.screenshot;
|
const screenshot = data.screenshot;
|
||||||
return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.error };
|
return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.code === 'ECONNABORTED') {
|
if (error.code === 'ECONNABORTED') {
|
||||||
@ -142,7 +142,7 @@ export async function scrapWithScrapingBee(
|
|||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
|
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
|
||||||
return { content: "" };
|
return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -172,7 +172,7 @@ export async function scrapWithPlaywright(
|
|||||||
console.error(
|
console.error(
|
||||||
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
||||||
);
|
);
|
||||||
return { content: "" };
|
return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
@ -412,8 +412,8 @@ export async function scrapSingleUrl(
|
|||||||
pageError: scraperResponse.metadata.pageError || undefined
|
pageError: scraperResponse.metadata.pageError || undefined
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
|
||||||
try {
|
try {
|
||||||
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
|
|
||||||
let urlKey = urlToScrap;
|
let urlKey = urlToScrap;
|
||||||
try {
|
try {
|
||||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||||
@ -441,10 +441,16 @@ export async function scrapSingleUrl(
|
|||||||
text = attempt.text ?? '';
|
text = attempt.text ?? '';
|
||||||
html = attempt.html ?? '';
|
html = attempt.html ?? '';
|
||||||
screenshot = attempt.screenshot ?? '';
|
screenshot = attempt.screenshot ?? '';
|
||||||
pageStatusCode = attempt.pageStatusCode;
|
if (attempt.pageStatusCode) {
|
||||||
pageError = attempt.pageError;
|
pageStatusCode = attempt.pageStatusCode;
|
||||||
|
}
|
||||||
|
if (attempt.pageError) {
|
||||||
|
pageError = attempt.pageError;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if (text && text.trim().length >= 100) break;
|
if (text && text.trim().length >= 100) break;
|
||||||
|
if (pageStatusCode && pageStatusCode == 404) break;
|
||||||
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||||
if (nextScraperIndex < scrapersInOrder.length) {
|
if (nextScraperIndex < scrapersInOrder.length) {
|
||||||
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||||
@ -493,7 +499,11 @@ export async function scrapSingleUrl(
|
|||||||
content: "",
|
content: "",
|
||||||
markdown: "",
|
markdown: "",
|
||||||
html: "",
|
html: "",
|
||||||
metadata: { sourceURL: urlToScrap },
|
metadata: {
|
||||||
|
sourceURL: urlToScrap,
|
||||||
|
pageStatusCode: pageStatusCode,
|
||||||
|
pageError: pageError
|
||||||
|
},
|
||||||
} as Document;
|
} as Document;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user