Merge pull request #153 from mendableai/test/crawl-options
[Tests] Added crawl test suite -> crawl improvements
This commit is contained in:
commit
a31459092e
@ -146,7 +146,274 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
// Additional tests for insufficient credits?
|
it("should return a successful response with a valid API key and valid includes option", async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://mendable.ai",
|
||||||
|
limit: 10,
|
||||||
|
crawlerOptions: {
|
||||||
|
includes: ["blog/*"],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
let response;
|
||||||
|
let isFinished = false;
|
||||||
|
|
||||||
|
while (!isFinished) {
|
||||||
|
response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("status");
|
||||||
|
isFinished = response.body.status === "completed";
|
||||||
|
|
||||||
|
if (!isFinished) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const completedResponse = response;
|
||||||
|
|
||||||
|
const urls = completedResponse.body.data.map(
|
||||||
|
(item: any) => item.metadata?.sourceURL
|
||||||
|
);
|
||||||
|
expect(urls.length).toBeGreaterThan(5);
|
||||||
|
urls.forEach((url: string) => {
|
||||||
|
console.log({url})
|
||||||
|
expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy();
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it("should return a successful response with a valid API key and valid excludes option", async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://mendable.ai",
|
||||||
|
limit: 10,
|
||||||
|
crawlerOptions: {
|
||||||
|
excludes: ["blog/*"],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
let isFinished = false;
|
||||||
|
let response;
|
||||||
|
|
||||||
|
while (!isFinished) {
|
||||||
|
response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("status");
|
||||||
|
isFinished = response.body.status === "completed";
|
||||||
|
|
||||||
|
if (!isFinished) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const completedResponse = response;
|
||||||
|
|
||||||
|
const urls = completedResponse.body.data.map(
|
||||||
|
(item: any) => item.metadata?.sourceURL
|
||||||
|
);
|
||||||
|
expect(urls.length).toBeGreaterThan(5);
|
||||||
|
urls.forEach((url: string) => {
|
||||||
|
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
|
||||||
|
});
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it("should return a successful response with a valid API key and limit to 3", async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://mendable.ai",
|
||||||
|
crawlerOptions: { limit: 3 },
|
||||||
|
});
|
||||||
|
|
||||||
|
let isFinished = false;
|
||||||
|
let response;
|
||||||
|
|
||||||
|
while (!isFinished) {
|
||||||
|
response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("status");
|
||||||
|
isFinished = response.body.status === "completed";
|
||||||
|
|
||||||
|
if (!isFinished) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const completedResponse = response;
|
||||||
|
|
||||||
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
|
expect(completedResponse.body.data.length).toBe(3);
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it("should return a successful response with max depth option for a valid crawl job", async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://www.scrapethissite.com",
|
||||||
|
crawlerOptions: { maxDepth: 2 },
|
||||||
|
});
|
||||||
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("status");
|
||||||
|
expect(response.body.status).toBe("active");
|
||||||
|
// wait for 60 seconds
|
||||||
|
await new Promise((r) => setTimeout(r, 60000));
|
||||||
|
const completedResponse = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
const urls = completedResponse.body.data.map(
|
||||||
|
(item: any) => item.metadata?.sourceURL
|
||||||
|
);
|
||||||
|
expect(urls.length).toBeGreaterThan(1);
|
||||||
|
|
||||||
|
// Check if all URLs have a maximum depth of 1
|
||||||
|
urls.forEach((url: string) => {
|
||||||
|
const depth = new URL(url).pathname.split("/").filter(Boolean).length;
|
||||||
|
expect(depth).toBeLessThanOrEqual(1);
|
||||||
|
});
|
||||||
|
}, 120000);
|
||||||
|
|
||||||
|
// it("should return a successful response with a valid API key and valid limit option", async () => {
|
||||||
|
// const crawlResponse = await request(TEST_URL)
|
||||||
|
// .post("/v0/crawl")
|
||||||
|
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
// .set("Content-Type", "application/json")
|
||||||
|
// .send({
|
||||||
|
// url: "https://mendable.ai",
|
||||||
|
// crawlerOptions: { limit: 10 },
|
||||||
|
// });
|
||||||
|
|
||||||
|
// const response = await request(TEST_URL)
|
||||||
|
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
// expect(response.statusCode).toBe(200);
|
||||||
|
// expect(response.body).toHaveProperty("status");
|
||||||
|
// expect(response.body.status).toBe("active");
|
||||||
|
|
||||||
|
// let isCompleted = false;
|
||||||
|
// while (!isCompleted) {
|
||||||
|
// const statusCheckResponse = await request(TEST_URL)
|
||||||
|
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
// expect(statusCheckResponse.statusCode).toBe(200);
|
||||||
|
// isCompleted = statusCheckResponse.body.status === "completed";
|
||||||
|
// if (!isCompleted) {
|
||||||
|
// await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// const completedResponse = await request(TEST_URL)
|
||||||
|
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
// expect(completedResponse.statusCode).toBe(200);
|
||||||
|
// expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
// expect(completedResponse.body.status).toBe("completed");
|
||||||
|
// expect(completedResponse.body).toHaveProperty("data");
|
||||||
|
// expect(completedResponse.body.data.length).toBe(10);
|
||||||
|
// expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
// expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
// expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
// expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
|
// expect(completedResponse.body.data[0].content).not.toContain("main menu");
|
||||||
|
// }, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://firecrawl.dev",
|
||||||
|
pageOptions: { includeHtml: true },
|
||||||
|
});
|
||||||
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("status");
|
||||||
|
expect(response.body.status).toBe("active");
|
||||||
|
|
||||||
|
let isCompleted = false;
|
||||||
|
while (!isCompleted) {
|
||||||
|
const statusCheckResponse = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(statusCheckResponse.statusCode).toBe(200);
|
||||||
|
isCompleted = statusCheckResponse.body.status === "completed";
|
||||||
|
if (!isCompleted) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const completedResponse = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
|
||||||
|
// 120 seconds
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
||||||
|
expect(completedResponse.body.data[0].markdown).toContain("FireCrawl");
|
||||||
|
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||||
|
}, 60000);
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("POST /v0/crawlWebsitePreview", () => {
|
describe("POST /v0/crawlWebsitePreview", () => {
|
||||||
@ -248,7 +515,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.statusCode).toBe(404);
|
expect(response.statusCode).toBe(404);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return a successful response for a valid crawl job", async () => {
|
it("should return a successful crawl status response for a valid crawl job", async () => {
|
||||||
const crawlResponse = await request(TEST_URL)
|
const crawlResponse = await request(TEST_URL)
|
||||||
.post("/v0/crawl")
|
.post("/v0/crawl")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@ -256,20 +523,23 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
let isCompleted = false;
|
||||||
|
let completedResponse;
|
||||||
|
|
||||||
|
while (!isCompleted) {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("status");
|
expect(response.body).toHaveProperty("status");
|
||||||
expect(response.body.status).toBe("active");
|
|
||||||
|
|
||||||
// wait for 30 seconds
|
if (response.body.status === "completed") {
|
||||||
await new Promise((r) => setTimeout(r, 30000));
|
isCompleted = true;
|
||||||
|
completedResponse = response;
|
||||||
const completedResponse = await request(TEST_URL)
|
} else {
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
}
|
||||||
expect(completedResponse.statusCode).toBe(200);
|
}
|
||||||
expect(completedResponse.body).toHaveProperty("status");
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
expect(completedResponse.body.status).toBe("completed");
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
expect(completedResponse.body).toHaveProperty("data");
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
@ -278,90 +548,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
it("should return a successful response with max depth option for a valid crawl job", async () => {
|
|
||||||
const crawlResponse = await request(TEST_URL)
|
|
||||||
.post("/v0/crawl")
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
|
||||||
.set("Content-Type", "application/json")
|
|
||||||
.send({
|
|
||||||
url: "https://www.scrapethissite.com",
|
|
||||||
crawlerOptions: { maxDepth: 2 },
|
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
|
||||||
|
|
||||||
const response = await request(TEST_URL)
|
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
|
||||||
expect(response.statusCode).toBe(200);
|
|
||||||
expect(response.body).toHaveProperty("status");
|
|
||||||
expect(response.body.status).toBe("active");
|
|
||||||
// wait for 60 seconds
|
|
||||||
await new Promise((r) => setTimeout(r, 60000));
|
|
||||||
const completedResponse = await request(TEST_URL)
|
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
|
||||||
|
|
||||||
expect(completedResponse.statusCode).toBe(200);
|
|
||||||
expect(completedResponse.body).toHaveProperty("status");
|
|
||||||
expect(completedResponse.body.status).toBe("completed");
|
|
||||||
expect(completedResponse.body).toHaveProperty("data");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
|
||||||
const urls = completedResponse.body.data.map(
|
|
||||||
(item: any) => item.metadata?.sourceURL
|
|
||||||
);
|
|
||||||
expect(urls.length).toBeGreaterThan(1);
|
|
||||||
|
|
||||||
// Check if all URLs have a maximum depth of 1
|
|
||||||
urls.forEach((url) => {
|
|
||||||
const depth = new URL(url).pathname.split("/").filter(Boolean).length;
|
|
||||||
expect(depth).toBeLessThanOrEqual(1);
|
|
||||||
});
|
|
||||||
}, 120000);
|
|
||||||
|
|
||||||
it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => {
|
|
||||||
const crawlResponse = await request(TEST_URL)
|
|
||||||
.post("/v0/crawl")
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
|
||||||
.set("Content-Type", "application/json")
|
|
||||||
.send({
|
|
||||||
url: "https://firecrawl.dev",
|
|
||||||
pageOptions: { includeHtml: true },
|
|
||||||
});
|
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
|
||||||
|
|
||||||
const response = await request(TEST_URL)
|
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
|
||||||
expect(response.statusCode).toBe(200);
|
|
||||||
expect(response.body).toHaveProperty("status");
|
|
||||||
expect(response.body.status).toBe("active");
|
|
||||||
|
|
||||||
// wait for 30 seconds
|
|
||||||
await new Promise((r) => setTimeout(r, 30000));
|
|
||||||
|
|
||||||
const completedResponse = await request(TEST_URL)
|
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
|
||||||
|
|
||||||
expect(completedResponse.statusCode).toBe(200);
|
|
||||||
expect(completedResponse.body).toHaveProperty("status");
|
|
||||||
expect(completedResponse.body.status).toBe("completed");
|
|
||||||
expect(completedResponse.body).toHaveProperty("data");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
|
||||||
|
|
||||||
// 120 seconds
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
|
||||||
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
|
||||||
expect(completedResponse.body.data[0].markdown).toContain("FireCrawl");
|
|
||||||
expect(completedResponse.body.data[0].html).toContain("<h1");
|
|
||||||
}, 60000);
|
|
||||||
}); // 60 seconds
|
|
||||||
|
|
||||||
it("If someone cancels a crawl job, it should turn into failed status", async () => {
|
it("If someone cancels a crawl job, it should turn into failed status", async () => {
|
||||||
const crawlResponse = await request(TEST_URL)
|
const crawlResponse = await request(TEST_URL)
|
||||||
@ -371,8 +558,6 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.send({ url: "https://jestjs.io" });
|
.send({ url: "https://jestjs.io" });
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// wait for 30 seconds
|
// wait for 30 seconds
|
||||||
await new Promise((r) => setTimeout(r, 10000));
|
await new Promise((r) => setTimeout(r, 10000));
|
||||||
|
|
||||||
|
@ -76,9 +76,22 @@ export class WebCrawler {
|
|||||||
|
|
||||||
// Check if the link matches the include patterns, if any are specified
|
// Check if the link matches the include patterns, if any are specified
|
||||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||||
return this.includes.some((includePattern) =>
|
if (!this.includes.some((includePattern) =>
|
||||||
new RegExp(includePattern).test(path)
|
new RegExp(includePattern).test(path)
|
||||||
);
|
)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize the initial URL and the link to account for www and non-www versions
|
||||||
|
const normalizedInitialUrl = new URL(this.initialUrl);
|
||||||
|
const normalizedLink = new URL(link);
|
||||||
|
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
|
||||||
|
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
|
||||||
|
|
||||||
|
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
||||||
|
if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
||||||
@ -105,11 +118,13 @@ export class WebCrawler {
|
|||||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -125,6 +140,7 @@ export class WebCrawler {
|
|||||||
return [{ url: this.initialUrl, html: "" }];
|
return [{ url: this.initialUrl, html: "" }];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// make sure to run include exclude here again
|
// make sure to run include exclude here again
|
||||||
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
||||||
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
||||||
@ -133,7 +149,7 @@ export class WebCrawler {
|
|||||||
private async crawlUrls(
|
private async crawlUrls(
|
||||||
urls: string[],
|
urls: string[],
|
||||||
concurrencyLimit: number,
|
concurrencyLimit: number,
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void,
|
||||||
): Promise<{ url: string, html: string }[]> {
|
): Promise<{ url: string, html: string }[]> {
|
||||||
const queue = async.queue(async (task: string, callback) => {
|
const queue = async.queue(async (task: string, callback) => {
|
||||||
if (this.crawledUrls.size >= this.maxCrawledLinks) {
|
if (this.crawledUrls.size >= this.maxCrawledLinks) {
|
||||||
@ -143,7 +159,20 @@ export class WebCrawler {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const newUrls = await this.crawl(task);
|
const newUrls = await this.crawl(task);
|
||||||
|
// add the initial url if not already added
|
||||||
|
// if (this.visited.size === 1) {
|
||||||
|
// let normalizedInitial = this.initialUrl;
|
||||||
|
// if (!normalizedInitial.endsWith("/")) {
|
||||||
|
// normalizedInitial = normalizedInitial + "/";
|
||||||
|
// }
|
||||||
|
// if (!newUrls.some(page => page.url === this.initialUrl)) {
|
||||||
|
// newUrls.push({ url: this.initialUrl, html: "" });
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
||||||
|
|
||||||
if (inProgress && newUrls.length > 0) {
|
if (inProgress && newUrls.length > 0) {
|
||||||
inProgress({
|
inProgress({
|
||||||
current: this.crawledUrls.size,
|
current: this.crawledUrls.size,
|
||||||
@ -179,15 +208,21 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
||||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent"))
|
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
|
||||||
return [];
|
return [];
|
||||||
|
}
|
||||||
this.visited.add(url);
|
this.visited.add(url);
|
||||||
|
|
||||||
|
|
||||||
if (!url.startsWith("http")) {
|
if (!url.startsWith("http")) {
|
||||||
url = "https://" + url;
|
url = "https://" + url;
|
||||||
|
|
||||||
}
|
}
|
||||||
if (url.endsWith("/")) {
|
if (url.endsWith("/")) {
|
||||||
url = url.slice(0, -1);
|
url = url.slice(0, -1);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
|
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
@ -205,6 +240,13 @@ export class WebCrawler {
|
|||||||
const $ = load(content);
|
const $ = load(content);
|
||||||
let links: {url: string, html: string}[] = [];
|
let links: {url: string, html: string}[] = [];
|
||||||
|
|
||||||
|
// Add the initial URL to the list of links
|
||||||
|
if(this.visited.size === 1)
|
||||||
|
{
|
||||||
|
links.push({url, html: content});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
$("a").each((_, element) => {
|
$("a").each((_, element) => {
|
||||||
const href = $(element).attr("href");
|
const href = $(element).attr("href");
|
||||||
if (href) {
|
if (href) {
|
||||||
@ -228,6 +270,9 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if(this.visited.size === 1){
|
||||||
|
return links;
|
||||||
|
}
|
||||||
// Create a new list to return to avoid modifying the visited list
|
// Create a new list to return to avoid modifying the visited list
|
||||||
return links.filter((link) => !this.visited.has(link.url));
|
return links.filter((link) => !this.visited.has(link.url));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@ -295,18 +340,57 @@ export class WebCrawler {
|
|||||||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
||||||
|
const normalizeUrl = (url: string) => {
|
||||||
|
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||||
|
if (url.endsWith("/")) {
|
||||||
|
url = url.slice(0, -1);
|
||||||
|
}
|
||||||
|
return url;
|
||||||
|
};
|
||||||
|
|
||||||
const sitemapUrl = url.endsWith("/sitemap.xml")
|
const sitemapUrl = url.endsWith("/sitemap.xml")
|
||||||
? url
|
? url
|
||||||
: `${url}/sitemap.xml`;
|
: `${url}/sitemap.xml`;
|
||||||
|
|
||||||
|
let sitemapLinks: string[] = [];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl);
|
const response = await axios.get(sitemapUrl);
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
return await getLinksFromSitemap(sitemapUrl);
|
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// Error handling for failed sitemap fetch
|
// Error handling for failed sitemap fetch
|
||||||
|
// console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
|
||||||
}
|
}
|
||||||
return [];
|
|
||||||
|
if (sitemapLinks.length === 0) {
|
||||||
|
// If the first one doesn't work, try the base URL
|
||||||
|
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||||
|
try {
|
||||||
|
const response = await axios.get(baseUrlSitemap);
|
||||||
|
if (response.status === 200) {
|
||||||
|
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
// Error handling for failed base URL sitemap fetch
|
||||||
|
// console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize and check if the URL is present in any of the sitemaps
|
||||||
|
const normalizedUrl = normalizeUrl(url);
|
||||||
|
|
||||||
|
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
||||||
|
|
||||||
|
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
||||||
|
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
|
||||||
|
// do not push the normalized url
|
||||||
|
sitemapLinks.push(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sitemapLinks;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -59,7 +59,11 @@ export class WebScraperDataProvider {
|
|||||||
await Promise.all(
|
await Promise.all(
|
||||||
batchUrls.map(async (url, index) => {
|
batchUrls.map(async (url, index) => {
|
||||||
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
||||||
const result = await scrapSingleUrl(url, this.pageOptions, existingHTML);
|
const result = await scrapSingleUrl(
|
||||||
|
url,
|
||||||
|
this.pageOptions,
|
||||||
|
existingHTML
|
||||||
|
);
|
||||||
processedUrls++;
|
processedUrls++;
|
||||||
if (inProgress) {
|
if (inProgress) {
|
||||||
inProgress({
|
inProgress({
|
||||||
@ -130,9 +134,30 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async cleanIrrelevantPath(links: string[]) {
|
||||||
|
return links.filter((link) => {
|
||||||
|
const normalizedInitialUrl = new URL(this.urls[0]);
|
||||||
|
const normalizedLink = new URL(link);
|
||||||
|
|
||||||
|
// Normalize the hostname to account for www and non-www versions
|
||||||
|
const initialHostname = normalizedInitialUrl.hostname.replace(
|
||||||
|
/^www\./,
|
||||||
|
""
|
||||||
|
);
|
||||||
|
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
|
||||||
|
|
||||||
|
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
||||||
|
return (
|
||||||
|
linkHostname === initialHostname &&
|
||||||
|
normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
private async handleCrawlMode(
|
private async handleCrawlMode(
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
|
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
includes: this.includes,
|
includes: this.includes,
|
||||||
@ -143,21 +168,25 @@ export class WebScraperDataProvider {
|
|||||||
generateImgAltText: this.generateImgAltText,
|
generateImgAltText: this.generateImgAltText,
|
||||||
});
|
});
|
||||||
|
|
||||||
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
|
let links = await crawler.start(
|
||||||
|
inProgress,
|
||||||
|
5,
|
||||||
|
this.limit,
|
||||||
|
this.maxCrawledDepth
|
||||||
|
);
|
||||||
|
|
||||||
const allLinks = links.map((e) => e.url);
|
let allLinks = links.map((e) => e.url);
|
||||||
const allHtmls = links.map((e)=> e.html);
|
const allHtmls = links.map((e) => e.html);
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
return this.returnOnlyUrlsResponse(allLinks , inProgress);
|
return this.returnOnlyUrlsResponse(allLinks, inProgress);
|
||||||
}
|
}
|
||||||
|
|
||||||
let documents = [];
|
let documents = [];
|
||||||
// check if fast mode is enabled and there is html inside the links
|
// check if fast mode is enabled and there is html inside the links
|
||||||
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
||||||
console.log("Fast mode enabled");
|
|
||||||
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
||||||
}else{
|
} else {
|
||||||
documents = await this.processLinks(allLinks, inProgress);
|
documents = await this.processLinks(allLinks, inProgress);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -175,6 +204,8 @@ export class WebScraperDataProvider {
|
|||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
let links = await getLinksFromSitemap(this.urls[0]);
|
let links = await getLinksFromSitemap(this.urls[0]);
|
||||||
|
links = await this.cleanIrrelevantPath(links);
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
}
|
}
|
||||||
@ -210,10 +241,13 @@ export class WebScraperDataProvider {
|
|||||||
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(links, inProgress, allHtmls);
|
let documents = await this.convertUrlsToDocuments(
|
||||||
|
links,
|
||||||
|
inProgress,
|
||||||
|
allHtmls
|
||||||
|
);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
|
|
||||||
|
|
||||||
documents = this.applyPathReplacements(documents);
|
documents = this.applyPathReplacements(documents);
|
||||||
// documents = await this.applyImgAltText(documents);
|
// documents = await this.applyImgAltText(documents);
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ export const crawlStatusRateLimiter = new RateLimiterRedis({
|
|||||||
export const testSuiteRateLimiter = new RateLimiterRedis({
|
export const testSuiteRateLimiter = new RateLimiterRedis({
|
||||||
storeClient: redisClient,
|
storeClient: redisClient,
|
||||||
keyPrefix: "middleware",
|
keyPrefix: "middleware",
|
||||||
points: 1000,
|
points: 100000,
|
||||||
duration: 60, // Duration in seconds
|
duration: 60, // Duration in seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -48,7 +48,7 @@ class FirecrawlApp:
|
|||||||
return response['data']
|
return response['data']
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||||
elif response.status_code in [402, 409, 500]:
|
elif response.status_code in [402, 408, 409, 500]:
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
error_message = response.json().get('error', 'Unknown error occurred')
|
||||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
||||||
else:
|
else:
|
||||||
@ -148,7 +148,7 @@ class FirecrawlApp:
|
|||||||
self._handle_error(status_response, 'check crawl status')
|
self._handle_error(status_response, 'check crawl status')
|
||||||
|
|
||||||
def _handle_error(self, response, action):
|
def _handle_error(self, response, action):
|
||||||
if response.status_code in [402, 409, 500]:
|
if response.status_code in [402, 408, 409, 500]:
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
error_message = response.json().get('error', 'Unknown error occurred')
|
||||||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
||||||
else:
|
else:
|
||||||
|
178
apps/test-suite/data/crawl.json
Normal file
178
apps/test-suite/data/crawl.json
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"website": "https://www.vellum.ai/llm-leaderboard",
|
||||||
|
"expected_min_num_of_pages": 1,
|
||||||
|
"expected_crawled_pages": ["https://www.vellum.ai/llm-leaderboard"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"website": "https://openai.com/news",
|
||||||
|
"expected_min_num_of_pages": 4,
|
||||||
|
"expected_crawled_pages": [
|
||||||
|
"https://openai.com/news/company/",
|
||||||
|
"https://openai.com/news/research/",
|
||||||
|
"https://openai.com/news/safety-and-alignment/",
|
||||||
|
"https://openai.com/news/stories/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"website": "https://www.framer.com/pricing",
|
||||||
|
"expected_min_num_of_pages": 1,
|
||||||
|
"expected_not_crawled_pages": [
|
||||||
|
"https://www.framer.com/features/navigation/",
|
||||||
|
"https://www.framer.com/contact/",
|
||||||
|
"https://www.framer.com/add-ons/",
|
||||||
|
"https://www.framer.com/free-saas-ui-kit/",
|
||||||
|
"https://www.framer.com/help/",
|
||||||
|
"https://www.framer.com/features/effects/",
|
||||||
|
"https://www.framer.com/enterprise/",
|
||||||
|
"https://www.framer.com/templates/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"website": "https://mendable.ai/pricing",
|
||||||
|
"expected_min_num_of_pages": 1,
|
||||||
|
"expected_not_crawled_pages": [
|
||||||
|
"https://mendable.ai/",
|
||||||
|
"https://mendable.ai/blog",
|
||||||
|
"https://mendable.ai/signin",
|
||||||
|
"https://mendable.ai/signup",
|
||||||
|
"https://mendable.ai",
|
||||||
|
"https://mendable.ai/usecases/sales-enablement",
|
||||||
|
"https://mendable.ai/usecases/documentation",
|
||||||
|
"https://mendable.ai/usecases/cs-enablement",
|
||||||
|
"https://mendable.ai/usecases/productcopilot",
|
||||||
|
"https://mendable.ai/security"
|
||||||
|
],
|
||||||
|
"notes": "This one should not go backwards, but it does!"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"website": "https://agentops.ai/blog",
|
||||||
|
"expected_min_num_of_pages": 6,
|
||||||
|
"expected_crawled_pages": [
|
||||||
|
"https://www.agentops.ai/blog/effortless-hr-management-with-saas",
|
||||||
|
"https://www.agentops.ai/blog/streamlining-hr-with-saas",
|
||||||
|
"https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions",
|
||||||
|
"https://www.agentops.ai/blog/efficient-hr-operations-with-saas",
|
||||||
|
"https://www.agentops.ai/blog/hr-made-simple-with-saas",
|
||||||
|
"https://agentops.ai/blog"
|
||||||
|
],
|
||||||
|
"expected_not_crawled_pages": [
|
||||||
|
"https://agentops.ai/about-us",
|
||||||
|
"https://agentops.ai/contact-us"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"website": "https://en.wikipedia.org/wiki/T._N._Seshan",
|
||||||
|
"expected_min_num_of_pages": 1,
|
||||||
|
"expected_not_crawled_pages": [
|
||||||
|
"https://en.wikipedia.org/wiki/Wikipedia:Contents",
|
||||||
|
"https://en.wikipedia.org/wiki/Wikipedia:Contact_us",
|
||||||
|
"https://en.wikipedia.org/wiki/V._S._Ramadevi",
|
||||||
|
"https://en.wikipedia.org/wiki/Wikipedia:About",
|
||||||
|
"https://en.wikipedia.org/wiki/Help:Introduction",
|
||||||
|
"https://en.wikipedia.org/wiki/H._D._Deve_Gowda",
|
||||||
|
"https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"website": "https://ycombinator.com/companies",
|
||||||
|
"expected_min_num_of_pages": 20,
|
||||||
|
"expected_crawled_pages": [
|
||||||
|
"https://www.ycombinator.com/companies/industry/elearning",
|
||||||
|
"https://www.ycombinator.com/companies/industry/computer-vision",
|
||||||
|
"https://www.ycombinator.com/companies/industry/health-tech",
|
||||||
|
"https://www.ycombinator.com/companies/industry/education",
|
||||||
|
"https://www.ycombinator.com/companies/industry/robotics",
|
||||||
|
"https://www.ycombinator.com/companies/industry/hardware",
|
||||||
|
"https://www.ycombinator.com/companies/industry/saas",
|
||||||
|
"https://www.ycombinator.com/companies/industry/hard-tech",
|
||||||
|
"https://www.ycombinator.com/companies/industry/developer-tools",
|
||||||
|
"https://www.ycombinator.com/companies/industry/entertainment",
|
||||||
|
"https://www.ycombinator.com/companies/industry/finance",
|
||||||
|
"https://www.ycombinator.com/companies/industry/generative-ai",
|
||||||
|
"https://www.ycombinator.com/companies/industry/machine-learning"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"website": "https://firecrawl.dev",
|
||||||
|
"expected_min_num_of_pages": 2,
|
||||||
|
"expected_crawled_pages": [
|
||||||
|
"https://firecrawl.dev/",
|
||||||
|
"https://firecrawl.dev/pricing"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
"website": "https://fly.io/docs/gpus/gpu-quickstart",
|
||||||
|
"expected_min_num_of_pages": 1,
|
||||||
|
"expected_not_crawled_pages": [
|
||||||
|
"https://fly.io/docs/getting-started/",
|
||||||
|
"https://fly.io/docs/hands-on/",
|
||||||
|
"https://fly.io/docs/about/support/",
|
||||||
|
"https://fly.io/docs/blueprints/going-to-production-with-healthcare-apps/",
|
||||||
|
"https://fly.io/docs/machines/flyctl/fly-machine-update/",
|
||||||
|
"https://fly.io/docs/blueprints/review-apps-guide/",
|
||||||
|
"https://fly.io/docs/blueprints/supercronic/"
|
||||||
|
],
|
||||||
|
"notes": "This one should not go backwards, but it does!"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"website": "https://www.instructables.com/circuits",
|
||||||
|
"expected_min_num_of_pages": 12,
|
||||||
|
"expected_crawled_pages": [
|
||||||
|
"https://www.instructables.com/circuits/",
|
||||||
|
"https://www.instructables.com/circuits/apple/projects/",
|
||||||
|
"https://www.instructables.com/circuits/art/projects/",
|
||||||
|
"https://www.instructables.com/circuits/electronics/projects/",
|
||||||
|
"https://www.instructables.com/circuits/microsoft/projects/",
|
||||||
|
"https://www.instructables.com/circuits/microcontrollers/projects/",
|
||||||
|
"https://www.instructables.com/circuits/community/",
|
||||||
|
"https://www.instructables.com/circuits/leds/projects/",
|
||||||
|
"https://www.instructables.com/circuits/gadgets/projects/",
|
||||||
|
"https://www.instructables.com/circuits/arduino/projects/",
|
||||||
|
"https://www.instructables.com/circuits/lasers/projects/",
|
||||||
|
"https://www.instructables.com/circuits/clocks/projects/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"website": "https://richmondconfidential.org",
|
||||||
|
"expected_min_num_of_pages": 20,
|
||||||
|
"expected_crawled_pages": [
|
||||||
|
"https://richmondconfidential.org/2009/10/13/salesians-star-guard-has-a-big-impact/",
|
||||||
|
"https://richmondconfidential.org/2009/10/13/on-team-of-beginners-oilers-old-hand-stands-out/",
|
||||||
|
"https://richmondconfidential.org/2009/10/19/point-richmond-clockmaker-turns-clutter-into-crafts/",
|
||||||
|
"https://richmondconfidential.org/2009/10/13/profile-maurice-cathy/",
|
||||||
|
"https://richmondconfidential.org/2009/10/13/soul-food-rescue-mission-rebuilds-diets-and-lives/",
|
||||||
|
"https://richmondconfidential.org/2009/10/21/in-tough-economy-pain-trickles-to-the-bottom/",
|
||||||
|
"https://richmondconfidential.org/2009/10/19/richmond-homicide-map/",
|
||||||
|
"https://richmondconfidential.org/2009/10/13/rough-roads-for-richmonds-cab-drivers/",
|
||||||
|
"https://richmondconfidential.org/2009/10/13/before-napa-there-was-winehaven/",
|
||||||
|
"https://richmondconfidential.org/2009/10/13/family-calls-for-end-to-violence-at-memorial-for-slain-woman-friend/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"website": "https://www.boardgamegeek.com",
|
||||||
|
"expected_min_num_of_pages": 15,
|
||||||
|
"expected_crawled_pages": [
|
||||||
|
"https://www.boardgamegeek.com/browse/boardgameartist",
|
||||||
|
"https://www.boardgamegeek.com/browse/boardgamehonor",
|
||||||
|
"https://www.boardgamegeek.com/browse/boardgamepublisher",
|
||||||
|
"https://www.boardgamegeek.com/browse/boardgamepodcast",
|
||||||
|
"https://www.boardgamegeek.com/wiki/page/Index",
|
||||||
|
"https://www.boardgamegeek.com/browse/boardgamecategory",
|
||||||
|
"https://www.boardgamegeek.com/boardgame/random",
|
||||||
|
"https://www.boardgamegeek.com/browse/boardgamemechanic",
|
||||||
|
"https://www.boardgamegeek.com/forums",
|
||||||
|
"https://www.boardgamegeek.com/gonecardboard",
|
||||||
|
"https://www.boardgamegeek.com/browse/boardgameaccessory",
|
||||||
|
"https://www.boardgamegeek.com/browse/boardgamedesigner",
|
||||||
|
"https://www.boardgamegeek.com/",
|
||||||
|
"https://www.boardgamegeek.com/previews",
|
||||||
|
"https://www.boardgamegeek.com/browse/boardgame"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
@ -3,7 +3,9 @@
|
|||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"description": "",
|
"description": "",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false"
|
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false",
|
||||||
|
"test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts",
|
||||||
|
"test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts"
|
||||||
},
|
},
|
||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
|
150
apps/test-suite/tests/crawl.test.ts
Normal file
150
apps/test-suite/tests/crawl.test.ts
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
import request from "supertest";
|
||||||
|
import dotenv from "dotenv";
|
||||||
|
import { WebsiteScrapeError } from "../utils/types";
|
||||||
|
import { logErrors } from "../utils/log";
|
||||||
|
|
||||||
|
import websitesData from "../data/crawl.json";
|
||||||
|
import "dotenv/config";
|
||||||
|
|
||||||
|
import fs from 'fs';
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
interface WebsiteData {
|
||||||
|
website: string;
|
||||||
|
expected_min_num_of_pages: number;
|
||||||
|
expected_crawled_pages: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
|
describe("Crawling Checkup (E2E)", () => {
|
||||||
|
beforeAll(() => {
|
||||||
|
if (!process.env.TEST_API_KEY) {
|
||||||
|
throw new Error("TEST_API_KEY is not set");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("Crawling website tests with a dataset", () => {
|
||||||
|
it("Should crawl the website and verify the response", async () => {
|
||||||
|
let passedTests = 0;
|
||||||
|
const startTime = new Date().getTime();
|
||||||
|
const date = new Date();
|
||||||
|
const logsDir = `logs/${date.getMonth() + 1}-${date.getDate()}-${date.getFullYear()}`;
|
||||||
|
|
||||||
|
let errorLogFileName = `${logsDir}/run.log_${new Date().toTimeString().split(' ')[0]}`;
|
||||||
|
const errorLog: WebsiteScrapeError[] = [];
|
||||||
|
|
||||||
|
for (const websiteData of websitesData) {
|
||||||
|
try {
|
||||||
|
const crawlResponse = await request(TEST_URL || "")
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100, returnOnlyUrls: true }});
|
||||||
|
|
||||||
|
const jobId = crawlResponse.body.jobId;
|
||||||
|
let completedResponse: any;
|
||||||
|
let isFinished = false;
|
||||||
|
|
||||||
|
while (!isFinished) {
|
||||||
|
completedResponse = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
isFinished = completedResponse.body.status === "completed";
|
||||||
|
|
||||||
|
if (!isFinished) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!completedResponse) {
|
||||||
|
// fail the test
|
||||||
|
console.log('No response');
|
||||||
|
continue;
|
||||||
|
// continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!completedResponse.body || completedResponse.body.status !== "completed") {
|
||||||
|
errorLog.push({
|
||||||
|
website: websiteData.website,
|
||||||
|
prompt: 'CRAWL',
|
||||||
|
expected_output: 'SUCCESS',
|
||||||
|
actual_output: 'FAILURE',
|
||||||
|
error: `Crawl job did not complete successfully.`
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check how many webpages were crawled successfully
|
||||||
|
// compares with expected_num_of_pages
|
||||||
|
if (completedResponse.body.data.length < websiteData.expected_min_num_of_pages) {
|
||||||
|
errorLog.push({
|
||||||
|
website: websiteData.website,
|
||||||
|
prompt: 'CRAWL',
|
||||||
|
expected_output: `SUCCESS: ${websiteData.expected_min_num_of_pages}`,
|
||||||
|
actual_output: `FAILURE: ${completedResponse.body.data.length}`,
|
||||||
|
error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}`
|
||||||
|
});
|
||||||
|
console.log('Error: ', errorLog);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// checks if crawled pages contain expected_crawled_pages
|
||||||
|
if (websiteData.expected_crawled_pages && websiteData.expected_crawled_pages.length > 0 && websiteData.expected_crawled_pages.some(page => !completedResponse.body.data?.some((d: { url: string }) => d.url === page))) {
|
||||||
|
errorLog.push({
|
||||||
|
website: websiteData.website,
|
||||||
|
prompt: 'CRAWL',
|
||||||
|
expected_output: `SUCCESS: ${websiteData.expected_crawled_pages}`,
|
||||||
|
actual_output: `FAILURE: ${completedResponse.body.data}`,
|
||||||
|
error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}`
|
||||||
|
});
|
||||||
|
console.log('Error: ', errorLog);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// checks if crawled pages not contain expected_not_crawled_pages
|
||||||
|
if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && completedResponse.body.data && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
|
||||||
|
errorLog.push({
|
||||||
|
website: websiteData.website,
|
||||||
|
prompt: 'CRAWL',
|
||||||
|
expected_output: `SUCCESS: ${websiteData.expected_not_crawled_pages}`,
|
||||||
|
actual_output: `FAILURE: ${completedResponse.body.data}`,
|
||||||
|
error: `Expected crawled pages to not contain ${websiteData.expected_not_crawled_pages}, but got ${completedResponse.body.data}`
|
||||||
|
});
|
||||||
|
console.log('Error: ', errorLog);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
passedTests++;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error processing ${websiteData.website}: ${error}`);
|
||||||
|
errorLog.push({
|
||||||
|
website: websiteData.website,
|
||||||
|
prompt: 'CRAWL',
|
||||||
|
expected_output: 'SUCCESS',
|
||||||
|
actual_output: 'FAILURE',
|
||||||
|
error: `Error processing ${websiteData.website}: ${error}`
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const score = (passedTests / websitesData.length) * 100;
|
||||||
|
const endTime = new Date().getTime();
|
||||||
|
const timeTaken = (endTime - startTime) / 1000;
|
||||||
|
console.log(`Score: ${score}%`);
|
||||||
|
|
||||||
|
await logErrors(errorLog, timeTaken, 0, score, websitesData.length);
|
||||||
|
|
||||||
|
if (process.env.ENV === "local" && errorLog.length > 0) {
|
||||||
|
if (!fs.existsSync(logsDir)){
|
||||||
|
fs.mkdirSync(logsDir, { recursive: true });
|
||||||
|
}
|
||||||
|
fs.writeFileSync(errorLogFileName, JSON.stringify(errorLog, null, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(score).toBeGreaterThanOrEqual(90);
|
||||||
|
}, 350000); // 150 seconds timeout
|
||||||
|
});
|
||||||
|
});
|
@ -1,16 +1,14 @@
|
|||||||
import request from "supertest";
|
import request from "supertest";
|
||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
import Anthropic from "@anthropic-ai/sdk";
|
import { numTokensFromString } from "../utils/tokens";
|
||||||
import { numTokensFromString } from "./utils/tokens";
|
|
||||||
import OpenAI from "openai";
|
import OpenAI from "openai";
|
||||||
import { WebsiteScrapeError } from "./utils/types";
|
import { WebsiteScrapeError } from "../utils/types";
|
||||||
import { logErrors } from "./utils/log";
|
import { logErrors } from "../utils/log";
|
||||||
|
|
||||||
const websitesData = require("./data/websites.json");
|
import websitesData from "../data/scrape.json";
|
||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
|
|
||||||
const fs = require('fs');
|
import fs from 'fs';
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
interface WebsiteData {
|
interface WebsiteData {
|
||||||
@ -21,8 +19,7 @@ interface WebsiteData {
|
|||||||
|
|
||||||
const TEST_URL = "http://127.0.0.1:3002";
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
|
describe("Scraping Checkup (E2E)", () => {
|
||||||
describe("Scraping/Crawling Checkup (E2E)", () => {
|
|
||||||
beforeAll(() => {
|
beforeAll(() => {
|
||||||
if (!process.env.TEST_API_KEY) {
|
if (!process.env.TEST_API_KEY) {
|
||||||
throw new Error("TEST_API_KEY is not set");
|
throw new Error("TEST_API_KEY is not set");
|
||||||
@ -72,10 +69,6 @@ describe("Scraping/Crawling Checkup (E2E)", () => {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const anthropic = new Anthropic({
|
|
||||||
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
||||||
});
|
|
||||||
|
|
||||||
const openai = new OpenAI({
|
const openai = new OpenAI({
|
||||||
apiKey: process.env.OPENAI_API_KEY,
|
apiKey: process.env.OPENAI_API_KEY,
|
||||||
});
|
});
|
@ -39,7 +39,7 @@
|
|||||||
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
|
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
|
||||||
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
|
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
|
||||||
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
|
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
|
||||||
// "resolveJsonModule": true, /* Enable importing .json files. */
|
"resolveJsonModule": true, /* Enable importing .json files. */
|
||||||
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
|
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
|
||||||
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user