added crawl test suite
This commit is contained in:
parent
e26008a833
commit
4925ee59f6
@ -146,7 +146,241 @@ describe("E2E Tests for API Routes", () => {
|
||||
);
|
||||
});
|
||||
|
||||
// Additional tests for insufficient credits?
|
||||
it("should return a successful response with a valid API key and valid includes option", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://mendable.ai",
|
||||
limit: 10,
|
||||
crawlerOptions: {
|
||||
includes: ["/blog/*"],
|
||||
},
|
||||
});
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
|
||||
// wait for 30 seconds
|
||||
await new Promise((r) => setTimeout(r, 30000));
|
||||
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(5);
|
||||
urls.forEach((url: string) => {
|
||||
console.log({url})
|
||||
expect(url.startsWith("https://mendable.ai/blog/")).toBeTruthy();
|
||||
});
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it("should return a successful response with a valid API key and valid excludes option", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://mendable.ai",
|
||||
limit: 10,
|
||||
crawlerOptions: {
|
||||
excludes: ["/blog/*"],
|
||||
},
|
||||
});
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
|
||||
// wait for 30 seconds
|
||||
await new Promise((r) => setTimeout(r, 30000));
|
||||
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(5);
|
||||
urls.forEach((url: string) => {
|
||||
expect(url.startsWith("https://mendable.ai/blog/")).toBeFalsy();
|
||||
});
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it("should return a successful response with a valid API key and valid excludes option", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://mendable.ai",
|
||||
limit: 3,
|
||||
});
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
|
||||
// wait for 30 seconds
|
||||
await new Promise((r) => setTimeout(r, 30000));
|
||||
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data.length).toBe(3);
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it("should return a successful response with max depth option for a valid crawl job", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://www.scrapethissite.com",
|
||||
crawlerOptions: { maxDepth: 2 },
|
||||
});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
// wait for 60 seconds
|
||||
await new Promise((r) => setTimeout(r, 60000));
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(1);
|
||||
|
||||
// Check if all URLs have a maximum depth of 1
|
||||
urls.forEach((url: string) => {
|
||||
const depth = new URL(url).pathname.split("/").filter(Boolean).length;
|
||||
expect(depth).toBeLessThanOrEqual(1);
|
||||
});
|
||||
}, 120000);
|
||||
|
||||
it("should return a successful response with a valid API key and valid onlyMainContent option", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://mendable.ai",
|
||||
limit: 10,
|
||||
});
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
|
||||
// wait for 30 seconds
|
||||
await new Promise((r) => setTimeout(r, 30000));
|
||||
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data.length).toBe(3);
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].content).not.toContain("main menu");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
pageOptions: { includeHtml: true },
|
||||
});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
|
||||
// wait for 30 seconds
|
||||
await new Promise((r) => setTimeout(r, 30000));
|
||||
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
|
||||
// 120 seconds
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
||||
expect(completedResponse.body.data[0].markdown).toContain("FireCrawl");
|
||||
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||
}, 60000);
|
||||
});
|
||||
|
||||
describe("POST /v0/crawlWebsitePreview", () => {
|
||||
@ -248,7 +482,7 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(response.statusCode).toBe(404);
|
||||
});
|
||||
|
||||
it("should return a successful response for a valid crawl job", async () => {
|
||||
it("should return a successful crawl status response for a valid crawl job", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
@ -278,90 +512,7 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it("should return a successful response with max depth option for a valid crawl job", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://www.scrapethissite.com",
|
||||
crawlerOptions: { maxDepth: 2 },
|
||||
});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
// wait for 60 seconds
|
||||
await new Promise((r) => setTimeout(r, 60000));
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(1);
|
||||
|
||||
// Check if all URLs have a maximum depth of 1
|
||||
urls.forEach((url) => {
|
||||
const depth = new URL(url).pathname.split("/").filter(Boolean).length;
|
||||
expect(depth).toBeLessThanOrEqual(1);
|
||||
});
|
||||
}, 120000);
|
||||
|
||||
it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
pageOptions: { includeHtml: true },
|
||||
});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
|
||||
// wait for 30 seconds
|
||||
await new Promise((r) => setTimeout(r, 30000));
|
||||
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
|
||||
// 120 seconds
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
||||
expect(completedResponse.body.data[0].markdown).toContain("FireCrawl");
|
||||
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||
}, 60000);
|
||||
}); // 60 seconds
|
||||
});
|
||||
|
||||
it("If someone cancels a crawl job, it should turn into failed status", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
@ -371,8 +522,6 @@ describe("E2E Tests for API Routes", () => {
|
||||
.send({ url: "https://jestjs.io" });
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
|
||||
|
||||
// wait for 30 seconds
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
|
||||
|
226
apps/test-suite/data/crawl.json
Normal file
226
apps/test-suite/data/crawl.json
Normal file
@ -0,0 +1,226 @@
|
||||
[
|
||||
{
|
||||
"website": "https://www.anthropic.com/claude",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""]
|
||||
},
|
||||
{
|
||||
"website": "https://mendable.ai/pricing",
|
||||
"expected_min_num_of_pages": 29,
|
||||
"expected_crawled_pages": [
|
||||
"https://mendable.ai/",
|
||||
"https://mendable.ai/blog",
|
||||
"https://mendable.ai/signin",
|
||||
"https://mendable.ai/signup",
|
||||
"https://mendable.ai",
|
||||
"https://mendable.ai/usecases/sales-enablement",
|
||||
"https://mendable.ai/usecases/documentation",
|
||||
"https://mendable.ai/usecases/cs-enablement",
|
||||
"https://mendable.ai/usecases/productcopilot",
|
||||
"https://mendable.ai/security"
|
||||
],
|
||||
"notes": "This one should not go backwards, but it does!"
|
||||
},
|
||||
{
|
||||
"website": "https://openai.com/news",
|
||||
"expected_min_num_of_pages": 59,
|
||||
"expected_crawled_pages": [
|
||||
"https://openai.com/news/company/",
|
||||
"https://openai.com/news/research/",
|
||||
"https://openai.com/news/safety-and-alignment/",
|
||||
"https://openai.com/news/stories/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://agentops.ai",
|
||||
"expected_min_num_of_pages": 7,
|
||||
"expected_crawled_pages": [
|
||||
"https://www.agentops.ai/blog/effortless-hr-management-with-saas",
|
||||
"https://www.agentops.ai/blog/streamlining-hr-with-saas",
|
||||
"https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions",
|
||||
"https://www.agentops.ai/blog/efficient-hr-operations-with-saas",
|
||||
"https://www.agentops.ai/blog/hr-made-simple-with-saas",
|
||||
"https://www.agentops.ai/about-us",
|
||||
"https://www.agentops.ai/contact-us"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://ycombinator.com/companies",
|
||||
"expected_min_num_of_pages": 45,
|
||||
"expected_crawled_pages": [
|
||||
"https://www.ycombinator.com/companies/industry/elearning",
|
||||
"https://www.ycombinator.com/companies/industry/computer-vision",
|
||||
"https://www.ycombinator.com/companies/industry/health-tech",
|
||||
"https://www.ycombinator.com/companies/industry/education",
|
||||
"https://www.ycombinator.com/companies/industry/robotics",
|
||||
"https://www.ycombinator.com/companies/industry/hardware",
|
||||
"https://www.ycombinator.com/companies/industry/saas",
|
||||
"https://www.ycombinator.com/companies/industry/hard-tech",
|
||||
"https://www.ycombinator.com/companies/industry/developer-tools",
|
||||
"https://www.ycombinator.com/companies/industry/entertainment",
|
||||
"https://www.ycombinator.com/companies/industry/finance",
|
||||
"https://www.ycombinator.com/companies/industry/generative-ai",
|
||||
"https://www.ycombinator.com/companies/industry/machine-learning"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://firecrawl.dev",
|
||||
"expected_min_num_of_pages": 2,
|
||||
"expected_crawled_pages": [
|
||||
"https://firecrawl.dev/",
|
||||
"https://firecrawl.dev/pricing"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://en.wikipedia.org/wiki/T._N._Seshan",
|
||||
"expected_min_num_of_pages": 100,
|
||||
"expected_crawled_pages": [
|
||||
"https://en.wikipedia.org/wiki/Wikipedia:Contents",
|
||||
"https://en.wikipedia.org/wiki/Wikipedia:Contact_us",
|
||||
"https://en.wikipedia.org/wiki/V._S._Ramadevi",
|
||||
"https://en.wikipedia.org/wiki/Wikipedia:About",
|
||||
"https://en.wikipedia.org/wiki/Help:Introduction",
|
||||
"https://en.wikipedia.org/wiki/H._D._Deve_Gowda",
|
||||
"https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://mendable.ai/blog",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""]
|
||||
},
|
||||
{
|
||||
"website": "https://www.framer.com/pricing",
|
||||
"expected_min_num_of_pages": 58,
|
||||
"expected_crawled_pages": [
|
||||
"https://www.framer.com/features/navigation/",
|
||||
"https://www.framer.com/contact/",
|
||||
"https://www.framer.com/add-ons/",
|
||||
"https://www.framer.com/free-saas-ui-kit/",
|
||||
"https://www.framer.com/help/",
|
||||
"https://www.framer.com/features/effects/",
|
||||
"https://www.framer.com/enterprise/",
|
||||
"https://www.framer.com/templates/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://fly.io/docs/gpus/gpu-quickstart",
|
||||
"expected_min_num_of_pages": 39,
|
||||
"expected_crawled_pages": [
|
||||
"https://fly.io/docs/getting-started/",
|
||||
"https://fly.io/docs/hands-on/",
|
||||
"https://fly.io/docs/about/support/",
|
||||
"https://fly.io/docs/blueprints/going-to-production-with-healthcare-apps/",
|
||||
"https://fly.io/docs/machines/flyctl/fly-machine-update/",
|
||||
"https://fly.io/docs/blueprints/review-apps-guide/",
|
||||
"https://fly.io/docs/blueprints/supercronic/"
|
||||
],
|
||||
"notes": "This one should not go backwards, but it does!"
|
||||
},
|
||||
{
|
||||
"website": "https://news.ycombinator.com/",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""]
|
||||
},
|
||||
{
|
||||
"website": "https://www.vellum.ai/llm-leaderboard",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""]
|
||||
},
|
||||
{
|
||||
"website": "https://www.bigbadtoystore.com",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""]
|
||||
},
|
||||
{
|
||||
"website": "https://www.instructables.com",
|
||||
"expected_min_num_of_pages": 78,
|
||||
"expected_crawled_pages": [
|
||||
"https://www.instructables.com/circuits/",
|
||||
"https://www.instructables.com/circuits/apple/projects/",
|
||||
"https://www.instructables.com/circuits/art/projects/",
|
||||
"https://www.instructables.com/circuits/electronics/projects/",
|
||||
"https://www.instructables.com/circuits/microsoft/projects/",
|
||||
"https://www.instructables.com/circuits/microcontrollers/projects/",
|
||||
"https://www.instructables.com/circuits/community/",
|
||||
"https://www.instructables.com/circuits/leds/projects/",
|
||||
"https://www.instructables.com/circuits/gadgets/projects/",
|
||||
"https://www.instructables.com/circuits/arduino/projects/",
|
||||
"https://www.instructables.com/circuits/lasers/projects/",
|
||||
"https://www.instructables.com/circuits/clocks/projects/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://www.powells.com",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""]
|
||||
},
|
||||
{
|
||||
"website": "https://www.royalacademy.org.uk",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""]
|
||||
},
|
||||
{
|
||||
"website": "https://www.eastbaytimes.com",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""]
|
||||
},
|
||||
{
|
||||
"website": "https://www.manchestereveningnews.co.uk",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""]
|
||||
},
|
||||
{
|
||||
"website": "https://physicsworld.com",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""]
|
||||
},
|
||||
{
|
||||
"website": "https://richmondconfidential.org",
|
||||
"expected_min_num_of_pages": 50,
|
||||
"expected_crawled_pages": [
|
||||
"https://richmondconfidential.org/2009/10/13/salesians-star-guard-has-a-big-impact/",
|
||||
"https://richmondconfidential.org/2009/10/13/on-team-of-beginners-oilers-old-hand-stands-out/",
|
||||
"https://richmondconfidential.org/2009/10/19/point-richmond-clockmaker-turns-clutter-into-crafts/",
|
||||
"https://richmondconfidential.org/2009/10/13/profile-maurice-cathy/",
|
||||
"https://richmondconfidential.org/2009/10/13/soul-food-rescue-mission-rebuilds-diets-and-lives/",
|
||||
"https://richmondconfidential.org/2009/10/21/in-tough-economy-pain-trickles-to-the-bottom/",
|
||||
"https://richmondconfidential.org/2009/10/19/richmond-homicide-map/",
|
||||
"https://richmondconfidential.org/2009/10/13/rough-roads-for-richmonds-cab-drivers/",
|
||||
"https://richmondconfidential.org/2009/10/13/before-napa-there-was-winehaven/",
|
||||
"https://richmondconfidential.org/2009/10/13/family-calls-for-end-to-violence-at-memorial-for-slain-woman-friend/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://www.techinasia.com",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""],
|
||||
"notes": "The website has a paywall and bot detectors."
|
||||
},
|
||||
{
|
||||
"website": "https://www.boardgamegeek.com",
|
||||
"expected_min_num_of_pages": 15,
|
||||
"expected_crawled_pages": [
|
||||
"https://www.boardgamegeek.com/browse/boardgameartist",
|
||||
"https://www.boardgamegeek.com/browse/boardgamehonor",
|
||||
"https://www.boardgamegeek.com/browse/boardgamepublisher",
|
||||
"https://www.boardgamegeek.com/browse/boardgamepodcast",
|
||||
"https://www.boardgamegeek.com/wiki/page/Index",
|
||||
"https://www.boardgamegeek.com/browse/boardgamecategory",
|
||||
"https://www.boardgamegeek.com/boardgame/random",
|
||||
"https://www.boardgamegeek.com/browse/boardgamemechanic",
|
||||
"https://www.boardgamegeek.com/forums",
|
||||
"https://www.boardgamegeek.com/gonecardboard",
|
||||
"https://www.boardgamegeek.com/browse/boardgameaccessory",
|
||||
"https://www.boardgamegeek.com/browse/boardgamedesigner",
|
||||
"https://www.boardgamegeek.com/",
|
||||
"https://www.boardgamegeek.com/previews",
|
||||
"https://www.boardgamegeek.com/browse/boardgame"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://www.mountainproject.com",
|
||||
"expected_min_num_of_pages": 0,
|
||||
"expected_crawled_pages": [""]
|
||||
}
|
||||
]
|
@ -3,7 +3,9 @@
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"scripts": {
|
||||
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false"
|
||||
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false",
|
||||
"test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts",
|
||||
"test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
|
148
apps/test-suite/tests/crawl.test.ts
Normal file
148
apps/test-suite/tests/crawl.test.ts
Normal file
@ -0,0 +1,148 @@
|
||||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import { WebsiteScrapeError } from "../utils/types";
|
||||
import { logErrors } from "../utils/log";
|
||||
|
||||
import websitesData from "../data/crawl.json";
|
||||
import "dotenv/config";
|
||||
|
||||
import fs from 'fs';
|
||||
dotenv.config();
|
||||
|
||||
interface WebsiteData {
|
||||
website: string;
|
||||
expected_min_num_of_pages: number;
|
||||
expected_crawled_pages: string[];
|
||||
}
|
||||
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("Crawling Checkup (E2E)", () => {
|
||||
beforeAll(() => {
|
||||
if (!process.env.TEST_API_KEY) {
|
||||
throw new Error("TEST_API_KEY is not set");
|
||||
}
|
||||
});
|
||||
|
||||
describe("Crawling website tests with a dataset", () => {
|
||||
it("Should crawl the website and verify the response", async () => {
|
||||
let passedTests = 0;
|
||||
const batchSize = 15;
|
||||
const batchPromises = [];
|
||||
const startTime = new Date().getTime();
|
||||
const date = new Date();
|
||||
const logsDir = `logs/${date.getMonth() + 1}-${date.getDate()}-${date.getFullYear()}`;
|
||||
|
||||
let errorLogFileName = `${logsDir}/run.log_${new Date().toTimeString().split(' ')[0]}`;
|
||||
const errorLog: WebsiteScrapeError[] = [];
|
||||
|
||||
for (let i = 0; i < websitesData.length; i += batchSize) {
|
||||
await new Promise(resolve => setTimeout(resolve, 10000));
|
||||
|
||||
const batch = websitesData.slice(i, i + batchSize);
|
||||
const batchPromise = Promise.all(
|
||||
batch.map(async (websiteData: WebsiteData) => {
|
||||
try {
|
||||
const crawlResponse = await request(TEST_URL || "")
|
||||
.post("/v0/crawl")
|
||||
.set("Content-Type", "application/json")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100 }});
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 300000)); // wait for 300 seconds
|
||||
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
console.log('-------------------')
|
||||
console.log(websiteData.website);
|
||||
|
||||
if (!completedResponse.body.data) {
|
||||
console.log(completedResponse.body.partial_data.length);
|
||||
const urls = completedResponse.body.partial_data.map((page: any) => page.metadata?.sourceURL);
|
||||
console.log(urls);
|
||||
} else {
|
||||
console.log(completedResponse.body.data.length);
|
||||
const urls = completedResponse.body.data.map((page: any) => page.metadata?.sourceURL);
|
||||
console.log(urls);
|
||||
}
|
||||
|
||||
console.log('-------------------')
|
||||
|
||||
// if (!completedResponse.body || completedResponse.body.status !== "completed") {
|
||||
// errorLog.push({
|
||||
// website: websiteData.website,
|
||||
// prompt: 'CRAWL',
|
||||
// expected_output: 'SUCCESS',
|
||||
// actual_output: 'FAILURE',
|
||||
// error: `Crawl job did not complete successfully.`
|
||||
// });
|
||||
// return null;
|
||||
// }
|
||||
|
||||
// // check how many webpages were crawled successfully
|
||||
// // compares with expected_num_of_pages
|
||||
// if (completedResponse.body.data.length < websiteData.expected_min_num_of_pages) {
|
||||
// errorLog.push({
|
||||
// website: websiteData.website,
|
||||
// prompt: 'CRAWL',
|
||||
// expected_output: `SUCCESS: ${websiteData.expected_min_num_of_pages}`,
|
||||
// actual_output: `FAILURE: ${completedResponse.body.data.length}`,
|
||||
// error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}`
|
||||
// });
|
||||
// return null;
|
||||
// }
|
||||
|
||||
// // checks if crawled pages contain expected_crawled_pages
|
||||
// if (websiteData.expected_crawled_pages.some(page => !completedResponse.body.data.includes(page))) {
|
||||
// errorLog.push({
|
||||
// website: websiteData.website,
|
||||
// prompt: 'CRAWL',
|
||||
// expected_output: `SUCCESS: ${websiteData.expected_crawled_pages}`,
|
||||
// actual_output: `FAILURE: ${completedResponse.body.data}`,
|
||||
// error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}`
|
||||
// });
|
||||
// return null;
|
||||
// }
|
||||
|
||||
passedTests++;
|
||||
return {
|
||||
website: websiteData.website,
|
||||
statusCode: completedResponse.statusCode,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error(`Error processing ${websiteData.website}: ${error}`);
|
||||
errorLog.push({
|
||||
website: websiteData.website,
|
||||
prompt: 'CRAWL',
|
||||
expected_output: 'SUCCESS',
|
||||
actual_output: 'FAILURE',
|
||||
error: `Error processing ${websiteData.website}: ${error}`
|
||||
});
|
||||
return null;
|
||||
}
|
||||
})
|
||||
);
|
||||
batchPromises.push(batchPromise);
|
||||
}
|
||||
|
||||
(await Promise.all(batchPromises)).flat();
|
||||
const score = (passedTests / websitesData.length) * 100;
|
||||
const endTime = new Date().getTime();
|
||||
const timeTaken = (endTime - startTime) / 1000;
|
||||
console.log(`Score: ${score}%`);
|
||||
|
||||
await logErrors(errorLog, timeTaken, 0, score, websitesData.length);
|
||||
|
||||
if (process.env.ENV === "local" && errorLog.length > 0) {
|
||||
if (!fs.existsSync(logsDir)){
|
||||
fs.mkdirSync(logsDir, { recursive: true });
|
||||
}
|
||||
fs.writeFileSync(errorLogFileName, JSON.stringify(errorLog, null, 2));
|
||||
}
|
||||
|
||||
expect(score).toBeGreaterThanOrEqual(95);
|
||||
}, 350000); // 150 seconds timeout
|
||||
});
|
||||
});
|
@ -1,16 +1,14 @@
|
||||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import Anthropic from "@anthropic-ai/sdk";
|
||||
import { numTokensFromString } from "./utils/tokens";
|
||||
import { numTokensFromString } from "../utils/tokens";
|
||||
import OpenAI from "openai";
|
||||
import { WebsiteScrapeError } from "./utils/types";
|
||||
import { logErrors } from "./utils/log";
|
||||
import { WebsiteScrapeError } from "../utils/types";
|
||||
import { logErrors } from "../utils/log";
|
||||
|
||||
const websitesData = require("./data/websites.json");
|
||||
import websitesData from "../data/scrape.json";
|
||||
import "dotenv/config";
|
||||
|
||||
const fs = require('fs');
|
||||
|
||||
import fs from 'fs';
|
||||
dotenv.config();
|
||||
|
||||
interface WebsiteData {
|
||||
@ -21,8 +19,7 @@ interface WebsiteData {
|
||||
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
|
||||
describe("Scraping/Crawling Checkup (E2E)", () => {
|
||||
describe("Scraping Checkup (E2E)", () => {
|
||||
beforeAll(() => {
|
||||
if (!process.env.TEST_API_KEY) {
|
||||
throw new Error("TEST_API_KEY is not set");
|
||||
@ -72,10 +69,6 @@ describe("Scraping/Crawling Checkup (E2E)", () => {
|
||||
return null;
|
||||
}
|
||||
|
||||
const anthropic = new Anthropic({
|
||||
apiKey: process.env.ANTHROPIC_API_KEY,
|
||||
});
|
||||
|
||||
const openai = new OpenAI({
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
});
|
@ -39,7 +39,7 @@
|
||||
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
|
||||
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
|
||||
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
|
||||
// "resolveJsonModule": true, /* Enable importing .json files. */
|
||||
"resolveJsonModule": true, /* Enable importing .json files. */
|
||||
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
|
||||
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user