From 59451754f52743b1ba598d51ce3db3c2cd595bdb Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 10:14:07 -0400 Subject: [PATCH] Add tests --- .../src/__tests__/e2e_withAuth/index.test.ts | 43 +++++++++++++++++++ apps/api/src/scraper/WebScraper/index.ts | 3 +- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index a4a427c..38ea631 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -482,6 +482,49 @@ describe("E2E Tests for API Routes", () => { }); }, 120000); + it.concurrent("should return a successful response with relative max depth option for a valid crawl job different depths (2)", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 2 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + // wait for 60 seconds + await new Promise((r) => setTimeout(r, 60000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); + + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + expect(depth).toBeLessThanOrEqual(3); + }); + }, 120000); + // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { // const crawlResponse = await request(TEST_URL) // .post("/v0/crawl") diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 67d73d4..250931b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -166,8 +166,7 @@ export class WebScraperDataProvider { const pathSplits = new URL(this.urls[0]).pathname.split('/'); const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth; - - + const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes,