From a6b71977375b5f53569fdacd486484330331bdba Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 19:40:37 -0400 Subject: [PATCH] Fix for maxDepth --- .../src/__tests__/e2e_withAuth/index.test.ts | 69 ++++++++++++++++++- apps/api/src/scraper/WebScraper/crawler.ts | 8 ++- apps/api/src/scraper/WebScraper/index.ts | 4 +- 3 files changed, 76 insertions(+), 5 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 7c234ef..9a574f3 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -619,13 +619,14 @@ describe("E2E Tests for API Routes", () => { }, 180000); it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { + const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send({ - url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 0 }, + url: "https://www.mendable.ai", + crawlerOptions: { maxDepth: 2 }, }); expect(crawlResponse.statusCode).toBe(200); @@ -651,6 +652,70 @@ describe("E2E Tests for API Routes", () => { .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + const testurls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + console.log(testurls) + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); + + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + expect(depth).toBeLessThanOrEqual(1); + }); + }, 180000); + + it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { + + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 2 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const testurls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + console.log(testurls) + expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("completed"); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index ba5e003..3171ec7 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -60,8 +60,11 @@ export class WebCrawler { .filter((link) => { const url = new URL(link); const path = url.pathname; - const depth = url.pathname.split('/').length - 1; + + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1; + // Check if the link exceeds the maximum depth allowed if (depth > maxDepth) { return false; @@ -136,8 +139,10 @@ export class WebCrawler { if(!crawlerOptions?.ignoreSitemap){ const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); + if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); + return filteredLinks.map(link => ({ url: link, html: "" })); } } @@ -148,6 +153,7 @@ export class WebCrawler { concurrencyLimit, inProgress ); + if ( urls.length === 0 && diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 030f795..c19711d 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -164,9 +164,9 @@ export class WebScraperDataProvider { ): Promise { const pathSplits = new URL(this.urls[0]).pathname.split('/'); - const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1; const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth; - + const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes,