From a6b71977375b5f53569fdacd486484330331bdba Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 19:40:37 -0400 Subject: [PATCH 1/6] Fix for maxDepth --- .../src/__tests__/e2e_withAuth/index.test.ts | 69 ++++++++++++++++++- apps/api/src/scraper/WebScraper/crawler.ts | 8 ++- apps/api/src/scraper/WebScraper/index.ts | 4 +- 3 files changed, 76 insertions(+), 5 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 7c234ef..9a574f3 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -619,13 +619,14 @@ describe("E2E Tests for API Routes", () => { }, 180000); it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { + const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send({ - url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 0 }, + url: "https://www.mendable.ai", + crawlerOptions: { maxDepth: 2 }, }); expect(crawlResponse.statusCode).toBe(200); @@ -651,6 +652,70 @@ describe("E2E Tests for API Routes", () => { .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + const testurls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + console.log(testurls) + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); + + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + expect(depth).toBeLessThanOrEqual(1); + }); + }, 180000); + + it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { + + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 2 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const testurls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + console.log(testurls) + expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("completed"); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index ba5e003..3171ec7 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -60,8 +60,11 @@ export class WebCrawler { .filter((link) => { const url = new URL(link); const path = url.pathname; - const depth = url.pathname.split('/').length - 1; + + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1; + // Check if the link exceeds the maximum depth allowed if (depth > maxDepth) { return false; @@ -136,8 +139,10 @@ export class WebCrawler { if(!crawlerOptions?.ignoreSitemap){ const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); + if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); + return filteredLinks.map(link => ({ url: link, html: "" })); } } @@ -148,6 +153,7 @@ export class WebCrawler { concurrencyLimit, inProgress ); + if ( urls.length === 0 && diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 030f795..c19711d 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -164,9 +164,9 @@ export class WebScraperDataProvider { ): Promise { const pathSplits = new URL(this.urls[0]).pathname.split('/'); - const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1; const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth; - + const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, From f22759b2e70052d1629b2c994527b64f42e581e6 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 19:42:11 -0400 Subject: [PATCH 2/6] Update index.test.ts --- .../src/__tests__/e2e_withAuth/index.test.ts | 61 +------------------ 1 file changed, 1 insertion(+), 60 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 9a574f3..fe316d0 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -626,66 +626,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.mendable.ai", - crawlerOptions: { maxDepth: 2 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - const testurls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - console.log(testurls) - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThanOrEqual(1); - - // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(1); - }); - }, 180000); - - it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { - - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 2 }, + crawlerOptions: { maxDepth: 0 }, }); expect(crawlResponse.statusCode).toBe(200); From 2b40729cc29ea9d12f63e4f13649bf4c12233149 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Sat, 15 Jun 2024 08:56:32 -0400 Subject: [PATCH 3/6] Update index.test.ts --- .../api/src/__tests__/e2e_withAuth/index.test.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index fe316d0..3b1c596 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -563,7 +563,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(2); }); - }, 180000); + }, 240000); it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => { const crawlResponse = await request(TEST_URL) @@ -616,7 +616,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(3); }); - }, 180000); + }, 240000); it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { @@ -675,7 +675,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(1); }); - }, 180000); + }, 240000); it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepth equals to 2", async () => { const crawlResponse = await request(TEST_URL) @@ -728,7 +728,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(3); }); - }, 180000); + }, 240000); // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { // const crawlResponse = await request(TEST_URL) @@ -828,7 +828,7 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - }, 180000); + }, 240000); }); @@ -971,7 +971,7 @@ describe("E2E Tests for API Routes", () => { ); expect(childrenLinks.length).toBe(completedResponse.body.data.length); - }, 180000); // 120 seconds + }, 240000); // 120 seconds it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { const crawlResponse = await request(TEST_URL) @@ -1012,7 +1012,7 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - }, 180000); // 120 seconds + }, 240000); // 120 seconds it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { @@ -1062,7 +1062,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(2); }); - }, 180000); + }, 240000); it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => { const crawlResponse = await request(TEST_URL) From 34e37c5671eb5693a18c5ee0d8e5b936cd58df73 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Sat, 15 Jun 2024 16:43:37 -0400 Subject: [PATCH 4/6] Add unit tests to replace e2e --- .../src/__tests__/e2e_withAuth/index.test.ts | 67 +------ .../WebScraper/__tests__/crawler.test.ts | 163 ++++++++++++++++++ apps/api/src/scraper/WebScraper/crawler.ts | 4 +- apps/api/src/scraper/WebScraper/index.ts | 10 +- .../utils/__tests__/maxDepthUtils.test.ts | 37 ++++ .../scraper/WebScraper/utils/maxDepthUtils.ts | 12 ++ 6 files changed, 226 insertions(+), 67 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 3b1c596..c11f398 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -563,7 +563,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(2); }); - }, 240000); + }, 180000); it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => { const crawlResponse = await request(TEST_URL) @@ -616,7 +616,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(3); }); - }, 240000); + }, 180000); it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { @@ -675,60 +675,11 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(1); }); - }, 240000); + }, 180000); - it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepth equals to 2", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 2, limit: 5 }, - }); - expect(crawlResponse.statusCode).toBe(200); + - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThanOrEqual(1); - - // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(3); - }); - }, 240000); + // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { // const crawlResponse = await request(TEST_URL) @@ -828,7 +779,7 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - }, 240000); + }, 180000); }); @@ -971,7 +922,7 @@ describe("E2E Tests for API Routes", () => { ); expect(childrenLinks.length).toBe(completedResponse.body.data.length); - }, 240000); // 120 seconds + }, 180000); // 120 seconds it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { const crawlResponse = await request(TEST_URL) @@ -1012,7 +963,7 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - }, 240000); // 120 seconds + }, 180000); // 120 seconds it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { @@ -1062,7 +1013,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(2); }); - }, 240000); + }, 180000); it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => { const crawlResponse = await request(TEST_URL) diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts new file mode 100644 index 0000000..6f7b632 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts @@ -0,0 +1,163 @@ +// crawler.test.ts +import { WebCrawler } from '../crawler'; +import axios from 'axios'; +import robotsParser from 'robots-parser'; +import { getAdjustedMaxDepth } from '../utils/maxDepthUtils'; + +jest.mock('axios'); +jest.mock('robots-parser'); + +describe('WebCrawler maxDepth and filterLinks', () => { + let crawler: WebCrawler; + const mockAxios = axios as jest.Mocked; + const mockRobotsParser = robotsParser as jest.MockedFunction; + + let maxCrawledDepth: number; + + beforeEach(() => { + // Setup default mocks + mockAxios.get.mockImplementation((url) => { + if (url.includes('robots.txt')) { + return Promise.resolve({ data: 'User-agent: *\nAllow: /' }); + } else if (url.includes('sitemap.xml')) { + return Promise.resolve({ data: 'sitemap content' }); // You would normally parse this to URLs + } + return Promise.resolve({ data: '' }); + }); + + mockRobotsParser.mockReturnValue({ + isAllowed: jest.fn().mockReturnValue(true), + isDisallowed: jest.fn().mockReturnValue(false), + getMatchingLineNumber: jest.fn().mockReturnValue(0), + getCrawlDelay: jest.fn().mockReturnValue(0), + getSitemaps: jest.fn().mockReturnValue([]), + getPreferredHost: jest.fn().mockReturnValue('example.com') + }); + }); + + it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => { + const initialUrl = 'http://example.com'; // Set initial URL for this test + const enteredMaxCrawledDepth = 2; + maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth); + + + crawler = new WebCrawler({ + initialUrl: initialUrl, + includes: [], + excludes: [], + limit: 100, + maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing + }); + + // Mock sitemap fetching function to return controlled links + crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ + initialUrl, // depth 0 + initialUrl + '/page1', // depth 1 + initialUrl + '/page1/page2', // depth 2 + initialUrl + '/page1/page2/page3' // depth 3, should be filtered out + ]); + + const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth); + expect(results).toEqual([ + { url: initialUrl, html: '' }, + { url: initialUrl + '/page1', html: '' }, + { url: initialUrl + '/page1/page2', html: '' } + ]); + + + // Ensure that the link with depth 3 is not included + expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false); + }); + + it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => { + const initialUrl = 'http://example.com'; // Set initial URL for this test + const enteredMaxCrawledDepth = 0; + maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth); + console.log(maxCrawledDepth); + + crawler = new WebCrawler({ + initialUrl: initialUrl, + includes: [], + excludes: [], + limit: 100, + maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing + }); + + // Mock sitemap fetching function to return controlled links + crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ + initialUrl, // depth 0 + initialUrl + '/page1', // depth 1 + initialUrl + '/page1/page2', // depth 2 + initialUrl + '/page1/page2/page3' // depth 3, should be filtered out + ]); + + const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth); + expect(results).toEqual([ + { url: initialUrl, html: '' }, + ]); + }); + + it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => { + const initialUrl = 'http://example.com/page1'; // Set initial URL for this test + const enteredMaxCrawledDepth = 1; + maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth); + console.log(maxCrawledDepth); + + crawler = new WebCrawler({ + initialUrl: initialUrl, + includes: [], + excludes: [], + limit: 100, + maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing + }); + + // Mock sitemap fetching function to return controlled links + crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ + initialUrl, // depth 0 + initialUrl + '/page2', // depth 1 + initialUrl + '/page2/page3', // depth 2 + initialUrl + '/page2/page3/page4' // depth 3, should be filtered out + ]); + + const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth); + expect(results).toEqual([ + { url: initialUrl, html: '' }, + { url: initialUrl + '/page2', html: '' } + ]); + }); + + it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => { + const initialUrl = 'http://example.com/page1'; // Set initial URL for this test + const enteredMaxCrawledDepth = 2; + maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth); + console.log(maxCrawledDepth); + + crawler = new WebCrawler({ + initialUrl: initialUrl, + includes: [], + excludes: [], + limit: 100, + maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing + }); + + // Mock sitemap fetching function to return controlled links + crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ + initialUrl, // depth 0 + initialUrl + '/page2', // depth 1 + initialUrl + '/page2/page3', // depth 2 + initialUrl + '/page2/page3/page4' // depth 3, should be filtered out + ]); + + const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth); + expect(results).toEqual([ + { url: initialUrl, html: '' }, + { url: initialUrl + '/page2', html: '' }, + { url: initialUrl + '/page2/page3', html: '' } + ]); + }); + + + + // Add more tests to cover other scenarios, such as checking includes and excludes +}); + diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 3171ec7..e1a5e05 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -6,6 +6,7 @@ import async from "async"; import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import robotsParser from "robots-parser"; +import { getURLDepth } from "./utils/maxDepthUtils"; export class WebCrawler { private initialUrl: string; @@ -61,8 +62,7 @@ export class WebCrawler { const url = new URL(link); const path = url.pathname; - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1; + const depth = getURLDepth(url.toString()); // Check if the link exceeds the maximum depth allowed diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index c19711d..21301af 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -18,6 +18,7 @@ import { import { generateCompletions } from "../../lib/LLM-extraction"; import { getWebScraperQueue } from "../../../src/services/queue-service"; import { fetchAndProcessDocx } from "./utils/docxProcessor"; +import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils"; export class WebScraperDataProvider { private bullJobId: string; @@ -163,16 +164,12 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { - const pathSplits = new URL(this.urls[0]).pathname.split('/'); - const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1; - const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth; - const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, excludes: this.excludes, maxCrawledLinks: this.maxCrawledLinks, - maxCrawledDepth: adjustedMaxDepth, + maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth), limit: this.limit, generateImgAltText: this.generateImgAltText, allowBackwardCrawling: this.allowBackwardCrawling, @@ -580,8 +577,7 @@ export class WebScraperDataProvider { filterDepth(documents: Document[]): Document[] { return documents.filter((document) => { const url = new URL(document.metadata.sourceURL); - const path = url.pathname; - return path.split("/").length <= this.maxCrawledDepth; + return getURLDepth(url.toString()) <= this.maxCrawledDepth; }); } } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts new file mode 100644 index 0000000..e77242c --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts @@ -0,0 +1,37 @@ +import { getURLDepth, getAdjustedMaxDepth } from '../maxDepthUtils'; + +describe('Testing getURLDepth and getAdjustedMaxDepth', () => { + it('should return 0 for root - mendable.ai', () => { + const enteredURL = "https://www.mendable.ai/" + expect(getURLDepth(enteredURL)).toBe(0); + }); + + it('should return 0 for root - scrapethissite.com', () => { + const enteredURL = "https://scrapethissite.com/" + expect(getURLDepth(enteredURL)).toBe(0); + }); + + it('should return 1 for scrapethissite.com/pages', () => { + const enteredURL = "https://scrapethissite.com/pages" + expect(getURLDepth(enteredURL)).toBe(1); + }); + + it('should return 2 for scrapethissite.com/pages/articles', () => { + const enteredURL = "https://scrapethissite.com/pages/articles" + expect(getURLDepth(enteredURL)).toBe(2); + + }); + + it('Adjusted maxDepth should return 1 for scrapethissite.com', () => { + const enteredURL = "https://scrapethissite.com" + expect(getAdjustedMaxDepth(enteredURL, 1)).toBe(1); + + }); + + it('Adjusted maxDepth should return 5 for scrapethissite.com/pages/articles', () => { + const enteredURL = "https://scrapethissite.com/pages/articles" + expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(5); + }); + + +}); diff --git a/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts b/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts new file mode 100644 index 0000000..c1fea7f --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts @@ -0,0 +1,12 @@ + + +export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number { + const baseURLDepth = getURLDepth(url); + const adjustedMaxDepth = maxCrawlDepth + baseURLDepth; + return adjustedMaxDepth; +} + +export function getURLDepth(url: string): number { + const pathSplits = new URL(url).pathname.split('/'); + return pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) - 1; +} From ff7b52cab1862ba2cbbf18f78c3d69da18f78e96 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Sat, 15 Jun 2024 16:51:50 -0400 Subject: [PATCH 5/6] Delete one more e2e test --- .../src/__tests__/e2e_withAuth/index.test.ts | 48 ------------------- 1 file changed, 48 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index c11f398..106528c 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -966,54 +966,6 @@ describe("E2E Tests for API Routes", () => { }, 180000); // 120 seconds - it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 1 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - let isCompleted = false; - let completedResponse; - - while (!isCompleted) { - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - - if (response.body.status === "completed") { - isCompleted = true; - completedResponse = response; - } - } - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(1); - - // Check if all URLs have a maximum depth of 1 - urls.forEach((url) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(2); - }); - }, 180000); it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => { const crawlResponse = await request(TEST_URL) From 519ab1aecb6607659c0335f79db3e1df3ffbc7c8 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Sat, 15 Jun 2024 17:14:09 -0400 Subject: [PATCH 6/6] Update unit tests --- .../api/src/__tests__/e2e_withAuth/index.test.ts | 2 +- .../scraper/WebScraper/__tests__/crawler.test.ts | 6 +++--- .../utils/__tests__/maxDepthUtils.test.ts | 16 +++++++++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 3b27cbf..9f04093 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -655,7 +655,7 @@ describe("E2E Tests for API Routes", () => { const testurls = completedResponse.body.data.map( (item: any) => item.metadata?.sourceURL ); - console.log(testurls) + //console.log(testurls) expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts index 6f7b632..c7c54aa 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts @@ -73,7 +73,7 @@ describe('WebCrawler maxDepth and filterLinks', () => { const initialUrl = 'http://example.com'; // Set initial URL for this test const enteredMaxCrawledDepth = 0; maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth); - console.log(maxCrawledDepth); + crawler = new WebCrawler({ initialUrl: initialUrl, @@ -101,7 +101,7 @@ describe('WebCrawler maxDepth and filterLinks', () => { const initialUrl = 'http://example.com/page1'; // Set initial URL for this test const enteredMaxCrawledDepth = 1; maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth); - console.log(maxCrawledDepth); + crawler = new WebCrawler({ initialUrl: initialUrl, @@ -130,7 +130,7 @@ describe('WebCrawler maxDepth and filterLinks', () => { const initialUrl = 'http://example.com/page1'; // Set initial URL for this test const enteredMaxCrawledDepth = 2; maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth); - console.log(maxCrawledDepth); + crawler = new WebCrawler({ initialUrl: initialUrl, diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts index e77242c..863a689 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts @@ -22,15 +22,25 @@ describe('Testing getURLDepth and getAdjustedMaxDepth', () => { }); - it('Adjusted maxDepth should return 1 for scrapethissite.com', () => { + it('Adjusted maxDepth should return 1 for scrapethissite.com and max depth param of 1', () => { const enteredURL = "https://scrapethissite.com" expect(getAdjustedMaxDepth(enteredURL, 1)).toBe(1); }); + it('Adjusted maxDepth should return 0 for scrapethissite.com and max depth param of 0', () => { + const enteredURL = "https://scrapethissite.com" + expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0); - it('Adjusted maxDepth should return 5 for scrapethissite.com/pages/articles', () => { + }); + + it('Adjusted maxDepth should return 0 for mendable.ai and max depth param of 0', () => { + const enteredURL = "https://mendable.ai" + expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0); + }); + + it('Adjusted maxDepth should return 4 for scrapethissite.com/pages/articles and max depth param of 2', () => { const enteredURL = "https://scrapethissite.com/pages/articles" - expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(5); + expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(4); });