From a6b71977375b5f53569fdacd486484330331bdba Mon Sep 17 00:00:00 2001
From: Eric Ciarla <ericciarla@yahoo.com>
Date: Fri, 14 Jun 2024 19:40:37 -0400
Subject: [PATCH 1/6] Fix for maxDepth

---
 .../src/__tests__/e2e_withAuth/index.test.ts  | 69 ++++++++++++++++++-
 apps/api/src/scraper/WebScraper/crawler.ts    |  8 ++-
 apps/api/src/scraper/WebScraper/index.ts      |  4 +-
 3 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 7c234ef..9a574f3 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -619,13 +619,14 @@ describe("E2E Tests for API Routes", () => {
     }, 180000);
 
     it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
+      
       const crawlResponse = await request(TEST_URL)
         .post("/v0/crawl")
         .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
         .set("Content-Type", "application/json")
         .send({
-          url: "https://www.scrapethissite.com",
-          crawlerOptions: { maxDepth: 0 },
+          url: "https://www.mendable.ai",
+          crawlerOptions: { maxDepth: 2 },
         });
       expect(crawlResponse.statusCode).toBe(200);
 
@@ -651,6 +652,70 @@ describe("E2E Tests for API Routes", () => {
         .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
         .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
 
+        const testurls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL
+        );
+        console.log(testurls)
+
+      expect(completedResponse.statusCode).toBe(200);
+      expect(completedResponse.body).toHaveProperty("status");
+      expect(completedResponse.body.status).toBe("completed");
+      expect(completedResponse.body).toHaveProperty("data");
+      expect(completedResponse.body.data[0]).toHaveProperty("content");
+      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+      const urls = completedResponse.body.data.map(
+        (item: any) => item.metadata?.sourceURL
+      );
+      expect(urls.length).toBeGreaterThanOrEqual(1);
+
+      // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
+      urls.forEach((url: string) => {
+        const pathSplits = new URL(url).pathname.split('/');
+        const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
+        expect(depth).toBeLessThanOrEqual(1);
+      });
+    }, 180000);
+
+    it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
+      
+      const crawlResponse = await request(TEST_URL)
+        .post("/v0/crawl")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://www.scrapethissite.com",
+          crawlerOptions: { maxDepth: 2 },
+        });
+      expect(crawlResponse.statusCode).toBe(200);
+
+      const response = await request(TEST_URL)
+        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("status");
+      expect(["active", "waiting"]).toContain(response.body.status);
+      // wait for 60 seconds
+      let isCompleted = false;
+      while (!isCompleted) {
+        const statusCheckResponse = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(statusCheckResponse.statusCode).toBe(200);
+        isCompleted = statusCheckResponse.body.status === "completed";
+        if (!isCompleted) {
+          await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+        }
+      }
+      const completedResponse = await request(TEST_URL)
+        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+
+        const testurls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL
+        );
+        console.log(testurls)
+
       expect(completedResponse.statusCode).toBe(200);
       expect(completedResponse.body).toHaveProperty("status");
       expect(completedResponse.body.status).toBe("completed");
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index ba5e003..3171ec7 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -60,8 +60,11 @@ export class WebCrawler {
       .filter((link) => {
         const url = new URL(link);
         const path = url.pathname;
-        const depth = url.pathname.split('/').length - 1;
+        
+        const pathSplits = new URL(url).pathname.split('/');
+        const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1;
 
+        
         // Check if the link exceeds the maximum depth allowed
         if (depth > maxDepth) {
           return false;
@@ -136,8 +139,10 @@ export class WebCrawler {
 
     if(!crawlerOptions?.ignoreSitemap){
       const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
+    
       if (sitemapLinks.length > 0) {
         let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
+       
         return filteredLinks.map(link => ({ url: link, html: "" }));
       }
     }
@@ -148,6 +153,7 @@ export class WebCrawler {
       concurrencyLimit,
       inProgress
     );
+   
     
     if (
       urls.length === 0 &&
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 030f795..c19711d 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -164,9 +164,9 @@ export class WebScraperDataProvider {
   ): Promise<Document[]> {
 
     const pathSplits = new URL(this.urls[0]).pathname.split('/');
-    const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
+    const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1;
     const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth;
-    
+  
     const crawler = new WebCrawler({
       initialUrl: this.urls[0],
       includes: this.includes,

From f22759b2e70052d1629b2c994527b64f42e581e6 Mon Sep 17 00:00:00 2001
From: Eric Ciarla <ericciarla@yahoo.com>
Date: Fri, 14 Jun 2024 19:42:11 -0400
Subject: [PATCH 2/6] Update index.test.ts

---
 .../src/__tests__/e2e_withAuth/index.test.ts  | 61 +------------------
 1 file changed, 1 insertion(+), 60 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 9a574f3..fe316d0 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -626,66 +626,7 @@ describe("E2E Tests for API Routes", () => {
         .set("Content-Type", "application/json")
         .send({
           url: "https://www.mendable.ai",
-          crawlerOptions: { maxDepth: 2 },
-        });
-      expect(crawlResponse.statusCode).toBe(200);
-
-      const response = await request(TEST_URL)
-        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-      expect(response.statusCode).toBe(200);
-      expect(response.body).toHaveProperty("status");
-      expect(["active", "waiting"]).toContain(response.body.status);
-      // wait for 60 seconds
-      let isCompleted = false;
-      while (!isCompleted) {
-        const statusCheckResponse = await request(TEST_URL)
-          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-        expect(statusCheckResponse.statusCode).toBe(200);
-        isCompleted = statusCheckResponse.body.status === "completed";
-        if (!isCompleted) {
-          await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
-        }
-      }
-      const completedResponse = await request(TEST_URL)
-        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-
-        const testurls = completedResponse.body.data.map(
-          (item: any) => item.metadata?.sourceURL
-        );
-        console.log(testurls)
-
-      expect(completedResponse.statusCode).toBe(200);
-      expect(completedResponse.body).toHaveProperty("status");
-      expect(completedResponse.body.status).toBe("completed");
-      expect(completedResponse.body).toHaveProperty("data");
-      expect(completedResponse.body.data[0]).toHaveProperty("content");
-      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
-      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
-      const urls = completedResponse.body.data.map(
-        (item: any) => item.metadata?.sourceURL
-      );
-      expect(urls.length).toBeGreaterThanOrEqual(1);
-
-      // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
-      urls.forEach((url: string) => {
-        const pathSplits = new URL(url).pathname.split('/');
-        const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
-        expect(depth).toBeLessThanOrEqual(1);
-      });
-    }, 180000);
-
-    it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
-      
-      const crawlResponse = await request(TEST_URL)
-        .post("/v0/crawl")
-        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
-        .set("Content-Type", "application/json")
-        .send({
-          url: "https://www.scrapethissite.com",
-          crawlerOptions: { maxDepth: 2 },
+          crawlerOptions: { maxDepth: 0 },
         });
       expect(crawlResponse.statusCode).toBe(200);
 

From 2b40729cc29ea9d12f63e4f13649bf4c12233149 Mon Sep 17 00:00:00 2001
From: Eric Ciarla <ericciarla@yahoo.com>
Date: Sat, 15 Jun 2024 08:56:32 -0400
Subject: [PATCH 3/6] Update index.test.ts

---
 .../api/src/__tests__/e2e_withAuth/index.test.ts | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index fe316d0..3b1c596 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -563,7 +563,7 @@ describe("E2E Tests for API Routes", () => {
         const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
         expect(depth).toBeLessThanOrEqual(2);
       });
-    }, 180000);
+    }, 240000);
 
     it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => {
       const crawlResponse = await request(TEST_URL)
@@ -616,7 +616,7 @@ describe("E2E Tests for API Routes", () => {
         const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
         expect(depth).toBeLessThanOrEqual(3);
       });
-    }, 180000);
+    }, 240000);
 
     it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
       
@@ -675,7 +675,7 @@ describe("E2E Tests for API Routes", () => {
         const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
         expect(depth).toBeLessThanOrEqual(1);
       });
-    }, 180000);
+    }, 240000);
 
     it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepth equals to 2", async () => {
       const crawlResponse = await request(TEST_URL)
@@ -728,7 +728,7 @@ describe("E2E Tests for API Routes", () => {
         const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
         expect(depth).toBeLessThanOrEqual(3);
       });
-    }, 180000);
+    }, 240000);
 
     // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
     //   const crawlResponse = await request(TEST_URL)
@@ -828,7 +828,7 @@ describe("E2E Tests for API Routes", () => {
 
       expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
       expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
-    }, 180000);
+    }, 240000);
 
   });
 
@@ -971,7 +971,7 @@ describe("E2E Tests for API Routes", () => {
       );
 
       expect(childrenLinks.length).toBe(completedResponse.body.data.length);
-    }, 180000); // 120 seconds
+    }, 240000); // 120 seconds
     
     it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => {
       const crawlResponse = await request(TEST_URL)
@@ -1012,7 +1012,7 @@ describe("E2E Tests for API Routes", () => {
         expect(completedResponse.body.data[0]).toHaveProperty("metadata");
         expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
         expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
-    }, 180000); // 120 seconds
+    }, 240000); // 120 seconds
 
 
     it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
@@ -1062,7 +1062,7 @@ describe("E2E Tests for API Routes", () => {
         const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
         expect(depth).toBeLessThanOrEqual(2);
       });
-    }, 180000);
+    }, 240000);
 
     it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => {
       const crawlResponse = await request(TEST_URL)

From 34e37c5671eb5693a18c5ee0d8e5b936cd58df73 Mon Sep 17 00:00:00 2001
From: Eric Ciarla <ericciarla@yahoo.com>
Date: Sat, 15 Jun 2024 16:43:37 -0400
Subject: [PATCH 4/6] Add unit tests to replace e2e

---
 .../src/__tests__/e2e_withAuth/index.test.ts  |  67 +------
 .../WebScraper/__tests__/crawler.test.ts      | 163 ++++++++++++++++++
 apps/api/src/scraper/WebScraper/crawler.ts    |   4 +-
 apps/api/src/scraper/WebScraper/index.ts      |  10 +-
 .../utils/__tests__/maxDepthUtils.test.ts     |  37 ++++
 .../scraper/WebScraper/utils/maxDepthUtils.ts |  12 ++
 6 files changed, 226 insertions(+), 67 deletions(-)
 create mode 100644 apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts
 create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts
 create mode 100644 apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 3b1c596..c11f398 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -563,7 +563,7 @@ describe("E2E Tests for API Routes", () => {
         const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
         expect(depth).toBeLessThanOrEqual(2);
       });
-    }, 240000);
+    }, 180000);
 
     it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => {
       const crawlResponse = await request(TEST_URL)
@@ -616,7 +616,7 @@ describe("E2E Tests for API Routes", () => {
         const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
         expect(depth).toBeLessThanOrEqual(3);
       });
-    }, 240000);
+    }, 180000);
 
     it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
       
@@ -675,60 +675,11 @@ describe("E2E Tests for API Routes", () => {
         const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
         expect(depth).toBeLessThanOrEqual(1);
       });
-    }, 240000);
+    }, 180000);
 
-    it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepth equals to 2", async () => {
-      const crawlResponse = await request(TEST_URL)
-        .post("/v0/crawl")
-        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
-        .set("Content-Type", "application/json")
-        .send({
-          url: "https://www.scrapethissite.com",
-          crawlerOptions: { maxDepth: 2, limit: 5 },
-        });
-      expect(crawlResponse.statusCode).toBe(200);
+    
 
-      const response = await request(TEST_URL)
-        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-      expect(response.statusCode).toBe(200);
-      expect(response.body).toHaveProperty("status");
-      expect(["active", "waiting"]).toContain(response.body.status);
-      // wait for 60 seconds
-      let isCompleted = false;
-      while (!isCompleted) {
-        const statusCheckResponse = await request(TEST_URL)
-          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-        expect(statusCheckResponse.statusCode).toBe(200);
-        isCompleted = statusCheckResponse.body.status === "completed";
-        if (!isCompleted) {
-          await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
-        }
-      }
-      const completedResponse = await request(TEST_URL)
-        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-
-      expect(completedResponse.statusCode).toBe(200);
-      expect(completedResponse.body).toHaveProperty("status");
-      expect(completedResponse.body.status).toBe("completed");
-      expect(completedResponse.body).toHaveProperty("data");
-      expect(completedResponse.body.data[0]).toHaveProperty("content");
-      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
-      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
-      const urls = completedResponse.body.data.map(
-        (item: any) => item.metadata?.sourceURL
-      );
-      expect(urls.length).toBeGreaterThanOrEqual(1);
-
-      // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
-      urls.forEach((url: string) => {
-        const pathSplits = new URL(url).pathname.split('/');
-        const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
-        expect(depth).toBeLessThanOrEqual(3);
-      });
-    }, 240000);
+    
 
     // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
     //   const crawlResponse = await request(TEST_URL)
@@ -828,7 +779,7 @@ describe("E2E Tests for API Routes", () => {
 
       expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
       expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
-    }, 240000);
+    }, 180000);
 
   });
 
@@ -971,7 +922,7 @@ describe("E2E Tests for API Routes", () => {
       );
 
       expect(childrenLinks.length).toBe(completedResponse.body.data.length);
-    }, 240000); // 120 seconds
+    }, 180000); // 120 seconds
     
     it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => {
       const crawlResponse = await request(TEST_URL)
@@ -1012,7 +963,7 @@ describe("E2E Tests for API Routes", () => {
         expect(completedResponse.body.data[0]).toHaveProperty("metadata");
         expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
         expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
-    }, 240000); // 120 seconds
+    }, 180000); // 120 seconds
 
 
     it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
@@ -1062,7 +1013,7 @@ describe("E2E Tests for API Routes", () => {
         const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
         expect(depth).toBeLessThanOrEqual(2);
       });
-    }, 240000);
+    }, 180000);
 
     it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => {
       const crawlResponse = await request(TEST_URL)
diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts
new file mode 100644
index 0000000..6f7b632
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts
@@ -0,0 +1,163 @@
+// crawler.test.ts
+import { WebCrawler } from '../crawler';
+import axios from 'axios';
+import robotsParser from 'robots-parser';
+import { getAdjustedMaxDepth } from '../utils/maxDepthUtils';
+
+jest.mock('axios');
+jest.mock('robots-parser');
+
+describe('WebCrawler maxDepth and filterLinks', () => {
+  let crawler: WebCrawler;
+  const mockAxios = axios as jest.Mocked<typeof axios>;
+  const mockRobotsParser = robotsParser as jest.MockedFunction<typeof robotsParser>;
+
+  let maxCrawledDepth: number;
+
+  beforeEach(() => {
+    // Setup default mocks
+    mockAxios.get.mockImplementation((url) => {
+      if (url.includes('robots.txt')) {
+        return Promise.resolve({ data: 'User-agent: *\nAllow: /' });
+      } else if (url.includes('sitemap.xml')) {
+        return Promise.resolve({ data: 'sitemap content' }); // You would normally parse this to URLs
+      }
+      return Promise.resolve({ data: '<html></html>' });
+    });
+
+    mockRobotsParser.mockReturnValue({
+      isAllowed: jest.fn().mockReturnValue(true),
+      isDisallowed: jest.fn().mockReturnValue(false),
+      getMatchingLineNumber: jest.fn().mockReturnValue(0),
+      getCrawlDelay: jest.fn().mockReturnValue(0),
+      getSitemaps: jest.fn().mockReturnValue([]),
+      getPreferredHost: jest.fn().mockReturnValue('example.com')
+    });
+  });
+
+  it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => {
+    const initialUrl = 'http://example.com'; // Set initial URL for this test
+    const enteredMaxCrawledDepth = 2;
+    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
+
+
+    crawler = new WebCrawler({
+      initialUrl: initialUrl,
+      includes: [],
+      excludes: [],
+      limit: 100,
+      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
+    });
+
+    // Mock sitemap fetching function to return controlled links
+    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
+      initialUrl, // depth 0
+      initialUrl + '/page1', // depth 1
+      initialUrl + '/page1/page2', // depth 2
+      initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
+    ]);
+
+    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
+    expect(results).toEqual([
+      { url: initialUrl, html: '' },
+      { url: initialUrl + '/page1', html: '' },
+      { url: initialUrl + '/page1/page2', html: '' }
+    ]);
+
+
+    // Ensure that the link with depth 3 is not included
+    expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false);
+  });
+
+  it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => {
+    const initialUrl = 'http://example.com'; // Set initial URL for this test
+    const enteredMaxCrawledDepth = 0;
+    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
+    console.log(maxCrawledDepth);
+
+    crawler = new WebCrawler({
+      initialUrl: initialUrl,
+      includes: [],
+      excludes: [],
+      limit: 100,
+      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
+    });
+
+    // Mock sitemap fetching function to return controlled links
+    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
+      initialUrl, // depth 0
+      initialUrl + '/page1', // depth 1
+      initialUrl + '/page1/page2', // depth 2
+      initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
+    ]);
+
+    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
+    expect(results).toEqual([
+      { url: initialUrl, html: '' },
+    ]);  
+  });
+
+  it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => {
+    const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
+    const enteredMaxCrawledDepth = 1;
+    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
+    console.log(maxCrawledDepth);
+
+    crawler = new WebCrawler({
+      initialUrl: initialUrl,
+      includes: [],
+      excludes: [],
+      limit: 100,
+      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
+    });
+
+    // Mock sitemap fetching function to return controlled links
+    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
+      initialUrl, // depth 0
+      initialUrl + '/page2', // depth 1
+      initialUrl + '/page2/page3', // depth 2
+      initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
+    ]);
+
+    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
+    expect(results).toEqual([
+      { url: initialUrl, html: '' },
+      { url: initialUrl + '/page2', html: '' }
+    ]);
+  });
+
+  it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => {
+    const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
+    const enteredMaxCrawledDepth = 2;
+    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
+    console.log(maxCrawledDepth);
+
+    crawler = new WebCrawler({
+      initialUrl: initialUrl,
+      includes: [],
+      excludes: [],
+      limit: 100,
+      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
+    });
+
+    // Mock sitemap fetching function to return controlled links
+    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
+      initialUrl, // depth 0
+      initialUrl + '/page2', // depth 1
+      initialUrl + '/page2/page3', // depth 2
+      initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
+    ]);
+
+    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
+    expect(results).toEqual([
+      { url: initialUrl, html: '' },
+      { url: initialUrl + '/page2', html: '' },
+      { url: initialUrl + '/page2/page3', html: '' }
+    ]);   
+  });
+
+
+
+  // Add more tests to cover other scenarios, such as checking includes and excludes
+});
+
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 3171ec7..e1a5e05 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -6,6 +6,7 @@ import async from "async";
 import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
 import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";
+import { getURLDepth } from "./utils/maxDepthUtils";
 
 export class WebCrawler {
   private initialUrl: string;
@@ -61,8 +62,7 @@ export class WebCrawler {
         const url = new URL(link);
         const path = url.pathname;
         
-        const pathSplits = new URL(url).pathname.split('/');
-        const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1;
+        const depth = getURLDepth(url.toString());
 
         
         // Check if the link exceeds the maximum depth allowed
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index c19711d..21301af 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -18,6 +18,7 @@ import {
 import { generateCompletions } from "../../lib/LLM-extraction";
 import { getWebScraperQueue } from "../../../src/services/queue-service";
 import { fetchAndProcessDocx } from "./utils/docxProcessor";
+import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
 
 export class WebScraperDataProvider {
   private bullJobId: string;
@@ -163,16 +164,12 @@ export class WebScraperDataProvider {
     inProgress?: (progress: Progress) => void
   ): Promise<Document[]> {
 
-    const pathSplits = new URL(this.urls[0]).pathname.split('/');
-    const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1;
-    const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth;
-  
     const crawler = new WebCrawler({
       initialUrl: this.urls[0],
       includes: this.includes,
       excludes: this.excludes,
       maxCrawledLinks: this.maxCrawledLinks,
-      maxCrawledDepth: adjustedMaxDepth,
+      maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
       limit: this.limit,
       generateImgAltText: this.generateImgAltText,
       allowBackwardCrawling: this.allowBackwardCrawling,
@@ -580,8 +577,7 @@ export class WebScraperDataProvider {
   filterDepth(documents: Document[]): Document[] {
     return documents.filter((document) => {
       const url = new URL(document.metadata.sourceURL);
-      const path = url.pathname;
-      return path.split("/").length <= this.maxCrawledDepth;
+      return getURLDepth(url.toString()) <= this.maxCrawledDepth;
     });
   }
 }
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts
new file mode 100644
index 0000000..e77242c
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts
@@ -0,0 +1,37 @@
+import { getURLDepth, getAdjustedMaxDepth } from '../maxDepthUtils';
+
+describe('Testing getURLDepth and getAdjustedMaxDepth', () => {
+  it('should return 0 for root - mendable.ai', () => {
+    const enteredURL = "https://www.mendable.ai/"
+    expect(getURLDepth(enteredURL)).toBe(0);
+  });
+
+  it('should return 0 for root - scrapethissite.com', () => {
+    const enteredURL = "https://scrapethissite.com/"
+    expect(getURLDepth(enteredURL)).toBe(0);
+  });
+
+  it('should return 1 for scrapethissite.com/pages', () => {
+    const enteredURL = "https://scrapethissite.com/pages"
+    expect(getURLDepth(enteredURL)).toBe(1);
+  });
+
+  it('should return 2 for scrapethissite.com/pages/articles', () => {
+    const enteredURL = "https://scrapethissite.com/pages/articles"
+    expect(getURLDepth(enteredURL)).toBe(2);
+
+  });
+
+  it('Adjusted maxDepth should return 1 for scrapethissite.com', () => {
+    const enteredURL = "https://scrapethissite.com"
+    expect(getAdjustedMaxDepth(enteredURL, 1)).toBe(1);
+
+  });
+
+  it('Adjusted maxDepth should return 5 for scrapethissite.com/pages/articles', () => {
+    const enteredURL = "https://scrapethissite.com/pages/articles"
+    expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(5);
+  });
+
+  
+});
diff --git a/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts b/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts
new file mode 100644
index 0000000..c1fea7f
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts
@@ -0,0 +1,12 @@
+
+
+export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number {
+  const baseURLDepth = getURLDepth(url);
+  const adjustedMaxDepth = maxCrawlDepth + baseURLDepth;
+  return adjustedMaxDepth;
+}
+
+export function getURLDepth(url: string): number {
+  const pathSplits = new URL(url).pathname.split('/');
+  return pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) - 1;
+}

From ff7b52cab1862ba2cbbf18f78c3d69da18f78e96 Mon Sep 17 00:00:00 2001
From: Eric Ciarla <ericciarla@yahoo.com>
Date: Sat, 15 Jun 2024 16:51:50 -0400
Subject: [PATCH 5/6] Delete one more e2e test

---
 .../src/__tests__/e2e_withAuth/index.test.ts  | 48 -------------------
 1 file changed, 48 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index c11f398..106528c 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -966,54 +966,6 @@ describe("E2E Tests for API Routes", () => {
     }, 180000); // 120 seconds
 
 
-    it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
-      const crawlResponse = await request(TEST_URL)
-        .post("/v0/crawl")
-        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
-        .set("Content-Type", "application/json")
-        .send({
-          url: "https://www.scrapethissite.com",
-          crawlerOptions: { maxDepth: 1 },
-        });
-      expect(crawlResponse.statusCode).toBe(200);
-
-      let isCompleted = false;
-      let completedResponse;
-
-      while (!isCompleted) {
-        const response = await request(TEST_URL)
-          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-        expect(response.statusCode).toBe(200);
-        expect(response.body).toHaveProperty("status");
-
-        if (response.body.status === "completed") {
-          isCompleted = true;
-          completedResponse = response;
-        }
-      }
-      expect(completedResponse.statusCode).toBe(200);
-      expect(completedResponse.body).toHaveProperty("status");
-      expect(completedResponse.body.status).toBe("completed");
-      expect(completedResponse.body).toHaveProperty("data");
-      expect(completedResponse.body.data[0]).toHaveProperty("content");
-      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
-      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
-      expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
-      expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
-
-      const urls = completedResponse.body.data.map(
-        (item: any) => item.metadata?.sourceURL
-      );
-      expect(urls.length).toBeGreaterThan(1);
-
-      // Check if all URLs have a maximum depth of 1
-      urls.forEach((url) => {
-        const pathSplits = new URL(url).pathname.split('/');
-        const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
-        expect(depth).toBeLessThanOrEqual(2);
-      });
-    }, 180000);
 
     it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => {
       const crawlResponse = await request(TEST_URL)

From 519ab1aecb6607659c0335f79db3e1df3ffbc7c8 Mon Sep 17 00:00:00 2001
From: Eric Ciarla <ericciarla@yahoo.com>
Date: Sat, 15 Jun 2024 17:14:09 -0400
Subject: [PATCH 6/6] Update unit tests

---
 .../api/src/__tests__/e2e_withAuth/index.test.ts |  2 +-
 .../scraper/WebScraper/__tests__/crawler.test.ts |  6 +++---
 .../utils/__tests__/maxDepthUtils.test.ts        | 16 +++++++++++++---
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 3b27cbf..9f04093 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -655,7 +655,7 @@ describe("E2E Tests for API Routes", () => {
         const testurls = completedResponse.body.data.map(
           (item: any) => item.metadata?.sourceURL
         );
-        console.log(testurls)
+        //console.log(testurls)
 
       expect(completedResponse.statusCode).toBe(200);
       expect(completedResponse.body).toHaveProperty("status");
diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts
index 6f7b632..c7c54aa 100644
--- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts
+++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts
@@ -73,7 +73,7 @@ describe('WebCrawler maxDepth and filterLinks', () => {
     const initialUrl = 'http://example.com'; // Set initial URL for this test
     const enteredMaxCrawledDepth = 0;
     maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
-    console.log(maxCrawledDepth);
+   
 
     crawler = new WebCrawler({
       initialUrl: initialUrl,
@@ -101,7 +101,7 @@ describe('WebCrawler maxDepth and filterLinks', () => {
     const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
     const enteredMaxCrawledDepth = 1;
     maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
-    console.log(maxCrawledDepth);
+  
 
     crawler = new WebCrawler({
       initialUrl: initialUrl,
@@ -130,7 +130,7 @@ describe('WebCrawler maxDepth and filterLinks', () => {
     const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
     const enteredMaxCrawledDepth = 2;
     maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
-    console.log(maxCrawledDepth);
+ 
 
     crawler = new WebCrawler({
       initialUrl: initialUrl,
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts
index e77242c..863a689 100644
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts
@@ -22,15 +22,25 @@ describe('Testing getURLDepth and getAdjustedMaxDepth', () => {
 
   });
 
-  it('Adjusted maxDepth should return 1 for scrapethissite.com', () => {
+  it('Adjusted maxDepth should return 1 for scrapethissite.com and max depth param of 1', () => {
     const enteredURL = "https://scrapethissite.com"
     expect(getAdjustedMaxDepth(enteredURL, 1)).toBe(1);
 
   });
+  it('Adjusted maxDepth should return 0 for scrapethissite.com and max depth param of 0', () => {
+    const enteredURL = "https://scrapethissite.com"
+    expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0);
 
-  it('Adjusted maxDepth should return 5 for scrapethissite.com/pages/articles', () => {
+  });
+
+  it('Adjusted maxDepth should return 0 for mendable.ai and max depth param of 0', () => {
+    const enteredURL = "https://mendable.ai"
+    expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0);
+  });
+
+  it('Adjusted maxDepth should return 4 for scrapethissite.com/pages/articles and max depth param of 2', () => {
     const enteredURL = "https://scrapethissite.com/pages/articles"
-    expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(5);
+    expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(4);
   });