Merge branch 'main' into nsc/rate-limiter-tests

2024-06-17 15:06:12 -04:00 · 2024-06-17 15:06:12 -04:00 · ab038051e9
commit ab038051e9
parent 43767360d8 a20d002a6b
7 changed files with 253 additions and 52859 deletions
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -619,12 +619,13 @@ describe("E2E Tests for API Routes", () => {
    }, 180000);
    it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
      const crawlResponse = await request(TEST_URL)
        .post("/v0/crawl")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
-          url: "https://www.scrapethissite.com",
+          url: "https://www.mendable.ai",
          crawlerOptions: { maxDepth: 0 },
        });
      expect(crawlResponse.statusCode).toBe(200);
@ -651,6 +652,11 @@ describe("E2E Tests for API Routes", () => {
        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
        const testurls = completedResponse.body.data.map(
          (item: any) => item.metadata?.sourceURL
        );
        //console.log(testurls)
      expect(completedResponse.statusCode).toBe(200);
      expect(completedResponse.body).toHaveProperty("status");
      expect(completedResponse.body.status).toBe("completed");
@ -671,58 +677,9 @@ describe("E2E Tests for API Routes", () => {
      });
    }, 180000);
    it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepth equals to 2", async () => {
      const crawlResponse = await request(TEST_URL)
        .post("/v0/crawl")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          url: "proxyway.com",
          crawlerOptions: { maxDepth: 2, limit: 5 },
        });
      expect(crawlResponse.statusCode).toBe(200);
      const response = await request(TEST_URL)
        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
      expect(response.statusCode).toBe(200);
      expect(response.body).toHaveProperty("status");
      expect(["active", "waiting"]).toContain(response.body.status);
      // wait for 60 seconds
      let isCompleted = false;
      while (!isCompleted) {
        const statusCheckResponse = await request(TEST_URL)
          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
        expect(statusCheckResponse.statusCode).toBe(200);
        isCompleted = statusCheckResponse.body.status === "completed";
        if (!isCompleted) {
          await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
        }
      }
      const completedResponse = await request(TEST_URL)
        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-      expect(completedResponse.statusCode).toBe(200);
+    
      expect(completedResponse.body).toHaveProperty("status");
      expect(completedResponse.body.status).toBe("completed");
      expect(completedResponse.body).toHaveProperty("data");
      expect(completedResponse.body.data[0]).toHaveProperty("content");
      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
      const urls = completedResponse.body.data.map(
        (item: any) => item.metadata?.sourceURL
      );
      expect(urls.length).toBeGreaterThanOrEqual(1);
      // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
      urls.forEach((url: string) => {
        const pathSplits = new URL(url).pathname.split('/');
        const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
        expect(depth).toBeLessThanOrEqual(3);
      });
    }, 180000);
    // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
    //   const crawlResponse = await request(TEST_URL)
@ -1009,54 +966,6 @@ describe("E2E Tests for API Routes", () => {
    }, 180000); // 120 seconds
    it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
      const crawlResponse = await request(TEST_URL)
        .post("/v0/crawl")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          url: "https://www.scrapethissite.com",
          crawlerOptions: { maxDepth: 1 },
        });
      expect(crawlResponse.statusCode).toBe(200);
      let isCompleted = false;
      let completedResponse;
      while (!isCompleted) {
        const response = await request(TEST_URL)
          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
        expect(response.statusCode).toBe(200);
        expect(response.body).toHaveProperty("status");
        if (response.body.status === "completed") {
          isCompleted = true;
          completedResponse = response;
        }
      }
      expect(completedResponse.statusCode).toBe(200);
      expect(completedResponse.body).toHaveProperty("status");
      expect(completedResponse.body.status).toBe("completed");
      expect(completedResponse.body).toHaveProperty("data");
      expect(completedResponse.body.data[0]).toHaveProperty("content");
      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
      expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
      expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
      const urls = completedResponse.body.data.map(
        (item: any) => item.metadata?.sourceURL
      );
      expect(urls.length).toBeGreaterThan(1);
      // Check if all URLs have a maximum depth of 1
      urls.forEach((url) => {
        const pathSplits = new URL(url).pathname.split('/');
        const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
        expect(depth).toBeLessThanOrEqual(2);
      });
    }, 180000);
    it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => {
      const crawlResponse = await request(TEST_URL)
--- a/apps/api/src/scraper/WebScraper/tests/crawler.test.ts
+++ b/apps/api/src/scraper/WebScraper/tests/crawler.test.ts
@ -0,0 +1,163 @@
 // crawler.test.ts
 import { WebCrawler } from '../crawler';
 import axios from 'axios';
 import robotsParser from 'robots-parser';
 import { getAdjustedMaxDepth } from '../utils/maxDepthUtils';
 jest.mock('axios');
 jest.mock('robots-parser');
 describe('WebCrawler maxDepth and filterLinks', () => {
  let crawler: WebCrawler;
  const mockAxios = axios as jest.Mocked<typeof axios>;
  const mockRobotsParser = robotsParser as jest.MockedFunction<typeof robotsParser>;
  let maxCrawledDepth: number;
  beforeEach(() => {
    // Setup default mocks
    mockAxios.get.mockImplementation((url) => {
      if (url.includes('robots.txt')) {
        return Promise.resolve({ data: 'User-agent: *\nAllow: /' });
      } else if (url.includes('sitemap.xml')) {
        return Promise.resolve({ data: 'sitemap content' }); // You would normally parse this to URLs
      }
      return Promise.resolve({ data: '<html></html>' });
    });
    mockRobotsParser.mockReturnValue({
      isAllowed: jest.fn().mockReturnValue(true),
      isDisallowed: jest.fn().mockReturnValue(false),
      getMatchingLineNumber: jest.fn().mockReturnValue(0),
      getCrawlDelay: jest.fn().mockReturnValue(0),
      getSitemaps: jest.fn().mockReturnValue([]),
      getPreferredHost: jest.fn().mockReturnValue('example.com')
    });
  });
  it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => {
    const initialUrl = 'http://example.com'; // Set initial URL for this test
    const enteredMaxCrawledDepth = 2;
    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
    crawler = new WebCrawler({
      initialUrl: initialUrl,
      includes: [],
      excludes: [],
      limit: 100,
      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
    });
    // Mock sitemap fetching function to return controlled links
    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
      initialUrl, // depth 0
      initialUrl + '/page1', // depth 1
      initialUrl + '/page1/page2', // depth 2
      initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
    ]);
    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
    expect(results).toEqual([
      { url: initialUrl, html: '' },
      { url: initialUrl + '/page1', html: '' },
      { url: initialUrl + '/page1/page2', html: '' }
    ]);
    // Ensure that the link with depth 3 is not included
    expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false);
  });
  it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => {
    const initialUrl = 'http://example.com'; // Set initial URL for this test
    const enteredMaxCrawledDepth = 0;
    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
    crawler = new WebCrawler({
      initialUrl: initialUrl,
      includes: [],
      excludes: [],
      limit: 100,
      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
    });
    // Mock sitemap fetching function to return controlled links
    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
      initialUrl, // depth 0
      initialUrl + '/page1', // depth 1
      initialUrl + '/page1/page2', // depth 2
      initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
    ]);
    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
    expect(results).toEqual([
      { url: initialUrl, html: '' },
    ]);  
  });
  it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => {
    const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
    const enteredMaxCrawledDepth = 1;
    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
    crawler = new WebCrawler({
      initialUrl: initialUrl,
      includes: [],
      excludes: [],
      limit: 100,
      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
    });
    // Mock sitemap fetching function to return controlled links
    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
      initialUrl, // depth 0
      initialUrl + '/page2', // depth 1
      initialUrl + '/page2/page3', // depth 2
      initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
    ]);
    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
    expect(results).toEqual([
      { url: initialUrl, html: '' },
      { url: initialUrl + '/page2', html: '' }
    ]);
  });
  it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => {
    const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
    const enteredMaxCrawledDepth = 2;
    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
    crawler = new WebCrawler({
      initialUrl: initialUrl,
      includes: [],
      excludes: [],
      limit: 100,
      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
    });
    // Mock sitemap fetching function to return controlled links
    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
      initialUrl, // depth 0
      initialUrl + '/page2', // depth 1
      initialUrl + '/page2/page3', // depth 2
      initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
    ]);
    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
    expect(results).toEqual([
      { url: initialUrl, html: '' },
      { url: initialUrl + '/page2', html: '' },
      { url: initialUrl + '/page2/page3', html: '' }
    ]);   
  });
  // Add more tests to cover other scenarios, such as checking includes and excludes
 });
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -6,6 +6,7 @@ import async from "async";
 import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
 import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";
 import { getURLDepth } from "./utils/maxDepthUtils";
 export class WebCrawler {
  private initialUrl: string;
@ -60,8 +61,10 @@ export class WebCrawler {
      .filter((link) => {
        const url = new URL(link);
        const path = url.pathname;
-        const depth = url.pathname.split('/').length - 1;
+        
        const depth = getURLDepth(url.toString());
        // Check if the link exceeds the maximum depth allowed
        if (depth > maxDepth) {
          return false;
@ -136,8 +139,10 @@ export class WebCrawler {
    if(!crawlerOptions?.ignoreSitemap){
      const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
      if (sitemapLinks.length > 0) {
        let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
        return filteredLinks.map(link => ({ url: link, html: "" }));
      }
    }
@ -148,6 +153,7 @@ export class WebCrawler {
      concurrencyLimit,
      inProgress
    );
    if (
      urls.length === 0 &&
@ -224,11 +230,10 @@ export class WebCrawler {
  }
  async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
-    const normalizedUrl = this.normalizeCrawlUrl(url);
+    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
    if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
      return [];
    }
-    this.visited.add(normalizedUrl);
+    this.visited.add(url);
    if (!url.startsWith("http")) {
      url = "https://" + url;
@ -276,15 +281,16 @@ export class WebCrawler {
          const urlObj = new URL(fullUrl);
          const path = urlObj.pathname;
          if (
            this.isInternalLink(fullUrl) &&
            this.matchesPattern(fullUrl) &&
            this.noSections(fullUrl) &&
            // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
            // this.matchesIncludes(path) &&
            !this.matchesExcludes(path) &&
-            this.robots.isAllowed(fullUrl, "FireCrawlAgent")
+            this.isRobotsAllowed(fullUrl)
          ) {
            links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
          }
        }
@ -294,12 +300,15 @@ export class WebCrawler {
        return links;
      }
      // Create a new list to return to avoid modifying the visited list
-      return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
+      return links.filter((link) => !this.visited.has(link.url));
    } catch (error) {
      return [];
    }
  }
  private isRobotsAllowed(url: string): boolean {
    return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
  }
  private normalizeCrawlUrl(url: string): string {
    try{
      const urlObj = new URL(url);
@ -326,12 +335,10 @@ export class WebCrawler {
  private isInternalLink(link: string): boolean {
    const urlObj = new URL(link, this.baseUrl);
-    const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
+    const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
-    return urlObj.hostname === domainWithoutProtocol;
+    const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
-  }
+    
-
+    return linkDomain === baseDomain;
  private matchesPattern(link: string): boolean {
    return true; // Placeholder for future pattern matching implementation
  }
  private isFile(url: string): boolean {
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -18,6 +18,7 @@ import {
 import { generateCompletions } from "../../lib/LLM-extraction";
 import { getWebScraperQueue } from "../../../src/services/queue-service";
 import { fetchAndProcessDocx } from "./utils/docxProcessor";
 import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
 export class WebScraperDataProvider {
  private bullJobId: string;
@ -163,16 +164,12 @@ export class WebScraperDataProvider {
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
    const pathSplits = new URL(this.urls[0]).pathname.split('/');
    const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
    const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth;
    const crawler = new WebCrawler({
      initialUrl: this.urls[0],
      includes: this.includes,
      excludes: this.excludes,
      maxCrawledLinks: this.maxCrawledLinks,
-      maxCrawledDepth: adjustedMaxDepth,
+      maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
      limit: this.limit,
      generateImgAltText: this.generateImgAltText,
      allowBackwardCrawling: this.allowBackwardCrawling,
@ -580,8 +577,7 @@ export class WebScraperDataProvider {
  filterDepth(documents: Document[]): Document[] {
    return documents.filter((document) => {
      const url = new URL(document.metadata.sourceURL);
-      const path = url.pathname;
+      return getURLDepth(url.toString()) <= this.maxCrawledDepth;
      return path.split("/").length <= this.maxCrawledDepth;
    });
  }
 }
--- a/apps/api/src/scraper/WebScraper/utils/tests/maxDepthUtils.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/maxDepthUtils.test.ts
@ -0,0 +1,47 @@
 import { getURLDepth, getAdjustedMaxDepth } from '../maxDepthUtils';
 describe('Testing getURLDepth and getAdjustedMaxDepth', () => {
  it('should return 0 for root - mendable.ai', () => {
    const enteredURL = "https://www.mendable.ai/"
    expect(getURLDepth(enteredURL)).toBe(0);
  });
  it('should return 0 for root - scrapethissite.com', () => {
    const enteredURL = "https://scrapethissite.com/"
    expect(getURLDepth(enteredURL)).toBe(0);
  });
  it('should return 1 for scrapethissite.com/pages', () => {
    const enteredURL = "https://scrapethissite.com/pages"
    expect(getURLDepth(enteredURL)).toBe(1);
  });
  it('should return 2 for scrapethissite.com/pages/articles', () => {
    const enteredURL = "https://scrapethissite.com/pages/articles"
    expect(getURLDepth(enteredURL)).toBe(2);
  });
  it('Adjusted maxDepth should return 1 for scrapethissite.com and max depth param of 1', () => {
    const enteredURL = "https://scrapethissite.com"
    expect(getAdjustedMaxDepth(enteredURL, 1)).toBe(1);
  });
  it('Adjusted maxDepth should return 0 for scrapethissite.com and max depth param of 0', () => {
    const enteredURL = "https://scrapethissite.com"
    expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0);
  });
  it('Adjusted maxDepth should return 0 for mendable.ai and max depth param of 0', () => {
    const enteredURL = "https://mendable.ai"
    expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0);
  });
  it('Adjusted maxDepth should return 4 for scrapethissite.com/pages/articles and max depth param of 2', () => {
    const enteredURL = "https://scrapethissite.com/pages/articles"
    expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(4);
  });
 });
--- a/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts
+++ b/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts
@ -0,0 +1,12 @@
 export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number {
  const baseURLDepth = getURLDepth(url);
  const adjustedMaxDepth = maxCrawlDepth + baseURLDepth;
  return adjustedMaxDepth;
 }
 export function getURLDepth(url: string): number {
  const pathSplits = new URL(url).pathname.split('/');
  return pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) - 1;
 }
--- a/apps/test-suite/load-test-results/test-run-report.json
+++ b/apps/test-suite/load-test-results/test-run-report.json