From 3091f0134cc95f47fe7d993b5fab5536868dd29e Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 10 Jun 2024 16:27:10 -0700
Subject: [PATCH 1/2] Nick:

---
 apps/api/src/scraper/WebScraper/crawler.ts | 14 +++++++++-----
 apps/api/src/scraper/WebScraper/index.ts   |  1 +
 apps/api/src/scraper/WebScraper/sitemap.ts |  3 +++
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 9340aa8..ee9baff 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import async from "async";
-import { Progress } from "../../lib/entities";
+import { PageOptions, Progress } from "../../lib/entities";
 import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";
 
@@ -108,6 +108,7 @@ export class WebCrawler {
 
   public async start(
     inProgress?: (progress: Progress) => void,
+    pageOptions?: PageOptions,
     concurrencyLimit: number = 5,
     limit: number = 10000,
     maxDepth: number = 10
@@ -130,6 +131,7 @@ export class WebCrawler {
 
     const urls = await this.crawlUrls(
       [this.initialUrl],
+      pageOptions,
       concurrencyLimit,
       inProgress
     );
@@ -148,6 +150,7 @@ export class WebCrawler {
 
   private async crawlUrls(
     urls: string[],
+    pageOptions: PageOptions,
     concurrencyLimit: number,
     inProgress?: (progress: Progress) => void,
   ): Promise<{ url: string, html: string }[]> {
@@ -158,7 +161,7 @@ export class WebCrawler {
         }
         return;
       }
-      const newUrls = await this.crawl(task);
+      const newUrls = await this.crawl(task, pageOptions);
       // add the initial url if not already added
       // if (this.visited.size === 1) {
       //   let normalizedInitial = this.initialUrl;
@@ -188,7 +191,7 @@ export class WebCrawler {
           currentDocumentUrl: task,
         });
       }
-      await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
+      await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
       if (callback && typeof callback === "function") {
         callback();
       }
@@ -207,7 +210,7 @@ export class WebCrawler {
     return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
   }
 
-  async crawl(url: string): Promise<{url: string, html: string}[]> {
+  async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
     if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
       return [];
     }
@@ -231,7 +234,8 @@ export class WebCrawler {
       let content : string = "";
       // If it is the first link, fetch with single url
       if (this.visited.size === 1) {
-        const page = await scrapSingleUrl(url, {includeHtml: true});
+        console.log(pageOptions)
+        const page = await scrapSingleUrl(url, {...pageOptions, includeHtml: true});
         content = page.html ?? ""
       } else {
         const response = await axios.get(url);
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index e3a3cc6..824ec06 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -173,6 +173,7 @@ export class WebScraperDataProvider {
 
     let links = await crawler.start(
       inProgress,
+      this.pageOptions,
       5,
       this.limit,
       this.maxCrawledDepth
diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts
index 0ac4338..5a89183 100644
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@@ -12,6 +12,8 @@ export async function getLinksFromSitemap(
       content = response.data;
     } catch (error) {
       console.error(`Request failed for ${sitemapUrl}: ${error}`);
+  console.log(allUrls)
+
       return allUrls;
     }
 
@@ -34,6 +36,7 @@ export async function getLinksFromSitemap(
   } catch (error) {
     console.error(`Error processing ${sitemapUrl}: ${error}`);
   }
+  console.log(allUrls)
 
   return allUrls;
 }

From f6b06ac27a829172416419c4fff02d0f71579050 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 10 Jun 2024 18:12:41 -0700
Subject: [PATCH 2/2] Nick: ignoreSitemap, better crawling algo

---
 apps/api/src/lib/entities.ts               | 25 +++++----
 apps/api/src/scraper/WebScraper/crawler.ts | 65 ++++++++++++----------
 apps/api/src/scraper/WebScraper/index.ts   |  6 ++
 apps/api/src/scraper/WebScraper/sitemap.ts |  2 -
 4 files changed, 57 insertions(+), 41 deletions(-)

diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 5511623..744c07b 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -35,20 +35,23 @@ export type SearchOptions = {
   location?: string;
 };
 
+export type CrawlerOptions = {
+  returnOnlyUrls?: boolean;
+  includes?: string[];
+  excludes?: string[];
+  maxCrawledLinks?: number;
+  maxDepth?: number;
+  limit?: number;
+  generateImgAltText?: boolean;
+  replaceAllPathsWithAbsolutePaths?: boolean;
+  ignoreSitemap?: boolean;
+  mode?: "default" | "fast"; // have a mode of some sort
+}
+
 export type WebScraperOptions = {
   urls: string[];
   mode: "single_urls" | "sitemap" | "crawl";
-  crawlerOptions?: {
-    returnOnlyUrls?: boolean;
-    includes?: string[];
-    excludes?: string[];
-    maxCrawledLinks?: number;
-    maxDepth?: number;
-    limit?: number;
-    generateImgAltText?: boolean;
-    replaceAllPathsWithAbsolutePaths?: boolean;
-    mode?: "default" | "fast"; // have a mode of some sort
-  };
+  crawlerOptions?: CrawlerOptions;
   pageOptions?: PageOptions;
   extractorOptions?: ExtractorOptions;
   concurrentRequests?: number;
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index ee9baff..fc95e7c 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import async from "async";
-import { PageOptions, Progress } from "../../lib/entities";
+import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
 import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";
 
@@ -109,6 +109,7 @@ export class WebCrawler {
   public async start(
     inProgress?: (progress: Progress) => void,
     pageOptions?: PageOptions,
+    crawlerOptions?: CrawlerOptions,
     concurrencyLimit: number = 5,
     limit: number = 10000,
     maxDepth: number = 10
@@ -123,10 +124,12 @@ export class WebCrawler {
     }
 
 
-    const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
-    if (sitemapLinks.length > 0) {
-      let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
-      return filteredLinks.map(link => ({ url: link, html: "" }));
+    if(!crawlerOptions?.ignoreSitemap){
+      const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
+      if (sitemapLinks.length > 0) {
+        let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
+        return filteredLinks.map(link => ({ url: link, html: "" }));
+      }
     }
 
     const urls = await this.crawlUrls(
@@ -135,6 +138,7 @@ export class WebCrawler {
       concurrencyLimit,
       inProgress
     );
+    
     if (
       urls.length === 0 &&
       this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
@@ -142,9 +146,9 @@ export class WebCrawler {
       return [{ url: this.initialUrl, html: "" }];
     }
 
-
     // make sure to run include exclude here again
     const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
+
     return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
   }
 
@@ -211,46 +215,41 @@ export class WebCrawler {
   }
 
   async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
-    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
+    const normalizedUrl = this.normalizeCrawlUrl(url);
+    if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
       return [];
     }
-    this.visited.add(url);
-    
+    this.visited.add(normalizedUrl);
 
     if (!url.startsWith("http")) {
       url = "https://" + url;
-
     }
     if (url.endsWith("/")) {
       url = url.slice(0, -1);
-
     }
-    
+
     if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
       return [];
     }
 
     try {
-      let content : string = "";
+      let content: string = "";
       // If it is the first link, fetch with single url
       if (this.visited.size === 1) {
-        console.log(pageOptions)
-        const page = await scrapSingleUrl(url, {...pageOptions, includeHtml: true});
-        content = page.html ?? ""
+        const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
+        content = page.html ?? "";
       } else {
         const response = await axios.get(url);
         content = response.data ?? "";
       }
       const $ = load(content);
-      let links: {url: string, html: string}[] = [];
+      let links: { url: string, html: string }[] = [];
 
       // Add the initial URL to the list of links
-      if(this.visited.size === 1)
-      {
-        links.push({url, html: content});
+      if (this.visited.size === 1) {
+        links.push({ url, html: content });
       }
 
-
       $("a").each((_, element) => {
         const href = $(element).attr("href");
         if (href) {
@@ -258,32 +257,43 @@ export class WebCrawler {
           if (!href.startsWith("http")) {
             fullUrl = new URL(href, this.baseUrl).toString();
           }
-          const url = new URL(fullUrl);
-          const path = url.pathname;
+          const urlObj = new URL(fullUrl);
+          const path = urlObj.pathname;
 
           if (
             this.isInternalLink(fullUrl) &&
             this.matchesPattern(fullUrl) &&
             this.noSections(fullUrl) &&
-            this.matchesIncludes(path) &&
+            // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
+            // this.matchesIncludes(path) &&
             !this.matchesExcludes(path) &&
             this.robots.isAllowed(fullUrl, "FireCrawlAgent")
           ) {
-            links.push({url: fullUrl, html: content});
+            links.push({ url: fullUrl, html: content });
           }
         }
       });
 
-      if(this.visited.size === 1){
+      if (this.visited.size === 1) {
         return links;
       }
       // Create a new list to return to avoid modifying the visited list
-      return links.filter((link) => !this.visited.has(link.url));
+      return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
     } catch (error) {
       return [];
     }
   }
 
+  private normalizeCrawlUrl(url: string): string {
+    try{
+      const urlObj = new URL(url);
+      urlObj.searchParams.sort(); // Sort query parameters to normalize
+      return urlObj.toString();
+    } catch (error) {
+      return url;
+    }
+  }
+
   private matchesIncludes(url: string): boolean {
     if (this.includes.length === 0 || this.includes[0] == "") return true;
     return this.includes.some((pattern) => new RegExp(pattern).test(url));
@@ -392,7 +402,6 @@ export class WebCrawler {
 
     // Normalize and check if the URL is present in any of the sitemaps
     const normalizedUrl = normalizeUrl(url);
-
     const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
 
     // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 824ec06..7dcd175 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -31,6 +31,7 @@ export class WebScraperDataProvider {
   private limit: number = 10000;
   private concurrentRequests: number = 20;
   private generateImgAltText: boolean = false;
+  private ignoreSitemap: boolean = false;
   private pageOptions?: PageOptions;
   private extractorOptions?: ExtractorOptions;
   private replaceAllPathsWithAbsolutePaths?: boolean = false;
@@ -38,6 +39,7 @@ export class WebScraperDataProvider {
     "gpt-4-turbo";
   private crawlerMode: string = "default";
 
+  
   authorize(): void {
     throw new Error("Method not implemented.");
   }
@@ -174,6 +176,9 @@ export class WebScraperDataProvider {
     let links = await crawler.start(
       inProgress,
       this.pageOptions,
+      {
+        ignoreSitemap: this.ignoreSitemap,
+      },
       5,
       this.limit,
       this.maxCrawledDepth
@@ -474,6 +479,7 @@ export class WebScraperDataProvider {
     //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
     this.excludes = this.excludes.filter((item) => item !== "");
     this.crawlerMode = options.crawlerOptions?.mode ?? "default";
+    this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
 
     // make sure all urls start with https://
     this.urls = this.urls.map((url) => {
diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts
index 5a89183..c6dbf11 100644
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@@ -12,7 +12,6 @@ export async function getLinksFromSitemap(
       content = response.data;
     } catch (error) {
       console.error(`Request failed for ${sitemapUrl}: ${error}`);
-  console.log(allUrls)
 
       return allUrls;
     }
@@ -36,7 +35,6 @@ export async function getLinksFromSitemap(
   } catch (error) {
     console.error(`Error processing ${sitemapUrl}: ${error}`);
   }
-  console.log(allUrls)
 
   return allUrls;
 }