From 3091f0134cc95f47fe7d993b5fab5536868dd29e Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 10 Jun 2024 16:27:10 -0700
Subject: [PATCH] Nick:

---
 apps/api/src/scraper/WebScraper/crawler.ts | 14 +++++++++-----
 apps/api/src/scraper/WebScraper/index.ts   |  1 +
 apps/api/src/scraper/WebScraper/sitemap.ts |  3 +++
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 9340aa8..ee9baff 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import async from "async";
-import { Progress } from "../../lib/entities";
+import { PageOptions, Progress } from "../../lib/entities";
 import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";
 
@@ -108,6 +108,7 @@ export class WebCrawler {
 
   public async start(
     inProgress?: (progress: Progress) => void,
+    pageOptions?: PageOptions,
     concurrencyLimit: number = 5,
     limit: number = 10000,
     maxDepth: number = 10
@@ -130,6 +131,7 @@ export class WebCrawler {
 
     const urls = await this.crawlUrls(
       [this.initialUrl],
+      pageOptions,
       concurrencyLimit,
       inProgress
     );
@@ -148,6 +150,7 @@ export class WebCrawler {
 
   private async crawlUrls(
     urls: string[],
+    pageOptions: PageOptions,
     concurrencyLimit: number,
     inProgress?: (progress: Progress) => void,
   ): Promise<{ url: string, html: string }[]> {
@@ -158,7 +161,7 @@ export class WebCrawler {
         }
         return;
       }
-      const newUrls = await this.crawl(task);
+      const newUrls = await this.crawl(task, pageOptions);
       // add the initial url if not already added
       // if (this.visited.size === 1) {
       //   let normalizedInitial = this.initialUrl;
@@ -188,7 +191,7 @@ export class WebCrawler {
           currentDocumentUrl: task,
         });
       }
-      await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
+      await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
       if (callback && typeof callback === "function") {
         callback();
       }
@@ -207,7 +210,7 @@ export class WebCrawler {
     return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
   }
 
-  async crawl(url: string): Promise<{url: string, html: string}[]> {
+  async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
     if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
       return [];
     }
@@ -231,7 +234,8 @@ export class WebCrawler {
       let content : string = "";
       // If it is the first link, fetch with single url
       if (this.visited.size === 1) {
-        const page = await scrapSingleUrl(url, {includeHtml: true});
+        console.log(pageOptions)
+        const page = await scrapSingleUrl(url, {...pageOptions, includeHtml: true});
         content = page.html ?? ""
       } else {
         const response = await axios.get(url);
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index e3a3cc6..824ec06 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -173,6 +173,7 @@ export class WebScraperDataProvider {
 
     let links = await crawler.start(
       inProgress,
+      this.pageOptions,
       5,
       this.limit,
       this.maxCrawledDepth
diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts
index 0ac4338..5a89183 100644
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@@ -12,6 +12,8 @@ export async function getLinksFromSitemap(
       content = response.data;
     } catch (error) {
       console.error(`Request failed for ${sitemapUrl}: ${error}`);
+  console.log(allUrls)
+
       return allUrls;
     }
 
@@ -34,6 +36,7 @@ export async function getLinksFromSitemap(
   } catch (error) {
     console.error(`Error processing ${sitemapUrl}: ${error}`);
   }
+  console.log(allUrls)
 
   return allUrls;
 }