From bfccaf670d3ea00e6460c015b50367d019e322aa Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 15 May 2024 15:30:37 -0700
Subject: [PATCH] Nick: fixes most of it

---
 apps/api/src/scraper/WebScraper/crawler.ts | 39 ++++++++++++++++++----
 apps/api/src/scraper/WebScraper/index.ts   | 33 +++++++++++-------
 apps/test-suite/data/crawl.json            |  2 +-
 3 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 7cfd1be..98a0738 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -76,9 +76,22 @@ export class WebCrawler {
 
         // Check if the link matches the include patterns, if any are specified
         if (this.includes.length > 0 && this.includes[0] !== "") {
-          return this.includes.some((includePattern) =>
+          if (!this.includes.some((includePattern) =>
             new RegExp(includePattern).test(path)
-          );
+          )) {
+            return false;
+          }
+        }
+
+        // Normalize the initial URL and the link to account for www and non-www versions
+        const normalizedInitialUrl = new URL(this.initialUrl);
+        const normalizedLink = new URL(link);
+        const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
+        const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
+
+        // Ensure the protocol and hostname match, and the path starts with the initial URL's path
+        if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
+          return false;
         }
 
         const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
@@ -88,10 +101,6 @@ export class WebCrawler {
           return false;
         }
 
-        if (!this.initialUrl.includes(link)) {
-          return false;
-        }
-
         return true;
       })
       .slice(0, limit);
@@ -109,11 +118,15 @@ export class WebCrawler {
       this.robots = robotsParser(this.robotsTxtUrl, response.data);
     } catch (error) {
       console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
+
     }
 
+    console.log("Initial URL: ", this.initialUrl);
+
     const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
     if (sitemapLinks.length > 0) {
       let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
+      console.log("Filtered links: ", filteredLinks.length);
       return filteredLinks.map(link => ({ url: link, html: "" }));
     }
 
@@ -310,7 +323,21 @@ export class WebCrawler {
       }
     } catch (error) {
       // Error handling for failed sitemap fetch
+      // console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
     }
+
+    // If the first one doesn't work, try the base URL
+    const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
+    try {
+      const response = await axios.get(baseUrlSitemap);
+      if (response.status === 200) {
+        return await getLinksFromSitemap(baseUrlSitemap);
+      }
+    } catch (error) {
+      // Error handling for failed base URL sitemap fetch
+      console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
+    }
+
     return [];
   }
 }
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 7e19357..3ba5a1d 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -130,6 +130,21 @@ export class WebScraperDataProvider {
     }
   }
 
+  private async cleanIrrelevantPath(links: string[]){
+    return links.filter(link => {
+      const normalizedInitialUrl = new URL(this.urls[0]);
+      const normalizedLink = new URL(link);
+
+      // Normalize the hostname to account for www and non-www versions
+      const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
+      const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
+
+      // Ensure the protocol and hostname match, and the path starts with the initial URL's path
+      return linkHostname === initialHostname &&
+             normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname);
+    });
+  }
+
   private async handleCrawlMode(
     inProgress?: (progress: Progress) => void
   ): Promise<Document[]> {
@@ -149,11 +164,11 @@ export class WebScraperDataProvider {
     let allLinks = links.map((e) => e.url);
     const allHtmls = links.map((e)=> e.html);
 
-    allLinks = allLinks.filter(link => {
-      const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
-      const normalizedLink = link.endsWith('/') ? link : `${link}/`;
-      return normalizedLink.startsWith(normalizedInitialUrl);
-    });
+    console.log(">>>>>> all links >>>>", {allLinks})
+    // allLinks = await this.cleanIrrelevantPath(allLinks);
+
+
+    
     console.log('>>>>>??>?>?>?>?.', {allLinks})
 
     if (this.returnOnlyUrls) {
@@ -183,13 +198,7 @@ export class WebScraperDataProvider {
     inProgress?: (progress: Progress) => void
   ): Promise<Document[]> {
     let links = await getLinksFromSitemap(this.urls[0]);
-    links = links.filter(link => {
-      const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
-      const normalizedLink = link.endsWith('/') ? link : `${link}/`;
-      return normalizedLink.startsWith(normalizedInitialUrl);
-    });
-
-    console.log('>>>>>??>?>?>?>?.', {links})
+    links = await this.cleanIrrelevantPath(links);
 
     if (this.returnOnlyUrls) {
       return this.returnOnlyUrlsResponse(links, inProgress);
diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json
index d729644..651468a 100644
--- a/apps/test-suite/data/crawl.json
+++ b/apps/test-suite/data/crawl.json
@@ -27,7 +27,7 @@
     ]
   },
   {
-    "website": "https://agentops.ai",
+    "website": "https://agentops.ai/blog",
     "expected_min_num_of_pages": 7,
     "expected_crawled_pages": [
       "https://www.agentops.ai/blog/effortless-hr-management-with-saas",