From e88cb314c81a63fdeab774feedbdf2048060e5e1 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 14 Jun 2024 13:44:54 -0700
Subject: [PATCH] Update crawler.ts

---
 apps/api/src/scraper/WebScraper/crawler.ts | 25 +++++++++++-----------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index ba5e003..32abb1b 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -224,11 +224,10 @@ export class WebCrawler {
   }
 
   async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
-    const normalizedUrl = this.normalizeCrawlUrl(url);
-    if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
+    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
       return [];
     }
-    this.visited.add(normalizedUrl);
+    this.visited.add(url);
 
     if (!url.startsWith("http")) {
       url = "https://" + url;
@@ -276,15 +275,16 @@ export class WebCrawler {
           const urlObj = new URL(fullUrl);
           const path = urlObj.pathname;
 
+
           if (
             this.isInternalLink(fullUrl) &&
-            this.matchesPattern(fullUrl) &&
             this.noSections(fullUrl) &&
             // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
             // this.matchesIncludes(path) &&
             !this.matchesExcludes(path) &&
-            this.robots.isAllowed(fullUrl, "FireCrawlAgent")
+            this.isRobotsAllowed(fullUrl)
           ) {
+
             links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
           }
         }
@@ -294,12 +294,15 @@ export class WebCrawler {
         return links;
       }
       // Create a new list to return to avoid modifying the visited list
-      return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
+      return links.filter((link) => !this.visited.has(link.url));
     } catch (error) {
       return [];
     }
   }
 
+  private isRobotsAllowed(url: string): boolean {
+    return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
+  }
   private normalizeCrawlUrl(url: string): string {
     try{
       const urlObj = new URL(url);
@@ -326,12 +329,10 @@ export class WebCrawler {
 
   private isInternalLink(link: string): boolean {
     const urlObj = new URL(link, this.baseUrl);
-    const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
-    return urlObj.hostname === domainWithoutProtocol;
-  }
-
-  private matchesPattern(link: string): boolean {
-    return true; // Placeholder for future pattern matching implementation
+    const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
+    const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
+    
+    return linkDomain === baseDomain;
   }
 
   private isFile(url: string): boolean {