Merge pull request #283 from mendableai/nsc/crawler-fixes

Fixes crawler getting confused with base paths that contain www.
2024-06-14 13:50:32 -07:00 · 2024-06-14 13:50:32 -07:00 · 4ec863718b
commit 4ec863718b
parent 361cba4119 e88cb314c8
1 changed files with 13 additions and 12 deletions
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -224,11 +224,10 @@ export class WebCrawler {
  }

  async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
-    const normalizedUrl = this.normalizeCrawlUrl(url);
-    if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
+    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
      return [];
    }
-    this.visited.add(normalizedUrl);
+    this.visited.add(url);

    if (!url.startsWith("http")) {
      url = "https://" + url;
@ -276,15 +275,16 @@ export class WebCrawler {
          const urlObj = new URL(fullUrl);
          const path = urlObj.pathname;

+
          if (
            this.isInternalLink(fullUrl) &&
-            this.matchesPattern(fullUrl) &&
            this.noSections(fullUrl) &&
            // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
            // this.matchesIncludes(path) &&
            !this.matchesExcludes(path) &&
-            this.robots.isAllowed(fullUrl, "FireCrawlAgent")
+            this.isRobotsAllowed(fullUrl)
          ) {
+
            links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
          }
        }
@ -294,12 +294,15 @@ export class WebCrawler {
        return links;
      }
      // Create a new list to return to avoid modifying the visited list
-      return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
+      return links.filter((link) => !this.visited.has(link.url));
    } catch (error) {
      return [];
    }
  }

+  private isRobotsAllowed(url: string): boolean {
+    return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
+  }
  private normalizeCrawlUrl(url: string): string {
    try{
      const urlObj = new URL(url);
@ -326,12 +329,10 @@ export class WebCrawler {

  private isInternalLink(link: string): boolean {
    const urlObj = new URL(link, this.baseUrl);
-    const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
-    return urlObj.hostname === domainWithoutProtocol;
-  }
-
-  private matchesPattern(link: string): boolean {
-    return true; // Placeholder for future pattern matching implementation
+    const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
+    const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
+    
+    return linkDomain === baseDomain;
  }

  private isFile(url: string): boolean {