From e88cb314c81a63fdeab774feedbdf2048060e5e1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 14 Jun 2024 13:44:54 -0700 Subject: [PATCH] Update crawler.ts --- apps/api/src/scraper/WebScraper/crawler.ts | 25 +++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index ba5e003..32abb1b 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -224,11 +224,10 @@ export class WebCrawler { } async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> { - const normalizedUrl = this.normalizeCrawlUrl(url); - if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) { + if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) { return []; } - this.visited.add(normalizedUrl); + this.visited.add(url); if (!url.startsWith("http")) { url = "https://" + url; @@ -276,15 +275,16 @@ export class WebCrawler { const urlObj = new URL(fullUrl); const path = urlObj.pathname; + if ( this.isInternalLink(fullUrl) && - this.matchesPattern(fullUrl) && this.noSections(fullUrl) && // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards // this.matchesIncludes(path) && !this.matchesExcludes(path) && - this.robots.isAllowed(fullUrl, "FireCrawlAgent") + this.isRobotsAllowed(fullUrl) ) { + links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } } @@ -294,12 +294,15 @@ export class WebCrawler { return links; } // Create a new list to return to avoid modifying the visited list - return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url))); + return links.filter((link) => !this.visited.has(link.url)); } catch (error) { return []; } } + private isRobotsAllowed(url: string): boolean { + return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true) + } private normalizeCrawlUrl(url: string): string { try{ const urlObj = new URL(url); @@ -326,12 +329,10 @@ export class WebCrawler { private isInternalLink(link: string): boolean { const urlObj = new URL(link, this.baseUrl); - const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, ""); - return urlObj.hostname === domainWithoutProtocol; - } - - private matchesPattern(link: string): boolean { - return true; // Placeholder for future pattern matching implementation + const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim(); + const linkDomain = urlObj.hostname.replace(/^www\./, "").trim(); + + return linkDomain === baseDomain; } private isFile(url: string): boolean {