From bfccaf670d3ea00e6460c015b50367d019e322aa Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 15:30:37 -0700 Subject: [PATCH] Nick: fixes most of it --- apps/api/src/scraper/WebScraper/crawler.ts | 39 ++++++++++++++++++---- apps/api/src/scraper/WebScraper/index.ts | 33 +++++++++++------- apps/test-suite/data/crawl.json | 2 +- 3 files changed, 55 insertions(+), 19 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 7cfd1be..98a0738 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -76,9 +76,22 @@ export class WebCrawler { // Check if the link matches the include patterns, if any are specified if (this.includes.length > 0 && this.includes[0] !== "") { - return this.includes.some((includePattern) => + if (!this.includes.some((includePattern) => new RegExp(includePattern).test(path) - ); + )) { + return false; + } + } + + // Normalize the initial URL and the link to account for www and non-www versions + const normalizedInitialUrl = new URL(this.initialUrl); + const normalizedLink = new URL(link); + const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, ''); + const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); + + // Ensure the protocol and hostname match, and the path starts with the initial URL's path + if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { + return false; } const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; @@ -88,10 +101,6 @@ export class WebCrawler { return false; } - if (!this.initialUrl.includes(link)) { - return false; - } - return true; }) .slice(0, limit); @@ -109,11 +118,15 @@ export class WebCrawler { this.robots = robotsParser(this.robotsTxtUrl, response.data); } catch (error) { console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); + } + console.log("Initial URL: ", this.initialUrl); + const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); + console.log("Filtered links: ", filteredLinks.length); return filteredLinks.map(link => ({ url: link, html: "" })); } @@ -310,7 +323,21 @@ export class WebCrawler { } } catch (error) { // Error handling for failed sitemap fetch + // console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); } + + // If the first one doesn't work, try the base URL + const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; + try { + const response = await axios.get(baseUrlSitemap); + if (response.status === 200) { + return await getLinksFromSitemap(baseUrlSitemap); + } + } catch (error) { + // Error handling for failed base URL sitemap fetch + console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); + } + return []; } } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 7e19357..3ba5a1d 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -130,6 +130,21 @@ export class WebScraperDataProvider { } } + private async cleanIrrelevantPath(links: string[]){ + return links.filter(link => { + const normalizedInitialUrl = new URL(this.urls[0]); + const normalizedLink = new URL(link); + + // Normalize the hostname to account for www and non-www versions + const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, ''); + const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); + + // Ensure the protocol and hostname match, and the path starts with the initial URL's path + return linkHostname === initialHostname && + normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname); + }); + } + private async handleCrawlMode( inProgress?: (progress: Progress) => void ): Promise { @@ -149,11 +164,11 @@ export class WebScraperDataProvider { let allLinks = links.map((e) => e.url); const allHtmls = links.map((e)=> e.html); - allLinks = allLinks.filter(link => { - const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; - const normalizedLink = link.endsWith('/') ? link : `${link}/`; - return normalizedLink.startsWith(normalizedInitialUrl); - }); + console.log(">>>>>> all links >>>>", {allLinks}) + // allLinks = await this.cleanIrrelevantPath(allLinks); + + + console.log('>>>>>??>?>?>?>?.', {allLinks}) if (this.returnOnlyUrls) { @@ -183,13 +198,7 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { let links = await getLinksFromSitemap(this.urls[0]); - links = links.filter(link => { - const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; - const normalizedLink = link.endsWith('/') ? link : `${link}/`; - return normalizedLink.startsWith(normalizedInitialUrl); - }); - - console.log('>>>>>??>?>?>?>?.', {links}) + links = await this.cleanIrrelevantPath(links); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(links, inProgress); diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json index d729644..651468a 100644 --- a/apps/test-suite/data/crawl.json +++ b/apps/test-suite/data/crawl.json @@ -27,7 +27,7 @@ ] }, { - "website": "https://agentops.ai", + "website": "https://agentops.ai/blog", "expected_min_num_of_pages": 7, "expected_crawled_pages": [ "https://www.agentops.ai/blog/effortless-hr-management-with-saas",