Nick: fixes most of it

2024-05-15 15:30:37 -07:00 · 2024-05-15 15:30:37 -07:00 · bfccaf670d
commit bfccaf670d
parent d91043376c
3 changed files with 55 additions and 19 deletions
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -76,9 +76,22 @@ export class WebCrawler {
        // Check if the link matches the include patterns, if any are specified
        if (this.includes.length > 0 && this.includes[0] !== "") {
-          return this.includes.some((includePattern) =>
+          if (!this.includes.some((includePattern) =>
            new RegExp(includePattern).test(path)
-          );
+          )) {
            return false;
          }
        }
        // Normalize the initial URL and the link to account for www and non-www versions
        const normalizedInitialUrl = new URL(this.initialUrl);
        const normalizedLink = new URL(link);
        const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
        const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
        // Ensure the protocol and hostname match, and the path starts with the initial URL's path
        if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
          return false;
        }
        const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
@ -88,10 +101,6 @@ export class WebCrawler {
          return false;
        }
        if (!this.initialUrl.includes(link)) {
          return false;
        }
        return true;
      })
      .slice(0, limit);
@ -109,11 +118,15 @@ export class WebCrawler {
      this.robots = robotsParser(this.robotsTxtUrl, response.data);
    } catch (error) {
      console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
    }
    console.log("Initial URL: ", this.initialUrl);
    const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
    if (sitemapLinks.length > 0) {
      let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
      console.log("Filtered links: ", filteredLinks.length);
      return filteredLinks.map(link => ({ url: link, html: "" }));
    }
@ -310,7 +323,21 @@ export class WebCrawler {
      }
    } catch (error) {
      // Error handling for failed sitemap fetch
      // console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
    }
    // If the first one doesn't work, try the base URL
    const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
    try {
      const response = await axios.get(baseUrlSitemap);
      if (response.status === 200) {
        return await getLinksFromSitemap(baseUrlSitemap);
      }
    } catch (error) {
      // Error handling for failed base URL sitemap fetch
      console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
    }
    return [];
  }
 }
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -130,6 +130,21 @@ export class WebScraperDataProvider {
    }
  }
  private async cleanIrrelevantPath(links: string[]){
    return links.filter(link => {
      const normalizedInitialUrl = new URL(this.urls[0]);
      const normalizedLink = new URL(link);
      // Normalize the hostname to account for www and non-www versions
      const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
      const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
      // Ensure the protocol and hostname match, and the path starts with the initial URL's path
      return linkHostname === initialHostname &&
             normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname);
    });
  }
  private async handleCrawlMode(
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
@ -149,11 +164,11 @@ export class WebScraperDataProvider {
    let allLinks = links.map((e) => e.url);
    const allHtmls = links.map((e)=> e.html);
-    allLinks = allLinks.filter(link => {
+    console.log(">>>>>> all links >>>>", {allLinks})
-      const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
+    // allLinks = await this.cleanIrrelevantPath(allLinks);
-      const normalizedLink = link.endsWith('/') ? link : `${link}/`;
+
-      return normalizedLink.startsWith(normalizedInitialUrl);
+
-    });
+    
    console.log('>>>>>??>?>?>?>?.', {allLinks})
    if (this.returnOnlyUrls) {
@ -183,13 +198,7 @@ export class WebScraperDataProvider {
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
    let links = await getLinksFromSitemap(this.urls[0]);
-    links = links.filter(link => {
+    links = await this.cleanIrrelevantPath(links);
      const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
      const normalizedLink = link.endsWith('/') ? link : `${link}/`;
      return normalizedLink.startsWith(normalizedInitialUrl);
    });
    console.log('>>>>>??>?>?>?>?.', {links})
    if (this.returnOnlyUrls) {
      return this.returnOnlyUrlsResponse(links, inProgress);
--- a/apps/test-suite/data/crawl.json
+++ b/apps/test-suite/data/crawl.json
@ -27,7 +27,7 @@
    ]
  },
  {
-    "website": "https://agentops.ai",
+    "website": "https://agentops.ai/blog",
    "expected_min_num_of_pages": 7,
    "expected_crawled_pages": [
      "https://www.agentops.ai/blog/effortless-hr-management-with-saas",