Merge pull request #263 from mendableai/nsc/pageoptions-crawler

ignoreSitemap feature, pageOptions now respected in the initial crawl as well
2024-06-10 18:22:51 -07:00 · 2024-06-10 18:22:51 -07:00 · 15e791ffb1
commit 15e791ffb1
parent 149d79a529 f6b06ac27a
4 changed files with 65 additions and 41 deletions
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -35,10 +35,7 @@ export type SearchOptions = {
  location?: string;
 };
-export type WebScraperOptions = {
+export type CrawlerOptions = {
  urls: string[];
  mode: "single_urls" | "sitemap" | "crawl";
  crawlerOptions?: {
  returnOnlyUrls?: boolean;
  includes?: string[];
  excludes?: string[];
@ -47,8 +44,14 @@ export type WebScraperOptions = {
  limit?: number;
  generateImgAltText?: boolean;
  replaceAllPathsWithAbsolutePaths?: boolean;
  ignoreSitemap?: boolean;
  mode?: "default" | "fast"; // have a mode of some sort
-  };
+}
 export type WebScraperOptions = {
  urls: string[];
  mode: "single_urls" | "sitemap" | "crawl";
  crawlerOptions?: CrawlerOptions;
  pageOptions?: PageOptions;
  extractorOptions?: ExtractorOptions;
  concurrentRequests?: number;
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import async from "async";
-import { Progress } from "../../lib/entities";
+import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
 import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";
@ -108,6 +108,8 @@ export class WebCrawler {
  public async start(
    inProgress?: (progress: Progress) => void,
    pageOptions?: PageOptions,
    crawlerOptions?: CrawlerOptions,
    concurrencyLimit: number = 5,
    limit: number = 10000,
    maxDepth: number = 10
@ -122,17 +124,21 @@ export class WebCrawler {
    }
    if(!crawlerOptions?.ignoreSitemap){
      const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
      if (sitemapLinks.length > 0) {
        let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
        return filteredLinks.map(link => ({ url: link, html: "" }));
      }
    }
    const urls = await this.crawlUrls(
      [this.initialUrl],
      pageOptions,
      concurrencyLimit,
      inProgress
    );
    if (
      urls.length === 0 &&
      this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
@ -140,14 +146,15 @@ export class WebCrawler {
      return [{ url: this.initialUrl, html: "" }];
    }
    // make sure to run include exclude here again
    const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
    return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
  }
  private async crawlUrls(
    urls: string[],
    pageOptions: PageOptions,
    concurrencyLimit: number,
    inProgress?: (progress: Progress) => void,
  ): Promise<{ url: string, html: string }[]> {
@ -158,7 +165,7 @@ export class WebCrawler {
        }
        return;
      }
-      const newUrls = await this.crawl(task);
+      const newUrls = await this.crawl(task, pageOptions);
      // add the initial url if not already added
      // if (this.visited.size === 1) {
      //   let normalizedInitial = this.initialUrl;
@ -188,7 +195,7 @@ export class WebCrawler {
          currentDocumentUrl: task,
        });
      }
-      await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
+      await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
      if (callback && typeof callback === "function") {
        callback();
      }
@ -207,20 +214,18 @@ export class WebCrawler {
    return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
  }
-  async crawl(url: string): Promise<{url: string, html: string}[]> {
+  async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
-    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
+    const normalizedUrl = this.normalizeCrawlUrl(url);
    if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
      return [];
    }
-    this.visited.add(url);
+    this.visited.add(normalizedUrl);
    if (!url.startsWith("http")) {
      url = "https://" + url;
    }
    if (url.endsWith("/")) {
      url = url.slice(0, -1);
    }
    if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
@ -231,8 +236,8 @@ export class WebCrawler {
      let content: string = "";
      // If it is the first link, fetch with single url
      if (this.visited.size === 1) {
-        const page = await scrapSingleUrl(url, {includeHtml: true});
+        const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
-        content = page.html ?? ""
+        content = page.html ?? "";
      } else {
        const response = await axios.get(url);
        content = response.data ?? "";
@ -241,12 +246,10 @@ export class WebCrawler {
      let links: { url: string, html: string }[] = [];
      // Add the initial URL to the list of links
-      if(this.visited.size === 1)
+      if (this.visited.size === 1) {
      {
        links.push({ url, html: content });
      }
      $("a").each((_, element) => {
        const href = $(element).attr("href");
        if (href) {
@ -254,14 +257,15 @@ export class WebCrawler {
          if (!href.startsWith("http")) {
            fullUrl = new URL(href, this.baseUrl).toString();
          }
-          const url = new URL(fullUrl);
+          const urlObj = new URL(fullUrl);
-          const path = url.pathname;
+          const path = urlObj.pathname;
          if (
            this.isInternalLink(fullUrl) &&
            this.matchesPattern(fullUrl) &&
            this.noSections(fullUrl) &&
-            this.matchesIncludes(path) &&
+            // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
            // this.matchesIncludes(path) &&
            !this.matchesExcludes(path) &&
            this.robots.isAllowed(fullUrl, "FireCrawlAgent")
          ) {
@ -274,12 +278,22 @@ export class WebCrawler {
        return links;
      }
      // Create a new list to return to avoid modifying the visited list
-      return links.filter((link) => !this.visited.has(link.url));
+      return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
    } catch (error) {
      return [];
    }
  }
  private normalizeCrawlUrl(url: string): string {
    try{
      const urlObj = new URL(url);
      urlObj.searchParams.sort(); // Sort query parameters to normalize
      return urlObj.toString();
    } catch (error) {
      return url;
    }
  }
  private matchesIncludes(url: string): boolean {
    if (this.includes.length === 0 || this.includes[0] == "") return true;
    return this.includes.some((pattern) => new RegExp(pattern).test(url));
@ -388,7 +402,6 @@ export class WebCrawler {
    // Normalize and check if the URL is present in any of the sitemaps
    const normalizedUrl = normalizeUrl(url);
    const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
    // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -31,6 +31,7 @@ export class WebScraperDataProvider {
  private limit: number = 10000;
  private concurrentRequests: number = 20;
  private generateImgAltText: boolean = false;
  private ignoreSitemap: boolean = false;
  private pageOptions?: PageOptions;
  private extractorOptions?: ExtractorOptions;
  private replaceAllPathsWithAbsolutePaths?: boolean = false;
@ -38,6 +39,7 @@ export class WebScraperDataProvider {
    "gpt-4-turbo";
  private crawlerMode: string = "default";
  authorize(): void {
    throw new Error("Method not implemented.");
  }
@ -173,6 +175,10 @@ export class WebScraperDataProvider {
    let links = await crawler.start(
      inProgress,
      this.pageOptions,
      {
        ignoreSitemap: this.ignoreSitemap,
      },
      5,
      this.limit,
      this.maxCrawledDepth
@ -473,6 +479,7 @@ export class WebScraperDataProvider {
    //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
    this.excludes = this.excludes.filter((item) => item !== "");
    this.crawlerMode = options.crawlerOptions?.mode ?? "default";
    this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
    // make sure all urls start with https://
    this.urls = this.urls.map((url) => {
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@ -12,6 +12,7 @@ export async function getLinksFromSitemap(
      content = response.data;
    } catch (error) {
      console.error(`Request failed for ${sitemapUrl}: ${error}`);
      return allUrls;
    }