v-firecrawl/apps/api/src/scraper/WebScraper/crawler.ts

import axios from "axios";
import cheerio, { load } from "cheerio";
import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap";
import async from "async";
import { Progress } from "../../lib/entities";
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
import robotsParser from "robots-parser";

export class WebCrawler {
  private initialUrl: string;
  private baseUrl: string;
  private includes: string[];
  private excludes: string[];
  private maxCrawledLinks: number;
  private maxCrawledDepth: number;
  private visited: Set<string> = new Set();
  private crawledUrls: Map<string, string> = new Map();
  private limit: number;
  private robotsTxtUrl: string;
  private robots: any;
  private generateImgAltText: boolean;

  constructor({
    initialUrl,
    includes,
    excludes,
    maxCrawledLinks,
    limit = 10000,
    generateImgAltText = false,
    maxCrawledDepth = 10,
  }: {
    initialUrl: string;
    includes?: string[];
    excludes?: string[];
    maxCrawledLinks?: number;
    limit?: number;
    generateImgAltText?: boolean;
    maxCrawledDepth?: number;
  }) {
    this.initialUrl = initialUrl;
    this.baseUrl = new URL(initialUrl).origin;
    this.includes = includes ?? [];
    this.excludes = excludes ?? [];
    this.limit = limit;
    this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
    this.robots = robotsParser(this.robotsTxtUrl, "");
    // Deprecated, use limit instead
    this.maxCrawledLinks = maxCrawledLinks ?? limit;
    this.maxCrawledDepth = maxCrawledDepth ?? 10;
    this.generateImgAltText = generateImgAltText ?? false;
  }

  private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
    return sitemapLinks
      .filter((link) => {
        const url = new URL(link);
        const path = url.pathname;
        const depth = url.pathname.split('/').length - 1;

        // Check if the link exceeds the maximum depth allowed
        if (depth > maxDepth) {
          return false;
        }

        // Check if the link should be excluded
        if (this.excludes.length > 0 && this.excludes[0] !== "") {
          if (
            this.excludes.some((excludePattern) =>
              new RegExp(excludePattern).test(path)
            )
          ) {
            return false;
          }
        }

        // Check if the link matches the include patterns, if any are specified
        if (this.includes.length > 0 && this.includes[0] !== "") {
          if (!this.includes.some((includePattern) =>
            new RegExp(includePattern).test(path)
          )) {
            return false;
          }
        }

        // Normalize the initial URL and the link to account for www and non-www versions
        const normalizedInitialUrl = new URL(this.initialUrl);
        const normalizedLink = new URL(link);
        const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
        const linkHostname = normalizedLink.hostname.replace(/^www\./, '');

        // Ensure the protocol and hostname match, and the path starts with the initial URL's path
        if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
          return false;
        }

        const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
        // Check if the link is disallowed by robots.txt
        if (!isAllowed) {
          console.log(`Link disallowed by robots.txt: ${link}`);
          return false;
        }

        return true;
      })
      .slice(0, limit);
  }

  public async start(
    inProgress?: (progress: Progress) => void,
    concurrencyLimit: number = 5,
    limit: number = 10000,
    maxDepth: number = 10
  ): Promise<{ url: string, html: string }[]> {
    // Fetch and parse robots.txt
    try {
      const response = await axios.get(this.robotsTxtUrl);
      this.robots = robotsParser(this.robotsTxtUrl, response.data);
    } catch (error) {
      console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);

    }


    const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
    if (sitemapLinks.length > 0) {
      let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
      return filteredLinks.map(link => ({ url: link, html: "" }));
    }

    const urls = await this.crawlUrls(
      [this.initialUrl],
      concurrencyLimit,
      inProgress
    );
    if (
      urls.length === 0 &&
      this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
    ) {
      return [{ url: this.initialUrl, html: "" }];
    }


    // make sure to run include exclude here again
    const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
    return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
  }

  private async crawlUrls(
    urls: string[],
    concurrencyLimit: number,
    inProgress?: (progress: Progress) => void,
  ): Promise<{ url: string, html: string }[]> {
    const queue = async.queue(async (task: string, callback) => {
      if (this.crawledUrls.size >= this.maxCrawledLinks) {
        if (callback && typeof callback === "function") {
          callback();
        }
        return;
      }
      const newUrls = await this.crawl(task);
      // add the initial url if not already added
      // if (this.visited.size === 1) {
      //   let normalizedInitial = this.initialUrl;
      //   if (!normalizedInitial.endsWith("/")) {
      //     normalizedInitial = normalizedInitial + "/";
      //   }
      //   if (!newUrls.some(page => page.url === this.initialUrl)) {
      //     newUrls.push({ url: this.initialUrl, html: "" });
      //   }
      // }


      newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
      
      if (inProgress && newUrls.length > 0) {
        inProgress({
          current: this.crawledUrls.size,
          total: this.maxCrawledLinks,
          status: "SCRAPING",
          currentDocumentUrl: newUrls[newUrls.length - 1].url,
        });
      } else if (inProgress) {
        inProgress({
          current: this.crawledUrls.size,
          total: this.maxCrawledLinks,
          status: "SCRAPING",
          currentDocumentUrl: task,
        });
      }
      await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
      if (callback && typeof callback === "function") {
        callback();
      }
    }, concurrencyLimit);

    queue.push(
      urls.filter(
        (url) =>
          !this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
      ),
      (err) => {
        if (err) console.error(err);
      }
    );
    await queue.drain();
    return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
  }

  async crawl(url: string): Promise<{url: string, html: string}[]> {
    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
      return [];
    }
    this.visited.add(url);
    

    if (!url.startsWith("http")) {
      url = "https://" + url;

    }
    if (url.endsWith("/")) {
      url = url.slice(0, -1);

    }
    
    if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
      return [];
    }

    try {
      let content : string = "";
      // If it is the first link, fetch with single url
      if (this.visited.size === 1) {
        const page = await scrapSingleUrl(url, {includeHtml: true});
        content = page.html ?? ""
      } else {
        const response = await axios.get(url);
        content = response.data ?? "";
      }
      const $ = load(content);
      let links: {url: string, html: string}[] = [];

      // Add the initial URL to the list of links
      if(this.visited.size === 1)
      {
        links.push({url, html: content});
      }


      $("a").each((_, element) => {
        const href = $(element).attr("href");
        if (href) {
          let fullUrl = href;
          if (!href.startsWith("http")) {
            fullUrl = new URL(href, this.baseUrl).toString();
          }
          const url = new URL(fullUrl);
          const path = url.pathname;

          if (
            this.isInternalLink(fullUrl) &&
            this.matchesPattern(fullUrl) &&
            this.noSections(fullUrl) &&
            this.matchesIncludes(path) &&
            !this.matchesExcludes(path) &&
            this.robots.isAllowed(fullUrl, "FireCrawlAgent")
          ) {
            links.push({url: fullUrl, html: content});
          }
        }
      });

      if(this.visited.size === 1){
        return links;
      }
      // Create a new list to return to avoid modifying the visited list
      return links.filter((link) => !this.visited.has(link.url));
    } catch (error) {
      return [];
    }
  }

  private matchesIncludes(url: string): boolean {
    if (this.includes.length === 0 || this.includes[0] == "") return true;
    return this.includes.some((pattern) => new RegExp(pattern).test(url));
  }

  private matchesExcludes(url: string): boolean {
    if (this.excludes.length === 0 || this.excludes[0] == "") return false;
    return this.excludes.some((pattern) => new RegExp(pattern).test(url));
  }

  private noSections(link: string): boolean {
    return !link.includes("#");
  }

  private isInternalLink(link: string): boolean {
    const urlObj = new URL(link, this.baseUrl);
    const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
    return urlObj.hostname === domainWithoutProtocol;
  }

  private matchesPattern(link: string): boolean {
    return true; // Placeholder for future pattern matching implementation
  }

  private isFile(url: string): boolean {
    const fileExtensions = [
      ".png",
      ".jpg",
      ".jpeg",
      ".gif",
      ".css",
      ".js",
      ".ico",
      ".svg",
      // ".pdf", 
      ".zip",
      ".exe",
      ".dmg",
      ".mp4",
      ".mp3",
      ".pptx",
      ".docx",
      ".xlsx",
      ".xml",
    ];
    return fileExtensions.some((ext) => url.endsWith(ext));
  }

  private isSocialMediaOrEmail(url: string): boolean {
    const socialMediaOrEmail = [
      "facebook.com",
      "twitter.com",
      "linkedin.com",
      "instagram.com",
      "pinterest.com",
      "mailto:",
    ];
    return socialMediaOrEmail.some((ext) => url.includes(ext));
  }

  // 
  private async tryFetchSitemapLinks(url: string): Promise<string[]> {
    const normalizeUrl = (url: string) => {
      url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
      if (url.endsWith("/")) {
        url = url.slice(0, -1);
      }
      return url;
    };

    const sitemapUrl = url.endsWith("/sitemap.xml")
      ? url
      : `${url}/sitemap.xml`;

    let sitemapLinks: string[] = [];

    try {
      const response = await axios.get(sitemapUrl);
      if (response.status === 200) {
        sitemapLinks = await getLinksFromSitemap(sitemapUrl);
      }
    } catch (error) {
      // Error handling for failed sitemap fetch
      // console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
    }

    if (sitemapLinks.length === 0) {
      // If the first one doesn't work, try the base URL
      const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
      try {
        const response = await axios.get(baseUrlSitemap);
        if (response.status === 200) {
          sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
        }
      } catch (error) {
        // Error handling for failed base URL sitemap fetch
        // console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
      }
    }

    // Normalize and check if the URL is present in any of the sitemaps
    const normalizedUrl = normalizeUrl(url);

    const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));

    // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
    if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
      // do not push the normalized url
      sitemapLinks.push(url);
    }

    return sitemapLinks;
  }
}
Initial commit 2024-04-15 17:01:47 -04:00			`import axios from "axios";`
			`import cheerio, { load } from "cheerio";`
			`import { URL } from "url";`
			`import { getLinksFromSitemap } from "./sitemap";`
			`import async from "async";`
			`import { Progress } from "../../lib/entities";`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";`
Initial commit 2024-04-15 17:01:47 -04:00			`import robotsParser from "robots-parser";`

			`export class WebCrawler {`
			`private initialUrl: string;`
			`private baseUrl: string;`
			`private includes: string[];`
			`private excludes: string[];`
			`private maxCrawledLinks: number;`
Added max depth option 2024-05-07 10:06:26 -04:00			`private maxCrawledDepth: number;`
Initial commit 2024-04-15 17:01:47 -04:00			`private visited: Set<string> = new Set();`
Nick: 2024-05-14 15:12:40 -04:00			`private crawledUrls: Map<string, string> = new Map();`
Initial commit 2024-04-15 17:01:47 -04:00			`private limit: number;`
			`private robotsTxtUrl: string;`
			`private robots: any;`
Nick: 2024-04-16 12:49:14 -04:00			`private generateImgAltText: boolean;`
Initial commit 2024-04-15 17:01:47 -04:00
			`constructor({`
			`initialUrl,`
			`includes,`
			`excludes,`
			`maxCrawledLinks,`
			`limit = 10000,`
Nick: 2024-04-16 12:49:14 -04:00			`generateImgAltText = false,`
Added max depth option 2024-05-07 10:06:26 -04:00			`maxCrawledDepth = 10,`
Initial commit 2024-04-15 17:01:47 -04:00			`}: {`
			`initialUrl: string;`
			`includes?: string[];`
			`excludes?: string[];`
			`maxCrawledLinks?: number;`
			`limit?: number;`
Nick: 2024-04-16 12:49:14 -04:00			`generateImgAltText?: boolean;`
Added max depth option 2024-05-07 10:06:26 -04:00			`maxCrawledDepth?: number;`
Initial commit 2024-04-15 17:01:47 -04:00			`}) {`
			`this.initialUrl = initialUrl;`
			`this.baseUrl = new URL(initialUrl).origin;`
			`this.includes = includes ?? [];`
			`this.excludes = excludes ?? [];`
			`this.limit = limit;`
			this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
			`this.robots = robotsParser(this.robotsTxtUrl, "");`
			`// Deprecated, use limit instead`
			`this.maxCrawledLinks = maxCrawledLinks ?? limit;`
Added max depth option 2024-05-07 10:06:26 -04:00			`this.maxCrawledDepth = maxCrawledDepth ?? 10;`
Nick: 2024-04-16 12:49:14 -04:00			`this.generateImgAltText = generateImgAltText ?? false;`
Initial commit 2024-04-15 17:01:47 -04:00			`}`

Added max depth option 2024-05-07 10:06:26 -04:00			`private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {`
Initial commit 2024-04-15 17:01:47 -04:00			`return sitemapLinks`
			`.filter((link) => {`
			`const url = new URL(link);`
			`const path = url.pathname;`
Added max depth option 2024-05-07 10:06:26 -04:00			`const depth = url.pathname.split('/').length - 1;`

			`// Check if the link exceeds the maximum depth allowed`
			`if (depth > maxDepth) {`
			`return false;`
			`}`
Initial commit 2024-04-15 17:01:47 -04:00
			`// Check if the link should be excluded`
			`if (this.excludes.length > 0 && this.excludes[0] !== "") {`
			`if (`
			`this.excludes.some((excludePattern) =>`
			`new RegExp(excludePattern).test(path)`
			`)`
			`) {`
			`return false;`
			`}`
			`}`

			`// Check if the link matches the include patterns, if any are specified`
			`if (this.includes.length > 0 && this.includes[0] !== "") {`
Nick: fixes most of it 2024-05-15 18:30:37 -04:00			`if (!this.includes.some((includePattern) =>`
Initial commit 2024-04-15 17:01:47 -04:00			`new RegExp(includePattern).test(path)`
Nick: fixes most of it 2024-05-15 18:30:37 -04:00			`)) {`
			`return false;`
			`}`
			`}`

			`// Normalize the initial URL and the link to account for www and non-www versions`
			`const normalizedInitialUrl = new URL(this.initialUrl);`
			`const normalizedLink = new URL(link);`
			`const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');`
			`const linkHostname = normalizedLink.hostname.replace(/^www\./, '');`

			`// Ensure the protocol and hostname match, and the path starts with the initial URL's path`
			`if (linkHostname !== initialHostname \|\| !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {`
			`return false;`
Initial commit 2024-04-15 17:01:47 -04:00			`}`

			`const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;`
			`// Check if the link is disallowed by robots.txt`
			`if (!isAllowed) {`
			console.log(`Link disallowed by robots.txt: ${link}`);
			`return false;`
			`}`

			`return true;`
			`})`
			`.slice(0, limit);`
			`}`

			`public async start(`
			`inProgress?: (progress: Progress) => void,`
			`concurrencyLimit: number = 5,`
Added max depth option 2024-05-07 10:06:26 -04:00			`limit: number = 10000,`
			`maxDepth: number = 10`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`): Promise<{ url: string, html: string }[]> {`
Initial commit 2024-04-15 17:01:47 -04:00			`// Fetch and parse robots.txt`
			`try {`
			`const response = await axios.get(this.robotsTxtUrl);`
			`this.robots = robotsParser(this.robotsTxtUrl, response.data);`
			`} catch (error) {`
			console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
Nick: fixes most of it 2024-05-15 18:30:37 -04:00
Initial commit 2024-04-15 17:01:47 -04:00			`}`

Nick: fixes most of it 2024-05-15 18:30:37 -04:00
Initial commit 2024-04-15 17:01:47 -04:00			`const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);`
			`if (sitemapLinks.length > 0) {`
Fixing child links only bug 2024-05-15 17:35:09 -04:00			`let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`return filteredLinks.map(link => ({ url: link, html: "" }));`
Initial commit 2024-04-15 17:01:47 -04:00			`}`

			`const urls = await this.crawlUrls(`
			`[this.initialUrl],`
			`concurrencyLimit,`
			`inProgress`
			`);`
			`if (`
			`urls.length === 0 &&`
Added max depth option 2024-05-07 10:06:26 -04:00			`this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0`
Initial commit 2024-04-15 17:01:47 -04:00			`) {`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`return [{ url: this.initialUrl, html: "" }];`
Initial commit 2024-04-15 17:01:47 -04:00			`}`

Nick: working 2024-05-15 20:13:04 -04:00
Initial commit 2024-04-15 17:01:47 -04:00			`// make sure to run include exclude here again`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);`
			`return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html \|\| "" }));`
Initial commit 2024-04-15 17:01:47 -04:00			`}`

			`private async crawlUrls(`
			`urls: string[],`
			`concurrencyLimit: number,`
Nick: working 2024-05-15 20:13:04 -04:00			`inProgress?: (progress: Progress) => void,`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`): Promise<{ url: string, html: string }[]> {`
Initial commit 2024-04-15 17:01:47 -04:00			`const queue = async.queue(async (task: string, callback) => {`
Nick: 2024-05-14 15:04:36 -04:00			`if (this.crawledUrls.size >= this.maxCrawledLinks) {`
Initial commit 2024-04-15 17:01:47 -04:00			`if (callback && typeof callback === "function") {`
			`callback();`
			`}`
			`return;`
			`}`
			`const newUrls = await this.crawl(task);`
Nick: working 2024-05-15 20:13:04 -04:00			`// add the initial url if not already added`
			`// if (this.visited.size === 1) {`
			`// let normalizedInitial = this.initialUrl;`
			`// if (!normalizedInitial.endsWith("/")) {`
			`// normalizedInitial = normalizedInitial + "/";`
			`// }`
			`// if (!newUrls.some(page => page.url === this.initialUrl)) {`
			`// newUrls.push({ url: this.initialUrl, html: "" });`
			`// }`
			`// }`


Nick: 2024-05-14 15:12:40 -04:00			`newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));`
Nick: working 2024-05-15 20:13:04 -04:00
Initial commit 2024-04-15 17:01:47 -04:00			`if (inProgress && newUrls.length > 0) {`
			`inProgress({`
Nick: 2024-05-14 15:04:36 -04:00			`current: this.crawledUrls.size,`
Initial commit 2024-04-15 17:01:47 -04:00			`total: this.maxCrawledLinks,`
			`status: "SCRAPING",`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`currentDocumentUrl: newUrls[newUrls.length - 1].url,`
Initial commit 2024-04-15 17:01:47 -04:00			`});`
			`} else if (inProgress) {`
			`inProgress({`
Nick: 2024-05-14 15:04:36 -04:00			`current: this.crawledUrls.size,`
Initial commit 2024-04-15 17:01:47 -04:00			`total: this.maxCrawledLinks,`
			`status: "SCRAPING",`
			`currentDocumentUrl: task,`
			`});`
			`}`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);`
Initial commit 2024-04-15 17:01:47 -04:00			`if (callback && typeof callback === "function") {`
			`callback();`
			`}`
			`}, concurrencyLimit);`

			`queue.push(`
			`urls.filter(`
			`(url) =>`
			`!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")`
			`),`
			`(err) => {`
			`if (err) console.error(err);`
			`}`
			`);`
			`await queue.drain();`
Nick: 2024-05-14 15:12:40 -04:00			`return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));`
Initial commit 2024-04-15 17:01:47 -04:00			`}`

Nick: 4x speed 2024-05-13 23:45:11 -04:00			`async crawl(url: string): Promise<{url: string, html: string}[]> {`
Nick: working 2024-05-15 20:13:04 -04:00			`if (this.visited.has(url) \|\| !this.robots.isAllowed(url, "FireCrawlAgent")){`
Initial commit 2024-04-15 17:01:47 -04:00			`return [];`
Nick: working 2024-05-15 20:13:04 -04:00			`}`
Initial commit 2024-04-15 17:01:47 -04:00			`this.visited.add(url);`
Nick: working 2024-05-15 20:13:04 -04:00

Initial commit 2024-04-15 17:01:47 -04:00			`if (!url.startsWith("http")) {`
			`url = "https://" + url;`
Nick: working 2024-05-15 20:13:04 -04:00
Initial commit 2024-04-15 17:01:47 -04:00			`}`
			`if (url.endsWith("/")) {`
			`url = url.slice(0, -1);`
Nick: working 2024-05-15 20:13:04 -04:00
Initial commit 2024-04-15 17:01:47 -04:00			`}`
Nick: working 2024-05-15 20:13:04 -04:00
Initial commit 2024-04-15 17:01:47 -04:00			`if (this.isFile(url) \|\| this.isSocialMediaOrEmail(url)) {`
			`return [];`
			`}`

			`try {`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`let content : string = "";`
			`// If it is the first link, fetch with single url`
Initial commit 2024-04-15 17:01:47 -04:00			`if (this.visited.size === 1) {`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`const page = await scrapSingleUrl(url, {includeHtml: true});`
			`content = page.html ?? ""`
Initial commit 2024-04-15 17:01:47 -04:00			`} else {`
			`const response = await axios.get(url);`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`content = response.data ?? "";`
Initial commit 2024-04-15 17:01:47 -04:00			`}`
			`const $ = load(content);`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`let links: {url: string, html: string}[] = [];`
Initial commit 2024-04-15 17:01:47 -04:00
Nick: working 2024-05-15 20:13:04 -04:00			`// Add the initial URL to the list of links`
			`if(this.visited.size === 1)`
			`{`
			`links.push({url, html: content});`
			`}`


Initial commit 2024-04-15 17:01:47 -04:00			`$("a").each((_, element) => {`
			`const href = $(element).attr("href");`
			`if (href) {`
			`let fullUrl = href;`
			`if (!href.startsWith("http")) {`
			`fullUrl = new URL(href, this.baseUrl).toString();`
			`}`
			`const url = new URL(fullUrl);`
			`const path = url.pathname;`

			`if (`
			`this.isInternalLink(fullUrl) &&`
			`this.matchesPattern(fullUrl) &&`
			`this.noSections(fullUrl) &&`
			`this.matchesIncludes(path) &&`
			`!this.matchesExcludes(path) &&`
			`this.robots.isAllowed(fullUrl, "FireCrawlAgent")`
			`) {`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`links.push({url: fullUrl, html: content});`
Initial commit 2024-04-15 17:01:47 -04:00			`}`
			`}`
			`});`

Nick: working 2024-05-15 20:13:04 -04:00			`if(this.visited.size === 1){`
			`return links;`
			`}`
Nick: 4x speed 2024-05-13 23:45:11 -04:00			`// Create a new list to return to avoid modifying the visited list`
Nick: 2024-05-14 00:10:58 -04:00			`return links.filter((link) => !this.visited.has(link.url));`
Initial commit 2024-04-15 17:01:47 -04:00			`} catch (error) {`
			`return [];`
			`}`
			`}`

			`private matchesIncludes(url: string): boolean {`
			`if (this.includes.length === 0 \|\| this.includes[0] == "") return true;`
			`return this.includes.some((pattern) => new RegExp(pattern).test(url));`
			`}`

			`private matchesExcludes(url: string): boolean {`
			`if (this.excludes.length === 0 \|\| this.excludes[0] == "") return false;`
			`return this.excludes.some((pattern) => new RegExp(pattern).test(url));`
			`}`

			`private noSections(link: string): boolean {`
			`return !link.includes("#");`
			`}`

			`private isInternalLink(link: string): boolean {`
			`const urlObj = new URL(link, this.baseUrl);`
			`const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");`
			`return urlObj.hostname === domainWithoutProtocol;`
			`}`

			`private matchesPattern(link: string): boolean {`
			`return true; // Placeholder for future pattern matching implementation`
			`}`

			`private isFile(url: string): boolean {`
			`const fileExtensions = [`
			`".png",`
			`".jpg",`
			`".jpeg",`
			`".gif",`
			`".css",`
			`".js",`
			`".ico",`
			`".svg",`
[Feat] Adding pdf parser 2024-04-18 10:43:57 -04:00			`// ".pdf",`
Initial commit 2024-04-15 17:01:47 -04:00			`".zip",`
			`".exe",`
			`".dmg",`
			`".mp4",`
			`".mp3",`
			`".pptx",`
			`".docx",`
			`".xlsx",`
			`".xml",`
			`];`
			`return fileExtensions.some((ext) => url.endsWith(ext));`
			`}`

			`private isSocialMediaOrEmail(url: string): boolean {`
			`const socialMediaOrEmail = [`
			`"facebook.com",`
			`"twitter.com",`
			`"linkedin.com",`
			`"instagram.com",`
			`"pinterest.com",`
			`"mailto:",`
			`];`
			`return socialMediaOrEmail.some((ext) => url.includes(ext));`
			`}`

Nick: working 2024-05-15 20:13:04 -04:00			`//`
Initial commit 2024-04-15 17:01:47 -04:00			`private async tryFetchSitemapLinks(url: string): Promise<string[]> {`
Nick: working 2024-05-15 20:13:04 -04:00			`const normalizeUrl = (url: string) => {`
			`url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");`
			`if (url.endsWith("/")) {`
			`url = url.slice(0, -1);`
			`}`
			`return url;`
			`};`

Initial commit 2024-04-15 17:01:47 -04:00			`const sitemapUrl = url.endsWith("/sitemap.xml")`
			`? url`
			: `${url}/sitemap.xml`;
Nick: working 2024-05-15 20:13:04 -04:00
			`let sitemapLinks: string[] = [];`

Initial commit 2024-04-15 17:01:47 -04:00			`try {`
			`const response = await axios.get(sitemapUrl);`
			`if (response.status === 200) {`
Nick: working 2024-05-15 20:13:04 -04:00			`sitemapLinks = await getLinksFromSitemap(sitemapUrl);`
Initial commit 2024-04-15 17:01:47 -04:00			`}`
			`} catch (error) {`
			`// Error handling for failed sitemap fetch`
Nick: fixes most of it 2024-05-15 18:30:37 -04:00			// console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
Initial commit 2024-04-15 17:01:47 -04:00			`}`
Nick: fixes most of it 2024-05-15 18:30:37 -04:00
Nick: working 2024-05-15 20:13:04 -04:00			`if (sitemapLinks.length === 0) {`
			`// If the first one doesn't work, try the base URL`
			const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
			`try {`
			`const response = await axios.get(baseUrlSitemap);`
			`if (response.status === 200) {`
			`sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);`
			`}`
			`} catch (error) {`
			`// Error handling for failed base URL sitemap fetch`
			// console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
Nick: fixes most of it 2024-05-15 18:30:37 -04:00			`}`
			`}`

Nick: working 2024-05-15 20:13:04 -04:00			`// Normalize and check if the URL is present in any of the sitemaps`
			`const normalizedUrl = normalizeUrl(url);`

			`const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));`

			`// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl`
			`if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {`
			`// do not push the normalized url`
			`sitemapLinks.push(url);`
			`}`

			`return sitemapLinks;`
Initial commit 2024-04-15 17:01:47 -04:00			`}`
			`}`