Nick: 4x speed

2024-05-13 20:45:11 -07:00 · 2024-05-13 20:45:11 -07:00 · a96fc5b96d
commit a96fc5b96d
parent e26008a833
5 changed files with 90 additions and 36 deletions
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -44,6 +44,7 @@ export type WebScraperOptions = {
    limit?: number;
    generateImgAltText?: boolean;
    replaceAllPathsWithAbsolutePaths?: boolean;
+    fastMode?: boolean; // have a mode of some sort
  };
  pageOptions?: PageOptions;
  extractorOptions?: ExtractorOptions;
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -4,7 +4,7 @@ import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import async from "async";
 import { Progress } from "../../lib/entities";
-import { scrapWithScrapingBee } from "./single_url";
+import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";

 export class WebCrawler {
@ -15,11 +15,12 @@ export class WebCrawler {
  private maxCrawledLinks: number;
  private maxCrawledDepth: number;
  private visited: Set<string> = new Set();
-  private crawledUrls: Set<string> = new Set();
+  private crawledUrls: { url: string, html: string }[] = [];
  private limit: number;
  private robotsTxtUrl: string;
  private robots: any;
  private generateImgAltText: boolean;
+  private fastMode: boolean = false;

  constructor({
    initialUrl,
@ -49,9 +50,9 @@ export class WebCrawler {
    this.maxCrawledLinks = maxCrawledLinks ?? limit;
    this.maxCrawledDepth = maxCrawledDepth ?? 10;
    this.generateImgAltText = generateImgAltText ?? false;
+    this.fastMode = false;
  }

-
  private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
    return sitemapLinks
      .filter((link) => {
@ -99,7 +100,7 @@ export class WebCrawler {
    concurrencyLimit: number = 5,
    limit: number = 10000,
    maxDepth: number = 10
-  ): Promise<string[]> {
+  ): Promise<{ url: string, html: string }[]> {
    // Fetch and parse robots.txt
    try {
      const response = await axios.get(this.robotsTxtUrl);
@ -111,7 +112,7 @@ export class WebCrawler {
    const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
    if (sitemapLinks.length > 0) {
      const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
-      return filteredLinks;
+      return filteredLinks.map(link => ({ url: link, html: "" }));
    }

    const urls = await this.crawlUrls(
@ -123,43 +124,44 @@ export class WebCrawler {
      urls.length === 0 &&
      this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
    ) {
-      return [this.initialUrl];
+      return [{ url: this.initialUrl, html: "" }];
    }

    // make sure to run include exclude here again
-    return this.filterLinks(urls, limit, this.maxCrawledDepth);
+    const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
+    return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
  }

  private async crawlUrls(
    urls: string[],
    concurrencyLimit: number,
    inProgress?: (progress: Progress) => void
-  ): Promise<string[]> {
+  ): Promise<{ url: string, html: string }[]> {
    const queue = async.queue(async (task: string, callback) => {
-      if (this.crawledUrls.size >= this.maxCrawledLinks) {
+      if (this.crawledUrls.length >= this.maxCrawledLinks) {
        if (callback && typeof callback === "function") {
          callback();
        }
        return;
      }
      const newUrls = await this.crawl(task);
-      newUrls.forEach((url) => this.crawledUrls.add(url));
+      newUrls.forEach((page) => this.crawledUrls.push(page));
      if (inProgress && newUrls.length > 0) {
        inProgress({
-          current: this.crawledUrls.size,
+          current: this.crawledUrls.length,
          total: this.maxCrawledLinks,
          status: "SCRAPING",
-          currentDocumentUrl: newUrls[newUrls.length - 1],
+          currentDocumentUrl: newUrls[newUrls.length - 1].url,
        });
      } else if (inProgress) {
        inProgress({
-          current: this.crawledUrls.size,
+          current: this.crawledUrls.length,
          total: this.maxCrawledLinks,
          status: "SCRAPING",
          currentDocumentUrl: task,
        });
      }
-      await this.crawlUrls(newUrls, concurrencyLimit, inProgress);
+      await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
      if (callback && typeof callback === "function") {
        callback();
      }
@ -175,10 +177,10 @@ export class WebCrawler {
      }
    );
    await queue.drain();
-    return Array.from(this.crawledUrls);
+    return this.crawledUrls;
  }

-  async crawl(url: string): Promise<string[]> {
+  async crawl(url: string): Promise<{url: string, html: string}[]> {
    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent"))
      return [];
    this.visited.add(url);
@ -193,16 +195,17 @@ export class WebCrawler {
    }

    try {
-      let content;
-      // If it is the first link, fetch with scrapingbee
+      let content : string = "";
+      // If it is the first link, fetch with single url
      if (this.visited.size === 1) {
-        content = await scrapWithScrapingBee(url, "load");
+        const page = await scrapSingleUrl(url, {includeHtml: true});
+        content = page.html ?? ""
      } else {
        const response = await axios.get(url);
-        content = response.data;
+        content = response.data ?? "";
      }
      const $ = load(content);
-      let links: string[] = [];
+      let links: {url: string, html: string}[] = [];

      $("a").each((_, element) => {
        const href = $(element).attr("href");
@ -215,7 +218,6 @@ export class WebCrawler {
          const path = url.pathname;

          if (
-            // fullUrl.startsWith(this.initialUrl) && // this condition makes it stop crawling back the url
            this.isInternalLink(fullUrl) &&
            this.matchesPattern(fullUrl) &&
            this.noSections(fullUrl) &&
@ -223,12 +225,14 @@ export class WebCrawler {
            !this.matchesExcludes(path) &&
            this.robots.isAllowed(fullUrl, "FireCrawlAgent")
          ) {
-            links.push(fullUrl);
+            links.push({url: fullUrl, html: content});
          }
        }
      });

-      return links.filter((link) => !this.visited.has(link));
+      // Create a new list to return to avoid modifying the visited list
+      const filteredLinks = links.filter((link) => !this.visited.has(link.url));
+      return filteredLinks;
    } catch (error) {
      return [];
    }
@ -309,3 +313,4 @@ export class WebCrawler {
    return [];
  }
 }
+
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -17,7 +17,20 @@ import {
 } from "./utils/replacePaths";
 import { generateCompletions } from "../../lib/LLM-extraction";
 import { getWebScraperQueue } from "../../../src/services/queue-service";
-
+import { parseMarkdown } from "../../lib/html-to-markdown";
+import cheerio from "cheerio";
+import { excludeNonMainTags } from "./utils/excludeTags";
+const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
+  const soup = cheerio.load(html);
+  soup("script, style, iframe, noscript, meta, head").remove();
+  if (pageOptions.onlyMainContent) {
+    // remove any other tags that are not in the main content
+    excludeNonMainTags.forEach((tag) => {
+      soup(tag).remove();
+    });
+  }
+  return soup.html();
+};
 export class WebScraperDataProvider {
  private bullJobId: string;
  private urls: string[] = [""];
@ -35,6 +48,7 @@ export class WebScraperDataProvider {
  private replaceAllPathsWithAbsolutePaths?: boolean = false;
  private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
    "gpt-4-turbo";
+  private fastMode: boolean = false;

  authorize(): void {
    throw new Error("Method not implemented.");
@ -46,7 +60,8 @@ export class WebScraperDataProvider {

  private async convertUrlsToDocuments(
    urls: string[],
-    inProgress?: (progress: Progress) => void
+    inProgress?: (progress: Progress) => void,
+    allHtmls?: string[]
  ): Promise<Document[]> {
    const totalUrls = urls.length;
    let processedUrls = 0;
@ -56,7 +71,8 @@ export class WebScraperDataProvider {
      const batchUrls = urls.slice(i, i + this.concurrentRequests);
      await Promise.all(
        batchUrls.map(async (url, index) => {
-          const result = await scrapSingleUrl(url, this.pageOptions);
+          const existingText = allHtmls ? allHtmls[i + index] : "";
+          const result = await scrapSingleUrl(url, this.pageOptions, existingText);
          processedUrls++;
          if (inProgress) {
            inProgress({
@ -139,13 +155,33 @@ export class WebScraperDataProvider {
      limit: this.limit,
      generateImgAltText: this.generateImgAltText,
    });
+    let start = Date.now();
    let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
+    console.log(links.length)
+    let end = Date.now();
+    console.log("Crawl end in seconds ", (end - start) / 1000);
+    const allLinks = links.map((e) => e.url);
+    const allHtmls = links.map((e)=> e.html);
+    console.log("All links", allLinks.length);
+    console.log("All htmls", allHtmls.length);
+
    if (this.returnOnlyUrls) {
-      return this.returnOnlyUrlsResponse(links, inProgress);
+      return this.returnOnlyUrlsResponse(allLinks , inProgress);
    }
    
-    let documents = await this.processLinks(links, inProgress);
-    return this.cacheAndFinalizeDocuments(documents, links);
+
+    let fastDocs = []
+    let documents = [];
+    // check if fast mode is enabled and there is html inside the links
+    if (this.fastMode && links.some((link) => link.html)) {
+      console.log("Fast mode enabled");
+      documents = await this.processLinks(allLinks, inProgress, allHtmls);
+
+    }else{
+      documents = await this.convertUrlsToDocuments(allLinks, inProgress, allHtmls);
+    }
+
+    return this.cacheAndFinalizeDocuments(documents, allLinks);
  }

  private async handleSingleUrlsMode(
@ -187,14 +223,17 @@ export class WebScraperDataProvider {

  private async processLinks(
    links: string[],
-    inProgress?: (progress: Progress) => void
+    inProgress?: (progress: Progress) => void,
+    allHtmls?: string[]
  ): Promise<Document[]> {
    let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
    let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
    links = links.filter((link) => !link.endsWith(".pdf"));
    
-    let documents = await this.convertUrlsToDocuments(links, inProgress);
+    let documents = await this.convertUrlsToDocuments(links, inProgress, allHtmls);
    documents = await this.getSitemapData(this.urls[0], documents);
+
+
    documents = this.applyPathReplacements(documents);
    // documents = await this.applyImgAltText(documents);

@ -238,6 +277,8 @@ export class WebScraperDataProvider {
  ): Promise<Document[]> {
    await this.setCachedDocuments(documents, links);
    documents = this.removeChildLinks(documents);
+    documents = this.filterDocsExcludeInclude(documents);
+    documents = this.filterDepth(documents);
    return documents.splice(0, this.limit);
  }

@ -397,6 +438,7 @@ export class WebScraperDataProvider {
    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
    //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
    this.excludes = this.excludes.filter((item) => item !== "");
+    this.fastMode = options.crawlerOptions?.fastMode ?? false;

    // make sure all urls start with https://
    this.urls = this.urls.map((url) => {
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -106,7 +106,8 @@ export async function scrapWithPlaywright(url: string): Promise<string> {

 export async function scrapSingleUrl(
  urlToScrap: string,
-  pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
+  pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
+  existingText: string = ""
 ): Promise<Document> {
  urlToScrap = urlToScrap.trim();

@ -197,8 +198,13 @@ export async function scrapSingleUrl(
      : ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];

    for (const scraper of scrapersInOrder) {
+      // If exists text coming from crawler, use it
+      if (existingText && existingText.trim().length >= 100) {
+        text = existingText;
+        break;
+      }
      [text, html] = await attemptScraping(urlToScrap, scraper);
-      if (text && text.length >= 100) break;
+      if (text && text.trim().length >= 100) break;
      console.log(`Falling back to ${scraper}`);
    }

--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -26,7 +26,7 @@ getWebScraperQueue().process(
        success: success,
        result: {
          links: docs.map((doc) => {
-            return { content: doc, source: doc.metadata.sourceURL };
+            return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
          }),
        },
        project_id: job.data.project_id,