From a96fc5b96d4e2144ed933d8a445900ec653c208a Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 13 May 2024 20:45:11 -0700
Subject: [PATCH 1/9] Nick: 4x speed

---
 apps/api/src/lib/entities.ts                  |  1 +
 apps/api/src/scraper/WebScraper/crawler.ts    | 53 ++++++++--------
 apps/api/src/scraper/WebScraper/index.ts      | 60 ++++++++++++++++---
 apps/api/src/scraper/WebScraper/single_url.ts | 10 +++-
 apps/api/src/services/queue-worker.ts         |  2 +-
 5 files changed, 90 insertions(+), 36 deletions(-)
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index a387b54..0c34126 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -44,6 +44,7 @@ export type WebScraperOptions = {
     limit?: number;
     generateImgAltText?: boolean;
     replaceAllPathsWithAbsolutePaths?: boolean;
+    fastMode?: boolean; // have a mode of some sort
   };
   pageOptions?: PageOptions;
   extractorOptions?: ExtractorOptions;
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 0248df2..25f2e9d 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -4,7 +4,7 @@ import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import async from "async";
 import { Progress } from "../../lib/entities";
-import { scrapWithScrapingBee } from "./single_url";
+import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";
 
 export class WebCrawler {
@@ -15,11 +15,12 @@ export class WebCrawler {
   private maxCrawledLinks: number;
   private maxCrawledDepth: number;
   private visited: Set<string> = new Set();
-  private crawledUrls: Set<string> = new Set();
+  private crawledUrls: { url: string, html: string }[] = [];
   private limit: number;
   private robotsTxtUrl: string;
   private robots: any;
   private generateImgAltText: boolean;
+  private fastMode: boolean = false;
 
   constructor({
     initialUrl,
@@ -49,9 +50,9 @@ export class WebCrawler {
     this.maxCrawledLinks = maxCrawledLinks ?? limit;
     this.maxCrawledDepth = maxCrawledDepth ?? 10;
     this.generateImgAltText = generateImgAltText ?? false;
+    this.fastMode = false;
   }
 
-
   private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
     return sitemapLinks
       .filter((link) => {
@@ -99,7 +100,7 @@ export class WebCrawler {
     concurrencyLimit: number = 5,
     limit: number = 10000,
     maxDepth: number = 10
-  ): Promise<string[]> {
+  ): Promise<{ url: string, html: string }[]> {
     // Fetch and parse robots.txt
     try {
       const response = await axios.get(this.robotsTxtUrl);
@@ -111,7 +112,7 @@ export class WebCrawler {
     const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
     if (sitemapLinks.length > 0) {
       const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
-      return filteredLinks;
+      return filteredLinks.map(link => ({ url: link, html: "" }));
     }
 
     const urls = await this.crawlUrls(
@@ -123,43 +124,44 @@ export class WebCrawler {
       urls.length === 0 &&
       this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
     ) {
-      return [this.initialUrl];
+      return [{ url: this.initialUrl, html: "" }];
     }
 
     // make sure to run include exclude here again
-    return this.filterLinks(urls, limit, this.maxCrawledDepth);
+    const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
+    return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
   }
 
   private async crawlUrls(
     urls: string[],
     concurrencyLimit: number,
     inProgress?: (progress: Progress) => void
-  ): Promise<string[]> {
+  ): Promise<{ url: string, html: string }[]> {
     const queue = async.queue(async (task: string, callback) => {
-      if (this.crawledUrls.size >= this.maxCrawledLinks) {
+      if (this.crawledUrls.length >= this.maxCrawledLinks) {
         if (callback && typeof callback === "function") {
           callback();
         }
         return;
       }
       const newUrls = await this.crawl(task);
-      newUrls.forEach((url) => this.crawledUrls.add(url));
+      newUrls.forEach((page) => this.crawledUrls.push(page));
       if (inProgress && newUrls.length > 0) {
         inProgress({
-          current: this.crawledUrls.size,
+          current: this.crawledUrls.length,
           total: this.maxCrawledLinks,
           status: "SCRAPING",
-          currentDocumentUrl: newUrls[newUrls.length - 1],
+          currentDocumentUrl: newUrls[newUrls.length - 1].url,
         });
       } else if (inProgress) {
         inProgress({
-          current: this.crawledUrls.size,
+          current: this.crawledUrls.length,
           total: this.maxCrawledLinks,
           status: "SCRAPING",
           currentDocumentUrl: task,
         });
       }
-      await this.crawlUrls(newUrls, concurrencyLimit, inProgress);
+      await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
       if (callback && typeof callback === "function") {
         callback();
       }
@@ -175,10 +177,10 @@ export class WebCrawler {
       }
     );
     await queue.drain();
-    return Array.from(this.crawledUrls);
+    return this.crawledUrls;
   }
 
-  async crawl(url: string): Promise<string[]> {
+  async crawl(url: string): Promise<{url: string, html: string}[]> {
     if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent"))
       return [];
     this.visited.add(url);
@@ -193,16 +195,17 @@ export class WebCrawler {
     }
 
     try {
-      let content;
-      // If it is the first link, fetch with scrapingbee
+      let content : string = "";
+      // If it is the first link, fetch with single url
       if (this.visited.size === 1) {
-        content = await scrapWithScrapingBee(url, "load");
+        const page = await scrapSingleUrl(url, {includeHtml: true});
+        content = page.html ?? ""
       } else {
         const response = await axios.get(url);
-        content = response.data;
+        content = response.data ?? "";
       }
       const $ = load(content);
-      let links: string[] = [];
+      let links: {url: string, html: string}[] = [];
 
       $("a").each((_, element) => {
         const href = $(element).attr("href");
@@ -215,7 +218,6 @@ export class WebCrawler {
           const path = url.pathname;
 
           if (
-            // fullUrl.startsWith(this.initialUrl) && // this condition makes it stop crawling back the url
             this.isInternalLink(fullUrl) &&
             this.matchesPattern(fullUrl) &&
             this.noSections(fullUrl) &&
@@ -223,12 +225,14 @@ export class WebCrawler {
             !this.matchesExcludes(path) &&
             this.robots.isAllowed(fullUrl, "FireCrawlAgent")
           ) {
-            links.push(fullUrl);
+            links.push({url: fullUrl, html: content});
           }
         }
       });
 
-      return links.filter((link) => !this.visited.has(link));
+      // Create a new list to return to avoid modifying the visited list
+      const filteredLinks = links.filter((link) => !this.visited.has(link.url));
+      return filteredLinks;
     } catch (error) {
       return [];
     }
@@ -309,3 +313,4 @@ export class WebCrawler {
     return [];
   }
 }
+
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 7ef0a10..9221666 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -17,7 +17,20 @@ import {
 } from "./utils/replacePaths";
 import { generateCompletions } from "../../lib/LLM-extraction";
 import { getWebScraperQueue } from "../../../src/services/queue-service";
-
+import { parseMarkdown } from "../../lib/html-to-markdown";
+import cheerio from "cheerio";
+import { excludeNonMainTags } from "./utils/excludeTags";
+const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
+  const soup = cheerio.load(html);
+  soup("script, style, iframe, noscript, meta, head").remove();
+  if (pageOptions.onlyMainContent) {
+    // remove any other tags that are not in the main content
+    excludeNonMainTags.forEach((tag) => {
+      soup(tag).remove();
+    });
+  }
+  return soup.html();
+};
 export class WebScraperDataProvider {
   private bullJobId: string;
   private urls: string[] = [""];
@@ -35,6 +48,7 @@ export class WebScraperDataProvider {
   private replaceAllPathsWithAbsolutePaths?: boolean = false;
   private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
     "gpt-4-turbo";
+  private fastMode: boolean = false;
 
   authorize(): void {
     throw new Error("Method not implemented.");
@@ -46,7 +60,8 @@ export class WebScraperDataProvider {
 
   private async convertUrlsToDocuments(
     urls: string[],
-    inProgress?: (progress: Progress) => void
+    inProgress?: (progress: Progress) => void,
+    allHtmls?: string[]
   ): Promise<Document[]> {
     const totalUrls = urls.length;
     let processedUrls = 0;
@@ -56,7 +71,8 @@ export class WebScraperDataProvider {
       const batchUrls = urls.slice(i, i + this.concurrentRequests);
       await Promise.all(
         batchUrls.map(async (url, index) => {
-          const result = await scrapSingleUrl(url, this.pageOptions);
+          const existingText = allHtmls ? allHtmls[i + index] : "";
+          const result = await scrapSingleUrl(url, this.pageOptions, existingText);
           processedUrls++;
           if (inProgress) {
             inProgress({
@@ -139,13 +155,33 @@ export class WebScraperDataProvider {
       limit: this.limit,
       generateImgAltText: this.generateImgAltText,
     });
+    let start = Date.now();
     let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
+    console.log(links.length)
+    let end = Date.now();
+    console.log("Crawl end in seconds ", (end - start) / 1000);
+    const allLinks = links.map((e) => e.url);
+    const allHtmls = links.map((e)=> e.html);
+    console.log("All links", allLinks.length);
+    console.log("All htmls", allHtmls.length);
+
     if (this.returnOnlyUrls) {
-      return this.returnOnlyUrlsResponse(links, inProgress);
+      return this.returnOnlyUrlsResponse(allLinks , inProgress);
+    }
+    
+
+    let fastDocs = []
+    let documents = [];
+    // check if fast mode is enabled and there is html inside the links
+    if (this.fastMode && links.some((link) => link.html)) {
+      console.log("Fast mode enabled");
+      documents = await this.processLinks(allLinks, inProgress, allHtmls);
+
+    }else{
+      documents = await this.convertUrlsToDocuments(allLinks, inProgress, allHtmls);
     }
 
-    let documents = await this.processLinks(links, inProgress);
-    return this.cacheAndFinalizeDocuments(documents, links);
+    return this.cacheAndFinalizeDocuments(documents, allLinks);
   }
 
   private async handleSingleUrlsMode(
@@ -187,14 +223,17 @@ export class WebScraperDataProvider {
 
   private async processLinks(
     links: string[],
-    inProgress?: (progress: Progress) => void
+    inProgress?: (progress: Progress) => void,
+    allHtmls?: string[]
   ): Promise<Document[]> {
     let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
     let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
     links = links.filter((link) => !link.endsWith(".pdf"));
-
-    let documents = await this.convertUrlsToDocuments(links, inProgress);
+    
+    let documents = await this.convertUrlsToDocuments(links, inProgress, allHtmls);
     documents = await this.getSitemapData(this.urls[0], documents);
+
+
     documents = this.applyPathReplacements(documents);
     // documents = await this.applyImgAltText(documents);
 
@@ -238,6 +277,8 @@ export class WebScraperDataProvider {
   ): Promise<Document[]> {
     await this.setCachedDocuments(documents, links);
     documents = this.removeChildLinks(documents);
+    documents = this.filterDocsExcludeInclude(documents);
+    documents = this.filterDepth(documents);
     return documents.splice(0, this.limit);
   }
 
@@ -397,6 +438,7 @@ export class WebScraperDataProvider {
     this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
     //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
     this.excludes = this.excludes.filter((item) => item !== "");
+    this.fastMode = options.crawlerOptions?.fastMode ?? false;
 
     // make sure all urls start with https://
     this.urls = this.urls.map((url) => {
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index c43ea40..c41beb5 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -106,7 +106,8 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
 
 export async function scrapSingleUrl(
   urlToScrap: string,
-  pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
+  pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
+  existingText: string = ""
 ): Promise<Document> {
   urlToScrap = urlToScrap.trim();
 
@@ -197,8 +198,13 @@ export async function scrapSingleUrl(
       : ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
 
     for (const scraper of scrapersInOrder) {
+      // If exists text coming from crawler, use it
+      if (existingText && existingText.trim().length >= 100) {
+        text = existingText;
+        break;
+      }
       [text, html] = await attemptScraping(urlToScrap, scraper);
-      if (text && text.length >= 100) break;
+      if (text && text.trim().length >= 100) break;
       console.log(`Falling back to ${scraper}`);
     }
 
diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index 78ea030..ef7bb1f 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -26,7 +26,7 @@ getWebScraperQueue().process(
         success: success,
         result: {
           links: docs.map((doc) => {
-            return { content: doc, source: doc.metadata.sourceURL };
+            return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
           }),
         },
         project_id: job.data.project_id,

From 8a72cf556bf8cff1b21983a8fd50f56abc2ec8af Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 13 May 2024 21:10:58 -0700
Subject: [PATCH 2/9] Nick:

---
 apps/api/src/lib/entities.ts               | 2 +-
 apps/api/src/scraper/WebScraper/crawler.ts | 5 +----
 apps/api/src/scraper/WebScraper/index.ts   | 6 +++---
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 0c34126..15550be 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -44,7 +44,7 @@ export type WebScraperOptions = {
     limit?: number;
     generateImgAltText?: boolean;
     replaceAllPathsWithAbsolutePaths?: boolean;
-    fastMode?: boolean; // have a mode of some sort
+    mode?: "default" | "fast"; // have a mode of some sort
   };
   pageOptions?: PageOptions;
   extractorOptions?: ExtractorOptions;
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 25f2e9d..4509531 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -20,7 +20,6 @@ export class WebCrawler {
   private robotsTxtUrl: string;
   private robots: any;
   private generateImgAltText: boolean;
-  private fastMode: boolean = false;
 
   constructor({
     initialUrl,
@@ -50,7 +49,6 @@ export class WebCrawler {
     this.maxCrawledLinks = maxCrawledLinks ?? limit;
     this.maxCrawledDepth = maxCrawledDepth ?? 10;
     this.generateImgAltText = generateImgAltText ?? false;
-    this.fastMode = false;
   }
 
   private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
@@ -231,8 +229,7 @@ export class WebCrawler {
       });
 
       // Create a new list to return to avoid modifying the visited list
-      const filteredLinks = links.filter((link) => !this.visited.has(link.url));
-      return filteredLinks;
+      return links.filter((link) => !this.visited.has(link.url));
     } catch (error) {
       return [];
     }
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 9221666..1eeb65f 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -48,7 +48,7 @@ export class WebScraperDataProvider {
   private replaceAllPathsWithAbsolutePaths?: boolean = false;
   private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
     "gpt-4-turbo";
-  private fastMode: boolean = false;
+  private crawlerMode: string = "default";
 
   authorize(): void {
     throw new Error("Method not implemented.");
@@ -173,7 +173,7 @@ export class WebScraperDataProvider {
     let fastDocs = []
     let documents = [];
     // check if fast mode is enabled and there is html inside the links
-    if (this.fastMode && links.some((link) => link.html)) {
+    if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
       console.log("Fast mode enabled");
       documents = await this.processLinks(allLinks, inProgress, allHtmls);
 
@@ -438,7 +438,7 @@ export class WebScraperDataProvider {
     this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
     //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
     this.excludes = this.excludes.filter((item) => item !== "");
-    this.fastMode = options.crawlerOptions?.fastMode ?? false;
+    this.crawlerMode = options.crawlerOptions?.mode ?? "default";
 
     // make sure all urls start with https://
     this.urls = this.urls.map((url) => {

From 7f31959be7a3333b32bc6b3d2dcc128fa07fb5b6 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Tue, 14 May 2024 12:04:36 -0700
Subject: [PATCH 3/9] Nick:

---
 apps/api/src/scraper/WebScraper/crawler.ts | 13 +++++++------
 apps/api/src/scraper/WebScraper/index.ts   |  2 --
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 4509531..3dc6dc4 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -15,7 +15,7 @@ export class WebCrawler {
   private maxCrawledLinks: number;
   private maxCrawledDepth: number;
   private visited: Set<string> = new Set();
-  private crawledUrls: { url: string, html: string }[] = [];
+  private crawledUrls: Set<{ url: string, html: string }> = new Set();
   private limit: number;
   private robotsTxtUrl: string;
   private robots: any;
@@ -136,24 +136,24 @@ export class WebCrawler {
     inProgress?: (progress: Progress) => void
   ): Promise<{ url: string, html: string }[]> {
     const queue = async.queue(async (task: string, callback) => {
-      if (this.crawledUrls.length >= this.maxCrawledLinks) {
+      if (this.crawledUrls.size >= this.maxCrawledLinks) {
         if (callback && typeof callback === "function") {
           callback();
         }
         return;
       }
       const newUrls = await this.crawl(task);
-      newUrls.forEach((page) => this.crawledUrls.push(page));
+      newUrls.forEach((page) => this.crawledUrls.add(page));
       if (inProgress && newUrls.length > 0) {
         inProgress({
-          current: this.crawledUrls.length,
+          current: this.crawledUrls.size,
           total: this.maxCrawledLinks,
           status: "SCRAPING",
           currentDocumentUrl: newUrls[newUrls.length - 1].url,
         });
       } else if (inProgress) {
         inProgress({
-          current: this.crawledUrls.length,
+          current: this.crawledUrls.size,
           total: this.maxCrawledLinks,
           status: "SCRAPING",
           currentDocumentUrl: task,
@@ -175,7 +175,7 @@ export class WebCrawler {
       }
     );
     await queue.drain();
-    return this.crawledUrls;
+    return Array.from(this.crawledUrls);
   }
 
   async crawl(url: string): Promise<{url: string, html: string}[]> {
@@ -311,3 +311,4 @@ export class WebCrawler {
   }
 }
 
+
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 1eeb65f..1f5a785 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -277,8 +277,6 @@ export class WebScraperDataProvider {
   ): Promise<Document[]> {
     await this.setCachedDocuments(documents, links);
     documents = this.removeChildLinks(documents);
-    documents = this.filterDocsExcludeInclude(documents);
-    documents = this.filterDepth(documents);
     return documents.splice(0, this.limit);
   }
 

From a0fdc6f7c6ec646f9a1627baf1afff314628b487 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Tue, 14 May 2024 12:12:40 -0700
Subject: [PATCH 4/9] Nick:

---
 apps/api/src/scraper/WebScraper/crawler.ts | 8 +++-----
 apps/api/src/scraper/WebScraper/index.ts   | 3 +--
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 3dc6dc4..521b1e1 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -15,7 +15,7 @@ export class WebCrawler {
   private maxCrawledLinks: number;
   private maxCrawledDepth: number;
   private visited: Set<string> = new Set();
-  private crawledUrls: Set<{ url: string, html: string }> = new Set();
+  private crawledUrls: Map<string, string> = new Map();
   private limit: number;
   private robotsTxtUrl: string;
   private robots: any;
@@ -143,7 +143,7 @@ export class WebCrawler {
         return;
       }
       const newUrls = await this.crawl(task);
-      newUrls.forEach((page) => this.crawledUrls.add(page));
+      newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
       if (inProgress && newUrls.length > 0) {
         inProgress({
           current: this.crawledUrls.size,
@@ -175,7 +175,7 @@ export class WebCrawler {
       }
     );
     await queue.drain();
-    return Array.from(this.crawledUrls);
+    return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
   }
 
   async crawl(url: string): Promise<{url: string, html: string}[]> {
@@ -310,5 +310,3 @@ export class WebCrawler {
     return [];
   }
 }
-
-
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 1f5a785..13f39c2 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -176,9 +176,8 @@ export class WebScraperDataProvider {
     if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
       console.log("Fast mode enabled");
       documents = await this.processLinks(allLinks, inProgress, allHtmls);
-
     }else{
-      documents = await this.convertUrlsToDocuments(allLinks, inProgress, allHtmls);
+      documents = await this.processLinks(allLinks, inProgress);
     }
 
     return this.cacheAndFinalizeDocuments(documents, allLinks);

From 27e1e22a0abdd49ebcb9574f24c5934e19240241 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Tue, 14 May 2024 12:28:25 -0700
Subject: [PATCH 5/9] Update index.test.ts

---
 .../src/__tests__/e2e_withAuth/index.test.ts  | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 0e2caeb..35ae746 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -511,6 +511,107 @@ describe("E2E Tests for API Routes", () => {
   //   }, 120000); // 120 secs
   // });
 
+  describe("POST /v0/crawl with fast mode", () => {
+    it("should complete the crawl under 20 seconds", async () => {
+      const startTime = Date.now();
+
+      const crawlResponse = await request(TEST_URL)
+        .post("/v0/crawl")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://flutterbricks.com",
+          crawlerOptions: {
+            mode: "fast"
+          }
+        });
+
+      expect(crawlResponse.statusCode).toBe(200);
+
+      const jobId = crawlResponse.body.jobId;
+      let statusResponse;
+      let isFinished = false;
+
+      while (!isFinished) {
+        statusResponse = await request(TEST_URL)
+          .get(`/v0/crawl/status/${jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+
+        expect(statusResponse.statusCode).toBe(200);
+        isFinished = statusResponse.body.status === "completed";
+
+        if (!isFinished) {
+          await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+        }
+      }
+
+      const endTime = Date.now();
+      const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
+
+      console.log(`Time elapsed: ${timeElapsed} seconds`);
+
+      expect(statusResponse.body.status).toBe("completed");
+      expect(statusResponse.body).toHaveProperty("data");
+      expect(statusResponse.body.data[0]).toHaveProperty("content");
+      expect(statusResponse.body.data[0]).toHaveProperty("markdown");
+      const results = statusResponse.body.data;
+      // results.forEach((result, i) => {
+      //   console.log(result.metadata.sourceURL);
+      // });
+      expect(results.length).toBeGreaterThanOrEqual(10);
+      expect(results.length).toBeLessThanOrEqual(15);
+      
+    }, 20000);
+
+    // it("should complete the crawl in more than 10 seconds", async () => {
+    //   const startTime = Date.now();
+
+    //   const crawlResponse = await request(TEST_URL)
+    //     .post("/v0/crawl")
+    //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+    //     .set("Content-Type", "application/json")
+    //     .send({
+    //       url: "https://flutterbricks.com",
+    //     });
+
+    //   expect(crawlResponse.statusCode).toBe(200);
+
+    //   const jobId = crawlResponse.body.jobId;
+    //   let statusResponse;
+    //   let isFinished = false;
+
+    //   while (!isFinished) {
+    //     statusResponse = await request(TEST_URL)
+    //       .get(`/v0/crawl/status/${jobId}`)
+    //       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+
+    //     expect(statusResponse.statusCode).toBe(200);
+    //     isFinished = statusResponse.body.status === "completed";
+
+    //     if (!isFinished) {
+    //       await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+    //     }
+    //   }
+
+    //   const endTime = Date.now();
+    //   const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
+
+    //   console.log(`Time elapsed: ${timeElapsed} seconds`);
+
+    //   expect(statusResponse.body.status).toBe("completed");
+    //   expect(statusResponse.body).toHaveProperty("data");
+    //   expect(statusResponse.body.data[0]).toHaveProperty("content");
+    //   expect(statusResponse.body.data[0]).toHaveProperty("markdown");
+    //   const results = statusResponse.body.data;
+    //   // results.forEach((result, i) => {
+    //   //   console.log(result.metadata.sourceURL);
+    //   // });
+    //   expect(results.length).toBeGreaterThanOrEqual(10);
+    //   expect(results.length).toBeLessThanOrEqual(15);
+      
+    // }, 50000);// 15 seconds timeout to account for network delays
+  });
+
   describe("GET /is-production", () => {
     it("should return the production status", async () => {
       const response = await request(TEST_URL).get("/is-production");

From 87570bdfa1dab843710352098d19bd687acdf3c0 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 15 May 2024 11:06:03 -0700
Subject: [PATCH 6/9] Update index.ts

---
 apps/api/src/scraper/WebScraper/index.ts | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 13f39c2..bdc7483 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -155,22 +155,16 @@ export class WebScraperDataProvider {
       limit: this.limit,
       generateImgAltText: this.generateImgAltText,
     });
-    let start = Date.now();
+
     let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
-    console.log(links.length)
-    let end = Date.now();
-    console.log("Crawl end in seconds ", (end - start) / 1000);
+
     const allLinks = links.map((e) => e.url);
     const allHtmls = links.map((e)=> e.html);
-    console.log("All links", allLinks.length);
-    console.log("All htmls", allHtmls.length);
 
     if (this.returnOnlyUrls) {
       return this.returnOnlyUrlsResponse(allLinks , inProgress);
     }
     
-
-    let fastDocs = []
     let documents = [];
     // check if fast mode is enabled and there is html inside the links
     if (this.crawlerMode === "fast" && links.some((link) => link.html)) {

From d10f81e7feecf2250b4ca102899dcc33660468bd Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 15 May 2024 11:28:20 -0700
Subject: [PATCH 7/9] Nick: fixes

---
 apps/api/src/scraper/WebScraper/index.ts      | 4 ++--
 apps/api/src/scraper/WebScraper/single_url.ts | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index bdc7483..0a86a90 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -71,8 +71,8 @@ export class WebScraperDataProvider {
       const batchUrls = urls.slice(i, i + this.concurrentRequests);
       await Promise.all(
         batchUrls.map(async (url, index) => {
-          const existingText = allHtmls ? allHtmls[i + index] : "";
-          const result = await scrapSingleUrl(url, this.pageOptions, existingText);
+          const existingHTML = allHtmls ? allHtmls[i + index] : "";
+          const result = await scrapSingleUrl(url, this.pageOptions, existingHTML);
           processedUrls++;
           if (inProgress) {
             inProgress({
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index c41beb5..4bbaee7 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -107,7 +107,7 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
 export async function scrapSingleUrl(
   urlToScrap: string,
   pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
-  existingText: string = ""
+  existingHtml: string = ""
 ): Promise<Document> {
   urlToScrap = urlToScrap.trim();
 
@@ -199,8 +199,10 @@ export async function scrapSingleUrl(
 
     for (const scraper of scrapersInOrder) {
       // If exists text coming from crawler, use it
-      if (existingText && existingText.trim().length >= 100) {
-        text = existingText;
+      if (existingHtml && existingHtml.trim().length >= 100) {
+        let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
+        text = await parseMarkdown(cleanedHtml);
+        html = existingHtml;
         break;
       }
       [text, html] = await attemptScraping(urlToScrap, scraper);

From 1b0d6341d3e5126fd5e7dbe3e9b997becd249aae Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 15 May 2024 11:48:12 -0700
Subject: [PATCH 8/9] Update index.ts

---
 apps/api/src/scraper/WebScraper/index.ts | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 0a86a90..c95e889 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -17,20 +17,7 @@ import {
 } from "./utils/replacePaths";
 import { generateCompletions } from "../../lib/LLM-extraction";
 import { getWebScraperQueue } from "../../../src/services/queue-service";
-import { parseMarkdown } from "../../lib/html-to-markdown";
-import cheerio from "cheerio";
-import { excludeNonMainTags } from "./utils/excludeTags";
-const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
-  const soup = cheerio.load(html);
-  soup("script, style, iframe, noscript, meta, head").remove();
-  if (pageOptions.onlyMainContent) {
-    // remove any other tags that are not in the main content
-    excludeNonMainTags.forEach((tag) => {
-      soup(tag).remove();
-    });
-  }
-  return soup.html();
-};
+
 export class WebScraperDataProvider {
   private bullJobId: string;
   private urls: string[] = [""];

From fd82982a3198e68a136c2f8ce99a89639ee495d5 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 15 May 2024 12:11:16 -0700
Subject: [PATCH 9/9] Nick:

---
 apps/api/openapi.json         | 121 +++++++++++++++++++++++++++++++++-
 apps/test-suite/index.test.ts |   2 +-
 2 files changed, 120 insertions(+), 3 deletions(-)

diff --git a/apps/api/openapi.json b/apps/api/openapi.json
index 127fe51..b0f8b99 100644
--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@@ -18,8 +18,8 @@
   "paths": {
     "/scrape": {
       "post": {
-        "summary": "Scrape a single URL",
-        "operationId": "scrapeSingleUrl",
+        "summary": "Scrape a single URL and optionally extract information using an LLM",
+        "operationId": "scrapeAndExtractFromUrl",
         "tags": ["Scraping"],
         "security": [
           {
@@ -45,8 +45,43 @@
                         "type": "boolean",
                         "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
                         "default": false
+                      },
+                      "includeHtml": {
+                        "type": "boolean",
+                        "description": "Include the raw HTML content of the page. Will output a html key in the response.",
+                        "default": false
                       }
                     }
+                  },
+                  "extractorOptions": {
+                    "type": "object",
+                    "description": "Options for LLM-based extraction of structured information from the page content",
+                    "properties": {
+                      "mode": {
+                        "type": "string",
+                        "enum": ["llm-extraction"],
+                        "description": "The extraction mode to use, currently supports 'llm-extraction'"
+                      },
+                      "extractionPrompt": {
+                        "type": "string",
+                        "description": "A prompt describing what information to extract from the page"
+                      },
+                      "extractionSchema": {
+                        "type": "object",
+                        "additionalProperties": true,
+                        "description": "The schema for the data to be extracted",
+                        "required": [
+                          "company_mission",
+                          "supports_sso",
+                          "is_open_source"
+                        ]
+                      }
+                    }
+                  },
+                  "timeout": {
+                    "type": "integer",
+                    "description": "Timeout in milliseconds for the request",
+                    "default": 30000
                   }
                 },
                 "required": ["url"]
@@ -126,6 +161,16 @@
                         "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
                         "default": false
                       },
+                      "maxDepth": {
+                        "type": "integer",
+                        "description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on."
+                      },
+                      "mode": {
+                        "type": "string",
+                        "enum": ["default", "fast"],
+                        "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
+                        "default": "default"
+                      },
                       "limit": {
                         "type": "integer",
                         "description": "Maximum number of pages to crawl",
@@ -140,6 +185,11 @@
                         "type": "boolean",
                         "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
                         "default": false
+                      },
+                      "includeHtml": {
+                        "type": "boolean",
+                        "description": "Include the raw HTML content of the page. Will output a html key in the response.",
+                        "default": false
                       }
                     }
                   }
@@ -206,6 +256,11 @@
                         "type": "boolean",
                         "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
                         "default": true
+                      },
+                      "includeHtml": {
+                        "type": "boolean",
+                        "description": "Include the raw HTML content of the page. Will output a html key in the response.",
+                        "default": false
                       }
                     }
                   },
@@ -302,6 +357,63 @@
                         "$ref": "#/components/schemas/ScrapeResponse"
                       },
                       "description": "Data returned from the job (null when it is in progress)"
+                    },
+                    "partial_data": {
+                      "type": "array",
+                      "items": {
+                        "$ref": "#/components/schemas/ScrapeResponse"
+                      },
+                      "description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "402": {
+            "description": "Payment required"
+          },
+          "429": {
+            "description": "Too many requests"
+          },
+          "500": {
+            "description": "Server error"
+          }
+        }
+      }
+    },
+    "/crawl/cancel/{jobId}": {
+      "delete": {
+        "tags": ["Crawl"],
+        "summary": "Cancel a crawl job",
+        "operationId": "cancelCrawlJob",
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "parameters": [
+          {
+            "name": "jobId",
+            "in": "path",
+            "description": "ID of the crawl job",
+            "required": true,
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "Successful response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "status": {
+                      "type": "string",
+                      "description": "Returns cancelled."
                     }
                   }
                 }
@@ -344,6 +456,11 @@
               "content": {
                 "type": "string"
               },
+              "html": {
+                "type": "string",
+                "nullable": true,
+                "description": "Raw HTML content of the page if `includeHtml`  is true"
+              },
               "metadata": {
                 "type": "object",
                 "properties": {
diff --git a/apps/test-suite/index.test.ts b/apps/test-suite/index.test.ts
index 8d6c31f..7b38791 100644
--- a/apps/test-suite/index.test.ts
+++ b/apps/test-suite/index.test.ts
@@ -183,7 +183,7 @@ describe("Scraping/Crawling Checkup (E2E)", () => {
       }
         
 
-      expect(score).toBeGreaterThanOrEqual(75);
+      expect(score).toBeGreaterThanOrEqual(70);
     }, 350000); // 150 seconds timeout
   });
 });