From 0e89f8b9a3520017ba1956bb2db083ec8385a1f7 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Fri, 7 Jun 2024 09:35:56 -0300
Subject: [PATCH 01/29] fixing workflow

---
 .github/workflows/fly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml
index 77b3fd3..28af7bd 100644
--- a/.github/workflows/fly.yml
+++ b/.github/workflows/fly.yml
@@ -213,7 +213,7 @@ jobs:
         working-directory: ./apps/python-sdk
 
       - name: Publish to PyPI
-        if: ${{ env.VERSION_INCREMENTED == 'true' }}
+        if: ${{ env.PYTHON_SDK_VERSION_INCREMENTED == 'true' }}
         env:
           TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}

From 556c57648e99f61b95583c7c7fbc2fa3221a6119 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Fri, 7 Jun 2024 09:40:40 -0300
Subject: [PATCH 02/29] Update fly.yml

---
 .github/workflows/fly.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml
index 28af7bd..84017b1 100644
--- a/.github/workflows/fly.yml
+++ b/.github/workflows/fly.yml
@@ -183,6 +183,7 @@ jobs:
           FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
 
   build-and-publish-python-sdk:
+    name: Build and publish Python SDK
     runs-on: ubuntu-latest
     needs: deploy
 
@@ -222,6 +223,7 @@ jobs:
         working-directory: ./apps/python-sdk
 
   build-and-publish-js-sdk:
+    name: Build and publish JavaScript SDK
     runs-on: ubuntu-latest
     needs: deploy
 

From f24ca766182e1d12a4c3f71bdf8a70fe242c324a Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 7 Jun 2024 10:39:11 -0700
Subject: [PATCH 03/29] Nick: removing rate limit emails for now

---
 apps/api/src/controllers/auth.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts
index 2c4b0c7..ea789fe 100644
--- a/apps/api/src/controllers/auth.ts
+++ b/apps/api/src/controllers/auth.ts
@@ -143,7 +143,7 @@ export async function supaAuthenticateUser(
     const startDate = new Date();
     const endDate = new Date();
     endDate.setDate(endDate.getDate() + 7);
-    await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
+    // await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
     return {
       success: false,
       error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,

From 827354a116a4ea424af7c1994aae7214d78c8032 Mon Sep 17 00:00:00 2001
From: Matt Joyce <matt.joyce@gmail.com>
Date: Mon, 10 Jun 2024 21:21:23 +1000
Subject: [PATCH 04/29] Added logging to python sdk FIRECRAWL_LOGGING_LEVEL

Instantiates the logger early and depends on env to set.
---
 apps/python-sdk/firecrawl/__init__.py  | 54 ++++++++++++++++++++++++++
 apps/python-sdk/firecrawl/firecrawl.py | 10 ++++-
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py
index ecb017f..4e53e77 100644
--- a/apps/python-sdk/firecrawl/__init__.py
+++ b/apps/python-sdk/firecrawl/__init__.py
@@ -1,3 +1,57 @@
+"""
+This is the Firecrawl package.
+
+This package provides a Python SDK for interacting with the Firecrawl API.
+It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
+and check the status of these jobs.
+
+For more information visit https://github.com/firecrawl/
+"""
+
+import logging
+import os
+
 from .firecrawl import FirecrawlApp
 
 __version__ = "0.0.14"
+
+# Define the logger for the Firecrawl project
+logger: logging.Logger = logging.getLogger("firecrawl")
+
+
+def _basic_config() -> None:
+    """Set up basic configuration for logging with a specific format and date format."""
+    try:
+        logging.basicConfig(
+            format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+    except Exception as e:
+        logger.error("Failed to configure logging: %s", e)
+
+
+def setup_logging() -> None:
+    """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
+    env = os.environ.get(
+        "FIRECRAWL_LOGGING_LEVEL", "INFO"
+    ).upper()  # Default to 'INFO' level
+    _basic_config()
+
+    if env == "DEBUG":
+        logger.setLevel(logging.DEBUG)
+    elif env == "INFO":
+        logger.setLevel(logging.INFO)
+    elif env == "WARNING":
+        logger.setLevel(logging.WARNING)
+    elif env == "ERROR":
+        logger.setLevel(logging.ERROR)
+    elif env == "CRITICAL":
+        logger.setLevel(logging.CRITICAL)
+    else:
+        logger.setLevel(logging.INFO)
+        logger.warning("Unknown logging level: %s, defaulting to INFO", env)
+
+
+# Initialize logging configuration when the module is imported
+setup_logging()
+logger.debug("Debugging logger setup")
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index b9a823f..f20d4bd 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes.
 Classes:
     - FirecrawlApp: Main class for interacting with the Firecrawl API.
 """
-
+import logging
 import os
 import time
 from typing import Any, Dict, Optional
 
 import requests
 
+logger : logging.Logger = logging.getLogger("firecrawl")
 
 class FirecrawlApp:
     """
@@ -28,8 +29,15 @@ class FirecrawlApp:
     def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
         self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
         if self.api_key is None:
+            logger.warning("No API key provided")
             raise ValueError('No API key provided')
+        else:
+            logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
+
         self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
+        if self.api_url != 'https://api.firecrawl.dev':
+            logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
+
     def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
         """
         Scrape the specified URL using the Firecrawl API.

From 3091f0134cc95f47fe7d993b5fab5536868dd29e Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 10 Jun 2024 16:27:10 -0700
Subject: [PATCH 05/29] Nick:

---
 apps/api/src/scraper/WebScraper/crawler.ts | 14 +++++++++-----
 apps/api/src/scraper/WebScraper/index.ts   |  1 +
 apps/api/src/scraper/WebScraper/sitemap.ts |  3 +++
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 9340aa8..ee9baff 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import async from "async";
-import { Progress } from "../../lib/entities";
+import { PageOptions, Progress } from "../../lib/entities";
 import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";
 
@@ -108,6 +108,7 @@ export class WebCrawler {
 
   public async start(
     inProgress?: (progress: Progress) => void,
+    pageOptions?: PageOptions,
     concurrencyLimit: number = 5,
     limit: number = 10000,
     maxDepth: number = 10
@@ -130,6 +131,7 @@ export class WebCrawler {
 
     const urls = await this.crawlUrls(
       [this.initialUrl],
+      pageOptions,
       concurrencyLimit,
       inProgress
     );
@@ -148,6 +150,7 @@ export class WebCrawler {
 
   private async crawlUrls(
     urls: string[],
+    pageOptions: PageOptions,
     concurrencyLimit: number,
     inProgress?: (progress: Progress) => void,
   ): Promise<{ url: string, html: string }[]> {
@@ -158,7 +161,7 @@ export class WebCrawler {
         }
         return;
       }
-      const newUrls = await this.crawl(task);
+      const newUrls = await this.crawl(task, pageOptions);
       // add the initial url if not already added
       // if (this.visited.size === 1) {
       //   let normalizedInitial = this.initialUrl;
@@ -188,7 +191,7 @@ export class WebCrawler {
           currentDocumentUrl: task,
         });
       }
-      await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
+      await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
       if (callback && typeof callback === "function") {
         callback();
       }
@@ -207,7 +210,7 @@ export class WebCrawler {
     return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
   }
 
-  async crawl(url: string): Promise<{url: string, html: string}[]> {
+  async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
     if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
       return [];
     }
@@ -231,7 +234,8 @@ export class WebCrawler {
       let content : string = "";
       // If it is the first link, fetch with single url
       if (this.visited.size === 1) {
-        const page = await scrapSingleUrl(url, {includeHtml: true});
+        console.log(pageOptions)
+        const page = await scrapSingleUrl(url, {...pageOptions, includeHtml: true});
         content = page.html ?? ""
       } else {
         const response = await axios.get(url);
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index e3a3cc6..824ec06 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -173,6 +173,7 @@ export class WebScraperDataProvider {
 
     let links = await crawler.start(
       inProgress,
+      this.pageOptions,
       5,
       this.limit,
       this.maxCrawledDepth
diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts
index 0ac4338..5a89183 100644
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@@ -12,6 +12,8 @@ export async function getLinksFromSitemap(
       content = response.data;
     } catch (error) {
       console.error(`Request failed for ${sitemapUrl}: ${error}`);
+  console.log(allUrls)
+
       return allUrls;
     }
 
@@ -34,6 +36,7 @@ export async function getLinksFromSitemap(
   } catch (error) {
     console.error(`Error processing ${sitemapUrl}: ${error}`);
   }
+  console.log(allUrls)
 
   return allUrls;
 }

From 913c1dd56839875ab4946d4fd085af9f01f841db Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 10 Jun 2024 16:49:03 -0700
Subject: [PATCH 06/29] Nick: fetch -> axios and fix timeouts

---
 apps/api/src/scraper/WebScraper/single_url.ts | 71 +++++++++++++------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index 689d5e7..9a61888 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -8,6 +8,7 @@ import { excludeNonMainTags } from "./utils/excludeTags";
 import { urlSpecificParams } from "./utils/custom/website_params";
 import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 import { handleCustomScraping } from "./custom/handleCustomScraping";
+import axios from "axios";
 
 dotenv.config();
 
@@ -19,6 +20,8 @@ const baseScrapers = [
   "fetch",
 ] as const;
 
+const universalTimeout = 15000;
+
 export async function generateRequestParams(
   url: string,
   wait_browser: string = "domcontentloaded",
@@ -59,21 +62,24 @@ export async function scrapWithFireEngine(
       `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
     );
 
-    const response = await fetch(process.env.FIRE_ENGINE_BETA_URL + "/scrape", {
-      method: "POST",
-      headers: {
-        "Content-Type": "application/json",
-      },
-      body: JSON.stringify({
+    const response = await axios.post(
+      process.env.FIRE_ENGINE_BETA_URL + "/scrape",
+      {
         url: url,
         wait: waitParam,
         screenshot: screenshotParam,
         headers: headers,
-        pageOptions: pageOptions
-      }),
-    });
+        pageOptions: pageOptions,
+      },
+      {
+        headers: {
+          "Content-Type": "application/json",
+        },
+        timeout: universalTimeout + waitParam
+      }
+    );
 
-    if (!response.ok) {
+    if (response.status !== 200) {
       console.error(
         `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
       );
@@ -84,7 +90,7 @@ export async function scrapWithFireEngine(
     if (contentType && contentType.includes("application/pdf")) {
       return { html: await fetchAndProcessPdf(url), screenshot: "" };
     } else {
-      const data = await response.json();
+      const data = response.data;
       const html = data.content;
       const screenshot = data.screenshot;
       return { html: html ?? "", screenshot: screenshot ?? "" };
@@ -98,7 +104,7 @@ export async function scrapWithFireEngine(
 export async function scrapWithScrapingBee(
   url: string,
   wait_browser: string = "domcontentloaded",
-  timeout: number = 15000
+  timeout: number = universalTimeout
 ): Promise<string> {
   try {
     const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
@@ -141,15 +147,19 @@ export async function scrapWithPlaywright(
     // If the user has passed a wait parameter in the request, use that
     const waitParam = reqParams["params"]?.wait ?? waitFor;
 
-    const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
-      method: "POST",
+    const response = await axios.post(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
+      url: url,
+      wait_after_load: waitParam,
+      headers: headers,
+    }, {
       headers: {
         "Content-Type": "application/json",
       },
-      body: JSON.stringify({ url: url, wait_after_load: waitParam, headers: headers }),
+      timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
+      transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
     });
 
-    if (!response.ok) {
+    if (response.status !== 200) {
       console.error(
         `[Playwright] Error fetching url: ${url} with status: ${response.status}`
       );
@@ -160,7 +170,7 @@ export async function scrapWithPlaywright(
     if (contentType && contentType.includes("application/pdf")) {
       return fetchAndProcessPdf(url);
     } else {
-      const textData = await response.text();
+      const textData = response.data;
       try {
         const data = JSON.parse(textData);
         const html = data.content;
@@ -171,17 +181,28 @@ export async function scrapWithPlaywright(
       }
     }
   } catch (error) {
-    console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
+    if (error.code === 'ECONNABORTED') {
+      console.log(`[Playwright] Request timed out for ${url}`);
+    } else {
+      console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
+    }
     return "";
   }
 }
 
 export async function scrapWithFetch(url: string): Promise<string> {
   try {
-    const response = await fetch(url);
-    if (!response.ok) {
+    const response = await axios.get(url, {
+      headers: {
+        "Content-Type": "application/json",
+      },
+      timeout: universalTimeout,
+      transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
+    });
+
+    if (response.status !== 200) {
       console.error(
-        `[Fetch] Error fetching url: ${url} with status: ${response.status}`
+        `[Axios] Error fetching url: ${url} with status: ${response.status}`
       );
       return "";
     }
@@ -190,11 +211,15 @@ export async function scrapWithFetch(url: string): Promise<string> {
     if (contentType && contentType.includes("application/pdf")) {
       return fetchAndProcessPdf(url);
     } else {
-      const text = await response.text();
+      const text = response.data;
       return text;
     }
   } catch (error) {
-    console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`);
+    if (error.code === 'ECONNABORTED') {
+      console.log(`[Axios] Request timed out for ${url}`);
+    } else {
+      console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
+    }
     return "";
   }
 }

From 7ae97786428f7c7911a232f8eea1c07e189f6726 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 10 Jun 2024 16:57:31 -0700
Subject: [PATCH 07/29] Update single_url.ts

---
 apps/api/src/scraper/WebScraper/single_url.ts | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index 9a61888..c2dcea1 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -96,7 +96,11 @@ export async function scrapWithFireEngine(
       return { html: html ?? "", screenshot: screenshot ?? "" };
     }
   } catch (error) {
-    console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
+    if (error.code === 'ECONNABORTED') {
+      console.log(`[Fire-Engine] Request timed out for ${url}`);
+    } else {
+      console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
+    }
     return { html: "", screenshot: "" };
   }
 }

From 99f2ffd6d591398a4baef347306d25371b381793 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 10 Jun 2024 17:03:10 -0700
Subject: [PATCH 08/29] Update webhook.ts

---
 apps/api/src/services/webhook.ts | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts
index eca7d09..1f8d647 100644
--- a/apps/api/src/services/webhook.ts
+++ b/apps/api/src/services/webhook.ts
@@ -3,15 +3,17 @@ import { supabase_service } from "./supabase";
 export const callWebhook = async (teamId: string, data: any) => {
   try {
     const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
+    const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
     let webhookUrl = selfHostedUrl;
 
-    if (!selfHostedUrl) {
+    // Only fetch the webhook URL from the database if the self-hosted webhook URL is not set
+    // and the USE_DB_AUTHENTICATION environment variable is set to true
+    if (!selfHostedUrl && useDbAuthentication) {
       const { data: webhooksData, error } = await supabase_service
         .from("webhooks")
         .select("url")
         .eq("team_id", teamId)
         .limit(1);
-
       if (error) {
         console.error(
           `Error fetching webhook URL for team ID: ${teamId}`,

From f6b06ac27a829172416419c4fff02d0f71579050 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 10 Jun 2024 18:12:41 -0700
Subject: [PATCH 09/29] Nick: ignoreSitemap, better crawling algo

---
 apps/api/src/lib/entities.ts               | 25 +++++----
 apps/api/src/scraper/WebScraper/crawler.ts | 65 ++++++++++++----------
 apps/api/src/scraper/WebScraper/index.ts   |  6 ++
 apps/api/src/scraper/WebScraper/sitemap.ts |  2 -
 4 files changed, 57 insertions(+), 41 deletions(-)

diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 5511623..744c07b 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -35,20 +35,23 @@ export type SearchOptions = {
   location?: string;
 };
 
+export type CrawlerOptions = {
+  returnOnlyUrls?: boolean;
+  includes?: string[];
+  excludes?: string[];
+  maxCrawledLinks?: number;
+  maxDepth?: number;
+  limit?: number;
+  generateImgAltText?: boolean;
+  replaceAllPathsWithAbsolutePaths?: boolean;
+  ignoreSitemap?: boolean;
+  mode?: "default" | "fast"; // have a mode of some sort
+}
+
 export type WebScraperOptions = {
   urls: string[];
   mode: "single_urls" | "sitemap" | "crawl";
-  crawlerOptions?: {
-    returnOnlyUrls?: boolean;
-    includes?: string[];
-    excludes?: string[];
-    maxCrawledLinks?: number;
-    maxDepth?: number;
-    limit?: number;
-    generateImgAltText?: boolean;
-    replaceAllPathsWithAbsolutePaths?: boolean;
-    mode?: "default" | "fast"; // have a mode of some sort
-  };
+  crawlerOptions?: CrawlerOptions;
   pageOptions?: PageOptions;
   extractorOptions?: ExtractorOptions;
   concurrentRequests?: number;
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index ee9baff..fc95e7c 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import async from "async";
-import { PageOptions, Progress } from "../../lib/entities";
+import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
 import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";
 
@@ -109,6 +109,7 @@ export class WebCrawler {
   public async start(
     inProgress?: (progress: Progress) => void,
     pageOptions?: PageOptions,
+    crawlerOptions?: CrawlerOptions,
     concurrencyLimit: number = 5,
     limit: number = 10000,
     maxDepth: number = 10
@@ -123,10 +124,12 @@ export class WebCrawler {
     }
 
 
-    const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
-    if (sitemapLinks.length > 0) {
-      let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
-      return filteredLinks.map(link => ({ url: link, html: "" }));
+    if(!crawlerOptions?.ignoreSitemap){
+      const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
+      if (sitemapLinks.length > 0) {
+        let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
+        return filteredLinks.map(link => ({ url: link, html: "" }));
+      }
     }
 
     const urls = await this.crawlUrls(
@@ -135,6 +138,7 @@ export class WebCrawler {
       concurrencyLimit,
       inProgress
     );
+    
     if (
       urls.length === 0 &&
       this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
@@ -142,9 +146,9 @@ export class WebCrawler {
       return [{ url: this.initialUrl, html: "" }];
     }
 
-
     // make sure to run include exclude here again
     const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
+
     return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
   }
 
@@ -211,46 +215,41 @@ export class WebCrawler {
   }
 
   async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
-    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
+    const normalizedUrl = this.normalizeCrawlUrl(url);
+    if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
       return [];
     }
-    this.visited.add(url);
-    
+    this.visited.add(normalizedUrl);
 
     if (!url.startsWith("http")) {
       url = "https://" + url;
-
     }
     if (url.endsWith("/")) {
       url = url.slice(0, -1);
-
     }
-    
+
     if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
       return [];
     }
 
     try {
-      let content : string = "";
+      let content: string = "";
       // If it is the first link, fetch with single url
       if (this.visited.size === 1) {
-        console.log(pageOptions)
-        const page = await scrapSingleUrl(url, {...pageOptions, includeHtml: true});
-        content = page.html ?? ""
+        const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
+        content = page.html ?? "";
       } else {
         const response = await axios.get(url);
         content = response.data ?? "";
       }
       const $ = load(content);
-      let links: {url: string, html: string}[] = [];
+      let links: { url: string, html: string }[] = [];
 
       // Add the initial URL to the list of links
-      if(this.visited.size === 1)
-      {
-        links.push({url, html: content});
+      if (this.visited.size === 1) {
+        links.push({ url, html: content });
       }
 
-
       $("a").each((_, element) => {
         const href = $(element).attr("href");
         if (href) {
@@ -258,32 +257,43 @@ export class WebCrawler {
           if (!href.startsWith("http")) {
             fullUrl = new URL(href, this.baseUrl).toString();
           }
-          const url = new URL(fullUrl);
-          const path = url.pathname;
+          const urlObj = new URL(fullUrl);
+          const path = urlObj.pathname;
 
           if (
             this.isInternalLink(fullUrl) &&
             this.matchesPattern(fullUrl) &&
             this.noSections(fullUrl) &&
-            this.matchesIncludes(path) &&
+            // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
+            // this.matchesIncludes(path) &&
             !this.matchesExcludes(path) &&
             this.robots.isAllowed(fullUrl, "FireCrawlAgent")
           ) {
-            links.push({url: fullUrl, html: content});
+            links.push({ url: fullUrl, html: content });
           }
         }
       });
 
-      if(this.visited.size === 1){
+      if (this.visited.size === 1) {
         return links;
       }
       // Create a new list to return to avoid modifying the visited list
-      return links.filter((link) => !this.visited.has(link.url));
+      return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
     } catch (error) {
       return [];
     }
   }
 
+  private normalizeCrawlUrl(url: string): string {
+    try{
+      const urlObj = new URL(url);
+      urlObj.searchParams.sort(); // Sort query parameters to normalize
+      return urlObj.toString();
+    } catch (error) {
+      return url;
+    }
+  }
+
   private matchesIncludes(url: string): boolean {
     if (this.includes.length === 0 || this.includes[0] == "") return true;
     return this.includes.some((pattern) => new RegExp(pattern).test(url));
@@ -392,7 +402,6 @@ export class WebCrawler {
 
     // Normalize and check if the URL is present in any of the sitemaps
     const normalizedUrl = normalizeUrl(url);
-
     const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
 
     // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 824ec06..7dcd175 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -31,6 +31,7 @@ export class WebScraperDataProvider {
   private limit: number = 10000;
   private concurrentRequests: number = 20;
   private generateImgAltText: boolean = false;
+  private ignoreSitemap: boolean = false;
   private pageOptions?: PageOptions;
   private extractorOptions?: ExtractorOptions;
   private replaceAllPathsWithAbsolutePaths?: boolean = false;
@@ -38,6 +39,7 @@ export class WebScraperDataProvider {
     "gpt-4-turbo";
   private crawlerMode: string = "default";
 
+  
   authorize(): void {
     throw new Error("Method not implemented.");
   }
@@ -174,6 +176,9 @@ export class WebScraperDataProvider {
     let links = await crawler.start(
       inProgress,
       this.pageOptions,
+      {
+        ignoreSitemap: this.ignoreSitemap,
+      },
       5,
       this.limit,
       this.maxCrawledDepth
@@ -474,6 +479,7 @@ export class WebScraperDataProvider {
     //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
     this.excludes = this.excludes.filter((item) => item !== "");
     this.crawlerMode = options.crawlerOptions?.mode ?? "default";
+    this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
 
     // make sure all urls start with https://
     this.urls = this.urls.map((url) => {
diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts
index 5a89183..c6dbf11 100644
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@@ -12,7 +12,6 @@ export async function getLinksFromSitemap(
       content = response.data;
     } catch (error) {
       console.error(`Request failed for ${sitemapUrl}: ${error}`);
-  console.log(allUrls)
 
       return allUrls;
     }
@@ -36,7 +35,6 @@ export async function getLinksFromSitemap(
   } catch (error) {
     console.error(`Error processing ${sitemapUrl}: ${error}`);
   }
-  console.log(allUrls)
 
   return allUrls;
 }

From 9390816c1b7975b3349f402f562d1846e6845e2a Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 10 Jun 2024 18:26:25 -0700
Subject: [PATCH 10/29] Update openapi.json

---
 apps/api/openapi.json | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/apps/api/openapi.json b/apps/api/openapi.json
index ab452ff..55bfe1c 100644
--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@@ -51,10 +51,19 @@
                         "description": "Include the raw HTML content of the page. Will output a html key in the response.",
                         "default": false
                       },
+                      "screenshot": {
+                        "type": "boolean",
+                        "description": "Include a screenshot of the top of the page that you are scraping.",
+                        "default": false
+                      },
                       "waitFor": {
                         "type": "integer",
                         "description": "Wait x amount of milliseconds for the page to load to fetch content",
                         "default": 0
+                      },
+                      "headers": {
+                        "type": "object",
+                        "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
                       }
                     }
                   },
@@ -176,6 +185,11 @@
                         "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
                         "default": "default"
                       },
+                      "ignoreSitemap": {
+                        "type": "boolean",
+                        "description": "Ignore the website sitemap when crawling",
+                        "default": false
+                      },
                       "limit": {
                         "type": "integer",
                         "description": "Maximum number of pages to crawl",
@@ -195,6 +209,15 @@
                         "type": "boolean",
                         "description": "Include the raw HTML content of the page. Will output a html key in the response.",
                         "default": false
+                      },
+                      "screenshot": {
+                        "type": "boolean",
+                        "description": "Include a screenshot of the top of the page that you are scraping.",
+                        "default": false
+                      },
+                      "headers": {
+                        "type": "object",
+                        "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
                       }
                     }
                   }
@@ -368,7 +391,7 @@
                       "items": {
                         "$ref": "#/components/schemas/CrawlStatusResponseObj"
                       },
-                      "description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
+                      "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
                     }
                   }
                 }
@@ -513,6 +536,10 @@
             "nullable": true,
             "description": "Raw HTML content of the page if `includeHtml`  is true"
           },
+          "index": {
+            "type": "integer",
+            "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." 
+          },
           "metadata": {
             "type": "object",
             "properties": {

From 00c23855b180f9f84a7032d40054b8fcf661e86e Mon Sep 17 00:00:00 2001
From: Eric Ciarla <ericciarla@yahoo.com>
Date: Tue, 11 Jun 2024 11:46:35 -0400
Subject: [PATCH 11/29] Update examples

---
 .../{k8n => kubernetes-cluster-install}/README.md   |   0
 .../{k8n => kubernetes-cluster-install}/api.yaml    |   0
 .../configmap.yaml                                  |   0
 .../playwright-service.yaml                         |   0
 .../{k8n => kubernetes-cluster-install}/redis.yaml  |   0
 .../{k8n => kubernetes-cluster-install}/secret.yaml |   0
 .../{k8n => kubernetes-cluster-install}/worker.yaml |   0
 .../.eslintrc.json                                  |   0
 .../.gitignore                                      |   0
 .../README.md                                       |   0
 .../components.json                                 |   0
 .../next.config.mjs                                 |   0
 .../package-lock.json                               |   0
 .../package.json                                    |   0
 .../postcss.config.mjs                              |   0
 .../public/android-chrome-192x192.png               | Bin
 .../public/android-chrome-512x512.png               | Bin
 .../public/apple-touch-icon.png                     | Bin
 .../public/bgd.png                                  | Bin
 .../public/favicon-16x16.png                        | Bin
 .../public/favicon-32x32.png                        | Bin
 .../public/favicon.ico                              | Bin
 .../public/next.svg                                 |   0
 .../public/og.png                                   | Bin
 .../public/site.webmanifest                         |   0
 .../public/vercel.svg                               |   0
 .../src/app/favicon.ico                             | Bin
 .../src/app/globals.css                             |   0
 .../src/app/hooks/useGithubStars.ts                 |   0
 .../src/app/layout.tsx                              |   0
 .../src/app/page.tsx                                |   0
 .../src/components/github-button.tsx                |   0
 .../src/components/main.tsx                         |   0
 .../src/components/ui/button.tsx                    |   0
 .../src/components/ui/dialog.tsx                    |   0
 .../src/components/ui/dropdown-menu.tsx             |   0
 .../src/components/ui/input.tsx                     |   0
 .../src/components/ui/select.tsx                    |   0
 .../src/components/ui/sonner.tsx                    |   0
 .../src/components/ui/switch.tsx                    |   0
 .../src/components/ui/textarea.tsx                  |   0
 .../src/lib/LLM/llm.ts                              |   2 +-
 .../src/lib/LLM/testing_constants.ts                |   0
 .../src/lib/utils.ts                                |   0
 .../src/pages/api/roastWebsite.ts                   |   0
 .../tailwind.config.ts                              |   0
 .../tsconfig.json                                   |   0
 ...rag-llama3.mdx => web-data-rag--with-llama3.mdx} |   0
 48 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/{k8n => kubernetes-cluster-install}/README.md (100%)
 rename examples/{k8n => kubernetes-cluster-install}/api.yaml (100%)
 rename examples/{k8n => kubernetes-cluster-install}/configmap.yaml (100%)
 rename examples/{k8n => kubernetes-cluster-install}/playwright-service.yaml (100%)
 rename examples/{k8n => kubernetes-cluster-install}/redis.yaml (100%)
 rename examples/{k8n => kubernetes-cluster-install}/secret.yaml (100%)
 rename examples/{k8n => kubernetes-cluster-install}/worker.yaml (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/.eslintrc.json (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/.gitignore (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/README.md (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/components.json (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/next.config.mjs (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/package-lock.json (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/package.json (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/postcss.config.mjs (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/public/android-chrome-192x192.png (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/public/android-chrome-512x512.png (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/public/apple-touch-icon.png (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/public/bgd.png (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/public/favicon-16x16.png (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/public/favicon-32x32.png (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/public/favicon.ico (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/public/next.svg (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/public/og.png (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/public/site.webmanifest (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/public/vercel.svg (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/app/favicon.ico (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/app/globals.css (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/app/hooks/useGithubStars.ts (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/app/layout.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/app/page.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/github-button.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/main.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/button.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/dialog.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/dropdown-menu.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/input.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/select.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/sonner.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/switch.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/textarea.tsx (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/lib/LLM/llm.ts (98%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/lib/LLM/testing_constants.ts (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/lib/utils.ts (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/src/pages/api/roastWebsite.ts (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/tailwind.config.ts (100%)
 rename examples/{roastmywebsite => roastmywebsite-example-app}/tsconfig.json (100%)
 rename examples/{rag-llama3.mdx => web-data-rag--with-llama3.mdx} (100%)

diff --git a/examples/k8n/README.md b/examples/kubernetes-cluster-install/README.md
similarity index 100%
rename from examples/k8n/README.md
rename to examples/kubernetes-cluster-install/README.md
diff --git a/examples/k8n/api.yaml b/examples/kubernetes-cluster-install/api.yaml
similarity index 100%
rename from examples/k8n/api.yaml
rename to examples/kubernetes-cluster-install/api.yaml
diff --git a/examples/k8n/configmap.yaml b/examples/kubernetes-cluster-install/configmap.yaml
similarity index 100%
rename from examples/k8n/configmap.yaml
rename to examples/kubernetes-cluster-install/configmap.yaml
diff --git a/examples/k8n/playwright-service.yaml b/examples/kubernetes-cluster-install/playwright-service.yaml
similarity index 100%
rename from examples/k8n/playwright-service.yaml
rename to examples/kubernetes-cluster-install/playwright-service.yaml
diff --git a/examples/k8n/redis.yaml b/examples/kubernetes-cluster-install/redis.yaml
similarity index 100%
rename from examples/k8n/redis.yaml
rename to examples/kubernetes-cluster-install/redis.yaml
diff --git a/examples/k8n/secret.yaml b/examples/kubernetes-cluster-install/secret.yaml
similarity index 100%
rename from examples/k8n/secret.yaml
rename to examples/kubernetes-cluster-install/secret.yaml
diff --git a/examples/k8n/worker.yaml b/examples/kubernetes-cluster-install/worker.yaml
similarity index 100%
rename from examples/k8n/worker.yaml
rename to examples/kubernetes-cluster-install/worker.yaml
diff --git a/examples/roastmywebsite/.eslintrc.json b/examples/roastmywebsite-example-app/.eslintrc.json
similarity index 100%
rename from examples/roastmywebsite/.eslintrc.json
rename to examples/roastmywebsite-example-app/.eslintrc.json
diff --git a/examples/roastmywebsite/.gitignore b/examples/roastmywebsite-example-app/.gitignore
similarity index 100%
rename from examples/roastmywebsite/.gitignore
rename to examples/roastmywebsite-example-app/.gitignore
diff --git a/examples/roastmywebsite/README.md b/examples/roastmywebsite-example-app/README.md
similarity index 100%
rename from examples/roastmywebsite/README.md
rename to examples/roastmywebsite-example-app/README.md
diff --git a/examples/roastmywebsite/components.json b/examples/roastmywebsite-example-app/components.json
similarity index 100%
rename from examples/roastmywebsite/components.json
rename to examples/roastmywebsite-example-app/components.json
diff --git a/examples/roastmywebsite/next.config.mjs b/examples/roastmywebsite-example-app/next.config.mjs
similarity index 100%
rename from examples/roastmywebsite/next.config.mjs
rename to examples/roastmywebsite-example-app/next.config.mjs
diff --git a/examples/roastmywebsite/package-lock.json b/examples/roastmywebsite-example-app/package-lock.json
similarity index 100%
rename from examples/roastmywebsite/package-lock.json
rename to examples/roastmywebsite-example-app/package-lock.json
diff --git a/examples/roastmywebsite/package.json b/examples/roastmywebsite-example-app/package.json
similarity index 100%
rename from examples/roastmywebsite/package.json
rename to examples/roastmywebsite-example-app/package.json
diff --git a/examples/roastmywebsite/postcss.config.mjs b/examples/roastmywebsite-example-app/postcss.config.mjs
similarity index 100%
rename from examples/roastmywebsite/postcss.config.mjs
rename to examples/roastmywebsite-example-app/postcss.config.mjs
diff --git a/examples/roastmywebsite/public/android-chrome-192x192.png b/examples/roastmywebsite-example-app/public/android-chrome-192x192.png
similarity index 100%
rename from examples/roastmywebsite/public/android-chrome-192x192.png
rename to examples/roastmywebsite-example-app/public/android-chrome-192x192.png
diff --git a/examples/roastmywebsite/public/android-chrome-512x512.png b/examples/roastmywebsite-example-app/public/android-chrome-512x512.png
similarity index 100%
rename from examples/roastmywebsite/public/android-chrome-512x512.png
rename to examples/roastmywebsite-example-app/public/android-chrome-512x512.png
diff --git a/examples/roastmywebsite/public/apple-touch-icon.png b/examples/roastmywebsite-example-app/public/apple-touch-icon.png
similarity index 100%
rename from examples/roastmywebsite/public/apple-touch-icon.png
rename to examples/roastmywebsite-example-app/public/apple-touch-icon.png
diff --git a/examples/roastmywebsite/public/bgd.png b/examples/roastmywebsite-example-app/public/bgd.png
similarity index 100%
rename from examples/roastmywebsite/public/bgd.png
rename to examples/roastmywebsite-example-app/public/bgd.png
diff --git a/examples/roastmywebsite/public/favicon-16x16.png b/examples/roastmywebsite-example-app/public/favicon-16x16.png
similarity index 100%
rename from examples/roastmywebsite/public/favicon-16x16.png
rename to examples/roastmywebsite-example-app/public/favicon-16x16.png
diff --git a/examples/roastmywebsite/public/favicon-32x32.png b/examples/roastmywebsite-example-app/public/favicon-32x32.png
similarity index 100%
rename from examples/roastmywebsite/public/favicon-32x32.png
rename to examples/roastmywebsite-example-app/public/favicon-32x32.png
diff --git a/examples/roastmywebsite/public/favicon.ico b/examples/roastmywebsite-example-app/public/favicon.ico
similarity index 100%
rename from examples/roastmywebsite/public/favicon.ico
rename to examples/roastmywebsite-example-app/public/favicon.ico
diff --git a/examples/roastmywebsite/public/next.svg b/examples/roastmywebsite-example-app/public/next.svg
similarity index 100%
rename from examples/roastmywebsite/public/next.svg
rename to examples/roastmywebsite-example-app/public/next.svg
diff --git a/examples/roastmywebsite/public/og.png b/examples/roastmywebsite-example-app/public/og.png
similarity index 100%
rename from examples/roastmywebsite/public/og.png
rename to examples/roastmywebsite-example-app/public/og.png
diff --git a/examples/roastmywebsite/public/site.webmanifest b/examples/roastmywebsite-example-app/public/site.webmanifest
similarity index 100%
rename from examples/roastmywebsite/public/site.webmanifest
rename to examples/roastmywebsite-example-app/public/site.webmanifest
diff --git a/examples/roastmywebsite/public/vercel.svg b/examples/roastmywebsite-example-app/public/vercel.svg
similarity index 100%
rename from examples/roastmywebsite/public/vercel.svg
rename to examples/roastmywebsite-example-app/public/vercel.svg
diff --git a/examples/roastmywebsite/src/app/favicon.ico b/examples/roastmywebsite-example-app/src/app/favicon.ico
similarity index 100%
rename from examples/roastmywebsite/src/app/favicon.ico
rename to examples/roastmywebsite-example-app/src/app/favicon.ico
diff --git a/examples/roastmywebsite/src/app/globals.css b/examples/roastmywebsite-example-app/src/app/globals.css
similarity index 100%
rename from examples/roastmywebsite/src/app/globals.css
rename to examples/roastmywebsite-example-app/src/app/globals.css
diff --git a/examples/roastmywebsite/src/app/hooks/useGithubStars.ts b/examples/roastmywebsite-example-app/src/app/hooks/useGithubStars.ts
similarity index 100%
rename from examples/roastmywebsite/src/app/hooks/useGithubStars.ts
rename to examples/roastmywebsite-example-app/src/app/hooks/useGithubStars.ts
diff --git a/examples/roastmywebsite/src/app/layout.tsx b/examples/roastmywebsite-example-app/src/app/layout.tsx
similarity index 100%
rename from examples/roastmywebsite/src/app/layout.tsx
rename to examples/roastmywebsite-example-app/src/app/layout.tsx
diff --git a/examples/roastmywebsite/src/app/page.tsx b/examples/roastmywebsite-example-app/src/app/page.tsx
similarity index 100%
rename from examples/roastmywebsite/src/app/page.tsx
rename to examples/roastmywebsite-example-app/src/app/page.tsx
diff --git a/examples/roastmywebsite/src/components/github-button.tsx b/examples/roastmywebsite-example-app/src/components/github-button.tsx
similarity index 100%
rename from examples/roastmywebsite/src/components/github-button.tsx
rename to examples/roastmywebsite-example-app/src/components/github-button.tsx
diff --git a/examples/roastmywebsite/src/components/main.tsx b/examples/roastmywebsite-example-app/src/components/main.tsx
similarity index 100%
rename from examples/roastmywebsite/src/components/main.tsx
rename to examples/roastmywebsite-example-app/src/components/main.tsx
diff --git a/examples/roastmywebsite/src/components/ui/button.tsx b/examples/roastmywebsite-example-app/src/components/ui/button.tsx
similarity index 100%
rename from examples/roastmywebsite/src/components/ui/button.tsx
rename to examples/roastmywebsite-example-app/src/components/ui/button.tsx
diff --git a/examples/roastmywebsite/src/components/ui/dialog.tsx b/examples/roastmywebsite-example-app/src/components/ui/dialog.tsx
similarity index 100%
rename from examples/roastmywebsite/src/components/ui/dialog.tsx
rename to examples/roastmywebsite-example-app/src/components/ui/dialog.tsx
diff --git a/examples/roastmywebsite/src/components/ui/dropdown-menu.tsx b/examples/roastmywebsite-example-app/src/components/ui/dropdown-menu.tsx
similarity index 100%
rename from examples/roastmywebsite/src/components/ui/dropdown-menu.tsx
rename to examples/roastmywebsite-example-app/src/components/ui/dropdown-menu.tsx
diff --git a/examples/roastmywebsite/src/components/ui/input.tsx b/examples/roastmywebsite-example-app/src/components/ui/input.tsx
similarity index 100%
rename from examples/roastmywebsite/src/components/ui/input.tsx
rename to examples/roastmywebsite-example-app/src/components/ui/input.tsx
diff --git a/examples/roastmywebsite/src/components/ui/select.tsx b/examples/roastmywebsite-example-app/src/components/ui/select.tsx
similarity index 100%
rename from examples/roastmywebsite/src/components/ui/select.tsx
rename to examples/roastmywebsite-example-app/src/components/ui/select.tsx
diff --git a/examples/roastmywebsite/src/components/ui/sonner.tsx b/examples/roastmywebsite-example-app/src/components/ui/sonner.tsx
similarity index 100%
rename from examples/roastmywebsite/src/components/ui/sonner.tsx
rename to examples/roastmywebsite-example-app/src/components/ui/sonner.tsx
diff --git a/examples/roastmywebsite/src/components/ui/switch.tsx b/examples/roastmywebsite-example-app/src/components/ui/switch.tsx
similarity index 100%
rename from examples/roastmywebsite/src/components/ui/switch.tsx
rename to examples/roastmywebsite-example-app/src/components/ui/switch.tsx
diff --git a/examples/roastmywebsite/src/components/ui/textarea.tsx b/examples/roastmywebsite-example-app/src/components/ui/textarea.tsx
similarity index 100%
rename from examples/roastmywebsite/src/components/ui/textarea.tsx
rename to examples/roastmywebsite-example-app/src/components/ui/textarea.tsx
diff --git a/examples/roastmywebsite/src/lib/LLM/llm.ts b/examples/roastmywebsite-example-app/src/lib/LLM/llm.ts
similarity index 98%
rename from examples/roastmywebsite/src/lib/LLM/llm.ts
rename to examples/roastmywebsite-example-app/src/lib/LLM/llm.ts
index 39dcf10..1d290a3 100644
--- a/examples/roastmywebsite/src/lib/LLM/llm.ts
+++ b/examples/roastmywebsite-example-app/src/lib/LLM/llm.ts
@@ -1,4 +1,4 @@
-import OpenAI from "openai";
+import OpenAI from "openai/index.mjs";
 import { encoding_for_model } from "@dqbd/tiktoken";
 
 /**
diff --git a/examples/roastmywebsite/src/lib/LLM/testing_constants.ts b/examples/roastmywebsite-example-app/src/lib/LLM/testing_constants.ts
similarity index 100%
rename from examples/roastmywebsite/src/lib/LLM/testing_constants.ts
rename to examples/roastmywebsite-example-app/src/lib/LLM/testing_constants.ts
diff --git a/examples/roastmywebsite/src/lib/utils.ts b/examples/roastmywebsite-example-app/src/lib/utils.ts
similarity index 100%
rename from examples/roastmywebsite/src/lib/utils.ts
rename to examples/roastmywebsite-example-app/src/lib/utils.ts
diff --git a/examples/roastmywebsite/src/pages/api/roastWebsite.ts b/examples/roastmywebsite-example-app/src/pages/api/roastWebsite.ts
similarity index 100%
rename from examples/roastmywebsite/src/pages/api/roastWebsite.ts
rename to examples/roastmywebsite-example-app/src/pages/api/roastWebsite.ts
diff --git a/examples/roastmywebsite/tailwind.config.ts b/examples/roastmywebsite-example-app/tailwind.config.ts
similarity index 100%
rename from examples/roastmywebsite/tailwind.config.ts
rename to examples/roastmywebsite-example-app/tailwind.config.ts
diff --git a/examples/roastmywebsite/tsconfig.json b/examples/roastmywebsite-example-app/tsconfig.json
similarity index 100%
rename from examples/roastmywebsite/tsconfig.json
rename to examples/roastmywebsite-example-app/tsconfig.json
diff --git a/examples/rag-llama3.mdx b/examples/web-data-rag--with-llama3.mdx
similarity index 100%
rename from examples/rag-llama3.mdx
rename to examples/web-data-rag--with-llama3.mdx

From 06b0d01fd430a8686748b8926668f55a02324d17 Mon Sep 17 00:00:00 2001
From: Eric Ciarla <ericciarla@yahoo.com>
Date: Tue, 11 Jun 2024 12:23:36 -0400
Subject: [PATCH 12/29] Update examples

---
 ...ing-llms.mdx => web-data-contradiction-testing-using-llms.mdx} | 0
 ...traction-using-llms.mdx => web-data-extraction-using-llms.mdx} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename examples/{contradiction-testing-using-llms.mdx => web-data-contradiction-testing-using-llms.mdx} (100%)
 rename examples/{data-extraction-using-llms.mdx => web-data-extraction-using-llms.mdx} (100%)

diff --git a/examples/contradiction-testing-using-llms.mdx b/examples/web-data-contradiction-testing-using-llms.mdx
similarity index 100%
rename from examples/contradiction-testing-using-llms.mdx
rename to examples/web-data-contradiction-testing-using-llms.mdx
diff --git a/examples/data-extraction-using-llms.mdx b/examples/web-data-extraction-using-llms.mdx
similarity index 100%
rename from examples/data-extraction-using-llms.mdx
rename to examples/web-data-extraction-using-llms.mdx

From a9f93c2f1e9d02303b24dd49862602d0fd5828dd Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 11 Jun 2024 14:18:05 -0300
Subject: [PATCH 13/29] Added route to clean completed jobs and a github action
 cron that triggers every 24h

---
 .../clean-before-24h-complete-jobs.yml        | 17 +++++++++++++++
 apps/api/src/index.ts                         | 21 +++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 .github/workflows/clean-before-24h-complete-jobs.yml

diff --git a/.github/workflows/clean-before-24h-complete-jobs.yml b/.github/workflows/clean-before-24h-complete-jobs.yml
new file mode 100644
index 0000000..2fd3b22
--- /dev/null
+++ b/.github/workflows/clean-before-24h-complete-jobs.yml
@@ -0,0 +1,17 @@
+name: Clean Before 24h Completed Jobs
+on:
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  clean-jobs:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Send GET request to clean jobs
+        run: |
+          response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/clean-before-24h-complete-jobs)
+          if [ "$response" -ne 200 ]; then
+            echo "Failed to clean jobs. Response: $response"
+            exit 1
+          fi
+          echo "Successfully cleaned jobs. Response: $response"
diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts
index 0246a1e..eac8204 100644
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@@ -164,6 +164,27 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
   }
 });
 
+app.get('/clean-before-24h-complete-jobs', async (req, res) => {
+  try {
+    const webScraperQueue = getWebScraperQueue();
+    const completedJobs = await webScraperQueue.getJobs(['completed']);
+    const before24hJobs = completedJobs.filter(job => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000);
+    const jobIds = before24hJobs.map(job => job.id) as string[];
+    let count = 0;
+    for (const jobId of jobIds) {
+      try {
+        await webScraperQueue.removeJobs(jobId);
+        count++;
+      } catch (jobError) {
+        console.error(`Failed to remove job with ID ${jobId}:`, jobError);
+      }
+    }
+    res.status(200).send(`Removed ${count} completed jobs.`);
+  } catch (error) {
+    console.error('Failed to clean last 24h complete jobs:', error);
+    res.status(500).send('Failed to clean jobs');
+  }
+});
 
 app.get("/is-production", (req, res) => {
   res.send({ isProduction: global.isProduction });

From ee282c3d5537f87ee81f84cf6ea6999c422268c0 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 11 Jun 2024 15:24:39 -0300
Subject: [PATCH 14/29] Added allowBackwardCrawling option

---
 apps/api/src/controllers/crawl.ts          |  8 +++-----
 apps/api/src/lib/entities.ts               |  1 +
 apps/api/src/scraper/WebScraper/crawler.ts | 12 +++++++++++-
 apps/api/src/scraper/WebScraper/index.ts   |  4 +++-
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts
index 5345b4f..55c3a2e 100644
--- a/apps/api/src/controllers/crawl.ts
+++ b/apps/api/src/controllers/crawl.ts
@@ -55,7 +55,7 @@ export async function crawlController(req: Request, res: Response) {
     }
 
     const mode = req.body.mode ?? "crawl";
-    const crawlerOptions = req.body.crawlerOptions ?? {};
+    const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false, returnOnlyUrls: true };
     const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
 
     if (mode === "single_urls" && !url.includes(",")) {
@@ -64,9 +64,7 @@ export async function crawlController(req: Request, res: Response) {
         await a.setOptions({
           mode: "single_urls",
           urls: [url],
-          crawlerOptions: {
-            returnOnlyUrls: true,
-          },
+          crawlerOptions: crawlerOptions,
           pageOptions: pageOptions,
         });
 
@@ -91,7 +89,7 @@ export async function crawlController(req: Request, res: Response) {
     const job = await addWebScraperJob({
       url: url,
       mode: mode ?? "crawl", // fix for single urls not working
-      crawlerOptions: { ...crawlerOptions },
+      crawlerOptions: crawlerOptions,
       team_id: team_id,
       pageOptions: pageOptions,
       origin: req.body.origin ?? "api",
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 744c07b..facc81e 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -46,6 +46,7 @@ export type CrawlerOptions = {
   replaceAllPathsWithAbsolutePaths?: boolean;
   ignoreSitemap?: boolean;
   mode?: "default" | "fast"; // have a mode of some sort
+  allowBackwardCrawling?: boolean;
 }
 
 export type WebScraperOptions = {
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index fc95e7c..7720991 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -20,6 +20,7 @@ export class WebCrawler {
   private robotsTxtUrl: string;
   private robots: any;
   private generateImgAltText: boolean;
+  private allowBackwardCrawling: boolean;
 
   constructor({
     initialUrl,
@@ -29,6 +30,7 @@ export class WebCrawler {
     limit = 10000,
     generateImgAltText = false,
     maxCrawledDepth = 10,
+    allowBackwardCrawling = false
   }: {
     initialUrl: string;
     includes?: string[];
@@ -37,6 +39,7 @@ export class WebCrawler {
     limit?: number;
     generateImgAltText?: boolean;
     maxCrawledDepth?: number;
+    allowBackwardCrawling?: boolean;
   }) {
     this.initialUrl = initialUrl;
     this.baseUrl = new URL(initialUrl).origin;
@@ -49,6 +52,7 @@ export class WebCrawler {
     this.maxCrawledLinks = maxCrawledLinks ?? limit;
     this.maxCrawledDepth = maxCrawledDepth ?? 10;
     this.generateImgAltText = generateImgAltText ?? false;
+    this.allowBackwardCrawling = allowBackwardCrawling ?? false;
   }
 
   private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
@@ -90,10 +94,16 @@ export class WebCrawler {
         const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
 
         // Ensure the protocol and hostname match, and the path starts with the initial URL's path
-        if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
+        if (linkHostname !== initialHostname) {
           return false;
         }
 
+        if (!this.allowBackwardCrawling) {
+          if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
+            return false;
+          }
+        }
+
         const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
         // Check if the link is disallowed by robots.txt
         if (!isAllowed) {
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 7dcd175..5344320 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -38,8 +38,8 @@ export class WebScraperDataProvider {
   private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
     "gpt-4-turbo";
   private crawlerMode: string = "default";
+  private allowBackwardCrawling: boolean = false;
 
-  
   authorize(): void {
     throw new Error("Method not implemented.");
   }
@@ -171,6 +171,7 @@ export class WebScraperDataProvider {
       maxCrawledDepth: this.maxCrawledDepth,
       limit: this.limit,
       generateImgAltText: this.generateImgAltText,
+      allowBackwardCrawling: this.allowBackwardCrawling,
     });
 
     let links = await crawler.start(
@@ -480,6 +481,7 @@ export class WebScraperDataProvider {
     this.excludes = this.excludes.filter((item) => item !== "");
     this.crawlerMode = options.crawlerOptions?.mode ?? "default";
     this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
+    this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
 
     // make sure all urls start with https://
     this.urls = this.urls.map((url) => {

From b87725c683fff5ac4bdaeb6464a6b6dd1755e3b7 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Tue, 11 Jun 2024 12:08:49 -0700
Subject: [PATCH 15/29] Update openapi.json

---
 apps/api/openapi.json | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/apps/api/openapi.json b/apps/api/openapi.json
index 55bfe1c..7147af1 100644
--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@@ -190,6 +190,11 @@
                         "description": "Ignore the website sitemap when crawling",
                         "default": false
                       },
+                      "replaceAllPathsWithAbsolutePaths": {
+                        "type": "boolean",
+                        "description": "Replace all relative paths with absolute paths for images and links",
+                        "default": false
+                      },
                       "limit": {
                         "type": "integer",
                         "description": "Maximum number of pages to crawl",

From 520739c9f44b77d94288f3ea9e0433330ae1bc12 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Tue, 11 Jun 2024 12:43:16 -0700
Subject: [PATCH 16/29] Nick: fixed bugs associated with absolute path
 replacements

---
 apps/api/openapi.json                         | 10 +++++-----
 apps/api/src/lib/entities.ts                  |  1 +
 apps/api/src/scraper/WebScraper/index.ts      | 11 +++++-----
 .../scraper/WebScraper/utils/replacePaths.ts  | 20 +++++++++++--------
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/apps/api/openapi.json b/apps/api/openapi.json
index 7147af1..a755e37 100644
--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@@ -190,11 +190,6 @@
                         "description": "Ignore the website sitemap when crawling",
                         "default": false
                       },
-                      "replaceAllPathsWithAbsolutePaths": {
-                        "type": "boolean",
-                        "description": "Replace all relative paths with absolute paths for images and links",
-                        "default": false
-                      },
                       "limit": {
                         "type": "integer",
                         "description": "Maximum number of pages to crawl",
@@ -223,6 +218,11 @@
                       "headers": {
                         "type": "object",
                         "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
+                      },
+                      "replaceAllPathsWithAbsolutePaths": {
+                        "type": "boolean",
+                        "description": "Replace all relative paths with absolute paths for images and links",
+                        "default": false
                       }
                     }
                   }
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 744c07b..d5002c7 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -18,6 +18,7 @@ export type PageOptions = {
   waitFor?: number;
   screenshot?: boolean;
   headers?: Record<string, string>;
+  replaceAllPathsWithAbsolutePaths?: boolean;
 };
 
 export type ExtractorOptions = {
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 7dcd175..54897f1 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -302,9 +302,10 @@ export class WebScraperDataProvider {
   }
 
   private applyPathReplacements(documents: Document[]): Document[] {
-    return this.replaceAllPathsWithAbsolutePaths
-      ? replacePathsWithAbsolutePaths(documents)
-      : replaceImgPathsWithAbsolutePaths(documents);
+    if (this.replaceAllPathsWithAbsolutePaths) {
+      documents = replacePathsWithAbsolutePaths(documents);
+    }
+    return replaceImgPathsWithAbsolutePaths(documents);
   }
 
   private async applyImgAltText(documents: Document[]): Promise<Document[]> {
@@ -473,9 +474,9 @@ export class WebScraperDataProvider {
     this.limit = options.crawlerOptions?.limit ?? 10000;
     this.generateImgAltText =
       options.crawlerOptions?.generateImgAltText ?? false;
-    this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
     this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
-    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
+    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
     //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
     this.excludes = this.excludes.filter((item) => item !== "");
     this.crawlerMode = options.crawlerOptions?.mode ?? "default";
diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts
index d652611..788916c 100644
--- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts
+++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts
@@ -10,7 +10,8 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
         ) || [];
 
       paths.forEach((path: string) => {
-        const isImage = path.startsWith("!");
+        try {
+          const isImage = path.startsWith("!");
         let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
         let url = matchedUrl[1];
 
@@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
         }
 
         const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
-        if (isImage) {
-          document.content = document.content.replace(
-            path,
-            `${markdownLinkOrImageText}(${url})`
-          );
-        } else {
+        // Image is handled afterwards
+        if (!isImage) {
           document.content = document.content.replace(
             path,
             `${markdownLinkOrImageText}(${url})`
           );
+          }
+        } catch (error) {
+          
         }
       });
+      document.markdown = document.content;
     });
 
     return documents;
@@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
           if (!imageUrl.startsWith("http")) {
             if (imageUrl.startsWith("/")) {
               imageUrl = imageUrl.substring(1);
+              imageUrl = new URL(imageUrl, baseUrl).toString();
+            } else {
+              imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
             }
-            imageUrl = new URL(imageUrl, baseUrl).toString();
           }
         }
 
@@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
           `![${altText}](${imageUrl})`
         );
       });
+      document.markdown = document.content;
     });
 
     return documents;

From 2239e03269ec8ef3c3dba2596ac8994fa4562b05 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Tue, 11 Jun 2024 12:54:02 -0700
Subject: [PATCH 17/29] Update replacePaths.test.ts

---
 .../WebScraper/utils/__tests__/replacePaths.test.ts    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts
index aae567c..6ecd990 100644
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts
@@ -6,12 +6,12 @@ describe('replacePaths', () => {
     it('should replace relative paths with absolute paths', () => {
       const documents: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).'
+        content: 'This is a [link](/path/to/resource).'
       }];
 
       const expectedDocuments: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).'
+        content: 'This is a [link](https://example.com/path/to/resource).'
       }];
 
       const result = replacePathsWithAbsolutePaths(documents);
@@ -21,7 +21,7 @@ describe('replacePaths', () => {
     it('should not alter absolute URLs', () => {
       const documents: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).'
+        content: 'This is an [external link](https://external.com/path).'
       }];
 
       const result = replacePathsWithAbsolutePaths(documents);
@@ -41,12 +41,12 @@ describe('replacePaths', () => {
     it('should handle multiple links and images correctly', () => {
       const documents: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).'
+        content: 'Here are two links: [link1](/path1) and [link2](/path2).'
       }];
 
       const expectedDocuments: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).'
+        content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
       }];
 
       const result = replacePathsWithAbsolutePaths(documents);

From 1e3e06a1d57bffdafb7f562ca9fd5a4cb15ad05f Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Tue, 11 Jun 2024 13:02:39 -0700
Subject: [PATCH 18/29] Update replacePaths.test.ts

---
 .../utils/__tests__/replacePaths.test.ts      | 39 ++++++++++++-------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts
index 6ecd990..e201926 100644
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts
@@ -6,12 +6,14 @@ describe('replacePaths', () => {
     it('should replace relative paths with absolute paths', () => {
       const documents: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'This is a [link](/path/to/resource).'
+        content: 'This is a [link](/path/to/resource).',
+        markdown: 'This is a [link](/path/to/resource).'
       }];
 
       const expectedDocuments: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'This is a [link](https://example.com/path/to/resource).'
+        content: 'This is a [link](https://example.com/path/to/resource).',
+        markdown: 'This is a [link](https://example.com/path/to/resource).'
       }];
 
       const result = replacePathsWithAbsolutePaths(documents);
@@ -21,7 +23,8 @@ describe('replacePaths', () => {
     it('should not alter absolute URLs', () => {
       const documents: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'This is an [external link](https://external.com/path).'
+        content: 'This is an [external link](https://external.com/path).',
+        markdown: 'This is an [external link](https://external.com/path).'
       }];
 
       const result = replacePathsWithAbsolutePaths(documents);
@@ -31,7 +34,8 @@ describe('replacePaths', () => {
     it('should not alter data URLs for images', () => {
       const documents: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).'
+        content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).',
+        markdown: 'This is an image: ![alt text](data:image/png;base64,ABC123==).'
       }];
 
       const result = replacePathsWithAbsolutePaths(documents);
@@ -41,12 +45,14 @@ describe('replacePaths', () => {
     it('should handle multiple links and images correctly', () => {
       const documents: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'Here are two links: [link1](/path1) and [link2](/path2).'
+        content: 'Here are two links: [link1](/path1) and [link2](/path2).',
+        markdown: 'Here are two links: [link1](/path1) and [link2](/path2).'
       }];
 
       const expectedDocuments: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
+        content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).',
+        markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
       }];
 
       const result = replacePathsWithAbsolutePaths(documents);
@@ -56,12 +62,14 @@ describe('replacePaths', () => {
     it('should correctly handle a mix of absolute and relative paths', () => {
       const documents: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
+        content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
+        markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
       }];
 
       const expectedDocuments: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
+        content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
+        markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
       }];
 
       const result = replacePathsWithAbsolutePaths(documents);
@@ -74,12 +82,14 @@ describe('replacePaths', () => {
     it('should replace relative image paths with absolute paths', () => {
       const documents: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'Here is an image: ![alt text](/path/to/image.jpg).'
+        content: 'Here is an image: ![alt text](/path/to/image.jpg).',
+        markdown: 'Here is an image: ![alt text](/path/to/image.jpg).'
       }];
 
       const expectedDocuments: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
+        content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).',
+        markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
       }];
 
       const result = replaceImgPathsWithAbsolutePaths(documents);
@@ -89,7 +99,8 @@ describe('replacePaths', () => {
     it('should not alter data:image URLs', () => {
       const documents: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).'
+        content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).',
+        markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).'
       }];
 
       const result = replaceImgPathsWithAbsolutePaths(documents);
@@ -99,12 +110,14 @@ describe('replacePaths', () => {
     it('should handle multiple images with a mix of data and relative URLs', () => {
       const documents: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).'
+        content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).',
+        markdown: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).'
       }];
 
       const expectedDocuments: Document[] = [{
         metadata: { sourceURL: 'https://example.com' },
-        content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).'
+        content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).',
+        markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).'
       }];
 
       const result = replaceImgPathsWithAbsolutePaths(documents);

From def2ba998717fcbf97d9fe0679bc92e4e4657fa6 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 11 Jun 2024 17:46:25 -0300
Subject: [PATCH 19/29] added tests

---
 .../src/__tests__/e2e_withAuth/index.test.ts  | 114 ++++++++++++++----
 1 file changed, 90 insertions(+), 24 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index f619254..05dd7ff 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -596,7 +596,7 @@ describe("E2E Tests for API Routes", () => {
         .post("/v0/crawl")
         .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
         .set("Content-Type", "application/json")
-        .send({ url: "https://roastmywebsite.ai" });
+        .send({ url: "https://mendable.ai/blog" });
       expect(crawlResponse.statusCode).toBe(200);
 
       let isCompleted = false;
@@ -622,7 +622,13 @@ describe("E2E Tests for API Routes", () => {
       expect(completedResponse.body.data[0]).toHaveProperty("content");
       expect(completedResponse.body.data[0]).toHaveProperty("markdown");
       expect(completedResponse.body.data[0]).toHaveProperty("metadata");
-      expect(completedResponse.body.data[0].content).toContain("_Roast_");
+      expect(completedResponse.body.data[0].content).toContain("Mendable");
+
+    const childrenLinks = completedResponse.body.data.filter(doc => 
+      doc.sourceURL && doc.sourceURL.startsWith("https://mendable.ai/blog")
+    );
+
+    expect(childrenLinks.length).toBe(completedResponse.body.data.length);
     }, 120000); // 120 seconds
     
     it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
@@ -757,40 +763,100 @@ describe("E2E Tests for API Routes", () => {
     }, 60000);
   }); // 60 seconds
 
-  it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
+  it.concurrent("should return a successful response for a valid crawl job with allowBackwardCrawling set to true option", async () => {
     const crawlResponse = await request(TEST_URL)
       .post("/v0/crawl")
       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
       .set("Content-Type", "application/json")
-      .send({ url: "https://jestjs.io" });
+      .send({
+        url: "https://mendable.ai/blog",
+        pageOptions: { includeHtml: true },
+        crawlerOptions: { allowBackwardCrawling: true },
+      });
     expect(crawlResponse.statusCode).toBe(200);
+    
+    let isFinished = false;
+    let completedResponse;
 
-    // wait for 30 seconds
-    await new Promise((r) => setTimeout(r, 20000));
+    while (!isFinished) {
+      const response = await request(TEST_URL)
+        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("status");
 
-    const response = await request(TEST_URL)
-      .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
-      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-    expect(response.statusCode).toBe(200);
-    expect(response.body).toHaveProperty("status");
-    expect(response.body.status).toBe("cancelled");
+      if (response.body.status === "completed") {
+        isFinished = true;
+        completedResponse = response;
+      } else {
+        await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+      }
+    }
 
-    await new Promise((r) => setTimeout(r, 10000));
-
-    const completedResponse = await request(TEST_URL)
-      .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
     expect(completedResponse.statusCode).toBe(200);
     expect(completedResponse.body).toHaveProperty("status");
-    expect(completedResponse.body.status).toBe("failed");
+    expect(completedResponse.body.status).toBe("completed");
     expect(completedResponse.body).toHaveProperty("data");
-    expect(completedResponse.body.data).toEqual(null);
-    expect(completedResponse.body).toHaveProperty("partial_data");
-    expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
-    expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
-    expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
+    expect(completedResponse.body.data[0]).toHaveProperty("content");
+    expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+    expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+    expect(completedResponse.body.data[0]).toHaveProperty("html");
+    expect(completedResponse.body.data[0].content).toContain("Mendable");
+    expect(completedResponse.body.data[0].markdown).toContain("Mendable");
+
+    const onlyChildrenLinks = completedResponse.body.data.filter(doc => {
+      return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
+    });
+
+    expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length);
+  }, 60000);
+
+  // it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
+  //   const crawlResponse = await request(TEST_URL)
+  //     .post("/v0/crawl")
+  //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+  //     .set("Content-Type", "application/json")
+  //     .send({ url: "https://scrapethissite.com" });
+
+  //   expect(crawlResponse.statusCode).toBe(200);
+
+  //   await new Promise((r) => setTimeout(r, 2000)); // Wait for 1 seconds before cancelling the job
+
+  //   const responseCancel = await request(TEST_URL)
+  //     .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
+  //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+  //   expect(responseCancel.statusCode).toBe(200);
+
+  //   let isFinished = false;
+  //   let completedResponse;
+
+  //   while (!isFinished) {
+  //     const response = await request(TEST_URL)
+  //       .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+  //       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+  //     expect(response.statusCode).toBe(200);
+  //     expect(response.body).toHaveProperty("status");
+  //     console.log(response.body.status)
+      
+  //     if (response.body.status === "failed") {
+  //       isFinished = true;
+  //       completedResponse = response;
+  //     } else {
+  //       await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+  //     }
+  //   }
+
+  //   expect(completedResponse.statusCode).toBe(200);
+  //   expect(completedResponse.body).toHaveProperty("status");
+  //   expect(completedResponse.body.status).toBe("failed");
+  //   expect(completedResponse.body).toHaveProperty("data");
+  //   expect(completedResponse.body.data).toBeNull();
+  //   expect(completedResponse.body).toHaveProperty("partial_data");
+  //   expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
+  //   expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
+  //   expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
     
-  }, 60000); // 60 seconds
+  // }, 60000); // 60 seconds
 
   describe("POST /v0/scrape with LLM Extraction", () => {
     it.concurrent("should extract data using LLM extraction mode", async () => {

From df3a678cf485107558f38b66db96381ca5012d14 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 11 Jun 2024 17:46:56 -0300
Subject: [PATCH 20/29] getting back the cancel test, this should work

---
 .../src/__tests__/e2e_withAuth/index.test.ts  | 76 +++++++++----------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 05dd7ff..5adf05d 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -811,52 +811,52 @@ describe("E2E Tests for API Routes", () => {
     expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length);
   }, 60000);
 
-  // it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
-  //   const crawlResponse = await request(TEST_URL)
-  //     .post("/v0/crawl")
-  //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
-  //     .set("Content-Type", "application/json")
-  //     .send({ url: "https://scrapethissite.com" });
+  it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
+    const crawlResponse = await request(TEST_URL)
+      .post("/v0/crawl")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send({ url: "https://scrapethissite.com" });
 
-  //   expect(crawlResponse.statusCode).toBe(200);
+    expect(crawlResponse.statusCode).toBe(200);
 
-  //   await new Promise((r) => setTimeout(r, 2000)); // Wait for 1 seconds before cancelling the job
+    await new Promise((r) => setTimeout(r, 2000)); // Wait for 1 seconds before cancelling the job
 
-  //   const responseCancel = await request(TEST_URL)
-  //     .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
-  //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-  //   expect(responseCancel.statusCode).toBe(200);
+    const responseCancel = await request(TEST_URL)
+      .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+    expect(responseCancel.statusCode).toBe(200);
 
-  //   let isFinished = false;
-  //   let completedResponse;
+    let isFinished = false;
+    let completedResponse;
 
-  //   while (!isFinished) {
-  //     const response = await request(TEST_URL)
-  //       .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-  //       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-  //     expect(response.statusCode).toBe(200);
-  //     expect(response.body).toHaveProperty("status");
-  //     console.log(response.body.status)
+    while (!isFinished) {
+      const response = await request(TEST_URL)
+        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("status");
+      console.log(response.body.status)
       
-  //     if (response.body.status === "failed") {
-  //       isFinished = true;
-  //       completedResponse = response;
-  //     } else {
-  //       await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
-  //     }
-  //   }
+      if (response.body.status === "failed") {
+        isFinished = true;
+        completedResponse = response;
+      } else {
+        await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+      }
+    }
 
-  //   expect(completedResponse.statusCode).toBe(200);
-  //   expect(completedResponse.body).toHaveProperty("status");
-  //   expect(completedResponse.body.status).toBe("failed");
-  //   expect(completedResponse.body).toHaveProperty("data");
-  //   expect(completedResponse.body.data).toBeNull();
-  //   expect(completedResponse.body).toHaveProperty("partial_data");
-  //   expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
-  //   expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
-  //   expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
+    expect(completedResponse.statusCode).toBe(200);
+    expect(completedResponse.body).toHaveProperty("status");
+    expect(completedResponse.body.status).toBe("failed");
+    expect(completedResponse.body).toHaveProperty("data");
+    expect(completedResponse.body.data).toBeNull();
+    expect(completedResponse.body).toHaveProperty("partial_data");
+    expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
+    expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
+    expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
     
-  // }, 60000); // 60 seconds
+  }, 60000); // 60 seconds
 
   describe("POST /v0/scrape with LLM Extraction", () => {
     it.concurrent("should extract data using LLM extraction mode", async () => {

From 157fbe4a1ea67e4807426696b5f9b3de446641c8 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 11 Jun 2024 17:52:01 -0300
Subject: [PATCH 21/29] added bull auth key

---
 .github/workflows/clean-before-24h-complete-jobs.yml | 5 ++++-
 apps/api/src/index.ts                                | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/clean-before-24h-complete-jobs.yml b/.github/workflows/clean-before-24h-complete-jobs.yml
index 2fd3b22..2ced537 100644
--- a/.github/workflows/clean-before-24h-complete-jobs.yml
+++ b/.github/workflows/clean-before-24h-complete-jobs.yml
@@ -3,13 +3,16 @@ on:
   schedule:
     - cron: '0 0 * * *'
 
+env:
+  BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
+    
 jobs:
   clean-jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Send GET request to clean jobs
         run: |
-          response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/clean-before-24h-complete-jobs)
+          response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/clean-before-24h-complete-jobs)
           if [ "$response" -ne 200 ]; then
             echo "Failed to clean jobs. Response: $response"
             exit 1
diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts
index eac8204..cc8376b 100644
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@@ -164,7 +164,7 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
   }
 });
 
-app.get('/clean-before-24h-complete-jobs', async (req, res) => {
+app.get(`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => {
   try {
     const webScraperQueue = getWebScraperQueue();
     const completedJobs = await webScraperQueue.getJobs(['completed']);

From d4df6f049d842c975fc5df15e24fd80fb031f322 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Tue, 11 Jun 2024 15:49:30 -0700
Subject: [PATCH 22/29] Nick:

---
 .github/{workflows => archive}/js-sdk.yml             | 0
 .github/{workflows => archive}/publish-js-sdk.yml     | 0
 .github/{workflows => archive}/publish-python-sdk.yml | 0
 .github/{workflows => archive}/python-sdk.yml         | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename .github/{workflows => archive}/js-sdk.yml (100%)
 rename .github/{workflows => archive}/publish-js-sdk.yml (100%)
 rename .github/{workflows => archive}/publish-python-sdk.yml (100%)
 rename .github/{workflows => archive}/python-sdk.yml (100%)

diff --git a/.github/workflows/js-sdk.yml b/.github/archive/js-sdk.yml
similarity index 100%
rename from .github/workflows/js-sdk.yml
rename to .github/archive/js-sdk.yml
diff --git a/.github/workflows/publish-js-sdk.yml b/.github/archive/publish-js-sdk.yml
similarity index 100%
rename from .github/workflows/publish-js-sdk.yml
rename to .github/archive/publish-js-sdk.yml
diff --git a/.github/workflows/publish-python-sdk.yml b/.github/archive/publish-python-sdk.yml
similarity index 100%
rename from .github/workflows/publish-python-sdk.yml
rename to .github/archive/publish-python-sdk.yml
diff --git a/.github/workflows/python-sdk.yml b/.github/archive/python-sdk.yml
similarity index 100%
rename from .github/workflows/python-sdk.yml
rename to .github/archive/python-sdk.yml

From 01c9f071fa554ec687882ad3727e480b3cc09dcd Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Wed, 12 Jun 2024 11:27:06 -0300
Subject: [PATCH 23/29] fixed

---
 .../src/__tests__/e2e_withAuth/index.test.ts  | 36 +++++++------------
 apps/api/src/controllers/crawl.ts             |  4 +--
 2 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 5adf05d..02e4a47 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -624,11 +624,11 @@ describe("E2E Tests for API Routes", () => {
       expect(completedResponse.body.data[0]).toHaveProperty("metadata");
       expect(completedResponse.body.data[0].content).toContain("Mendable");
 
-    const childrenLinks = completedResponse.body.data.filter(doc => 
-      doc.sourceURL && doc.sourceURL.startsWith("https://mendable.ai/blog")
-    );
+      const childrenLinks = completedResponse.body.data.filter(doc => 
+        doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
+      );
 
-    expect(childrenLinks.length).toBe(completedResponse.body.data.length);
+      expect(childrenLinks.length).toBe(completedResponse.body.data.length);
     }, 120000); // 120 seconds
     
     it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
@@ -816,35 +816,23 @@ describe("E2E Tests for API Routes", () => {
       .post("/v0/crawl")
       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
       .set("Content-Type", "application/json")
-      .send({ url: "https://scrapethissite.com" });
+      .send({ url: "https://jestjs.io" });
 
     expect(crawlResponse.statusCode).toBe(200);
 
-    await new Promise((r) => setTimeout(r, 2000)); // Wait for 1 seconds before cancelling the job
+    await new Promise((r) => setTimeout(r, 20000));
 
     const responseCancel = await request(TEST_URL)
       .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
     expect(responseCancel.statusCode).toBe(200);
+    expect(responseCancel.body).toHaveProperty("status");
+    expect(responseCancel.body.status).toBe("cancelled");
 
-    let isFinished = false;
-    let completedResponse;
-
-    while (!isFinished) {
-      const response = await request(TEST_URL)
-        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-      expect(response.statusCode).toBe(200);
-      expect(response.body).toHaveProperty("status");
-      console.log(response.body.status)
-      
-      if (response.body.status === "failed") {
-        isFinished = true;
-        completedResponse = response;
-      } else {
-        await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
-      }
-    }
+    await new Promise((r) => setTimeout(r, 10000));
+    const completedResponse = await request(TEST_URL)
+      .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
 
     expect(completedResponse.statusCode).toBe(200);
     expect(completedResponse.body).toHaveProperty("status");
diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts
index 55c3a2e..58d01e2 100644
--- a/apps/api/src/controllers/crawl.ts
+++ b/apps/api/src/controllers/crawl.ts
@@ -55,7 +55,7 @@ export async function crawlController(req: Request, res: Response) {
     }
 
     const mode = req.body.mode ?? "crawl";
-    const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false, returnOnlyUrls: true };
+    const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
     const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
 
     if (mode === "single_urls" && !url.includes(",")) {
@@ -64,7 +64,7 @@ export async function crawlController(req: Request, res: Response) {
         await a.setOptions({
           mode: "single_urls",
           urls: [url],
-          crawlerOptions: crawlerOptions,
+          crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
           pageOptions: pageOptions,
         });
 

From d20af257baebbeea8fe907f9c3447e2e12eb1d1b Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Wed, 12 Jun 2024 15:38:41 -0300
Subject: [PATCH 24/29] Added jobId to webhook data

---
 apps/api/src/services/queue-worker.ts | 4 ++--
 apps/api/src/services/webhook.ts      | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index 6772c57..a42b3e8 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -38,7 +38,7 @@ getWebScraperQueue().process(
         error: message /* etc... */,
       };
 
-      await callWebhook(job.data.team_id, data);
+      await callWebhook(job.data.team_id, job.id as string, data);
 
       await logJob({
         success: success,
@@ -78,7 +78,7 @@ getWebScraperQueue().process(
         error:
           "Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
       };
-      await callWebhook(job.data.team_id, data);
+      await callWebhook(job.data.team_id, job.id as string, data);
       await logJob({
         success: false,
         message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts
index 1f8d647..fc5962b 100644
--- a/apps/api/src/services/webhook.ts
+++ b/apps/api/src/services/webhook.ts
@@ -1,6 +1,6 @@
 import { supabase_service } from "./supabase";
 
-export const callWebhook = async (teamId: string, data: any) => {
+export const callWebhook = async (teamId: string, jobId: string,data: any) => {
   try {
     const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
     const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
@@ -47,6 +47,7 @@ export const callWebhook = async (teamId: string, data: any) => {
       },
       body: JSON.stringify({
         success: data.success,
+        jobId: jobId,
         data: dataToSend,
         error: data.error || undefined,
       }),

From 67dc46b454cb07d50ae3bc7fca219f597a009a83 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 12 Jun 2024 17:53:04 -0700
Subject: [PATCH 25/29] Nick: clusters

---
 .../src/__tests__/e2e_noAuth/index.test.ts    |   1 -
 .../src/__tests__/e2e_withAuth/index.test.ts  |   3 +-
 apps/api/src/index.ts                         | 331 ++++++++++--------
 apps/api/src/services/redis.ts                |  31 +-
 4 files changed, 208 insertions(+), 158 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts
index c443e71..acb2278 100644
--- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts
@@ -1,5 +1,4 @@
 import request from "supertest";
-import { app } from "../../index";
 import dotenv from "dotenv";
 const fs = require("fs");
 const path = require("path");
diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 02e4a47..431c7d1 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -1,5 +1,4 @@
 import request from "supertest";
-import { app } from "../../index";
 import dotenv from "dotenv";
 import { v4 as uuidv4 } from "uuid";
 
@@ -35,7 +34,7 @@ describe("E2E Tests for API Routes", () => {
 
   describe("POST /v0/scrape", () => {
     it.concurrent("should require authorization", async () => {
-      const response = await request(app).post("/v0/scrape");
+      const response = await request(TEST_URL).post("/v0/scrape");
       expect(response.statusCode).toBe(401);
     });
 
diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts
index cc8376b..6b62f06 100644
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@@ -5,190 +5,215 @@ import "dotenv/config";
 import { getWebScraperQueue } from "./services/queue-service";
 import { redisClient } from "./services/rate-limiter";
 import { v0Router } from "./routes/v0";
-import { initSDK } from '@hyperdx/node-opentelemetry';
+import { initSDK } from "@hyperdx/node-opentelemetry";
+import cluster from "cluster";
+import os from "os";
 
 const { createBullBoard } = require("@bull-board/api");
 const { BullAdapter } = require("@bull-board/api/bullAdapter");
 const { ExpressAdapter } = require("@bull-board/express");
 
-export const app = express();
+const numCPUs = os.cpus().length;
+console.log(`Number of CPUs: ${numCPUs} available`);
 
-global.isProduction = process.env.IS_PRODUCTION === "true";
+if (cluster.isMaster) {
+  console.log(`Master ${process.pid} is running`);
 
-app.use(bodyParser.urlencoded({ extended: true }));
-app.use(bodyParser.json({ limit: "10mb" }));
+  // Fork workers.
+  for (let i = 0; i < numCPUs; i++) {
+    cluster.fork();
+  }
 
-app.use(cors()); // Add this line to enable CORS
-
-const serverAdapter = new ExpressAdapter();
-serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
-
-const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
-  queues: [new BullAdapter(getWebScraperQueue())],
-  serverAdapter: serverAdapter,
-});
-
-app.use(
-  `/admin/${process.env.BULL_AUTH_KEY}/queues`,
-  serverAdapter.getRouter()
-);
-
-app.get("/", (req, res) => {
-  res.send("SCRAPERS-JS: Hello, world! Fly.io");
-});
-
-//write a simple test function
-app.get("/test", async (req, res) => {
-  res.send("Hello, world!");
-});
-
-// register router
-app.use(v0Router);
-
-const DEFAULT_PORT = process.env.PORT ?? 3002;
-const HOST = process.env.HOST ?? "localhost";
-redisClient.connect();
-
-// HyperDX OpenTelemetry
-if(process.env.ENV === 'production') {
-  initSDK({ consoleCapture: true, additionalInstrumentations: []});
-}
-
-
-export function startServer(port = DEFAULT_PORT) {
-  const server = app.listen(Number(port), HOST, () => {
-    console.log(`Server listening on port ${port}`);
-    console.log(
-      `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
-    );
-    console.log("");
-    console.log("1. Make sure Redis is running on port 6379 by default");
-    console.log(
-      "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 "
-    );
+  cluster.on("exit", (worker, code, signal) => {
+    console.log(`Worker ${worker.process.pid} exited`);
+    console.log("Starting a new worker");
+    cluster.fork();
   });
-  return server;
-}
+} else {
+  const app = express();
 
-if (require.main === module) {
-  startServer();
-}
+  global.isProduction = process.env.IS_PRODUCTION === "true";
 
-// Use this as a "health check" that way we dont destroy the server
-app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => {
-  try {
-    const webScraperQueue = getWebScraperQueue();
-    const [webScraperActive] = await Promise.all([
-      webScraperQueue.getActiveCount(),
-    ]);
+  app.use(bodyParser.urlencoded({ extended: true }));
+  app.use(bodyParser.json({ limit: "10mb" }));
 
-    const noActiveJobs = webScraperActive === 0;
-    // 200 if no active jobs, 503 if there are active jobs
-    return res.status(noActiveJobs ? 200 : 500).json({
-      webScraperActive,
-      noActiveJobs,
-    });
-  } catch (error) {
-    console.error(error);
-    return res.status(500).json({ error: error.message });
+  app.use(cors()); // Add this line to enable CORS
+
+  const serverAdapter = new ExpressAdapter();
+  serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
+
+  const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
+    queues: [new BullAdapter(getWebScraperQueue())],
+    serverAdapter: serverAdapter,
+  });
+
+  app.use(
+    `/admin/${process.env.BULL_AUTH_KEY}/queues`,
+    serverAdapter.getRouter()
+  );
+
+  app.get("/", (req, res) => {
+    res.send("SCRAPERS-JS: Hello, world! Fly.io");
+  });
+
+  //write a simple test function
+  app.get("/test", async (req, res) => {
+    res.send("Hello, world!");
+  });
+
+  // register router
+  app.use(v0Router);
+
+  const DEFAULT_PORT = process.env.PORT ?? 3002;
+  const HOST = process.env.HOST ?? "localhost";
+  redisClient.connect();
+
+  // HyperDX OpenTelemetry
+  if (process.env.ENV === "production") {
+    initSDK({ consoleCapture: true, additionalInstrumentations: [] });
   }
-});
 
-app.get(`/serverHealthCheck`, async (req, res) => {
-  try {
-    const webScraperQueue = getWebScraperQueue();
-    const [waitingJobs] = await Promise.all([
-      webScraperQueue.getWaitingCount(),
-    ]);
-
-    const noWaitingJobs = waitingJobs === 0;
-    // 200 if no active jobs, 503 if there are active jobs
-    return res.status(noWaitingJobs ? 200 : 500).json({
-      waitingJobs,
+  function startServer(port = DEFAULT_PORT) {
+    const server = app.listen(Number(port), HOST, () => {
+      console.log(`Worker ${process.pid} listening on port ${port}`);
+      console.log(
+        `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
+      );
+      console.log("");
+      console.log("1. Make sure Redis is running on port 6379 by default");
+      console.log(
+        "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 "
+      );
     });
-  } catch (error) {
-    console.error(error);
-    return res.status(500).json({ error: error.message });
+    return server;
   }
-});
 
-app.get('/serverHealthCheck/notify', async (req, res) => {
-  if (process.env.SLACK_WEBHOOK_URL) {
-    const treshold = 1; // The treshold value for the active jobs
-    const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
+  if (require.main === module) {
+    startServer();
+  }
 
-    const getWaitingJobsCount = async () => {
+  // Use this as a "health check" that way we dont destroy the server
+  app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => {
+    try {
       const webScraperQueue = getWebScraperQueue();
-      const [waitingJobsCount] = await Promise.all([
+      const [webScraperActive] = await Promise.all([
+        webScraperQueue.getActiveCount(),
+      ]);
+
+      const noActiveJobs = webScraperActive === 0;
+      // 200 if no active jobs, 503 if there are active jobs
+      return res.status(noActiveJobs ? 200 : 500).json({
+        webScraperActive,
+        noActiveJobs,
+      });
+    } catch (error) {
+      console.error(error);
+      return res.status(500).json({ error: error.message });
+    }
+  });
+
+  app.get(`/serverHealthCheck`, async (req, res) => {
+    try {
+      const webScraperQueue = getWebScraperQueue();
+      const [waitingJobs] = await Promise.all([
         webScraperQueue.getWaitingCount(),
       ]);
 
-      return waitingJobsCount;
-    };
+      const noWaitingJobs = waitingJobs === 0;
+      // 200 if no active jobs, 503 if there are active jobs
+      return res.status(noWaitingJobs ? 200 : 500).json({
+        waitingJobs,
+      });
+    } catch (error) {
+      console.error(error);
+      return res.status(500).json({ error: error.message });
+    }
+  });
 
-    res.status(200).json({ message: "Check initiated" });
+  app.get("/serverHealthCheck/notify", async (req, res) => {
+    if (process.env.SLACK_WEBHOOK_URL) {
+      const treshold = 1; // The treshold value for the active jobs
+      const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
 
-    const checkWaitingJobs = async () => {
-      try {
-        let waitingJobsCount = await getWaitingJobsCount();
-        if (waitingJobsCount >= treshold) {
-          setTimeout(async () => {
-            // Re-check the waiting jobs count after the timeout
-            waitingJobsCount = await getWaitingJobsCount(); 
-            if (waitingJobsCount >= treshold) {
-              const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
-              const message = {
-                text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`,
-              };
+      const getWaitingJobsCount = async () => {
+        const webScraperQueue = getWebScraperQueue();
+        const [waitingJobsCount] = await Promise.all([
+          webScraperQueue.getWaitingCount(),
+        ]);
 
-              const response = await fetch(slackWebhookUrl, {
-                method: 'POST',
-                headers: {
-                  'Content-Type': 'application/json',
-                },
-                body: JSON.stringify(message),
-              })
-              
-              if (!response.ok) {
-                console.error('Failed to send Slack notification')
+        return waitingJobsCount;
+      };
+
+      res.status(200).json({ message: "Check initiated" });
+
+      const checkWaitingJobs = async () => {
+        try {
+          let waitingJobsCount = await getWaitingJobsCount();
+          if (waitingJobsCount >= treshold) {
+            setTimeout(async () => {
+              // Re-check the waiting jobs count after the timeout
+              waitingJobsCount = await getWaitingJobsCount();
+              if (waitingJobsCount >= treshold) {
+                const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
+                const message = {
+                  text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
+                    timeout / 60000
+                  } minute(s).`,
+                };
+
+                const response = await fetch(slackWebhookUrl, {
+                  method: "POST",
+                  headers: {
+                    "Content-Type": "application/json",
+                  },
+                  body: JSON.stringify(message),
+                });
+
+                if (!response.ok) {
+                  console.error("Failed to send Slack notification");
+                }
               }
-            }
-          }, timeout);
+            }, timeout);
+          }
+        } catch (error) {
+          console.error(error);
         }
-      } catch (error) {
-        console.error(error);
-      }
-    };
+      };
 
-    checkWaitingJobs();
-  }
-});
+      checkWaitingJobs();
+    }
+  });
 
-app.get(`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => {
-  try {
-    const webScraperQueue = getWebScraperQueue();
-    const completedJobs = await webScraperQueue.getJobs(['completed']);
-    const before24hJobs = completedJobs.filter(job => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000);
-    const jobIds = before24hJobs.map(job => job.id) as string[];
-    let count = 0;
-    for (const jobId of jobIds) {
+  app.get(
+    `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
+    async (req, res) => {
       try {
-        await webScraperQueue.removeJobs(jobId);
-        count++;
-      } catch (jobError) {
-        console.error(`Failed to remove job with ID ${jobId}:`, jobError);
+        const webScraperQueue = getWebScraperQueue();
+        const completedJobs = await webScraperQueue.getJobs(["completed"]);
+        const before24hJobs = completedJobs.filter(
+          (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
+        );
+        const jobIds = before24hJobs.map((job) => job.id) as string[];
+        let count = 0;
+        for (const jobId of jobIds) {
+          try {
+            await webScraperQueue.removeJobs(jobId);
+            count++;
+          } catch (jobError) {
+            console.error(`Failed to remove job with ID ${jobId}:`, jobError);
+          }
+        }
+        res.status(200).send(`Removed ${count} completed jobs.`);
+      } catch (error) {
+        console.error("Failed to clean last 24h complete jobs:", error);
+        res.status(500).send("Failed to clean jobs");
       }
     }
-    res.status(200).send(`Removed ${count} completed jobs.`);
-  } catch (error) {
-    console.error('Failed to clean last 24h complete jobs:', error);
-    res.status(500).send('Failed to clean jobs');
-  }
-});
+  );
 
-app.get("/is-production", (req, res) => {
-  res.send({ isProduction: global.isProduction });
-});
+  app.get("/is-production", (req, res) => {
+    res.send({ isProduction: global.isProduction });
+  });
 
-
-// /workers health check, cant act as load balancer, just has to be a pre deploy thing
\ No newline at end of file
+  console.log(`Worker ${process.pid} started`);
+}
diff --git a/apps/api/src/services/redis.ts b/apps/api/src/services/redis.ts
index f2cedd1..491eeb1 100644
--- a/apps/api/src/services/redis.ts
+++ b/apps/api/src/services/redis.ts
@@ -1,8 +1,35 @@
-import Redis from 'ioredis';
+import Redis from "ioredis";
 
 // Initialize Redis client
 const redis = new Redis(process.env.REDIS_URL);
 
+// Listen to 'error' events to the Redis connection
+redis.on("error", (error) => {
+  try {
+    if (error.message === "ECONNRESET") {
+      console.log("Connection to Redis Session Store timed out.");
+    } else if (error.message === "ECONNREFUSED") {
+      console.log("Connection to Redis Session Store refused!");
+    } else console.log(error);
+  } catch (error) {}
+});
+
+// Listen to 'reconnecting' event to Redis
+redis.on("reconnecting", (err) => {
+  try {
+    if (redis.status === "reconnecting")
+      console.log("Reconnecting to Redis Session Store...");
+    else console.log("Error reconnecting to Redis Session Store.");
+  } catch (error) {}
+});
+
+// Listen to the 'connect' event to Redis
+redis.on("connect", (err) => {
+  try {
+    if (!err) console.log("Connected to Redis Session Store!");
+  } catch (error) {}
+});
+
 /**
  * Set a value in Redis with an optional expiration time.
  * @param {string} key The key under which to store the value.
@@ -11,7 +38,7 @@ const redis = new Redis(process.env.REDIS_URL);
  */
 const setValue = async (key: string, value: string, expire?: number) => {
   if (expire) {
-    await redis.set(key, value, 'EX', expire);
+    await redis.set(key, value, "EX", expire);
   } else {
     await redis.set(key, value);
   }

From 11b6d5afa5285476d934900ee6e4db8b8f48710c Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 12 Jun 2024 18:00:22 -0700
Subject: [PATCH 26/29] Update fly.toml

---
 apps/api/fly.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/api/fly.toml b/apps/api/fly.toml
index 6bc8266..468695d 100644
--- a/apps/api/fly.toml
+++ b/apps/api/fly.toml
@@ -54,7 +54,7 @@ kill_timeout = '5s'
     soft_limit = 12
 
 [[vm]]
-  size = 'performance-8x'
+  size = 'performance-4x'
   processes = ['app']
 
 

From 182f8d4d6c3fbc9598d054d0c13aadbe1dba8b52 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 12 Jun 2024 18:07:05 -0700
Subject: [PATCH 27/29] Update index.ts

---
 apps/api/src/index.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts
index 6b62f06..494b4d5 100644
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@@ -13,7 +13,7 @@ const { createBullBoard } = require("@bull-board/api");
 const { BullAdapter } = require("@bull-board/api/bullAdapter");
 const { ExpressAdapter } = require("@bull-board/express");
 
-const numCPUs = os.cpus().length;
+const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
 console.log(`Number of CPUs: ${numCPUs} available`);
 
 if (cluster.isMaster) {

From 676d6e8ab5f7a1fd14ff5b76f8289db7543082c4 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Thu, 13 Jun 2024 10:51:05 -0300
Subject: [PATCH 28/29] Added pageOptions.removeTags

---
 apps/api/openapi.json                         | 19 +++++++++++
 .../src/__tests__/e2e_withAuth/index.test.ts  | 34 +++++++++++++++++++
 apps/api/src/controllers/crawl.ts             | 10 ++++--
 apps/api/src/controllers/crawlPreview.ts      |  2 +-
 apps/api/src/controllers/search.ts            |  2 ++
 apps/api/src/lib/entities.ts                  |  1 +
 apps/api/src/scraper/WebScraper/index.ts      |  7 +++-
 apps/api/src/scraper/WebScraper/single_url.ts | 13 +++++++
 8 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/apps/api/openapi.json b/apps/api/openapi.json
index a755e37..b07e43f 100644
--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@@ -61,6 +61,13 @@
                         "description": "Wait x amount of milliseconds for the page to load to fetch content",
                         "default": 0
                       },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
                       "headers": {
                         "type": "object",
                         "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
@@ -194,6 +201,11 @@
                         "type": "integer",
                         "description": "Maximum number of pages to crawl",
                         "default": 10000
+                      },
+                      "allowBackwardCrawling": {
+                        "type": "boolean",
+                        "description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
+                        "default": false
                       }
                     }
                   },
@@ -219,6 +231,13 @@
                         "type": "object",
                         "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
                       },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
                       "replaceAllPathsWithAbsolutePaths": {
                         "type": "boolean",
                         "description": "Replace all relative paths with absolute paths for images and links",
diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 02e4a47..3423b3a 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -136,6 +136,40 @@ describe("E2E Tests for API Routes", () => {
       expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
     }, 60000); // 60 seconds
 
+    it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
+      const responseWithoutRemoveTags = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://www.scrapethissite.com/" });
+      expect(responseWithoutRemoveTags.statusCode).toBe(200);
+      expect(responseWithoutRemoveTags.body).toHaveProperty("data");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
+      expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
+      expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
+      expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
+      expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
+      expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
+
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toHaveProperty("content");
+      expect(response.body.data).toHaveProperty("markdown");
+      expect(response.body.data).toHaveProperty("metadata");
+      expect(response.body.data).not.toHaveProperty("html");
+      expect(response.body.data.content).toContain("Scrape This Site");
+      expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
+      expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
+      expect(response.body.data.content).not.toContain("web scraping"); // strong
+    }, 30000); // 30 seconds timeout
+
     // TODO: add this test back once we nail the waitFor option to be more deterministic
     // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
     //   const startTime = Date.now();
diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts
index 58d01e2..7eab78f 100644
--- a/apps/api/src/controllers/crawl.ts
+++ b/apps/api/src/controllers/crawl.ts
@@ -55,8 +55,14 @@ export async function crawlController(req: Request, res: Response) {
     }
 
     const mode = req.body.mode ?? "crawl";
-    const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    const crawlerOptions = req.body.crawlerOptions ?? {
+      allowBackwardCrawling: false
+    };
+    const pageOptions = req.body.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      removeTags: []
+    };
 
     if (mode === "single_urls" && !url.includes(",")) {
       try {
diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts
index d3e9afe..2c3dc4e 100644
--- a/apps/api/src/controllers/crawlPreview.ts
+++ b/apps/api/src/controllers/crawlPreview.ts
@@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
 
     const mode = req.body.mode ?? "crawl";
     const crawlerOptions = req.body.crawlerOptions ?? {};
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
 
     const job = await addWebScraperJob({
       url: url,
diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts
index 7474aae..abbc357 100644
--- a/apps/api/src/controllers/search.ts
+++ b/apps/api/src/controllers/search.ts
@@ -85,6 +85,7 @@ export async function searchHelper(
       onlyMainContent: pageOptions?.onlyMainContent ?? true,
       fetchPageContent: pageOptions?.fetchPageContent ?? true,
       includeHtml: pageOptions?.includeHtml ?? false,
+      removeTags: pageOptions?.removeTags ?? [],
       fallback: false,
     },
   });
@@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
       includeHtml: false,
       onlyMainContent: true,
       fetchPageContent: true,
+      removeTags: [],
       fallback: false,
     };
     const origin = req.body.origin ?? "api";
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 81bf12c..92170c1 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -19,6 +19,7 @@ export type PageOptions = {
   screenshot?: boolean;
   headers?: Record<string, string>;
   replaceAllPathsWithAbsolutePaths?: boolean;
+  removeTags?: string | string[];
 };
 
 export type ExtractorOptions = {
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index f432f43..1a6ffd0 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -475,7 +475,12 @@ export class WebScraperDataProvider {
     this.limit = options.crawlerOptions?.limit ?? 10000;
     this.generateImgAltText =
       options.crawlerOptions?.generateImgAltText ?? false;
-    this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
+    this.pageOptions = options.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      replaceAllPathsWithAbsolutePaths: false,
+      removeTags: []
+    };
     this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
     this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
     //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index c2dcea1..a16f6f0 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -304,6 +304,19 @@ export async function scrapSingleUrl(
   const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
     const soup = cheerio.load(html);
     soup("script, style, iframe, noscript, meta, head").remove();
+
+    if (pageOptions.removeTags) {
+      if (typeof pageOptions.removeTags === 'string') {
+        pageOptions.removeTags.split(',').forEach((tag) => {
+          soup(tag.trim()).remove();
+        });
+      } else if (Array.isArray(pageOptions.removeTags)) {
+        pageOptions.removeTags.forEach((tag) => {
+          soup(tag).remove();
+        });
+      }
+    }
+    
     if (pageOptions.onlyMainContent) {
       // remove any other tags that are not in the main content
       excludeNonMainTags.forEach((tag) => {

From 6963a490f1284d89756ce9f6290b5c654ae14b79 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Fri, 14 Jun 2024 10:21:44 -0300
Subject: [PATCH 29/29] Updated version

---
 apps/python-sdk/firecrawl/__init__.py           |   2 +-
 .../test.cpython-311-pytest-8.2.1.pyc           | Bin 0 -> 44947 bytes
 2 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc

diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py
index 4e53e77..2fe16ba 100644
--- a/apps/python-sdk/firecrawl/__init__.py
+++ b/apps/python-sdk/firecrawl/__init__.py
@@ -13,7 +13,7 @@ import os
 
 from .firecrawl import FirecrawlApp
 
-__version__ = "0.0.14"
+__version__ = "0.0.15"
 
 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ba1f1324fe139772739cdae776d127cd5002ca8
GIT binary patch
literal 44947
zcmeHwZ)_V!mS<CJi4^sRvLwrYV#~CX*jD0yiWC1y>`a`Do!E};jGaF-9vrx|x@A+Q
zNTr*yV-1zl++`PwU~n@y?B2}mW`Nw@Z8k=Bf%$T{J77OtK5c-zJv0pp(Gdt>;IJR&
za34IE`7rRO`@O2>V|B6Hre%9PI}zP_u6p(ARkPS#uimRyul}a5FJ<8KC-DoD3mXjM
ze^aLMG5y5Xe;+f9pBk27#R|sxnE1bAIzG`6Gp5b+oiSO`bv_|VlIN4+|L*hM;{Vk7
z6#nlh^i21j?-jK1Lf>@%`Tm&EVc7kjppnlSEgC2{&!?@<^8+YLO|5DaSY4ms9zMgL
z{G1;|`H+>c2maFdMMtnKX4r|T;RfZ$s~aixe=BJZe9{4q_>-UWBY|3L8hG`8dnDM>
z`L*`Q)Vc=6$Lkv@^?!S9unz8S?bKrpit~?I-5(g|H&`jajaCofsMQO&$?5~lSp9&T
ztu){kYXI<ZYZc%V)*#@M))3%UYZ!2wwHk1{H3B$htpR+>S_}BJ6+35)t@{-<czP^W
z@0^{@ThAiVQz+%EtW_%8#Vd2GFO|zP&+pzn{>=Ved+=}k`MrDgJTo^knRo07CwH~5
zn^U`HuGM?rJbLcU?2$K)XHOpe-rTw)FP}T}`de=v%^o{`_UJ2TkG%7G_KhQNzV>{k
z-g~UU;K<C(MB_`-|4IDC@%Q!r!7uPrqs+goVN7wl+;{>(80^L8jCrG+0NVeQiJ$-Z
z+gOlxo}O6z3;y0QquBGfF>bi#)sFY#-!rbpV#fFI{NV}gnD3|<Q=JWd^YL<DBW)O!
zxW?VAbX3eM#(Za`^CvO4vz%_yPVrq?@k-}p%o4Pd9iQNPea7D{P@}t1yP`eSR%cmx
zlBaqabiw!Mis7yc@at{Rl{uyNr4y{dcMx5p7_&O9t~T#yIQ)Jp+Arz^>G&>mI<(yt
z;Mk<pewhd=Gx+;iiH|zxx(s8!%Su|^m9BhD<Pv3mM-3yuslU;ql|+zJF!!(U)V&v*
z@4#Ot{^IkdW#COr(My=f&E&JQPGN57bSZ<^D0A7qmYH!%SMru^U60i}^2O`1xsy+B
zpSd<Z=4SH63?&bYxv3xciOgYv+B@cMC(OP^5fXmUvt#bAE4jj~oths@&DY~@*{LV&
z_a^ei$<p=CdK|0?>?QClfqeuH+)n3Q*LKQ+{rH|Sw{C)R&se(NHFK?OyXAV9lgqod
zTkk&l-h@3<&X<Y~y3eR5Xgql9?CbST(X2zyTRruXtd-3<^;Ox4T%nM4>=|dGG(D4Z
z?0R?B%}v^bsrQ01dto+TDCdiAy{k|Hv0Go8buX1>3zn#ry;vw+$Q821+_YWqKO*iE
z%^!7~l2aebPL`bMTsdpMH&e(Jb97^4Nr&FtdQZNHd&*nc+{{e9hZ>S!a*vvMPR;CB
zGtbD3<bP1!smzRzyYz}@{Lk&TcfW<sb9Xzr$(&u7&KC-{(;PZ?qj`5XI^<HRxYM;R
zD<#=1^{t!D?zW$?vsd%wOGjqQmv$492HZ3yvc(d<Kt79y#d#X7(s1B@j=^%l_^Lbo
zqm6%lt=hf0*1h?b5$k;B<9EIq&ivgPZ+N^qJYE|fU$}fL4zwr};MYX;8v;#SSn9XG
z{cRZfrn%1(S53f00juUdnUzHhfd@fo5olyBCSWEK!&y*D3?L(0F|yI9y4)OdQ!%Yb
z$w#9>>%gd&#DG|%=lfKNThlJdjK3H;6#skkh>`d_9s>-QmLrgvL1|g$8H|DX+23{q
zr5vOr%P~k+lDOid%FE!m1<4w#7#}OO8g;;58)3z4?p`FT3CS9pqTxb(AXyb@c~{BG
zvq5|l9`!BAT}ak0lEN*LRjncBxzucnWKEQn+1?b($q>n^^5Z!{nI%GqWL0bM9YoiF
zbW2#tHt%P+ta9i3sc7>bNLH1O??R_T+sOz@?RSr$(j{v*OV*Utqe#|`^4BLA;Ka|u
zvLtKo>3X_3&WmC4dWS>9vy`_OHgAi0u8WCob2@9;SLU`J%jFBUl_{4pE~dLPb_Q~6
zSLR$fSDtk<6D7-jKJ)CJafco{hh|<vxCz<f(2wtsw&BotJT@SNp+ic6GeUqKEoUu2
zJtcXQ;CDz>kW6Tr;Xv~;9O@Qll)xqe83LOLYyr5QbRMTP{mRZ01fC?YmB2Ov+X;*T
z1SFF46fpHoLVPrv&uabT>;)CwlKa1;I2j!M>#N>@<KEzj>fnjm;E9EkEfVIy$L}m9
zhrUdX-bjvmoAy<c`)kSlUUEN)nFAy-Bp@+!fW(Y~YnC=szL?ReZkoqEan%G|6tHR@
zmswfF5O@%T7J)|AVghC&F`NaZ!~im~6(bvss>{tWHx<*0lzcQAv<{4VNeqZJdcIGU
zxHbJX4FkU%IWidke9$~HlK6aG3@}{KJRt;)6@vg_bqueoB5YLUtVQg^ROM`{sp|0H
zqYsr3I}i;YvuMDb@<&$z?pf@_=i`-*DOMNfO?B9*Xu}p1IVfvT=(;LhLZKVt8!@K%
z{-MfsLF@?H7O|t&U`0(AZ%|{{bO3?I!*__-QTeg3DGxO7D=5OD(5W@}4x($2Qe`FD
zyr1C!cfKE8tU~OB-i1y_A}@kc`(+}iI)mTdO8R0aVRc)nO2Sv@l4TlI{ZD`s_hTg)
z<P^;L3SAt^To?Y#`A(tG^_-p?CF_ZhD!B>!LaB6l7xbImlex?GP8xWfU1+sU8MC(?
z8M8`}F*Hxp1dTIJxO<i`W8L-i1*lz@3wgI}TO_9HacC^<h#V4(4J9ZeWzMjWIT}Of
z01@KPtqVc3G_r5u%vY(AUtIl9y<X~YHFda_I=t{oi?G>F!lviP{U7%K<X={k+iJ;e
zUUC~rn(ZV(Bp^w%og|HdYnC=szNFEqRwRuoFdl@Y`OhOq*2O<xXCB#@_&gH>3`iOp
zjUi)<um7)*G*MC}Or?Q(qR1ie@IShGLWVz*DwU2|7+@-K_ov}f31fL&jOC!z$8wC)
zJbq%7X8F`IN)HEkv#il4*%%eM4E)$YsMFDi96_lY1`{832$Pj53|OW=-gT;DT!0hH
zu!=TvhL3mi9af5DVI>Bc*iq?-?8P4RUd-wxO1&2Y?Ki)(X4HR_eQotU)L!gDFZQDs
zV^*pu^yd@ktAyXHg+L$ij0uVp^=g_Zm)ENSzRkYZjF;Ac|NWEJDr>Nk^q->cHlCvH
z;Bynq`BIZ4X2(1hh+qO8I$hsTzLa;fxTa=d*UDvQ9Q#V%zM3tUF55*RFK@4F2vw()
zbIa4$uG$yeeA(WW%U_S(?#jEFVyO%x=?IxWVW!MTOop<siq?~k?aq{nu6@0O?3#NV
zlJd9v731b}phC%W^~6M}h*glX5WQeINIVp~E7(DiB;`~vsVA~$>6Wq~q%gWRNNKJe
zXb+}YXAsD-9z}c$QQcGlHSJGmgqx{hp#r9y{gi%*z%~HYMmet9D4$g``-Hlv((YA@
zeY2;uXYO}ta9~O9i}G=)KF&@|*seP{OD0^|x57}Fm8R9;Pk0V&=oZ~+jMT7Cl8E>A
zoc2;@s;M)z)R~1>mePYie*eSwz0rf!^z*g!^9!#nnVY<@Yth_P3&VrB+9;AVoJA4W
zVq_~OluWV(R4*`*#2OQ3t`aYq887TwG&8j@Jcz5JhzVpCc!}XGW4KBv21t3KBBxah
z*lKO5O>mVZ^J%Yf)fx%L$i$-gbW~AQ6fr?)Rx&}6##UlbSF0|WXvAvvYdm#E_)Oyq
zNZx4UX+WQi>1I^SIcT=h^072&95Ux%Qzhg*n^mzONOKO8W+(c%j-@`<22qFS3n6n3
zm3MP&Z#4j_bes;Eb99!u6fJXF*V(KtmX<JAR+@7TWr;GtiRVyl&pEoaIS1cMpchj@
zI>NXZGUrfhaJr(;IUZqOTfGmp7irGXx6+IB%HGGb*qFJg^Iv|NR-Q#Hr?jI*P%oza
zXAv_&p+$uE;(&OgSFOC!s=HHT7L8sU6k0@gq!tn0t8$zi3e_Ui8l2Y0$-o=bq|&vB
zVb&s6TO*1Vad)|JuUf>K(?U;h4&n874ioqmftLszA@B-;Zxc95fbQ8jM&MO|dU87F
zT((MAi%8k!oRur*>gJ{LbYTqI1yl<-U5xqK1*rlO)6g!Q*ND$?0w)NZBtUcf>%Gd{
zUYO?YWrFX#4i=#^@F9Hx2VMfs_d!jjMfV?|E&6lpfEjINip?`wB;*WNoAv_#sHfPQ
zJu~E5G&k46@F1>=A|{Yo;3bB$jNvMw7$D_^ikwz4V5_yIHo;YvTB^$;uFI61kbIbc
z*-8v&K_5kfB9%;X28NqZF<_$cX{`(TKcFwtRH~v>G}f+E6-uT<)itJ9^)RWcsT?F`
z8QPokCI+-{?X8Xx{K0E)R@^eJ&U`GdYkaM?W0rb>SznjX;}W5IT=J3WamfgIt!b-P
zH)|l|qVX{FxZbujkUlZ%>%Wg#-y`g6EB#P=k><_=E4_$;;ePeFRV%&t0D6%$kh^Qc
zD{nM)wdUlm=y5~nMXZ+HqaG)Fb@-v{ajRL68?n|rG(ArC!N51V57vt3VAVIe4`@|z
zoubdzduDPM?K5I?flJ=|6NOpJehu3BoA<8jJ8$DPbiPaA9Rg9h`&|6crE%vxkwog@
zhuT>Q^L_NQxuOL}OL6O2vMe}R@FoWscMseM?yRu?ms4My^1h$-PX3YiLav&=P)lD}
zSs-iZ>a5JKXz#4(-nY`b(KAD?MRQ{<3=iU}C}IMc1zut}%NVW_iUCqysK{v*1GZXQ
zY7<;#MSFiF+WV5Z!)si%MuIUiv1smyDyoViCJ4<+CMeR_N(}01)g=>+SZ0^ZC%nd0
zYa|#W6N}~(QAJfz!~~&P$pl3jTZutkt-55Q5v$odSVj)P(){2pBSAl&m33_hZt)3N
zR#w)vHD3exEq1P~Yd4-1-NORjpQ;s+(>mSns28P2;_}uK!#xuD8wI>aG@pxzKG4?^
zqdXEH{959W)IH{&YWN)HocHoqc9ooqyJxJ)-Ej7EN;_vR?LN47&-k7_<GX-dh8{xN
z$L-_q*x2c9yYM&6xHI+yIL}-vl}lKJbn+9K(go}vz9Kd+yP48trhLiHybKRWEAx83
zXlKsmtb8f+YOaub?^@XU<6YF3{gLVm-2Z$(K7W@OZ9&fBduAuGGtRkyTc@Q|_c=I0
z-BEKnQB9w$rBALXD=W%MV+FHymF>Qjm1)oPuUaF)7@1f!r=yChqKFAXvyuslG`13h
zx>|L~L?c$ScbL9ezmw@(V9+Jk1o6?gOwiUkZOcTcE8<<YOpx=l|7kiAhPW@Pyqnt+
zv?&FIrb^eGa%ps8%S1S(zU&pXWuj??GXiZg*S4IU;#jcJw@fex2u613^qu>|w@j!t
zn%`LyrPwm@$oe{R%fy4}#W2S^wNF-h@m?J7?zHjbPQ5x1y40Z7h<r8!Z%~s;pXRON
zrG`OkNLgyQ`%`qUE@Q*~G;eEr)4X57O1li#GHGSbQQhMNf*xXH{b3$rb+epDOq))(
zU?UbzoBKhc&D3X?InNXOIZCrKhE9GWPYLXdaVf^nMSK@w7k~L~Q@~@@^sBY>s|&Av
zWhOs3@u%<p@YDyV7EZ0~?pSd>lOAm=^Fd4uTjztceFLwS@%2iKAM^q~&6(p$j32*8
zSnlcuBg~Q2d0^yR8gt}--Z6fho#+bb`Mn3Jxsz`d5%3Lm6ol`wDG*$yG%+#jILKbT
z1k**P>|DziFN)AU<s9PLiEx&gsnUf99^a>1uwGtxALki-nUKgmqe7B1<CefZQ>gY9
zRc$|T&wf8}&&C_cjo#>U)#Scfa-WynN8uFqQ<ww^D4fE63a6mpnx)N@A5KB1x@i_Y
zan%G|6tHR*WmXn31ReyTMWB(jn1Go`3}-<pF@TJ0#mGjZ>T+|;O~te#B_E9jtplT8
z5(8q5p6^p7ZcRJS;wKJx#j>!FaeAVnC){~YBLbZ+FHVmU5vNB}$XDX@;Ab<UqwcK4
z>5-vUe@Ah8WE`H24dt&46L;Bmg0XrYq&#^bv3&A)f|e7@M}`X6N406vaF>vAv~0t>
z{$>209a4vEL|-k>I>oFIuoP!CtH<>zfC24C^5gonhV)q)c@B|&^f0B{WL%%V5Bq*n
zswTJAl3TsxRuV+pNWe%yf@m8FA_dnhZKixdq*JX3B2}K;uOK>z-#8$M$O<R4mRk?S
ziY>QiT*UT_JObi;f|fp`*2Jf6mxP4aqLB>UqR}0&Qn5`j#8#~?ZIbvEpOAUDO2=uP
z4)gHtfJMGZsoSbkyt$wU!F3+O77aD*n|`0LWYZS%j!M^MlfE!Q)+F5j7r680T0(y%
zAwt!Fx+SE+K%4!%F2I}nxl9o>{ZG&_1wxejPSqKsYoPB3(EF@QsTTR)pWi+FTj|?!
ztyLeHLWm}<p!M{@ERs|7@&qEI56`FItlrl&$<FswQk5PKU8Md3&F5jY*wiyZl=|}!
z@$FK<-!_7(CrD+j@!ybMYpu1e(wmnN+>>g2`T(b<ExMh6e<=~VsK;8b_klJ1jqVR}
z+I~!R+U^%>s_iuVwv&i9Pk!6(?Ji6gF7AHnZqC~j)D6+^V=p4I(r!PHlZ<$e5KMkp
zdxUxt5$~}n8)5A;FN!VM2(46)7wqD7(|H*@nz5S1rtClzCIr!UC@iIOlmI=qVODzK
zBtF)so9gpK-mZ1N3}UbdhHP?95sT9V&JcJKKn>0GtaFyo+X*~D;0*%b0dQ%&;yD^6
z<UDPwW+AwmgV~;@XcS|=U{e_UEbSJ557o(y!~HV|&?py<`48v4jR(9JPJS`veSg9`
zZ+WL|@A*mZQlXljuBE5{fZRI1GB<eUhN`)tCK5>Vbwf!nh7m|gt}AIci&BXJWMnHw
zGKs2-Qmw9v;U*X<`EV8qnU)EL;E6PdHF|D~DLJuZKJGQHS|h<2nOHO*k1DE)A|?pU
zN+u}M*h&oQYSkqZjaX)v%w1mNs)>YzjO4_kxhtxuDvFpOG%NX_h#9IF)YYm>CK|Dt
zy~Bz?2IJIn9emaBf2tG=&UAtj^$sThbOdMQ7B)VA0*U__OMT234GOnhi6iKySuqhV
zGIXXzVWTix>?TTmc3`Cf)@+_#1n(JV-2^>DnpSUZ4jt)1qS0Yym<rr&bEcq0z_)t*
zIR~6wt-cC2kc(MZAf$7E6Ze5WgA4Y&zd_K%$p}i6d_DZud>p|fq199p{;WJ%Nn%#c
zZKyL^gpN$2p9hG#rJq-|+0W|&yt$vt6t~|01iuGn<?7vVKStL;->>@p|J}pCm404<
zHR#XEyR9K>xYF%E9Vu1Q3UK1Ss%SIdV6Xa52Zf+cBADiC%)${?RGEeLR(dMEn3Yqo
z+d`lP)@%`$hXT*7Axiyu2-M);tfI}gBdB_V_iC;6-;h3QowdHw=l4N(8-37@Uq=u6
zAU@w~J*M}8wfdXfA8L^Ese#73S>Dn}=^K&}cE3OJw3iKc8|&}cz`qf{R{s=5N)n&>
zgesl%w`e|%XEZIJW)X;=-;%)iCZFd~qFvteY;C&FQLwA=ee~Xq90en5U=-Yhb4b*+
z+<TXP{lPaeQu=9hXgd?}ZfibU5mf15yVhnu5P!<rVm)3-`6IwU8zaC#kW(<{kAh+Z
z5cY|*2rK-AqKi63{3K3^zzo2fzarK0E-w3m<Xxh336T2c%o4alpiQ085sT7qDCf!C
zWzqTIu8cUp?D@=b%buR0;}|EdiSyuc1;@@=*D|xNZQbS2nIT<K?WJ2LL#cDmDyx2&
z0G%T+wo0h6_2JoK{@qzyxzm$4i|i$8XB&Z21cJ(~bCqyWO08U**h(a004_cE0Ufug
z=Wf&WT-u<3^YR*fM#rKFMYjSTv@Gua9COkzMfa=6Mt?E;^Q#|S{rRDf4teHU3X>MC
z_<F_CE!<x0SjVpoW8l?}-w3#(2rD`=E00miEbtnQijkUTR9%#6byW<~vLYk3Eby{O
zCK!Sz(jeC8xh<yT#EK&PuodBd($@!ZB9J)v0}F)S$WCw1OaJY8?~U(%G3UK}-rN0s
z?_cD+3Co+Dt0sR?Oa8!1{(zPaD=}IVY(QE)ti)*fuz^uaThsn3Vi@sF)Ahtv6L3+$
zs_DwCEMf>e2ttcMBWp1MGm#k1f>L4t8QF@FjYiew=9rs`X+=st8Vy<pM!h5k#2P){
zr%K$Kc4-p$KXz<7wI=?LYs^!RCH`?V1~>sMe|T+iEi}&M+CQQLcL!&;D{G4nV{Ng`
z0Mz|rrHHnawjLeA^IwQ{PU}{bN4cKIM`xJU!&srkJ{Wbz+&n1ES{0lZ#|%8{{rvVQ
ziQw8@uofN4fgN87<&YfiW%M}P^<e2EyFrpDbv?@(3}^&wYY4H|sl0i0SJ(WYv8Z&M
z*6E<J3`S6DmObmCucBKN%&K{u%TELM)$GOQlip&D3)E<SXMM-lBkb#k|D;zE){tVq
zp_S((dNp~Ov@?19P$AGy&7N(nHiKG!J_9u(dp3efTO;Px=5Ddt+_df8eZ{>U+nziq
zCyCYOm8S{6c;Gi_S|r=t`G%)yz4A20RqL339OHaa*b<YK9^a<kTj_QGA<%yFS=#{H
zY0TPK0k!_DS)*-^g+ad+9t+EK29W;=j?|H!E|E1bK8^;yLG#(y)-xigv}ZqpO0y4J
zn|$kcpOvvTv*owH4Sx*HA{G4ps`VRopWgW%VfWeM+kMFLqS<|p-Pz<*@5DBM<CYd0
z{#VqU^|UJv|K+2O>Rl31@AlW#Z3~g&I}RTnQx1$ed|0?6;1nor*I(VF0P_x;m{3VL
zIDk?*I3S@pI6R{|IP3>U;ZGnP9O&GtXB;w?P-_32daTt!d92mJz)8gP<ldVp$fg>T
z)WYDzC41s>R_;NNognWeeHY&b;lHpzeBL1N9Rg$sQvDtFDIO5=2&x11-j`;skq5(c
zt~|a+9F4A-grwucW6~hha&y86^dq!GmM6C#rq(bc5F34-_4dE|#Z%rpS?}#X@{Z@c
zeHT0{UrkTd(o=sxMxG^e)C;>7&CyyI9>i5q!~`-6yu@&pF<d1S1EjoAk<%&$Y_+!3
zCb-H!i9Myx4^Mw^dg1ia$ht2_cH9`*@#%Med9k+h#p=knY9rrTIJ0DK^}?=2b89UO
z58`U0NYZc?MO=%Kt(Z_U$rez(z(f*jOqjV!yzDQcF_$gkyQ{54(STSbllLQUTfH}l
z0TYc+YhBb@GWUAs-kN|&@O48;FNP6F%Bm=0g3v5eGIJG!x>|L~L?f12H9njggN<Xh
zsYscQ3IZpqsL|b+2_suE%pWwI=7x|!6|EL?Q|aXaZZh~kqVXd%pc=QIBlh1=j$h*l
z|6Ixmt}AIc3j&D&WMnIb$&ip_k|rt(yeyIlhNz1)>T1=wX{HQKwEEewo#}@8Fzigh
zSxnnr>njJuL4NBis*O1yyo~68xI0C);JI$ViqSlzcA5Hj#2i>#A?4U8NvoecaM=6d
z?pk6y-Vgq9s;XTm#B+ZD9?Kn-l&{b9RC*#k_g6uWfh~ooTlATs`_yM1tmppF@BiV5
z5l@Jw{aeHABSc4KHS7qzZD>iFr%iBt@!t^g-0zuB2@T0VD(iNNX3QBZ;5ygttQbk>
zXrZmpTNLUcJx2xa5{OzZZ}Fdo4rS+@iAxXGg<7Z?K@CGF6>{a7JgK8xr_(}%koP3C
zhM$1=kEz0c$9Ip?8s<Kn``M0adRr~M?GH*pXgc3Ux$?3XZmZ{Qs9P`7tNyyur&r}5
zG56%<yW+epz1HPw^&Qr%<>Gbgl!wes<R7vUYy_P^pXSoiO0W^XN7&s?pIUqqgN^(f
zbjsWo#ji_9vIiYQ#Q8Crx%chfQPD%BiOd&p`Jz0i-9Mhf`L}3_%w+CTObENd(Z9ax
z9XRd{o~RC<s12T2IN7pYd(00Vvi?SL{m%|nliO>_?Ot*_d0mW=D}@B)axq387YeRf
z+D!Sb7CO~U^Qb4Tnt+P}R?VX_D~lKc4}#Dl(8yX$z)U2Dv!Ik1Kt{G=WTR1axjE*h
zVp@@sk4A&mfl)7s0kKBU_o))MrduP2&?1T$`L93*JS?XQtINW<;MlUAurRUpVd|s!
zMZ-${RScUa;)oF*U-p@MePSuPKXgNPx=nQ@xWvrMFCk-{sz`ELr^8Z9I%49lFGV-v
zTASGLF!)fMF62^l%-^0HwG=%poGw=1httI)>}zX;x5Yn<UWC&Hj0@^gbi!KG){<<q
zSC?sEu-a9)qn=>R6mbvE6hps*UQJkQ152{aj8Tb~$=lffd;%TpGtxoD2W|jK_bb{;
z(&qaewY>UZ%@OM(-g#|#)q3nc?^^8_i0jYrCqI#k!jUyFQau)N*V=NgwZS)0bhcrX
zXlF@2j$bihZEXJJ*yh&Eto;6l8}-yi{+j>Xet%(*8ubJ54mDyiP1{8)ccEbK%H`_=
zh&k)zCd#EoAc495LScHRkHvQOxqtm~4tpXprJ^_<Azzeh#VBc@==8W9naO;Sf&k{q
zf{d2!bH!`h+{|>|rHHnfyqm!hZJ@Am%7T;0<3NVlnVFJPb~ESBo!Oxmg1j_i7c*{Y
z)|s&DU8M_C2s?<#z1L=t!x0e(GgwbtD3uC!t~eI2r~MYQetY%a2K%h*mg@a^H%t7o
zlHXX|`FD7u>!}$W2w^+ryzSPLj{WXz9-$1u&73Y-Hunu~{|yIUsH|enVWA$U+8+iR
z=UW6`B5;JjD+Im`P^Z46sJ_(0;G+3HI6o)CzW|tfF6ZJXin4HMc&_25fJCdC!VW=$
z5Df^(P*3IEd=Xv(MRbu#oiXOeWFWhmGeCgu!}&{q@L0%gTL=G&`1}Xr)7_3I!oMeS
z;S)fUICd$J&ICik6sYyhe*_(AN$&qNqs@WufB3#Px_2qLerfoLTOG#c$&Q<AGdH*I
z`eoAFc5vy*r$7Ck_r(6CZ96}`=50NAD`9RLyk!tb85_^Wyba&Mbt|3R+<(g;km%i*
zx@8cE_wVcawb4J2_zi)D*KUm(of*&E)OszNn`&WrP*xj7l7_P=;#!Ps#e|Ycwt(sd
zCX!fV!pv3TC3Bk>b}gFQYGHT~S49yM$Sm*@!&$~~l~4?j@<K&Us~E7=+ESa~Dod*o
zL33Aa^=`N#v;+c>d;2*GfV$ORrlWRK$SP{Q0A|9-Rt)n84X3#wBv3`G#oSc-CG!Q(
ze4!@bVgpti7!>h^Qjn=d5(CJ{Rt&03BoZ1KXPIUhtR!2>L2F3m4+42DnTI{|a81C)
z2COzPDB=sHAXAGZ29S}h7*v-?Bs4P4GR-nrNw$)M){x2{1nMO$!ecw?$qTdjLK!2r
zBfQ?$WoNF@NVrQ5{tytm9Q&%1FWdEQM%5EOPK?wLWY~h|lneO_yJpMzLcN>waw{Qu
zlGQs(Zrz-n&0AtuV!g|vjgD6wjH`x2Gr;<qBQKvj^ZHwF9?c#*e)i}qXOFz|diISY
zZ@yOVx@Zg1PNM3Zfs}>J6%DPS^OLfZkb~loqHLiwAr4Nen^aIAfW(|F*pfhCx+ft_
zg&$kJtAI0=Y^UC1zXu+YQf{Jrnr>+?fdd525unJ1;`CN=5RwRm@6Z-Qhr%9;$T|+~
zI&{dK=FlRNFs?ZyKdu|1tA&)gojfF4deQl>D55#M+YJo|_On=Qi9}5JKhIdr*OJk<
zApR{GJqzOBlF_{&{w*0JUhrBnM!n#5lWP62?*p*7ncBRNSX%##XRKLDul-Z!N7sHf
zT1{`SrMEBi-ZVNFI=!wJye$U_cD+zF4%Li9o^j}AV&H@KzfA1Bk=XfZ(R=Z&YU1r$
z;_U@<sdx2{PJh|^^o`!9KmB&Kcf8g+zR<m7m>=|fX{@<ntod1Y)!0%qws^)Cv{p+$
zxzKydjK>f*m_f>noxt8L1@?9di~{>RV>@mc4EmFlWiXtIJ$cJuu(m5!1VaT5Bm_o*
zgJ!HG8fAz^DXqZ%uGns3<AZ^oSQ!KgT;EH#!C*sYY$Gx7LBAP0CYoT_I}&^GmeC5t
Hqw@a&%y&Jk

literal 0
HcmV?d00001