From 0e89f8b9a3520017ba1956bb2db083ec8385a1f7 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 7 Jun 2024 09:35:56 -0300 Subject: [PATCH 01/29] fixing workflow --- .github/workflows/fly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index 77b3fd3..28af7bd 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -213,7 +213,7 @@ jobs: working-directory: ./apps/python-sdk - name: Publish to PyPI - if: ${{ env.VERSION_INCREMENTED == 'true' }} + if: ${{ env.PYTHON_SDK_VERSION_INCREMENTED == 'true' }} env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} From 556c57648e99f61b95583c7c7fbc2fa3221a6119 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 7 Jun 2024 09:40:40 -0300 Subject: [PATCH 02/29] Update fly.yml --- .github/workflows/fly.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index 28af7bd..84017b1 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -183,6 +183,7 @@ jobs: FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} build-and-publish-python-sdk: + name: Build and publish Python SDK runs-on: ubuntu-latest needs: deploy @@ -222,6 +223,7 @@ jobs: working-directory: ./apps/python-sdk build-and-publish-js-sdk: + name: Build and publish JavaScript SDK runs-on: ubuntu-latest needs: deploy From f24ca766182e1d12a4c3f71bdf8a70fe242c324a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 7 Jun 2024 10:39:11 -0700 Subject: [PATCH 03/29] Nick: removing rate limit emails for now --- apps/api/src/controllers/auth.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 2c4b0c7..ea789fe 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -143,7 +143,7 @@ export async function supaAuthenticateUser( const startDate = new Date(); const endDate = new Date(); endDate.setDate(endDate.getDate() + 7); - await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString()); + // await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString()); return { success: false, error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`, From 827354a116a4ea424af7c1994aae7214d78c8032 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Mon, 10 Jun 2024 21:21:23 +1000 Subject: [PATCH 04/29] Added logging to python sdk FIRECRAWL_LOGGING_LEVEL Instantiates the logger early and depends on env to set. --- apps/python-sdk/firecrawl/__init__.py | 54 ++++++++++++++++++++++++++ apps/python-sdk/firecrawl/firecrawl.py | 10 ++++- 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index ecb017f..4e53e77 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -1,3 +1,57 @@ +""" +This is the Firecrawl package. + +This package provides a Python SDK for interacting with the Firecrawl API. +It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, +and check the status of these jobs. + +For more information visit https://github.com/firecrawl/ +""" + +import logging +import os + from .firecrawl import FirecrawlApp __version__ = "0.0.14" + +# Define the logger for the Firecrawl project +logger: logging.Logger = logging.getLogger("firecrawl") + + +def _basic_config() -> None: + """Set up basic configuration for logging with a specific format and date format.""" + try: + logging.basicConfig( + format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + except Exception as e: + logger.error("Failed to configure logging: %s", e) + + +def setup_logging() -> None: + """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable.""" + env = os.environ.get( + "FIRECRAWL_LOGGING_LEVEL", "INFO" + ).upper() # Default to 'INFO' level + _basic_config() + + if env == "DEBUG": + logger.setLevel(logging.DEBUG) + elif env == "INFO": + logger.setLevel(logging.INFO) + elif env == "WARNING": + logger.setLevel(logging.WARNING) + elif env == "ERROR": + logger.setLevel(logging.ERROR) + elif env == "CRITICAL": + logger.setLevel(logging.CRITICAL) + else: + logger.setLevel(logging.INFO) + logger.warning("Unknown logging level: %s, defaulting to INFO", env) + + +# Initialize logging configuration when the module is imported +setup_logging() +logger.debug("Debugging logger setup") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index b9a823f..f20d4bd 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes. Classes: - FirecrawlApp: Main class for interacting with the Firecrawl API. """ - +import logging import os import time from typing import Any, Dict, Optional import requests +logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: """ @@ -28,8 +29,15 @@ class FirecrawlApp: def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') if self.api_key is None: + logger.warning("No API key provided") raise ValueError('No API key provided') + else: + logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + if self.api_url != 'https://api.firecrawl.dev': + logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url) + def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ Scrape the specified URL using the Firecrawl API. From 3091f0134cc95f47fe7d993b5fab5536868dd29e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 10 Jun 2024 16:27:10 -0700 Subject: [PATCH 05/29] Nick: --- apps/api/src/scraper/WebScraper/crawler.ts | 14 +++++++++----- apps/api/src/scraper/WebScraper/index.ts | 1 + apps/api/src/scraper/WebScraper/sitemap.ts | 3 +++ 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 9340aa8..ee9baff 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio"; import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import async from "async"; -import { Progress } from "../../lib/entities"; +import { PageOptions, Progress } from "../../lib/entities"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import robotsParser from "robots-parser"; @@ -108,6 +108,7 @@ export class WebCrawler { public async start( inProgress?: (progress: Progress) => void, + pageOptions?: PageOptions, concurrencyLimit: number = 5, limit: number = 10000, maxDepth: number = 10 @@ -130,6 +131,7 @@ export class WebCrawler { const urls = await this.crawlUrls( [this.initialUrl], + pageOptions, concurrencyLimit, inProgress ); @@ -148,6 +150,7 @@ export class WebCrawler { private async crawlUrls( urls: string[], + pageOptions: PageOptions, concurrencyLimit: number, inProgress?: (progress: Progress) => void, ): Promise<{ url: string, html: string }[]> { @@ -158,7 +161,7 @@ export class WebCrawler { } return; } - const newUrls = await this.crawl(task); + const newUrls = await this.crawl(task, pageOptions); // add the initial url if not already added // if (this.visited.size === 1) { // let normalizedInitial = this.initialUrl; @@ -188,7 +191,7 @@ export class WebCrawler { currentDocumentUrl: task, }); } - await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress); + await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress); if (callback && typeof callback === "function") { callback(); } @@ -207,7 +210,7 @@ export class WebCrawler { return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } - async crawl(url: string): Promise<{url: string, html: string}[]> { + async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> { if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){ return []; } @@ -231,7 +234,8 @@ export class WebCrawler { let content : string = ""; // If it is the first link, fetch with single url if (this.visited.size === 1) { - const page = await scrapSingleUrl(url, {includeHtml: true}); + console.log(pageOptions) + const page = await scrapSingleUrl(url, {...pageOptions, includeHtml: true}); content = page.html ?? "" } else { const response = await axios.get(url); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index e3a3cc6..824ec06 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -173,6 +173,7 @@ export class WebScraperDataProvider { let links = await crawler.start( inProgress, + this.pageOptions, 5, this.limit, this.maxCrawledDepth diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 0ac4338..5a89183 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -12,6 +12,8 @@ export async function getLinksFromSitemap( content = response.data; } catch (error) { console.error(`Request failed for ${sitemapUrl}: ${error}`); + console.log(allUrls) + return allUrls; } @@ -34,6 +36,7 @@ export async function getLinksFromSitemap( } catch (error) { console.error(`Error processing ${sitemapUrl}: ${error}`); } + console.log(allUrls) return allUrls; } From 913c1dd56839875ab4946d4fd085af9f01f841db Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 10 Jun 2024 16:49:03 -0700 Subject: [PATCH 06/29] Nick: fetch -> axios and fix timeouts --- apps/api/src/scraper/WebScraper/single_url.ts | 71 +++++++++++++------ 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 689d5e7..9a61888 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -8,6 +8,7 @@ import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { handleCustomScraping } from "./custom/handleCustomScraping"; +import axios from "axios"; dotenv.config(); @@ -19,6 +20,8 @@ const baseScrapers = [ "fetch", ] as const; +const universalTimeout = 15000; + export async function generateRequestParams( url: string, wait_browser: string = "domcontentloaded", @@ -59,21 +62,24 @@ export async function scrapWithFireEngine( `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` ); - const response = await fetch(process.env.FIRE_ENGINE_BETA_URL + "/scrape", { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ + const response = await axios.post( + process.env.FIRE_ENGINE_BETA_URL + "/scrape", + { url: url, wait: waitParam, screenshot: screenshotParam, headers: headers, - pageOptions: pageOptions - }), - }); + pageOptions: pageOptions, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout + waitParam + } + ); - if (!response.ok) { + if (response.status !== 200) { console.error( `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` ); @@ -84,7 +90,7 @@ export async function scrapWithFireEngine( if (contentType && contentType.includes("application/pdf")) { return { html: await fetchAndProcessPdf(url), screenshot: "" }; } else { - const data = await response.json(); + const data = response.data; const html = data.content; const screenshot = data.screenshot; return { html: html ?? "", screenshot: screenshot ?? "" }; @@ -98,7 +104,7 @@ export async function scrapWithFireEngine( export async function scrapWithScrapingBee( url: string, wait_browser: string = "domcontentloaded", - timeout: number = 15000 + timeout: number = universalTimeout ): Promise { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); @@ -141,15 +147,19 @@ export async function scrapWithPlaywright( // If the user has passed a wait parameter in the request, use that const waitParam = reqParams["params"]?.wait ?? waitFor; - const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, { - method: "POST", + const response = await axios.post(process.env.PLAYWRIGHT_MICROSERVICE_URL, { + url: url, + wait_after_load: waitParam, + headers: headers, + }, { headers: { "Content-Type": "application/json", }, - body: JSON.stringify({ url: url, wait_after_load: waitParam, headers: headers }), + timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time + transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically }); - if (!response.ok) { + if (response.status !== 200) { console.error( `[Playwright] Error fetching url: ${url} with status: ${response.status}` ); @@ -160,7 +170,7 @@ export async function scrapWithPlaywright( if (contentType && contentType.includes("application/pdf")) { return fetchAndProcessPdf(url); } else { - const textData = await response.text(); + const textData = response.data; try { const data = JSON.parse(textData); const html = data.content; @@ -171,17 +181,28 @@ export async function scrapWithPlaywright( } } } catch (error) { - console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); + if (error.code === 'ECONNABORTED') { + console.log(`[Playwright] Request timed out for ${url}`); + } else { + console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); + } return ""; } } export async function scrapWithFetch(url: string): Promise { try { - const response = await fetch(url); - if (!response.ok) { + const response = await axios.get(url, { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout, + transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically + }); + + if (response.status !== 200) { console.error( - `[Fetch] Error fetching url: ${url} with status: ${response.status}` + `[Axios] Error fetching url: ${url} with status: ${response.status}` ); return ""; } @@ -190,11 +211,15 @@ export async function scrapWithFetch(url: string): Promise { if (contentType && contentType.includes("application/pdf")) { return fetchAndProcessPdf(url); } else { - const text = await response.text(); + const text = response.data; return text; } } catch (error) { - console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`); + if (error.code === 'ECONNABORTED') { + console.log(`[Axios] Request timed out for ${url}`); + } else { + console.error(`[Axios] Error fetching url: ${url} -> ${error}`); + } return ""; } } From 7ae97786428f7c7911a232f8eea1c07e189f6726 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 10 Jun 2024 16:57:31 -0700 Subject: [PATCH 07/29] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 9a61888..c2dcea1 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -96,7 +96,11 @@ export async function scrapWithFireEngine( return { html: html ?? "", screenshot: screenshot ?? "" }; } } catch (error) { - console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + if (error.code === 'ECONNABORTED') { + console.log(`[Fire-Engine] Request timed out for ${url}`); + } else { + console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + } return { html: "", screenshot: "" }; } } From 99f2ffd6d591398a4baef347306d25371b381793 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 10 Jun 2024 17:03:10 -0700 Subject: [PATCH 08/29] Update webhook.ts --- apps/api/src/services/webhook.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index eca7d09..1f8d647 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -3,15 +3,17 @@ import { supabase_service } from "./supabase"; export const callWebhook = async (teamId: string, data: any) => { try { const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; let webhookUrl = selfHostedUrl; - if (!selfHostedUrl) { + // Only fetch the webhook URL from the database if the self-hosted webhook URL is not set + // and the USE_DB_AUTHENTICATION environment variable is set to true + if (!selfHostedUrl && useDbAuthentication) { const { data: webhooksData, error } = await supabase_service .from("webhooks") .select("url") .eq("team_id", teamId) .limit(1); - if (error) { console.error( `Error fetching webhook URL for team ID: ${teamId}`, From f6b06ac27a829172416419c4fff02d0f71579050 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 10 Jun 2024 18:12:41 -0700 Subject: [PATCH 09/29] Nick: ignoreSitemap, better crawling algo --- apps/api/src/lib/entities.ts | 25 +++++---- apps/api/src/scraper/WebScraper/crawler.ts | 65 ++++++++++++---------- apps/api/src/scraper/WebScraper/index.ts | 6 ++ apps/api/src/scraper/WebScraper/sitemap.ts | 2 - 4 files changed, 57 insertions(+), 41 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 5511623..744c07b 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -35,20 +35,23 @@ export type SearchOptions = { location?: string; }; +export type CrawlerOptions = { + returnOnlyUrls?: boolean; + includes?: string[]; + excludes?: string[]; + maxCrawledLinks?: number; + maxDepth?: number; + limit?: number; + generateImgAltText?: boolean; + replaceAllPathsWithAbsolutePaths?: boolean; + ignoreSitemap?: boolean; + mode?: "default" | "fast"; // have a mode of some sort +} + export type WebScraperOptions = { urls: string[]; mode: "single_urls" | "sitemap" | "crawl"; - crawlerOptions?: { - returnOnlyUrls?: boolean; - includes?: string[]; - excludes?: string[]; - maxCrawledLinks?: number; - maxDepth?: number; - limit?: number; - generateImgAltText?: boolean; - replaceAllPathsWithAbsolutePaths?: boolean; - mode?: "default" | "fast"; // have a mode of some sort - }; + crawlerOptions?: CrawlerOptions; pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; concurrentRequests?: number; diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index ee9baff..fc95e7c 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio"; import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import async from "async"; -import { PageOptions, Progress } from "../../lib/entities"; +import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import robotsParser from "robots-parser"; @@ -109,6 +109,7 @@ export class WebCrawler { public async start( inProgress?: (progress: Progress) => void, pageOptions?: PageOptions, + crawlerOptions?: CrawlerOptions, concurrencyLimit: number = 5, limit: number = 10000, maxDepth: number = 10 @@ -123,10 +124,12 @@ export class WebCrawler { } - const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); - if (sitemapLinks.length > 0) { - let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); - return filteredLinks.map(link => ({ url: link, html: "" })); + if(!crawlerOptions?.ignoreSitemap){ + const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); + if (sitemapLinks.length > 0) { + let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); + return filteredLinks.map(link => ({ url: link, html: "" })); + } } const urls = await this.crawlUrls( @@ -135,6 +138,7 @@ export class WebCrawler { concurrencyLimit, inProgress ); + if ( urls.length === 0 && this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 @@ -142,9 +146,9 @@ export class WebCrawler { return [{ url: this.initialUrl, html: "" }]; } - // make sure to run include exclude here again const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); + return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); } @@ -211,46 +215,41 @@ export class WebCrawler { } async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> { - if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){ + const normalizedUrl = this.normalizeCrawlUrl(url); + if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) { return []; } - this.visited.add(url); - + this.visited.add(normalizedUrl); if (!url.startsWith("http")) { url = "https://" + url; - } if (url.endsWith("/")) { url = url.slice(0, -1); - } - + if (this.isFile(url) || this.isSocialMediaOrEmail(url)) { return []; } try { - let content : string = ""; + let content: string = ""; // If it is the first link, fetch with single url if (this.visited.size === 1) { - console.log(pageOptions) - const page = await scrapSingleUrl(url, {...pageOptions, includeHtml: true}); - content = page.html ?? "" + const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); + content = page.html ?? ""; } else { const response = await axios.get(url); content = response.data ?? ""; } const $ = load(content); - let links: {url: string, html: string}[] = []; + let links: { url: string, html: string }[] = []; // Add the initial URL to the list of links - if(this.visited.size === 1) - { - links.push({url, html: content}); + if (this.visited.size === 1) { + links.push({ url, html: content }); } - $("a").each((_, element) => { const href = $(element).attr("href"); if (href) { @@ -258,32 +257,43 @@ export class WebCrawler { if (!href.startsWith("http")) { fullUrl = new URL(href, this.baseUrl).toString(); } - const url = new URL(fullUrl); - const path = url.pathname; + const urlObj = new URL(fullUrl); + const path = urlObj.pathname; if ( this.isInternalLink(fullUrl) && this.matchesPattern(fullUrl) && this.noSections(fullUrl) && - this.matchesIncludes(path) && + // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards + // this.matchesIncludes(path) && !this.matchesExcludes(path) && this.robots.isAllowed(fullUrl, "FireCrawlAgent") ) { - links.push({url: fullUrl, html: content}); + links.push({ url: fullUrl, html: content }); } } }); - if(this.visited.size === 1){ + if (this.visited.size === 1) { return links; } // Create a new list to return to avoid modifying the visited list - return links.filter((link) => !this.visited.has(link.url)); + return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url))); } catch (error) { return []; } } + private normalizeCrawlUrl(url: string): string { + try{ + const urlObj = new URL(url); + urlObj.searchParams.sort(); // Sort query parameters to normalize + return urlObj.toString(); + } catch (error) { + return url; + } + } + private matchesIncludes(url: string): boolean { if (this.includes.length === 0 || this.includes[0] == "") return true; return this.includes.some((pattern) => new RegExp(pattern).test(url)); @@ -392,7 +402,6 @@ export class WebCrawler { // Normalize and check if the URL is present in any of the sitemaps const normalizedUrl = normalizeUrl(url); - const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 824ec06..7dcd175 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -31,6 +31,7 @@ export class WebScraperDataProvider { private limit: number = 10000; private concurrentRequests: number = 20; private generateImgAltText: boolean = false; + private ignoreSitemap: boolean = false; private pageOptions?: PageOptions; private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; @@ -38,6 +39,7 @@ export class WebScraperDataProvider { "gpt-4-turbo"; private crawlerMode: string = "default"; + authorize(): void { throw new Error("Method not implemented."); } @@ -174,6 +176,9 @@ export class WebScraperDataProvider { let links = await crawler.start( inProgress, this.pageOptions, + { + ignoreSitemap: this.ignoreSitemap, + }, 5, this.limit, this.maxCrawledDepth @@ -474,6 +479,7 @@ export class WebScraperDataProvider { //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); this.crawlerMode = options.crawlerOptions?.mode ?? "default"; + this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; // make sure all urls start with https:// this.urls = this.urls.map((url) => { diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 5a89183..c6dbf11 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -12,7 +12,6 @@ export async function getLinksFromSitemap( content = response.data; } catch (error) { console.error(`Request failed for ${sitemapUrl}: ${error}`); - console.log(allUrls) return allUrls; } @@ -36,7 +35,6 @@ export async function getLinksFromSitemap( } catch (error) { console.error(`Error processing ${sitemapUrl}: ${error}`); } - console.log(allUrls) return allUrls; } From 9390816c1b7975b3349f402f562d1846e6845e2a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 10 Jun 2024 18:26:25 -0700 Subject: [PATCH 10/29] Update openapi.json --- apps/api/openapi.json | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index ab452ff..55bfe1c 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -51,10 +51,19 @@ "description": "Include the raw HTML content of the page. Will output a html key in the response.", "default": false }, + "screenshot": { + "type": "boolean", + "description": "Include a screenshot of the top of the page that you are scraping.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", "default": 0 + }, + "headers": { + "type": "object", + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." } } }, @@ -176,6 +185,11 @@ "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", "default": "default" }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore the website sitemap when crawling", + "default": false + }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl", @@ -195,6 +209,15 @@ "type": "boolean", "description": "Include the raw HTML content of the page. Will output a html key in the response.", "default": false + }, + "screenshot": { + "type": "boolean", + "description": "Include a screenshot of the top of the page that you are scraping.", + "default": false + }, + "headers": { + "type": "object", + "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." } } } @@ -368,7 +391,7 @@ "items": { "$ref": "#/components/schemas/CrawlStatusResponseObj" }, - "description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled." + "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array." } } } @@ -513,6 +536,10 @@ "nullable": true, "description": "Raw HTML content of the page if `includeHtml` is true" }, + "index": { + "type": "integer", + "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." + }, "metadata": { "type": "object", "properties": { From 00c23855b180f9f84a7032d40054b8fcf661e86e Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 11 Jun 2024 11:46:35 -0400 Subject: [PATCH 11/29] Update examples --- .../{k8n => kubernetes-cluster-install}/README.md | 0 .../{k8n => kubernetes-cluster-install}/api.yaml | 0 .../configmap.yaml | 0 .../playwright-service.yaml | 0 .../{k8n => kubernetes-cluster-install}/redis.yaml | 0 .../{k8n => kubernetes-cluster-install}/secret.yaml | 0 .../{k8n => kubernetes-cluster-install}/worker.yaml | 0 .../.eslintrc.json | 0 .../.gitignore | 0 .../README.md | 0 .../components.json | 0 .../next.config.mjs | 0 .../package-lock.json | 0 .../package.json | 0 .../postcss.config.mjs | 0 .../public/android-chrome-192x192.png | Bin .../public/android-chrome-512x512.png | Bin .../public/apple-touch-icon.png | Bin .../public/bgd.png | Bin .../public/favicon-16x16.png | Bin .../public/favicon-32x32.png | Bin .../public/favicon.ico | Bin .../public/next.svg | 0 .../public/og.png | Bin .../public/site.webmanifest | 0 .../public/vercel.svg | 0 .../src/app/favicon.ico | Bin .../src/app/globals.css | 0 .../src/app/hooks/useGithubStars.ts | 0 .../src/app/layout.tsx | 0 .../src/app/page.tsx | 0 .../src/components/github-button.tsx | 0 .../src/components/main.tsx | 0 .../src/components/ui/button.tsx | 0 .../src/components/ui/dialog.tsx | 0 .../src/components/ui/dropdown-menu.tsx | 0 .../src/components/ui/input.tsx | 0 .../src/components/ui/select.tsx | 0 .../src/components/ui/sonner.tsx | 0 .../src/components/ui/switch.tsx | 0 .../src/components/ui/textarea.tsx | 0 .../src/lib/LLM/llm.ts | 2 +- .../src/lib/LLM/testing_constants.ts | 0 .../src/lib/utils.ts | 0 .../src/pages/api/roastWebsite.ts | 0 .../tailwind.config.ts | 0 .../tsconfig.json | 0 ...rag-llama3.mdx => web-data-rag--with-llama3.mdx} | 0 48 files changed, 1 insertion(+), 1 deletion(-) rename examples/{k8n => kubernetes-cluster-install}/README.md (100%) rename examples/{k8n => kubernetes-cluster-install}/api.yaml (100%) rename examples/{k8n => kubernetes-cluster-install}/configmap.yaml (100%) rename examples/{k8n => kubernetes-cluster-install}/playwright-service.yaml (100%) rename examples/{k8n => kubernetes-cluster-install}/redis.yaml (100%) rename examples/{k8n => kubernetes-cluster-install}/secret.yaml (100%) rename examples/{k8n => kubernetes-cluster-install}/worker.yaml (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/.eslintrc.json (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/.gitignore (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/README.md (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/components.json (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/next.config.mjs (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/package-lock.json (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/package.json (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/postcss.config.mjs (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/public/android-chrome-192x192.png (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/public/android-chrome-512x512.png (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/public/apple-touch-icon.png (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/public/bgd.png (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/public/favicon-16x16.png (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/public/favicon-32x32.png (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/public/favicon.ico (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/public/next.svg (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/public/og.png (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/public/site.webmanifest (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/public/vercel.svg (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/app/favicon.ico (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/app/globals.css (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/app/hooks/useGithubStars.ts (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/app/layout.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/app/page.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/github-button.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/main.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/button.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/dialog.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/dropdown-menu.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/input.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/select.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/sonner.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/switch.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/components/ui/textarea.tsx (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/lib/LLM/llm.ts (98%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/lib/LLM/testing_constants.ts (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/lib/utils.ts (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/src/pages/api/roastWebsite.ts (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/tailwind.config.ts (100%) rename examples/{roastmywebsite => roastmywebsite-example-app}/tsconfig.json (100%) rename examples/{rag-llama3.mdx => web-data-rag--with-llama3.mdx} (100%) diff --git a/examples/k8n/README.md b/examples/kubernetes-cluster-install/README.md similarity index 100% rename from examples/k8n/README.md rename to examples/kubernetes-cluster-install/README.md diff --git a/examples/k8n/api.yaml b/examples/kubernetes-cluster-install/api.yaml similarity index 100% rename from examples/k8n/api.yaml rename to examples/kubernetes-cluster-install/api.yaml diff --git a/examples/k8n/configmap.yaml b/examples/kubernetes-cluster-install/configmap.yaml similarity index 100% rename from examples/k8n/configmap.yaml rename to examples/kubernetes-cluster-install/configmap.yaml diff --git a/examples/k8n/playwright-service.yaml b/examples/kubernetes-cluster-install/playwright-service.yaml similarity index 100% rename from examples/k8n/playwright-service.yaml rename to examples/kubernetes-cluster-install/playwright-service.yaml diff --git a/examples/k8n/redis.yaml b/examples/kubernetes-cluster-install/redis.yaml similarity index 100% rename from examples/k8n/redis.yaml rename to examples/kubernetes-cluster-install/redis.yaml diff --git a/examples/k8n/secret.yaml b/examples/kubernetes-cluster-install/secret.yaml similarity index 100% rename from examples/k8n/secret.yaml rename to examples/kubernetes-cluster-install/secret.yaml diff --git a/examples/k8n/worker.yaml b/examples/kubernetes-cluster-install/worker.yaml similarity index 100% rename from examples/k8n/worker.yaml rename to examples/kubernetes-cluster-install/worker.yaml diff --git a/examples/roastmywebsite/.eslintrc.json b/examples/roastmywebsite-example-app/.eslintrc.json similarity index 100% rename from examples/roastmywebsite/.eslintrc.json rename to examples/roastmywebsite-example-app/.eslintrc.json diff --git a/examples/roastmywebsite/.gitignore b/examples/roastmywebsite-example-app/.gitignore similarity index 100% rename from examples/roastmywebsite/.gitignore rename to examples/roastmywebsite-example-app/.gitignore diff --git a/examples/roastmywebsite/README.md b/examples/roastmywebsite-example-app/README.md similarity index 100% rename from examples/roastmywebsite/README.md rename to examples/roastmywebsite-example-app/README.md diff --git a/examples/roastmywebsite/components.json b/examples/roastmywebsite-example-app/components.json similarity index 100% rename from examples/roastmywebsite/components.json rename to examples/roastmywebsite-example-app/components.json diff --git a/examples/roastmywebsite/next.config.mjs b/examples/roastmywebsite-example-app/next.config.mjs similarity index 100% rename from examples/roastmywebsite/next.config.mjs rename to examples/roastmywebsite-example-app/next.config.mjs diff --git a/examples/roastmywebsite/package-lock.json b/examples/roastmywebsite-example-app/package-lock.json similarity index 100% rename from examples/roastmywebsite/package-lock.json rename to examples/roastmywebsite-example-app/package-lock.json diff --git a/examples/roastmywebsite/package.json b/examples/roastmywebsite-example-app/package.json similarity index 100% rename from examples/roastmywebsite/package.json rename to examples/roastmywebsite-example-app/package.json diff --git a/examples/roastmywebsite/postcss.config.mjs b/examples/roastmywebsite-example-app/postcss.config.mjs similarity index 100% rename from examples/roastmywebsite/postcss.config.mjs rename to examples/roastmywebsite-example-app/postcss.config.mjs diff --git a/examples/roastmywebsite/public/android-chrome-192x192.png b/examples/roastmywebsite-example-app/public/android-chrome-192x192.png similarity index 100% rename from examples/roastmywebsite/public/android-chrome-192x192.png rename to examples/roastmywebsite-example-app/public/android-chrome-192x192.png diff --git a/examples/roastmywebsite/public/android-chrome-512x512.png b/examples/roastmywebsite-example-app/public/android-chrome-512x512.png similarity index 100% rename from examples/roastmywebsite/public/android-chrome-512x512.png rename to examples/roastmywebsite-example-app/public/android-chrome-512x512.png diff --git a/examples/roastmywebsite/public/apple-touch-icon.png b/examples/roastmywebsite-example-app/public/apple-touch-icon.png similarity index 100% rename from examples/roastmywebsite/public/apple-touch-icon.png rename to examples/roastmywebsite-example-app/public/apple-touch-icon.png diff --git a/examples/roastmywebsite/public/bgd.png b/examples/roastmywebsite-example-app/public/bgd.png similarity index 100% rename from examples/roastmywebsite/public/bgd.png rename to examples/roastmywebsite-example-app/public/bgd.png diff --git a/examples/roastmywebsite/public/favicon-16x16.png b/examples/roastmywebsite-example-app/public/favicon-16x16.png similarity index 100% rename from examples/roastmywebsite/public/favicon-16x16.png rename to examples/roastmywebsite-example-app/public/favicon-16x16.png diff --git a/examples/roastmywebsite/public/favicon-32x32.png b/examples/roastmywebsite-example-app/public/favicon-32x32.png similarity index 100% rename from examples/roastmywebsite/public/favicon-32x32.png rename to examples/roastmywebsite-example-app/public/favicon-32x32.png diff --git a/examples/roastmywebsite/public/favicon.ico b/examples/roastmywebsite-example-app/public/favicon.ico similarity index 100% rename from examples/roastmywebsite/public/favicon.ico rename to examples/roastmywebsite-example-app/public/favicon.ico diff --git a/examples/roastmywebsite/public/next.svg b/examples/roastmywebsite-example-app/public/next.svg similarity index 100% rename from examples/roastmywebsite/public/next.svg rename to examples/roastmywebsite-example-app/public/next.svg diff --git a/examples/roastmywebsite/public/og.png b/examples/roastmywebsite-example-app/public/og.png similarity index 100% rename from examples/roastmywebsite/public/og.png rename to examples/roastmywebsite-example-app/public/og.png diff --git a/examples/roastmywebsite/public/site.webmanifest b/examples/roastmywebsite-example-app/public/site.webmanifest similarity index 100% rename from examples/roastmywebsite/public/site.webmanifest rename to examples/roastmywebsite-example-app/public/site.webmanifest diff --git a/examples/roastmywebsite/public/vercel.svg b/examples/roastmywebsite-example-app/public/vercel.svg similarity index 100% rename from examples/roastmywebsite/public/vercel.svg rename to examples/roastmywebsite-example-app/public/vercel.svg diff --git a/examples/roastmywebsite/src/app/favicon.ico b/examples/roastmywebsite-example-app/src/app/favicon.ico similarity index 100% rename from examples/roastmywebsite/src/app/favicon.ico rename to examples/roastmywebsite-example-app/src/app/favicon.ico diff --git a/examples/roastmywebsite/src/app/globals.css b/examples/roastmywebsite-example-app/src/app/globals.css similarity index 100% rename from examples/roastmywebsite/src/app/globals.css rename to examples/roastmywebsite-example-app/src/app/globals.css diff --git a/examples/roastmywebsite/src/app/hooks/useGithubStars.ts b/examples/roastmywebsite-example-app/src/app/hooks/useGithubStars.ts similarity index 100% rename from examples/roastmywebsite/src/app/hooks/useGithubStars.ts rename to examples/roastmywebsite-example-app/src/app/hooks/useGithubStars.ts diff --git a/examples/roastmywebsite/src/app/layout.tsx b/examples/roastmywebsite-example-app/src/app/layout.tsx similarity index 100% rename from examples/roastmywebsite/src/app/layout.tsx rename to examples/roastmywebsite-example-app/src/app/layout.tsx diff --git a/examples/roastmywebsite/src/app/page.tsx b/examples/roastmywebsite-example-app/src/app/page.tsx similarity index 100% rename from examples/roastmywebsite/src/app/page.tsx rename to examples/roastmywebsite-example-app/src/app/page.tsx diff --git a/examples/roastmywebsite/src/components/github-button.tsx b/examples/roastmywebsite-example-app/src/components/github-button.tsx similarity index 100% rename from examples/roastmywebsite/src/components/github-button.tsx rename to examples/roastmywebsite-example-app/src/components/github-button.tsx diff --git a/examples/roastmywebsite/src/components/main.tsx b/examples/roastmywebsite-example-app/src/components/main.tsx similarity index 100% rename from examples/roastmywebsite/src/components/main.tsx rename to examples/roastmywebsite-example-app/src/components/main.tsx diff --git a/examples/roastmywebsite/src/components/ui/button.tsx b/examples/roastmywebsite-example-app/src/components/ui/button.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/button.tsx rename to examples/roastmywebsite-example-app/src/components/ui/button.tsx diff --git a/examples/roastmywebsite/src/components/ui/dialog.tsx b/examples/roastmywebsite-example-app/src/components/ui/dialog.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/dialog.tsx rename to examples/roastmywebsite-example-app/src/components/ui/dialog.tsx diff --git a/examples/roastmywebsite/src/components/ui/dropdown-menu.tsx b/examples/roastmywebsite-example-app/src/components/ui/dropdown-menu.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/dropdown-menu.tsx rename to examples/roastmywebsite-example-app/src/components/ui/dropdown-menu.tsx diff --git a/examples/roastmywebsite/src/components/ui/input.tsx b/examples/roastmywebsite-example-app/src/components/ui/input.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/input.tsx rename to examples/roastmywebsite-example-app/src/components/ui/input.tsx diff --git a/examples/roastmywebsite/src/components/ui/select.tsx b/examples/roastmywebsite-example-app/src/components/ui/select.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/select.tsx rename to examples/roastmywebsite-example-app/src/components/ui/select.tsx diff --git a/examples/roastmywebsite/src/components/ui/sonner.tsx b/examples/roastmywebsite-example-app/src/components/ui/sonner.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/sonner.tsx rename to examples/roastmywebsite-example-app/src/components/ui/sonner.tsx diff --git a/examples/roastmywebsite/src/components/ui/switch.tsx b/examples/roastmywebsite-example-app/src/components/ui/switch.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/switch.tsx rename to examples/roastmywebsite-example-app/src/components/ui/switch.tsx diff --git a/examples/roastmywebsite/src/components/ui/textarea.tsx b/examples/roastmywebsite-example-app/src/components/ui/textarea.tsx similarity index 100% rename from examples/roastmywebsite/src/components/ui/textarea.tsx rename to examples/roastmywebsite-example-app/src/components/ui/textarea.tsx diff --git a/examples/roastmywebsite/src/lib/LLM/llm.ts b/examples/roastmywebsite-example-app/src/lib/LLM/llm.ts similarity index 98% rename from examples/roastmywebsite/src/lib/LLM/llm.ts rename to examples/roastmywebsite-example-app/src/lib/LLM/llm.ts index 39dcf10..1d290a3 100644 --- a/examples/roastmywebsite/src/lib/LLM/llm.ts +++ b/examples/roastmywebsite-example-app/src/lib/LLM/llm.ts @@ -1,4 +1,4 @@ -import OpenAI from "openai"; +import OpenAI from "openai/index.mjs"; import { encoding_for_model } from "@dqbd/tiktoken"; /** diff --git a/examples/roastmywebsite/src/lib/LLM/testing_constants.ts b/examples/roastmywebsite-example-app/src/lib/LLM/testing_constants.ts similarity index 100% rename from examples/roastmywebsite/src/lib/LLM/testing_constants.ts rename to examples/roastmywebsite-example-app/src/lib/LLM/testing_constants.ts diff --git a/examples/roastmywebsite/src/lib/utils.ts b/examples/roastmywebsite-example-app/src/lib/utils.ts similarity index 100% rename from examples/roastmywebsite/src/lib/utils.ts rename to examples/roastmywebsite-example-app/src/lib/utils.ts diff --git a/examples/roastmywebsite/src/pages/api/roastWebsite.ts b/examples/roastmywebsite-example-app/src/pages/api/roastWebsite.ts similarity index 100% rename from examples/roastmywebsite/src/pages/api/roastWebsite.ts rename to examples/roastmywebsite-example-app/src/pages/api/roastWebsite.ts diff --git a/examples/roastmywebsite/tailwind.config.ts b/examples/roastmywebsite-example-app/tailwind.config.ts similarity index 100% rename from examples/roastmywebsite/tailwind.config.ts rename to examples/roastmywebsite-example-app/tailwind.config.ts diff --git a/examples/roastmywebsite/tsconfig.json b/examples/roastmywebsite-example-app/tsconfig.json similarity index 100% rename from examples/roastmywebsite/tsconfig.json rename to examples/roastmywebsite-example-app/tsconfig.json diff --git a/examples/rag-llama3.mdx b/examples/web-data-rag--with-llama3.mdx similarity index 100% rename from examples/rag-llama3.mdx rename to examples/web-data-rag--with-llama3.mdx From 06b0d01fd430a8686748b8926668f55a02324d17 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 11 Jun 2024 12:23:36 -0400 Subject: [PATCH 12/29] Update examples --- ...ing-llms.mdx => web-data-contradiction-testing-using-llms.mdx} | 0 ...traction-using-llms.mdx => web-data-extraction-using-llms.mdx} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename examples/{contradiction-testing-using-llms.mdx => web-data-contradiction-testing-using-llms.mdx} (100%) rename examples/{data-extraction-using-llms.mdx => web-data-extraction-using-llms.mdx} (100%) diff --git a/examples/contradiction-testing-using-llms.mdx b/examples/web-data-contradiction-testing-using-llms.mdx similarity index 100% rename from examples/contradiction-testing-using-llms.mdx rename to examples/web-data-contradiction-testing-using-llms.mdx diff --git a/examples/data-extraction-using-llms.mdx b/examples/web-data-extraction-using-llms.mdx similarity index 100% rename from examples/data-extraction-using-llms.mdx rename to examples/web-data-extraction-using-llms.mdx From a9f93c2f1e9d02303b24dd49862602d0fd5828dd Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 11 Jun 2024 14:18:05 -0300 Subject: [PATCH 13/29] Added route to clean completed jobs and a github action cron that triggers every 24h --- .../clean-before-24h-complete-jobs.yml | 17 +++++++++++++++ apps/api/src/index.ts | 21 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 .github/workflows/clean-before-24h-complete-jobs.yml diff --git a/.github/workflows/clean-before-24h-complete-jobs.yml b/.github/workflows/clean-before-24h-complete-jobs.yml new file mode 100644 index 0000000..2fd3b22 --- /dev/null +++ b/.github/workflows/clean-before-24h-complete-jobs.yml @@ -0,0 +1,17 @@ +name: Clean Before 24h Completed Jobs +on: + schedule: + - cron: '0 0 * * *' + +jobs: + clean-jobs: + runs-on: ubuntu-latest + steps: + - name: Send GET request to clean jobs + run: | + response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/clean-before-24h-complete-jobs) + if [ "$response" -ne 200 ]; then + echo "Failed to clean jobs. Response: $response" + exit 1 + fi + echo "Successfully cleaned jobs. Response: $response" diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 0246a1e..eac8204 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -164,6 +164,27 @@ app.get('/serverHealthCheck/notify', async (req, res) => { } }); +app.get('/clean-before-24h-complete-jobs', async (req, res) => { + try { + const webScraperQueue = getWebScraperQueue(); + const completedJobs = await webScraperQueue.getJobs(['completed']); + const before24hJobs = completedJobs.filter(job => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000); + const jobIds = before24hJobs.map(job => job.id) as string[]; + let count = 0; + for (const jobId of jobIds) { + try { + await webScraperQueue.removeJobs(jobId); + count++; + } catch (jobError) { + console.error(`Failed to remove job with ID ${jobId}:`, jobError); + } + } + res.status(200).send(`Removed ${count} completed jobs.`); + } catch (error) { + console.error('Failed to clean last 24h complete jobs:', error); + res.status(500).send('Failed to clean jobs'); + } +}); app.get("/is-production", (req, res) => { res.send({ isProduction: global.isProduction }); From ee282c3d5537f87ee81f84cf6ea6999c422268c0 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 11 Jun 2024 15:24:39 -0300 Subject: [PATCH 14/29] Added allowBackwardCrawling option --- apps/api/src/controllers/crawl.ts | 8 +++----- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/crawler.ts | 12 +++++++++++- apps/api/src/scraper/WebScraper/index.ts | 4 +++- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 5345b4f..55c3a2e 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -55,7 +55,7 @@ export async function crawlController(req: Request, res: Response) { } const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? {}; + const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false, returnOnlyUrls: true }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; if (mode === "single_urls" && !url.includes(",")) { @@ -64,9 +64,7 @@ export async function crawlController(req: Request, res: Response) { await a.setOptions({ mode: "single_urls", urls: [url], - crawlerOptions: { - returnOnlyUrls: true, - }, + crawlerOptions: crawlerOptions, pageOptions: pageOptions, }); @@ -91,7 +89,7 @@ export async function crawlController(req: Request, res: Response) { const job = await addWebScraperJob({ url: url, mode: mode ?? "crawl", // fix for single urls not working - crawlerOptions: { ...crawlerOptions }, + crawlerOptions: crawlerOptions, team_id: team_id, pageOptions: pageOptions, origin: req.body.origin ?? "api", diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 744c07b..facc81e 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -46,6 +46,7 @@ export type CrawlerOptions = { replaceAllPathsWithAbsolutePaths?: boolean; ignoreSitemap?: boolean; mode?: "default" | "fast"; // have a mode of some sort + allowBackwardCrawling?: boolean; } export type WebScraperOptions = { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index fc95e7c..7720991 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -20,6 +20,7 @@ export class WebCrawler { private robotsTxtUrl: string; private robots: any; private generateImgAltText: boolean; + private allowBackwardCrawling: boolean; constructor({ initialUrl, @@ -29,6 +30,7 @@ export class WebCrawler { limit = 10000, generateImgAltText = false, maxCrawledDepth = 10, + allowBackwardCrawling = false }: { initialUrl: string; includes?: string[]; @@ -37,6 +39,7 @@ export class WebCrawler { limit?: number; generateImgAltText?: boolean; maxCrawledDepth?: number; + allowBackwardCrawling?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -49,6 +52,7 @@ export class WebCrawler { this.maxCrawledLinks = maxCrawledLinks ?? limit; this.maxCrawledDepth = maxCrawledDepth ?? 10; this.generateImgAltText = generateImgAltText ?? false; + this.allowBackwardCrawling = allowBackwardCrawling ?? false; } private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { @@ -90,10 +94,16 @@ export class WebCrawler { const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); // Ensure the protocol and hostname match, and the path starts with the initial URL's path - if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { + if (linkHostname !== initialHostname) { return false; } + if (!this.allowBackwardCrawling) { + if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { + return false; + } + } + const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; // Check if the link is disallowed by robots.txt if (!isAllowed) { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 7dcd175..5344320 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -38,8 +38,8 @@ export class WebScraperDataProvider { private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; private crawlerMode: string = "default"; + private allowBackwardCrawling: boolean = false; - authorize(): void { throw new Error("Method not implemented."); } @@ -171,6 +171,7 @@ export class WebScraperDataProvider { maxCrawledDepth: this.maxCrawledDepth, limit: this.limit, generateImgAltText: this.generateImgAltText, + allowBackwardCrawling: this.allowBackwardCrawling, }); let links = await crawler.start( @@ -480,6 +481,7 @@ export class WebScraperDataProvider { this.excludes = this.excludes.filter((item) => item !== ""); this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; + this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false; // make sure all urls start with https:// this.urls = this.urls.map((url) => { From b87725c683fff5ac4bdaeb6464a6b6dd1755e3b7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 12:08:49 -0700 Subject: [PATCH 15/29] Update openapi.json --- apps/api/openapi.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 55bfe1c..7147af1 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -190,6 +190,11 @@ "description": "Ignore the website sitemap when crawling", "default": false }, + "replaceAllPathsWithAbsolutePaths": { + "type": "boolean", + "description": "Replace all relative paths with absolute paths for images and links", + "default": false + }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl", From 520739c9f44b77d94288f3ea9e0433330ae1bc12 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 12:43:16 -0700 Subject: [PATCH 16/29] Nick: fixed bugs associated with absolute path replacements --- apps/api/openapi.json | 10 +++++----- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/index.ts | 11 +++++----- .../scraper/WebScraper/utils/replacePaths.ts | 20 +++++++++++-------- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 7147af1..a755e37 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -190,11 +190,6 @@ "description": "Ignore the website sitemap when crawling", "default": false }, - "replaceAllPathsWithAbsolutePaths": { - "type": "boolean", - "description": "Replace all relative paths with absolute paths for images and links", - "default": false - }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl", @@ -223,6 +218,11 @@ "headers": { "type": "object", "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." + }, + "replaceAllPathsWithAbsolutePaths": { + "type": "boolean", + "description": "Replace all relative paths with absolute paths for images and links", + "default": false } } } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 744c07b..d5002c7 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -18,6 +18,7 @@ export type PageOptions = { waitFor?: number; screenshot?: boolean; headers?: Record; + replaceAllPathsWithAbsolutePaths?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 7dcd175..54897f1 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -302,9 +302,10 @@ export class WebScraperDataProvider { } private applyPathReplacements(documents: Document[]): Document[] { - return this.replaceAllPathsWithAbsolutePaths - ? replacePathsWithAbsolutePaths(documents) - : replaceImgPathsWithAbsolutePaths(documents); + if (this.replaceAllPathsWithAbsolutePaths) { + documents = replacePathsWithAbsolutePaths(documents); + } + return replaceImgPathsWithAbsolutePaths(documents); } private async applyImgAltText(documents: Document[]): Promise { @@ -473,9 +474,9 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} - this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; + this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); this.crawlerMode = options.crawlerOptions?.mode ?? "default"; diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts index d652611..788916c 100644 --- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts +++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts @@ -10,7 +10,8 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] ) || []; paths.forEach((path: string) => { - const isImage = path.startsWith("!"); + try { + const isImage = path.startsWith("!"); let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/); let url = matchedUrl[1]; @@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] } const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0]; - if (isImage) { - document.content = document.content.replace( - path, - `${markdownLinkOrImageText}(${url})` - ); - } else { + // Image is handled afterwards + if (!isImage) { document.content = document.content.replace( path, `${markdownLinkOrImageText}(${url})` ); + } + } catch (error) { + } }); + document.markdown = document.content; }); return documents; @@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen if (!imageUrl.startsWith("http")) { if (imageUrl.startsWith("/")) { imageUrl = imageUrl.substring(1); + imageUrl = new URL(imageUrl, baseUrl).toString(); + } else { + imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString(); } - imageUrl = new URL(imageUrl, baseUrl).toString(); } } @@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen `![${altText}](${imageUrl})` ); }); + document.markdown = document.content; }); return documents; From 2239e03269ec8ef3c3dba2596ac8994fa4562b05 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 12:54:02 -0700 Subject: [PATCH 17/29] Update replacePaths.test.ts --- .../WebScraper/utils/__tests__/replacePaths.test.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts index aae567c..6ecd990 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts @@ -6,12 +6,12 @@ describe('replacePaths', () => { it('should replace relative paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).' + content: 'This is a [link](/path/to/resource).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).' + content: 'This is a [link](https://example.com/path/to/resource).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -21,7 +21,7 @@ describe('replacePaths', () => { it('should not alter absolute URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).' + content: 'This is an [external link](https://external.com/path).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -41,12 +41,12 @@ describe('replacePaths', () => { it('should handle multiple links and images correctly', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).' + content: 'Here are two links: [link1](/path1) and [link2](/path2).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).' + content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).' }]; const result = replacePathsWithAbsolutePaths(documents); From 1e3e06a1d57bffdafb7f562ca9fd5a4cb15ad05f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 13:02:39 -0700 Subject: [PATCH 18/29] Update replacePaths.test.ts --- .../utils/__tests__/replacePaths.test.ts | 39 ++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts index 6ecd990..e201926 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts @@ -6,12 +6,14 @@ describe('replacePaths', () => { it('should replace relative paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](/path/to/resource).' + content: 'This is a [link](/path/to/resource).', + markdown: 'This is a [link](/path/to/resource).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](https://example.com/path/to/resource).' + content: 'This is a [link](https://example.com/path/to/resource).', + markdown: 'This is a [link](https://example.com/path/to/resource).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -21,7 +23,8 @@ describe('replacePaths', () => { it('should not alter absolute URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an [external link](https://external.com/path).' + content: 'This is an [external link](https://external.com/path).', + markdown: 'This is an [external link](https://external.com/path).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -31,7 +34,8 @@ describe('replacePaths', () => { it('should not alter data URLs for images', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an image: ![alt text]().' + content: 'This is an image: ![alt text]().', + markdown: 'This is an image: ![alt text]().' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -41,12 +45,14 @@ describe('replacePaths', () => { it('should handle multiple links and images correctly', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](/path1) and [link2](/path2).' + content: 'Here are two links: [link1](/path1) and [link2](/path2).', + markdown: 'Here are two links: [link1](/path1) and [link2](/path2).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).' + content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).', + markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -56,12 +62,14 @@ describe('replacePaths', () => { it('should correctly handle a mix of absolute and relative paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().' + content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().', + markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().' + content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().', + markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -74,12 +82,14 @@ describe('replacePaths', () => { it('should replace relative image paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here is an image: ![alt text](/path/to/image.jpg).' + content: 'Here is an image: ![alt text](/path/to/image.jpg).', + markdown: 'Here is an image: ![alt text](/path/to/image.jpg).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' + content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).', + markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); @@ -89,7 +99,8 @@ describe('replacePaths', () => { it('should not alter data:image URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'An image with a data URL: ![alt text]().' + content: 'An image with a data URL: ![alt text]().', + markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); @@ -99,12 +110,14 @@ describe('replacePaths', () => { it('should handle multiple images with a mix of data and relative URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).' + content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).', + markdown: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).' + content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).', + markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); From def2ba998717fcbf97d9fe0679bc92e4e4657fa6 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 11 Jun 2024 17:46:25 -0300 Subject: [PATCH 19/29] added tests --- .../src/__tests__/e2e_withAuth/index.test.ts | 114 ++++++++++++++---- 1 file changed, 90 insertions(+), 24 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index f619254..05dd7ff 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -596,7 +596,7 @@ describe("E2E Tests for API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://roastmywebsite.ai" }); + .send({ url: "https://mendable.ai/blog" }); expect(crawlResponse.statusCode).toBe(200); let isCompleted = false; @@ -622,7 +622,13 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("_Roast_"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + + const childrenLinks = completedResponse.body.data.filter(doc => + doc.sourceURL && doc.sourceURL.startsWith("https://mendable.ai/blog") + ); + + expect(childrenLinks.length).toBe(completedResponse.body.data.length); }, 120000); // 120 seconds it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { @@ -757,40 +763,100 @@ describe("E2E Tests for API Routes", () => { }, 60000); }); // 60 seconds - it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { + it.concurrent("should return a successful response for a valid crawl job with allowBackwardCrawling set to true option", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://jestjs.io" }); + .send({ + url: "https://mendable.ai/blog", + pageOptions: { includeHtml: true }, + crawlerOptions: { allowBackwardCrawling: true }, + }); expect(crawlResponse.statusCode).toBe(200); + + let isFinished = false; + let completedResponse; - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 20000)); + while (!isFinished) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); - const response = await request(TEST_URL) - .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("cancelled"); + if (response.body.status === "completed") { + isFinished = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } - await new Promise((r) => setTimeout(r, 10000)); - - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("failed"); + expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data).toEqual(null); - expect(completedResponse.body).toHaveProperty("partial_data"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].markdown).toContain("Mendable"); + + const onlyChildrenLinks = completedResponse.body.data.filter(doc => { + return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") + }); + + expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length); + }, 60000); + + // it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { + // const crawlResponse = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://scrapethissite.com" }); + + // expect(crawlResponse.statusCode).toBe(200); + + // await new Promise((r) => setTimeout(r, 2000)); // Wait for 1 seconds before cancelling the job + + // const responseCancel = await request(TEST_URL) + // .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + // expect(responseCancel.statusCode).toBe(200); + + // let isFinished = false; + // let completedResponse; + + // while (!isFinished) { + // const response = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty("status"); + // console.log(response.body.status) + + // if (response.body.status === "failed") { + // isFinished = true; + // completedResponse = response; + // } else { + // await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + // } + // } + + // expect(completedResponse.statusCode).toBe(200); + // expect(completedResponse.body).toHaveProperty("status"); + // expect(completedResponse.body.status).toBe("failed"); + // expect(completedResponse.body).toHaveProperty("data"); + // expect(completedResponse.body.data).toBeNull(); + // expect(completedResponse.body).toHaveProperty("partial_data"); + // expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); + // expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); + // expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); - }, 60000); // 60 seconds + // }, 60000); // 60 seconds describe("POST /v0/scrape with LLM Extraction", () => { it.concurrent("should extract data using LLM extraction mode", async () => { From df3a678cf485107558f38b66db96381ca5012d14 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 11 Jun 2024 17:46:56 -0300 Subject: [PATCH 20/29] getting back the cancel test, this should work --- .../src/__tests__/e2e_withAuth/index.test.ts | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 05dd7ff..5adf05d 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -811,52 +811,52 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length); }, 60000); - // it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { - // const crawlResponse = await request(TEST_URL) - // .post("/v0/crawl") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: "https://scrapethissite.com" }); + it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://scrapethissite.com" }); - // expect(crawlResponse.statusCode).toBe(200); + expect(crawlResponse.statusCode).toBe(200); - // await new Promise((r) => setTimeout(r, 2000)); // Wait for 1 seconds before cancelling the job + await new Promise((r) => setTimeout(r, 2000)); // Wait for 1 seconds before cancelling the job - // const responseCancel = await request(TEST_URL) - // .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - // expect(responseCancel.statusCode).toBe(200); + const responseCancel = await request(TEST_URL) + .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); - // let isFinished = false; - // let completedResponse; + let isFinished = false; + let completedResponse; - // while (!isFinished) { - // const response = await request(TEST_URL) - // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty("status"); - // console.log(response.body.status) + while (!isFinished) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + console.log(response.body.status) - // if (response.body.status === "failed") { - // isFinished = true; - // completedResponse = response; - // } else { - // await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again - // } - // } + if (response.body.status === "failed") { + isFinished = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } - // expect(completedResponse.statusCode).toBe(200); - // expect(completedResponse.body).toHaveProperty("status"); - // expect(completedResponse.body.status).toBe("failed"); - // expect(completedResponse.body).toHaveProperty("data"); - // expect(completedResponse.body.data).toBeNull(); - // expect(completedResponse.body).toHaveProperty("partial_data"); - // expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); - // expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); - // expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("failed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data).toBeNull(); + expect(completedResponse.body).toHaveProperty("partial_data"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); - // }, 60000); // 60 seconds + }, 60000); // 60 seconds describe("POST /v0/scrape with LLM Extraction", () => { it.concurrent("should extract data using LLM extraction mode", async () => { From 157fbe4a1ea67e4807426696b5f9b3de446641c8 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 11 Jun 2024 17:52:01 -0300 Subject: [PATCH 21/29] added bull auth key --- .github/workflows/clean-before-24h-complete-jobs.yml | 5 ++++- apps/api/src/index.ts | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/clean-before-24h-complete-jobs.yml b/.github/workflows/clean-before-24h-complete-jobs.yml index 2fd3b22..2ced537 100644 --- a/.github/workflows/clean-before-24h-complete-jobs.yml +++ b/.github/workflows/clean-before-24h-complete-jobs.yml @@ -3,13 +3,16 @@ on: schedule: - cron: '0 0 * * *' +env: + BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} + jobs: clean-jobs: runs-on: ubuntu-latest steps: - name: Send GET request to clean jobs run: | - response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/clean-before-24h-complete-jobs) + response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/clean-before-24h-complete-jobs) if [ "$response" -ne 200 ]; then echo "Failed to clean jobs. Response: $response" exit 1 diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index eac8204..cc8376b 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -164,7 +164,7 @@ app.get('/serverHealthCheck/notify', async (req, res) => { } }); -app.get('/clean-before-24h-complete-jobs', async (req, res) => { +app.get(`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => { try { const webScraperQueue = getWebScraperQueue(); const completedJobs = await webScraperQueue.getJobs(['completed']); From d4df6f049d842c975fc5df15e24fd80fb031f322 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 15:49:30 -0700 Subject: [PATCH 22/29] Nick: --- .github/{workflows => archive}/js-sdk.yml | 0 .github/{workflows => archive}/publish-js-sdk.yml | 0 .github/{workflows => archive}/publish-python-sdk.yml | 0 .github/{workflows => archive}/python-sdk.yml | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename .github/{workflows => archive}/js-sdk.yml (100%) rename .github/{workflows => archive}/publish-js-sdk.yml (100%) rename .github/{workflows => archive}/publish-python-sdk.yml (100%) rename .github/{workflows => archive}/python-sdk.yml (100%) diff --git a/.github/workflows/js-sdk.yml b/.github/archive/js-sdk.yml similarity index 100% rename from .github/workflows/js-sdk.yml rename to .github/archive/js-sdk.yml diff --git a/.github/workflows/publish-js-sdk.yml b/.github/archive/publish-js-sdk.yml similarity index 100% rename from .github/workflows/publish-js-sdk.yml rename to .github/archive/publish-js-sdk.yml diff --git a/.github/workflows/publish-python-sdk.yml b/.github/archive/publish-python-sdk.yml similarity index 100% rename from .github/workflows/publish-python-sdk.yml rename to .github/archive/publish-python-sdk.yml diff --git a/.github/workflows/python-sdk.yml b/.github/archive/python-sdk.yml similarity index 100% rename from .github/workflows/python-sdk.yml rename to .github/archive/python-sdk.yml From 01c9f071fa554ec687882ad3727e480b3cc09dcd Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 12 Jun 2024 11:27:06 -0300 Subject: [PATCH 23/29] fixed --- .../src/__tests__/e2e_withAuth/index.test.ts | 36 +++++++------------ apps/api/src/controllers/crawl.ts | 4 +-- 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 5adf05d..02e4a47 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -624,11 +624,11 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Mendable"); - const childrenLinks = completedResponse.body.data.filter(doc => - doc.sourceURL && doc.sourceURL.startsWith("https://mendable.ai/blog") - ); + const childrenLinks = completedResponse.body.data.filter(doc => + doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") + ); - expect(childrenLinks.length).toBe(completedResponse.body.data.length); + expect(childrenLinks.length).toBe(completedResponse.body.data.length); }, 120000); // 120 seconds it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { @@ -816,35 +816,23 @@ describe("E2E Tests for API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://scrapethissite.com" }); + .send({ url: "https://jestjs.io" }); expect(crawlResponse.statusCode).toBe(200); - await new Promise((r) => setTimeout(r, 2000)); // Wait for 1 seconds before cancelling the job + await new Promise((r) => setTimeout(r, 20000)); const responseCancel = await request(TEST_URL) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); - let isFinished = false; - let completedResponse; - - while (!isFinished) { - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - console.log(response.body.status) - - if (response.body.status === "failed") { - isFinished = true; - completedResponse = response; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again - } - } + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 55c3a2e..58d01e2 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -55,7 +55,7 @@ export async function crawlController(req: Request, res: Response) { } const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false, returnOnlyUrls: true }; + const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; if (mode === "single_urls" && !url.includes(",")) { @@ -64,7 +64,7 @@ export async function crawlController(req: Request, res: Response) { await a.setOptions({ mode: "single_urls", urls: [url], - crawlerOptions: crawlerOptions, + crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, pageOptions: pageOptions, }); From d20af257baebbeea8fe907f9c3447e2e12eb1d1b Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 12 Jun 2024 15:38:41 -0300 Subject: [PATCH 24/29] Added jobId to webhook data --- apps/api/src/services/queue-worker.ts | 4 ++-- apps/api/src/services/webhook.ts | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 6772c57..a42b3e8 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -38,7 +38,7 @@ getWebScraperQueue().process( error: message /* etc... */, }; - await callWebhook(job.data.team_id, data); + await callWebhook(job.data.team_id, job.id as string, data); await logJob({ success: success, @@ -78,7 +78,7 @@ getWebScraperQueue().process( error: "Something went wrong... Contact help@mendable.ai or try again." /* etc... */, }; - await callWebhook(job.data.team_id, data); + await callWebhook(job.data.team_id, job.id as string, data); await logJob({ success: false, message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"), diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 1f8d647..fc5962b 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,6 +1,6 @@ import { supabase_service } from "./supabase"; -export const callWebhook = async (teamId: string, data: any) => { +export const callWebhook = async (teamId: string, jobId: string,data: any) => { try { const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; @@ -47,6 +47,7 @@ export const callWebhook = async (teamId: string, data: any) => { }, body: JSON.stringify({ success: data.success, + jobId: jobId, data: dataToSend, error: data.error || undefined, }), From 67dc46b454cb07d50ae3bc7fca219f597a009a83 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 12 Jun 2024 17:53:04 -0700 Subject: [PATCH 25/29] Nick: clusters --- .../src/__tests__/e2e_noAuth/index.test.ts | 1 - .../src/__tests__/e2e_withAuth/index.test.ts | 3 +- apps/api/src/index.ts | 331 ++++++++++-------- apps/api/src/services/redis.ts | 31 +- 4 files changed, 208 insertions(+), 158 deletions(-) diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index c443e71..acb2278 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -1,5 +1,4 @@ import request from "supertest"; -import { app } from "../../index"; import dotenv from "dotenv"; const fs = require("fs"); const path = require("path"); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 02e4a47..431c7d1 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -1,5 +1,4 @@ import request from "supertest"; -import { app } from "../../index"; import dotenv from "dotenv"; import { v4 as uuidv4 } from "uuid"; @@ -35,7 +34,7 @@ describe("E2E Tests for API Routes", () => { describe("POST /v0/scrape", () => { it.concurrent("should require authorization", async () => { - const response = await request(app).post("/v0/scrape"); + const response = await request(TEST_URL).post("/v0/scrape"); expect(response.statusCode).toBe(401); }); diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index cc8376b..6b62f06 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -5,190 +5,215 @@ import "dotenv/config"; import { getWebScraperQueue } from "./services/queue-service"; import { redisClient } from "./services/rate-limiter"; import { v0Router } from "./routes/v0"; -import { initSDK } from '@hyperdx/node-opentelemetry'; +import { initSDK } from "@hyperdx/node-opentelemetry"; +import cluster from "cluster"; +import os from "os"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { ExpressAdapter } = require("@bull-board/express"); -export const app = express(); +const numCPUs = os.cpus().length; +console.log(`Number of CPUs: ${numCPUs} available`); -global.isProduction = process.env.IS_PRODUCTION === "true"; +if (cluster.isMaster) { + console.log(`Master ${process.pid} is running`); -app.use(bodyParser.urlencoded({ extended: true })); -app.use(bodyParser.json({ limit: "10mb" })); + // Fork workers. + for (let i = 0; i < numCPUs; i++) { + cluster.fork(); + } -app.use(cors()); // Add this line to enable CORS - -const serverAdapter = new ExpressAdapter(); -serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); - -const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ - queues: [new BullAdapter(getWebScraperQueue())], - serverAdapter: serverAdapter, -}); - -app.use( - `/admin/${process.env.BULL_AUTH_KEY}/queues`, - serverAdapter.getRouter() -); - -app.get("/", (req, res) => { - res.send("SCRAPERS-JS: Hello, world! Fly.io"); -}); - -//write a simple test function -app.get("/test", async (req, res) => { - res.send("Hello, world!"); -}); - -// register router -app.use(v0Router); - -const DEFAULT_PORT = process.env.PORT ?? 3002; -const HOST = process.env.HOST ?? "localhost"; -redisClient.connect(); - -// HyperDX OpenTelemetry -if(process.env.ENV === 'production') { - initSDK({ consoleCapture: true, additionalInstrumentations: []}); -} - - -export function startServer(port = DEFAULT_PORT) { - const server = app.listen(Number(port), HOST, () => { - console.log(`Server listening on port ${port}`); - console.log( - `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` - ); - console.log(""); - console.log("1. Make sure Redis is running on port 6379 by default"); - console.log( - "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 " - ); + cluster.on("exit", (worker, code, signal) => { + console.log(`Worker ${worker.process.pid} exited`); + console.log("Starting a new worker"); + cluster.fork(); }); - return server; -} +} else { + const app = express(); -if (require.main === module) { - startServer(); -} + global.isProduction = process.env.IS_PRODUCTION === "true"; -// Use this as a "health check" that way we dont destroy the server -app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { - try { - const webScraperQueue = getWebScraperQueue(); - const [webScraperActive] = await Promise.all([ - webScraperQueue.getActiveCount(), - ]); + app.use(bodyParser.urlencoded({ extended: true })); + app.use(bodyParser.json({ limit: "10mb" })); - const noActiveJobs = webScraperActive === 0; - // 200 if no active jobs, 503 if there are active jobs - return res.status(noActiveJobs ? 200 : 500).json({ - webScraperActive, - noActiveJobs, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); + app.use(cors()); // Add this line to enable CORS + + const serverAdapter = new ExpressAdapter(); + serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); + + const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ + queues: [new BullAdapter(getWebScraperQueue())], + serverAdapter: serverAdapter, + }); + + app.use( + `/admin/${process.env.BULL_AUTH_KEY}/queues`, + serverAdapter.getRouter() + ); + + app.get("/", (req, res) => { + res.send("SCRAPERS-JS: Hello, world! Fly.io"); + }); + + //write a simple test function + app.get("/test", async (req, res) => { + res.send("Hello, world!"); + }); + + // register router + app.use(v0Router); + + const DEFAULT_PORT = process.env.PORT ?? 3002; + const HOST = process.env.HOST ?? "localhost"; + redisClient.connect(); + + // HyperDX OpenTelemetry + if (process.env.ENV === "production") { + initSDK({ consoleCapture: true, additionalInstrumentations: [] }); } -}); -app.get(`/serverHealthCheck`, async (req, res) => { - try { - const webScraperQueue = getWebScraperQueue(); - const [waitingJobs] = await Promise.all([ - webScraperQueue.getWaitingCount(), - ]); - - const noWaitingJobs = waitingJobs === 0; - // 200 if no active jobs, 503 if there are active jobs - return res.status(noWaitingJobs ? 200 : 500).json({ - waitingJobs, + function startServer(port = DEFAULT_PORT) { + const server = app.listen(Number(port), HOST, () => { + console.log(`Worker ${process.pid} listening on port ${port}`); + console.log( + `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + ); + console.log(""); + console.log("1. Make sure Redis is running on port 6379 by default"); + console.log( + "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 " + ); }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); + return server; } -}); -app.get('/serverHealthCheck/notify', async (req, res) => { - if (process.env.SLACK_WEBHOOK_URL) { - const treshold = 1; // The treshold value for the active jobs - const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds + if (require.main === module) { + startServer(); + } - const getWaitingJobsCount = async () => { + // Use this as a "health check" that way we dont destroy the server + app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { + try { const webScraperQueue = getWebScraperQueue(); - const [waitingJobsCount] = await Promise.all([ + const [webScraperActive] = await Promise.all([ + webScraperQueue.getActiveCount(), + ]); + + const noActiveJobs = webScraperActive === 0; + // 200 if no active jobs, 503 if there are active jobs + return res.status(noActiveJobs ? 200 : 500).json({ + webScraperActive, + noActiveJobs, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } + }); + + app.get(`/serverHealthCheck`, async (req, res) => { + try { + const webScraperQueue = getWebScraperQueue(); + const [waitingJobs] = await Promise.all([ webScraperQueue.getWaitingCount(), ]); - return waitingJobsCount; - }; + const noWaitingJobs = waitingJobs === 0; + // 200 if no active jobs, 503 if there are active jobs + return res.status(noWaitingJobs ? 200 : 500).json({ + waitingJobs, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } + }); - res.status(200).json({ message: "Check initiated" }); + app.get("/serverHealthCheck/notify", async (req, res) => { + if (process.env.SLACK_WEBHOOK_URL) { + const treshold = 1; // The treshold value for the active jobs + const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds - const checkWaitingJobs = async () => { - try { - let waitingJobsCount = await getWaitingJobsCount(); - if (waitingJobsCount >= treshold) { - setTimeout(async () => { - // Re-check the waiting jobs count after the timeout - waitingJobsCount = await getWaitingJobsCount(); - if (waitingJobsCount >= treshold) { - const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; - const message = { - text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`, - }; + const getWaitingJobsCount = async () => { + const webScraperQueue = getWebScraperQueue(); + const [waitingJobsCount] = await Promise.all([ + webScraperQueue.getWaitingCount(), + ]); - const response = await fetch(slackWebhookUrl, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify(message), - }) - - if (!response.ok) { - console.error('Failed to send Slack notification') + return waitingJobsCount; + }; + + res.status(200).json({ message: "Check initiated" }); + + const checkWaitingJobs = async () => { + try { + let waitingJobsCount = await getWaitingJobsCount(); + if (waitingJobsCount >= treshold) { + setTimeout(async () => { + // Re-check the waiting jobs count after the timeout + waitingJobsCount = await getWaitingJobsCount(); + if (waitingJobsCount >= treshold) { + const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; + const message = { + text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${ + timeout / 60000 + } minute(s).`, + }; + + const response = await fetch(slackWebhookUrl, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify(message), + }); + + if (!response.ok) { + console.error("Failed to send Slack notification"); + } } - } - }, timeout); + }, timeout); + } + } catch (error) { + console.error(error); } - } catch (error) { - console.error(error); - } - }; + }; - checkWaitingJobs(); - } -}); + checkWaitingJobs(); + } + }); -app.get(`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => { - try { - const webScraperQueue = getWebScraperQueue(); - const completedJobs = await webScraperQueue.getJobs(['completed']); - const before24hJobs = completedJobs.filter(job => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000); - const jobIds = before24hJobs.map(job => job.id) as string[]; - let count = 0; - for (const jobId of jobIds) { + app.get( + `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, + async (req, res) => { try { - await webScraperQueue.removeJobs(jobId); - count++; - } catch (jobError) { - console.error(`Failed to remove job with ID ${jobId}:`, jobError); + const webScraperQueue = getWebScraperQueue(); + const completedJobs = await webScraperQueue.getJobs(["completed"]); + const before24hJobs = completedJobs.filter( + (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 + ); + const jobIds = before24hJobs.map((job) => job.id) as string[]; + let count = 0; + for (const jobId of jobIds) { + try { + await webScraperQueue.removeJobs(jobId); + count++; + } catch (jobError) { + console.error(`Failed to remove job with ID ${jobId}:`, jobError); + } + } + res.status(200).send(`Removed ${count} completed jobs.`); + } catch (error) { + console.error("Failed to clean last 24h complete jobs:", error); + res.status(500).send("Failed to clean jobs"); } } - res.status(200).send(`Removed ${count} completed jobs.`); - } catch (error) { - console.error('Failed to clean last 24h complete jobs:', error); - res.status(500).send('Failed to clean jobs'); - } -}); + ); -app.get("/is-production", (req, res) => { - res.send({ isProduction: global.isProduction }); -}); + app.get("/is-production", (req, res) => { + res.send({ isProduction: global.isProduction }); + }); - -// /workers health check, cant act as load balancer, just has to be a pre deploy thing \ No newline at end of file + console.log(`Worker ${process.pid} started`); +} diff --git a/apps/api/src/services/redis.ts b/apps/api/src/services/redis.ts index f2cedd1..491eeb1 100644 --- a/apps/api/src/services/redis.ts +++ b/apps/api/src/services/redis.ts @@ -1,8 +1,35 @@ -import Redis from 'ioredis'; +import Redis from "ioredis"; // Initialize Redis client const redis = new Redis(process.env.REDIS_URL); +// Listen to 'error' events to the Redis connection +redis.on("error", (error) => { + try { + if (error.message === "ECONNRESET") { + console.log("Connection to Redis Session Store timed out."); + } else if (error.message === "ECONNREFUSED") { + console.log("Connection to Redis Session Store refused!"); + } else console.log(error); + } catch (error) {} +}); + +// Listen to 'reconnecting' event to Redis +redis.on("reconnecting", (err) => { + try { + if (redis.status === "reconnecting") + console.log("Reconnecting to Redis Session Store..."); + else console.log("Error reconnecting to Redis Session Store."); + } catch (error) {} +}); + +// Listen to the 'connect' event to Redis +redis.on("connect", (err) => { + try { + if (!err) console.log("Connected to Redis Session Store!"); + } catch (error) {} +}); + /** * Set a value in Redis with an optional expiration time. * @param {string} key The key under which to store the value. @@ -11,7 +38,7 @@ const redis = new Redis(process.env.REDIS_URL); */ const setValue = async (key: string, value: string, expire?: number) => { if (expire) { - await redis.set(key, value, 'EX', expire); + await redis.set(key, value, "EX", expire); } else { await redis.set(key, value); } From 11b6d5afa5285476d934900ee6e4db8b8f48710c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 12 Jun 2024 18:00:22 -0700 Subject: [PATCH 26/29] Update fly.toml --- apps/api/fly.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/fly.toml b/apps/api/fly.toml index 6bc8266..468695d 100644 --- a/apps/api/fly.toml +++ b/apps/api/fly.toml @@ -54,7 +54,7 @@ kill_timeout = '5s' soft_limit = 12 [[vm]] - size = 'performance-8x' + size = 'performance-4x' processes = ['app'] From 182f8d4d6c3fbc9598d054d0c13aadbe1dba8b52 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 12 Jun 2024 18:07:05 -0700 Subject: [PATCH 27/29] Update index.ts --- apps/api/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 6b62f06..494b4d5 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -13,7 +13,7 @@ const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { ExpressAdapter } = require("@bull-board/express"); -const numCPUs = os.cpus().length; +const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; console.log(`Number of CPUs: ${numCPUs} available`); if (cluster.isMaster) { From 676d6e8ab5f7a1fd14ff5b76f8289db7543082c4 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 13 Jun 2024 10:51:05 -0300 Subject: [PATCH 28/29] Added pageOptions.removeTags --- apps/api/openapi.json | 19 +++++++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 34 +++++++++++++++++++ apps/api/src/controllers/crawl.ts | 10 ++++-- apps/api/src/controllers/crawlPreview.ts | 2 +- apps/api/src/controllers/search.ts | 2 ++ apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/index.ts | 7 +++- apps/api/src/scraper/WebScraper/single_url.ts | 13 +++++++ 8 files changed, 84 insertions(+), 4 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index a755e37..b07e43f 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -61,6 +61,13 @@ "description": "Wait x amount of milliseconds for the page to load to fetch content", "default": 0 }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, "headers": { "type": "object", "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." @@ -194,6 +201,11 @@ "type": "integer", "description": "Maximum number of pages to crawl", "default": 10000 + }, + "allowBackwardCrawling": { + "type": "boolean", + "description": "Allow backward crawling (crawl from the base URL to the previous URLs)", + "default": false } } }, @@ -219,6 +231,13 @@ "type": "object", "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, "replaceAllPathsWithAbsolutePaths": { "type": "boolean", "description": "Replace all relative paths with absolute paths for images and links", diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 02e4a47..3423b3a 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -136,6 +136,40 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 60000); // 60 seconds + it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { + const responseWithoutRemoveTags = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/" }); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer + expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav + expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong + + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("Scrape This Site"); + expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer + expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav + expect(response.body.data.content).not.toContain("web scraping"); // strong + }, 30000); // 30 seconds timeout + // TODO: add this test back once we nail the waitFor option to be more deterministic // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { // const startTime = Date.now(); diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 58d01e2..7eab78f 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -55,8 +55,14 @@ export async function crawlController(req: Request, res: Response) { } const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false }; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + const crawlerOptions = req.body.crawlerOptions ?? { + allowBackwardCrawling: false + }; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + removeTags: [] + }; if (mode === "single_urls" && !url.includes(",")) { try { diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index d3e9afe..2c3dc4e 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] }; const job = await addWebScraperJob({ url: url, diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 7474aae..abbc357 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -85,6 +85,7 @@ export async function searchHelper( onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, includeHtml: pageOptions?.includeHtml ?? false, + removeTags: pageOptions?.removeTags ?? [], fallback: false, }, }); @@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) { includeHtml: false, onlyMainContent: true, fetchPageContent: true, + removeTags: [], fallback: false, }; const origin = req.body.origin ?? "api"; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 81bf12c..92170c1 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -19,6 +19,7 @@ export type PageOptions = { screenshot?: boolean; headers?: Record; replaceAllPathsWithAbsolutePaths?: boolean; + removeTags?: string | string[]; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index f432f43..1a6ffd0 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -475,7 +475,12 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false }; + this.pageOptions = options.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + replaceAllPathsWithAbsolutePaths: false, + removeTags: [] + }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c2dcea1..a16f6f0 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -304,6 +304,19 @@ export async function scrapSingleUrl( const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { const soup = cheerio.load(html); soup("script, style, iframe, noscript, meta, head").remove(); + + if (pageOptions.removeTags) { + if (typeof pageOptions.removeTags === 'string') { + pageOptions.removeTags.split(',').forEach((tag) => { + soup(tag.trim()).remove(); + }); + } else if (Array.isArray(pageOptions.removeTags)) { + pageOptions.removeTags.forEach((tag) => { + soup(tag).remove(); + }); + } + } + if (pageOptions.onlyMainContent) { // remove any other tags that are not in the main content excludeNonMainTags.forEach((tag) => { From 6963a490f1284d89756ce9f6290b5c654ae14b79 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 14 Jun 2024 10:21:44 -0300 Subject: [PATCH 29/29] Updated version --- apps/python-sdk/firecrawl/__init__.py | 2 +- .../test.cpython-311-pytest-8.2.1.pyc | Bin 0 -> 44947 bytes 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 4e53e77..2fe16ba 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "0.0.14" +__version__ = "0.0.15" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ba1f1324fe139772739cdae776d127cd5002ca8 GIT binary patch literal 44947 zcmeHwZ)_V!mS`a`Do!E};jGaF-9vrx|x@A+Q zNTr*yV-1zl++`PwU~n@y?B2}mW`Nw@Z8k=Bf%$T{J77OtK5c-zJv0pp(Gdt>;IJR& za34IE`7rRO`@O2>V|B6Hre%9PI}zP_u6p(ARkPS#uimRyul}a5FJ<8KC-DoD3mXjM ze^aLMG5y5Xe;+f9pBk27#R|sxnE1bAIzG`6Gp5b+oiSO`bv_|VlIN4+|L*hM;{Vk7 z6#nlh^i21j?-jK1Lf>@%`Tm&EVc7kjppnlSEgC2{&!?@<^8+YLO|5DaSY4ms9zMgL z{G1;|`H+>c2maFdMMtnKX4r|T;RfZ$s~aixe=BJZe9{4q_>-UWBY|3L8hG`8dnDM> z`L*`Q)Vc=6$Lkv@^?!S9unz8S?bKrpit~?I-5(g|H&`jajaCofsMQO&$?5~lSp9&T ztu){kYXIS_}BJ6+35)t@{-=Ved+=}k`MrDgJTo^knRo07CwH~5 zn^U`HuGM?rJbLcU?2$K)XHOpe-rTw)FP}T}`de=v%^o{`_UJ2TkG%7G_KhQNzV>{k z-g~UU;K(80^L8jCrG+0NVeQiJ$-Z z+gOlxo}O6z3;y0QquBGfF>bi#)sFY#-!rbpV#fFI{NV}gnD3|G&>mI<(yt z;Mk?QClfqeuH+)n3Q*LKQ+{rH|Sw{C)R&se(NHFK?OyXAV9lgqod zTkk&l-h@3<&X=|dGG(D4Z z?0R?B%}v^bsrQ01dto+TDCdiAy{k|Hv0Go8buX1>3zn#ry;vw+$Q821+_YWqKO*iE z%^!7~l2aebPL`bMTsdpMH&e(Jb97^4Nr&FtdQZNHd&*nc+{{e9hZ>S!a*vvMPR;CB zGtbD3>%gd&#DG|%=lfKNThlJdjK3H;6#skkh>`d_9s>-QmLrgvL1|g$8H|DX+23{q zr5vOr%P~k+lDOid%FE!m1<4w#7#}OO8g;;58)3z4?p`FT3CS9pqTxb(AXyb@c~{BG zvq5|l9`!BAT}ak0lEN*LRjncBxzucnWKEQn+1?b($q>n^^5Z!{nI%GqWL0bM9YoiF zbW2#tHt%P+ta9i3sc7>bNLH1O??R_T+sOz@?RSr$(j{v*OV*Utqe#|`^4BLA;Ka|u zvLtKo>3X_3&WmC4dWS>9vy`_OHgAi0u8WCob2@9;SLU`J%jFBUl_{4pE~dLPb_Q~6 zSLR$fSDtk<6D7-jKJ)CJafco{hh|kW6Tr;Xv~;9O@Qll)xqe83LOLYyr5QbRMTP{mRZ01fC?YmB2Ov+X;*T z1SFF46fpHoLVPrv&uabT>;)CwlKa1;I2j!M>#N>@fnjm;E9EkEfVIy$L}m9 zhrUdX-bjvmoAy{tWHx<*0lzcQAv<{4VNeqZJdcIGU zxHbJX4FkU%IWidke9$~HlK6aG3@}{KJRt;)6@vg_bqueoB5YLUtVQg^ROM`{sp|0H zqYsr3I}i;YvuMDb@<&$z?pf@_=i`-*DOMNfO?B9*Xu}p1IVfvT=(;LhLZKVt8!@K% z{-MfsLF@?H7O|t&U`0(AZ%|{{bO3?I!*__-QTeg3DGxO7D=5OD(5W@}4x($2Qe`FD zyr1C!cfKE8tU~OB-i1y_A}@kc`(+}iI)mTdO8R0aVRc)nO2Sv@l4TlI{ZD`s_hTg) z!LaB6l7xbImlex?GP8xWfU1+sU8MC(? z8M8`}F*Hxp1dTIJxOF!lviP{U7%K3`iOp zjUi)Bc*iq?-?8P4RUd-wxO1&2Y?Ki)(X4HR_eQotU)L!gDFZQDs zV^*pu^yd@ktAyXHg+L$ij0uVp^=g_Zm)ENSzRkYZjF;Ac|NWEJDr>Nk^q->cHlCvH z;Bynq`BIZ4X2(1hh+qO8I$hsTzLa;fxTa=d*UDvQ9Q#V%zM3tUF55*RFK@4F2vw() zbIa4$uG$yeeA(WW%U_S(?#jEFVyO%x=?IxWVW!MTOop6Wq~q%gWRNNKJe zXb+}YXAsD-9z}c$QQcGlHSJGmgqx{hp#r9y{gi%*z%~HYMmet9D4$g``-Hlv((YA@ zeY2;uXYO}ta9~O9i}G=)KF&@|*seP{OD0^|x57}Fm8R9;Pk0V&=oZ~+jMT7Cl8E>A zoc2;@s;M)z)R~1>mePYie*eSwz0rf!^z*g!^9!#nnVY<@Yth_P3&VrB+9;AVoJA4W zVq_~OluWV(R4*`*#2OQ3t`aYq887TwG&8j@Jcz5JhzVpCc!}XGW4KBv21t3KBBxah z*lKO5O>mVZ^J%Yf)fx%L$i$-gbW~AQ6fr?)Rx&}6##UlbSF0|WXvAvvYdm#E_)Oyq zNZx4UX+WQi>1I^SIcT=h^072&95Ux%Qzhg*n^mzONOKO8W+(c%j-@`<22qFS3n6n3 zm3MP&Z#4j_bes;Eb99!u6fJXF*V(KtmXNXZGUrfhaJr(;IUZqOTfGmp7irGXx6+IB%HGGb*qFJg^Iv|NR-Q#Hr?jI*P%oza zXAv_&p+$uE;(&OgSFOC!s=HHT7L8sU6k0@gq!tn0t8$zi3e_Ui8l2Y0$-o=bq|&vB zVb&s6TO*1Vad)|JuUf>K(?U;h4&n874ioqmftLszA@B-;Zxc95fbQ8jM&MO|dU87F zT((MAi%8k!oRur*>gJ{LbYTqI1yl<-U5xqK1*rlO)6g!Q*ND$?0w)NZBtUcf>%Gd{ zUYO?YWrFX#4i=#^@F9Hx2VMfs_d!jjMfV?|E&6lpfEjINip?`wB;*WNoAv_#sHfPQ zJu~E5G&k46@F1>=A|{Yo;3bB$jNvMw7$D_^ikwz4V5_yIHo;YvTB^$;uFI61kbIbc z*-8v&K_5kfB9%;X28NqZF<_$cX{`(TKcFwtRH~v>G}f+E6-uT<)itJ9^)RWcsT?F` z8QPokCI+-{?X8Xx{K0E)R@^eJ&U`GdYkaM?W0rb>SznjX;}W5IT=J3WamfgIt!b-P zH)|l|qVX{FxZbujkUlZ%>%Wg#-y`g6EB#P=k><_=E4_$;;ePeFRV%&t0D6%$kh^Qc zD{nM)wdUlm=y5~nMXZ+HqaG)Fb@-v{ajRL68?n|rG(ArC!N51V57vt3VAVIe4`@|z zoubdzduDPM?K5I?flJ=|6NOpJehu3BoA<8jJ8$DPbiPaA9Rg9h`&|6crE%vxkwog@ zhuT>Q^L_NQxuOL}OL6O2vMe}R@FoWscMseM?yRu?ms4My^1h$-PX3YiLav&=P)lD} zSs-iZ>a5JKXz#4(-nY`b(KAD?MRQ{<3=iU}C}IMc1zut}%NVW_iUCqysK{v*1GZXQ zY7<;#MSFiF+WV5Z!)si%MuIUiv1smyDyoViCJ4<+CMeR_N(}01)g=>+SZ0^ZC%nd0 zYa|#W6N}~(QAJfz!~~&P$pl3jTZutkt-55Q5v$odSVj)P(){2pBSAl&m33_hZt)3N zR#w)vHD3exEq1P~Yd4-1-NORjpQ;s+(>mSns28P2;_}uK!#xuD8wI>aG@pxzKG4?^ zqdXEH{959W)IH{&YWN)HocHoqc9ooqyJxJ)-Ej7EN;_vR?LN47&-k7_Qjm1)oPuUaF)7@1f!r=yChqKFAXvyuslG`13h zx>|L~L?c$ScbL9ezmw@(V9+Jk1o6?gOwiUkZOcTcE8<|w@j!t zn%`LyrPwm@$oe{R%fy4}#W2S^wNF-h@m?J7?zHjbPQ5x1y40Z7hs|&Av zWhOs3@u%lOAm=^Fd4uTjztceFLwS@%2iKAM^q~&6(p$j32*8 zSnlcuBg~Q2d0^yR8gt}--Z6fho#+bb`Mn3Jxsz`d5%3Lm6ol`wDG*$yG%+#jILKbT z1k**P>|DziFN)AU1uwGtxALki-nUKgmqe7B1gY9 zRc$|T&wf8}&&C_cjo#>U)#Scfa-WynN8uFqQT+|;O~te#B_E9jtplT8 z5(8q5p6^p7ZcRJS;wKJx#j>!FaeAVnC){~YBLbZ+FHVmU5vNB}$XDX@;Ab5-vUe@Ah8WE`H24dt&46L;Bmg0XrYq&#^bv3&A)f|e7@M}`X6N406vaF>vAv~0t> z{$>209a4vEL|-k>I>oFIuoP!CtH<>zfC24C^5gonhV)q)c@B|&^f0B{WL%%V5Bq*n zswTJAl3TsxRuV+pNWe%yf@m8FA_dnhZKixdq*JX3B2}K;uOK>z-#8$M$OQiT*UT_JObi;f|fp`*2Jf6mxP4aqLB>UqR}0&Qn5`j#8#~?ZIbvEpOAUDO2=uP z4)gHtfJMGZsoSbkyt$wU!F3+O77aD*n|`0LWYZS%j!M^MlfE!Q)+F5j7r680T0(y% zAwt!Fx+SE+K%4!%F2I}nxl9o>{ZG&_1wxejPSqKsYoPB3(EF@QsTTR)pWi+FTj|?! ztyLeHLWm}>g2`T(bs_iuVwv&i9Pk!6(?Ji6gF7AHnZqC~j)D6+^V=p4I(r!PHlZ<$e5KMkp zdxUxt5$~}n8)5A;FN!VM2(46)7wqD7(|H*@nz5S1rtClzCIr!UC@iIOlmI=qVODzK zBtF)so9gpK-mZ1N3}UbdhHP?95sT9V&JcJKKn>0GtaFyo+X*~D;0*%b0dQ%&;yD^6 zVbwf!nh7m|gt}AIci&BXJWMnHw zGKs2-Qmw9v;U*X<`EV8qnU)EL;E6PdHF|D~DLJuZKJGQHS|h<2nOHO*k1DE)A|?pU zN+u}M*h&oQYSkqZjaX)v%w1mNs)>YzjO4_kxhtxuDvFpOG%NX_h#9IF)YYm>CK|Dt zy~Bz?2IJIn9emaBf2tG=&UAtj^$sThbOdMQ7B)VA0*U__OMT234GOnhi6iKySuqhV zGIXXzVWTix>?TTmc3`Cf)@+_#1n(JV-2^>DnpSUZ4jt)1qS0Yymp|fq199p{;WJ%Nn%#c zZKyL^gpN$2p9hG#rJq-|+0W|&yt$vt6t~|01iuGn>@p|J}pCm404< zHR#XEyR9K>xYF%E9Vu1Q3UK1Ss%SIdV6Xa52Zf+cBADiC%)${?RGEeLR(dMEn3Yqo z+d`lP)@%`$hXT*7Axiyu2-M);tfI}gBdB_V_iC;6-;h3QowdHw=l4N(8-37@Uq=u6 zAU@w~J*M}8wfdXfA8L^Ese#73S>Dn}=^K&}cE3OJw3iKc8|&}cz`qf{R{s=5N)n&> zgesl%w`e|%XEZIJW)X;=-;%)iCZFd~qFvteY;C&FQLwA=ee~Xq90en5U=-Yhb4b*+ z+MC z_e2ttcMBWp1MGm#k1f>L4t8QF@FjYiew=9rs`X+=st8VysMe|T+iEi}&M+CQQLcL!&;D{G4nV{Ng` z0Mz|rrHHnawjLeA^IwQ{PU}{bN4cKIM`xJU!&srkJ{Wbz+&n1ES{0lZ#|%8{{rvVQ ziQw8@uofN4fgN87<&YfiW%M}P^U+nziq zCyCYOm8S{6c;Gi_S|r=t`G%)yz4A20RqL339OHaa*bRl31@AlW#Z3~g&I}RTnQx1$ed|0?6;1nor*I(VF0P_x;m{3VL zIDk?*I3S@pI6R{|IP3>U;ZGnP9O&GtXB;w?P-_32daTt!d92mJz)8gPT z)WYDzC41s>R_;NNognWeeHY&b;lHpzeBL1N9Rg$sQvDtFDIO5=2&x11-j`;skq5(c zt~|a+9F4A-grwucW6~hha&y86^dq!GmM6C#rq(bc5F34-_4dE|#Z%rpS?}#X@{Z@c zeHT0{UrkTd(o=sxMxG^e)C;>7&CyyI9>i5q!~`-6yu@&pF|L~L?f12H9njggNT1=wX{HQKwEEewo#}@8Fzigh zSxnnr>njJuL4NBis*O1yyo~68xI0C);JI$ViqSlzcA5Hj#2i>#A?4U8NvoecaM=6d z?pk6y-Vgq9s;XTm#B+ZD9?Kn-l&{b9RC*#k_g6uWfh~ooTlATs`_yM1tmppF@BiV5 z5l@Jw{aeHABSc4KHS7qzZD>iFr%iBt@!t^g-0zuB2@T0VD(iNNX3QBZ;5ygttQbk> zXrZmpTNLUcJx2xa5{OzZZ}Fdo4rS+@iAxXGg<7Z?K@CGF6>{a7JgK8xr_(}%koP3C zhM$1=kEz0c$9Ip?8sGbgl!wes_?Ot*_d0mW=D}@B)axq387YeRf z+D!Sb7CO~U^Qb4Tnt+P}R?VX_D~lKc4}#Dl(8yX$z)U2Dv!Ik1Kt{G=WTR1axjE*h zVp@@sk4A&mfl)7s0kKBU_o))MrduP2&?1T$`L93*JS?XQtINW<;MlUAurRUpVd|s! zMZ-${RScUa;)oF*U-p@MePSuPKXgNPx=nQ@xWvrMFCk-{sz`ELr^8Z9I%49lFGV-v zTASGLF!)fMF62^l%-^0HwG=%poGw=1httI)>}zX;x5YnR6mbvE6hps*UQJkQ152{aj8Tb~$=lffd;%TpGtxoD2W|jK_bb{; z(&qaewY>UZ%@OM(-g#|#)q3nc?^^8_i0jYrCqI#k!jUyFQau)N*V=NgwZS)0bhcrX zXlF@2j$bihZEXJJ*yh&Eto;6l8}-yi{+j>Xet%(*8ubJ54mDyiP1{8)ccEbK%H`_= zh&k)zCd#EoAc495LScHRkHvQOxqtm~4tpXprJ^_|rHHnfyqm!hZJ@Am%7T;0<3NVlnVFJPb~ESBo!Oxmg1j_i7c*{Y z)|s&DU8M_C2s?<#z1L=t!x0e(GgwbtD3uC!t~eI2r~MYQetY%a2K%h*mg@a^H%t7o zlHXX|`FD7u>!}$W2w^+ryzSPLj{WXz9-$1u&73Y-Hunu~{|yIUsH|enVWA$U+8+iR z=UW6`B5;JjD+Im`P^Z46sJ_(0;G+3HI6o)CzW|tfF6ZJXin4HMc&_25fJCdC!VW=$ z5Df^(P*3IEd=Xv(MRbu#oiXOeWFWhmGeCgu!}&{q@L0%gTL=G&`1}Xr)7_3I!oMeS z;S)fUICd$J&ICik6sYyhe*_(AN$&qNqs@WufB3#Px_2qLerfoLTOG#c$&Qb}gFQYGHT~S49yM$Sm*@!&$~~l~4?j@d;2*GfV$ORrlWRK$SP{Q0A|9-Rt)n84X3#wBv3`G#oSc-CG!Q( ze4!@bVgpti7!>h^Qjn=d5(CJ{Rt&03BoZ1KXPIUhtR!2>L2F3m4+42DnTI{|a81C) z2COzPDB=sHAXAGZ29S}h7*v-?Bs4P4GR-nrNw$)M){x2{1nMO$!ecw?$qTdjLK!2r zBfQ?$WoNF@NVrQ5{tytm9Q&%1FWdEQM%5EOPK?wLWY~h|lneO_yJpMzLcN>waw{Qu zlGQs(Zrz-n&0AtuV!g|vjgD6wjH`x2Gr;J6%DPS^OLfZkb~loqHLiwAr4Nen^aIAfW(|F*pfhCx+ft_ zg&$kJtAI0=Y^UC1zXu+YQf{Jrnr>+?fdd525unJ1;`CN=5RwRm@6Z-Qhr%9;$T|+~ zI&{dK=FlRNFs?ZyKdu|1tA&)gojfF4deQl>D55#M+YJo|_On=Qi9}5JKhIdr*OJk< zApR{GJqzOBlF_{&{w*0JUhrBnM!n#5lWP62?*p*7ncBRNSX%##XRKLDul-Z!N7sHf zT1{`SrMEBi-ZVNFI=!wJye$U_cD+zF4%Li9o^j}AV&H@KzfA1Bk=XfZ(R=Z&YU1r$ z;_U@=|fX{@f*m_f>noxt8L1@?9di~{>RV>@mc4EmFlWiXtIJ$cJuu(m5!1VaT5Bm_o* zgJ!HG8fAz^DXqZ%uGns3