From 93627ae87c612e985a59cf26501988f0043b0b8c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 16 Apr 2024 12:06:46 -0400 Subject: [PATCH] Nick: --- README.md | 2 +- SELF_HOST.md | 2 +- apps/api/src/lib/parseApi.ts | 1 - apps/api/src/scraper/WebScraper/single_url.ts | 67 ++++++++++++++----- 4 files changed, 52 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 2e6d317..53446df 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Crawl and convert any website into LLM-ready markdown. Build by [Mendable.ai](https://mendable.ai?ref=gfirecrawl) -*This repo is still in early development and its main purpose is to help improve accuracy of LLM response given clean data* +*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* ## What is Firecrawl? diff --git a/SELF_HOST.md b/SELF_HOST.md index 3536ac3..ba0ae23 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -2,5 +2,5 @@ Guide coming soon. - +*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* diff --git a/apps/api/src/lib/parseApi.ts b/apps/api/src/lib/parseApi.ts index 8e1a354..4b03a40 100644 --- a/apps/api/src/lib/parseApi.ts +++ b/apps/api/src/lib/parseApi.ts @@ -13,7 +13,6 @@ export function parseApi(api: string) { return uuid; } -console.log(parseApi("fc-a6a2d63aed2b46a9946d2a7207efed4d")) export function uuidToFcUuid(uuid: string) { const uuidWithoutDashes = uuid.replace(/-/g, ""); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 8bb5160..0cdbe51 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -1,6 +1,5 @@ import * as cheerio from "cheerio"; import { ScrapingBeeClient } from "scrapingbee"; -import { attemptScrapWithRequests, sanitizeText } from "./utils/utils"; import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; import { Document } from "../../lib/entities"; @@ -9,9 +8,23 @@ import { parseMarkdown } from "../../lib/html-to-markdown"; dotenv.config(); +export async function scrapWithCustomFirecrawl( + url: string, + options?: any +): Promise { + try { + // TODO: merge the custom firecrawl scraper into mono-repo when ready + return null; + } catch (error) { + console.error(`Error scraping with custom firecrawl-scraper: ${error}`); + return ""; + } +} - -export async function scrapWithScrapingBee(url: string, wait_browser:string = "domcontentloaded"): Promise { +export async function scrapWithScrapingBee( + url: string, + wait_browser: string = "domcontentloaded" +): Promise { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const response = await client.get({ @@ -35,11 +48,10 @@ export async function scrapWithScrapingBee(url: string, wait_browser:string = "d } } - export async function scrapWithPlaywright(url: string): Promise { try { const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, { - method: 'POST', + method: "POST", headers: { "Content-Type": "application/json", }, @@ -47,7 +59,9 @@ export async function scrapWithPlaywright(url: string): Promise { }); if (!response.ok) { - console.error(`Error fetching w/ playwright server -> URL: ${url} with status: ${response.status}`); + console.error( + `Error fetching w/ playwright server -> URL: ${url} with status: ${response.status}` + ); return ""; } @@ -73,29 +87,42 @@ export async function scrapSingleUrl( return soup.html(); }; - const attemptScraping = async (url: string, method: 'scrapingBee' | 'playwright' | 'scrapingBeeLoad' | 'fetch') => { + const attemptScraping = async ( + url: string, + method: + | "firecrawl-scraper" + | "scrapingBee" + | "playwright" + | "scrapingBeeLoad" + | "fetch" + ) => { let text = ""; switch (method) { - case 'scrapingBee': + case "firecrawl-scraper": + text = await scrapWithCustomFirecrawl(url); + break; + case "scrapingBee": if (process.env.SCRAPING_BEE_API_KEY) { text = await scrapWithScrapingBee(url); } break; - case 'playwright': + case "playwright": if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { text = await scrapWithPlaywright(url); } break; - case 'scrapingBeeLoad': + case "scrapingBeeLoad": if (process.env.SCRAPING_BEE_API_KEY) { text = await scrapWithScrapingBee(url, "networkidle2"); } break; - case 'fetch': + case "fetch": try { const response = await fetch(url); if (!response.ok) { - console.error(`Error fetching URL: ${url} with status: ${response.status}`); + console.error( + `Error fetching URL: ${url} with status: ${response.status}` + ); return ""; } text = await response.text(); @@ -104,26 +131,32 @@ export async function scrapSingleUrl( return ""; } break; - } const cleanedHtml = removeUnwantedElements(text); return [await parseMarkdown(cleanedHtml), text]; }; try { - let [text, html ] = await attemptScraping(urlToScrap, 'scrapingBee'); + // TODO: comment this out once we're ready to merge firecrawl-scraper into the mono-repo + // let [text, html] = await attemptScraping(urlToScrap, 'firecrawl-scraper'); + // if (!text || text.length < 100) { + // console.log("Falling back to scraping bee load"); + // [text, html] = await attemptScraping(urlToScrap, 'scrapingBeeLoad'); + // } + + let [text, html] = await attemptScraping(urlToScrap, "scrapingBee"); if (!text || text.length < 100) { console.log("Falling back to playwright"); - [text, html] = await attemptScraping(urlToScrap, 'playwright'); + [text, html] = await attemptScraping(urlToScrap, "playwright"); } if (!text || text.length < 100) { console.log("Falling back to scraping bee load"); - [text, html] = await attemptScraping(urlToScrap, 'scrapingBeeLoad'); + [text, html] = await attemptScraping(urlToScrap, "scrapingBeeLoad"); } if (!text || text.length < 100) { console.log("Falling back to fetch"); - [text, html] = await attemptScraping(urlToScrap, 'fetch'); + [text, html] = await attemptScraping(urlToScrap, "fetch"); } const soup = cheerio.load(html);