From a5e718b0840a2888be2ad5105dfdcdc132313651 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 21 May 2024 18:34:23 -0700 Subject: [PATCH] Nick: improvements --- apps/api/.env.example | 4 +- apps/api/src/lib/load-testing-example.ts | 42 ++++++++++ apps/api/src/scraper/WebScraper/single_url.ts | 82 +++++++++++++------ .../WebScraper/utils/custom/website_params.ts | 2 +- 4 files changed, 102 insertions(+), 28 deletions(-) create mode 100644 apps/api/src/lib/load-testing-example.ts diff --git a/apps/api/.env.example b/apps/api/.env.example index a2bffd0..659d68f 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -33,4 +33,6 @@ STRIPE_PRICE_ID_STANDARD= STRIPE_PRICE_ID_SCALE= HYPERDX_API_KEY= -HDX_NODE_BETA_MODE=1 \ No newline at end of file +HDX_NODE_BETA_MODE=1 + +FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta \ No newline at end of file diff --git a/apps/api/src/lib/load-testing-example.ts b/apps/api/src/lib/load-testing-example.ts new file mode 100644 index 0000000..6fd56fc --- /dev/null +++ b/apps/api/src/lib/load-testing-example.ts @@ -0,0 +1,42 @@ +import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url"; + +const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); + +const scrapInBatches = async ( + urls: string[], + batchSize: number, + delayMs: number +) => { + let successCount = 0; + let errorCount = 0; + + for (let i = 0; i < urls.length; i += batchSize) { + const batch = urls + .slice(i, i + batchSize) + .map((url) => scrapWithFireEngine(url)); + try { + const results = await Promise.all(batch); + results.forEach((data, index) => { + if (data.trim() === "") { + errorCount++; + } else { + successCount++; + console.log( + `Scraping result ${i + index + 1}:`, + data.trim().substring(0, 20) + "..." + ); + } + }); + } catch (error) { + console.error("Error during scraping:", error); + } + await delay(delayMs); + } + + console.log(`Total successful scrapes: ${successCount}`); + console.log(`Total errored scrapes: ${errorCount}`); +}; +function run() { + const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com"); + scrapInBatches(urls, 10, 1000); +} diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4c08168..f58ec77 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -10,6 +10,15 @@ import { fetchAndProcessPdf } from "./utils/pdfProcessor"; dotenv.config(); +const baseScrapers = [ + "fire-engine", + "scrapingBee", + "playwright", + "scrapingBeeLoad", + "fetch", +] as const; + + export async function generateRequestParams( url: string, wait_browser: string = "domcontentloaded", @@ -33,15 +42,39 @@ export async function generateRequestParams( return defaultParams; } } -export async function scrapWithCustomFirecrawl( +export async function scrapWithFireEngine( url: string, options?: any ): Promise { try { - // TODO: merge the custom firecrawl scraper into mono-repo when ready - return null; + const reqParams = await generateRequestParams(url); + const wait_playwright = reqParams["params"]?.wait ?? 0; + + const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ url: url, wait: wait_playwright }), + }); + + if (!response.ok) { + console.error( + `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` + ); + return ""; + } + + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + const data = await response.json(); + const html = data.content; + return html ?? ""; + } } catch (error) { - console.error(`Error scraping with custom firecrawl-scraper: ${error}`); + console.error(`Error scraping with Fire Engine: ${error}`); return ""; } } @@ -63,7 +96,7 @@ export async function scrapWithScrapingBee( if (response.status !== 200 && response.status !== 404) { console.error( - `Scraping bee error in ${url} with status code ${response.status}` + `[ScrapingBee] Error fetching url: ${url} with status code ${response.status}` ); return ""; } @@ -77,7 +110,7 @@ export async function scrapWithScrapingBee( return text; } } catch (error) { - console.error(`Error scraping with Scraping Bee: ${error}`); + console.error(`[ScrapingBee] Error fetching url: ${url} -> ${error}`); return ""; } } @@ -97,7 +130,7 @@ export async function scrapWithPlaywright(url: string): Promise { if (!response.ok) { console.error( - `Error fetching w/ playwright server -> URL: ${url} with status: ${response.status}` + `[Playwright] Error fetching url: ${url} with status: ${response.status}` ); return ""; } @@ -111,11 +144,18 @@ export async function scrapWithPlaywright(url: string): Promise { return html ?? ""; } } catch (error) { - console.error(`Error scraping with Puppeteer: ${error}`); + console.error(`Error scraping with Playwright: ${error}`); return ""; } } +function getScrapingFallbackOrder(defaultScraper?: string) { + const fireEngineScraper = process.env.FIRE_ENGINE_BETA_URL ? ["fire-engine"] : []; + const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...fireEngineScraper, ...baseScrapers] : [...fireEngineScraper, ...baseScrapers]); + const scrapersInOrder = Array.from(uniqueScrapers); + return scrapersInOrder as typeof baseScrapers[number][]; +} + export async function scrapSingleUrl( urlToScrap: string, pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, @@ -137,17 +177,12 @@ export async function scrapSingleUrl( const attemptScraping = async ( url: string, - method: - | "firecrawl-scraper" - | "scrapingBee" - | "playwright" - | "scrapingBeeLoad" - | "fetch" + method: typeof baseScrapers[number] ) => { let text = ""; switch (method) { - case "firecrawl-scraper": - text = await scrapWithCustomFirecrawl(url); + case "fire-engine": + text = await scrapWithFireEngine(url); break; case "scrapingBee": if (process.env.SCRAPING_BEE_API_KEY) { @@ -205,15 +240,7 @@ export async function scrapSingleUrl( console.error(`Invalid URL key, trying: ${urlToScrap}`); } const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; - const scrapersInOrder = defaultScraper - ? [ - defaultScraper, - "scrapingBee", - "playwright", - "scrapingBeeLoad", - "fetch", - ] - : ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"]; + const scrapersInOrder = getScrapingFallbackOrder(defaultScraper) for (const scraper of scrapersInOrder) { // If exists text coming from crawler, use it @@ -225,7 +252,10 @@ export async function scrapSingleUrl( } [text, html] = await attemptScraping(urlToScrap, scraper); if (text && text.trim().length >= 100) break; - console.log(`Falling back to ${scraper}`); + const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; + if (nextScraperIndex < scrapersInOrder.length) { + console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); + } } if (!text) { diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index 32f5c08..9094fc3 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -63,7 +63,7 @@ export const urlSpecificParams = { }, }, "ycombinator.com":{ - defaultScraper: "playwright", + defaultScraper: "fire-engine", params: { wait_browser: "networkidle2", block_resources: false,