diff --git a/apps/api/.env.example b/apps/api/.env.example index a2bffd0..659d68f 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -33,4 +33,6 @@ STRIPE_PRICE_ID_STANDARD= STRIPE_PRICE_ID_SCALE= HYPERDX_API_KEY= -HDX_NODE_BETA_MODE=1 \ No newline at end of file +HDX_NODE_BETA_MODE=1 + +FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta \ No newline at end of file diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index e9082ca..331283e 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -81,7 +81,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data.content).toContain("🔥 FireCrawl"); + expect(response.body.data.content).toContain("🔥 Firecrawl"); }, 30000); // 30 seconds timeout it("should return a successful response with a valid API key and includeHtml set to true", async () => { @@ -99,8 +99,8 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("html"); expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.content).toContain("🔥 FireCrawl"); - expect(response.body.data.markdown).toContain("🔥 FireCrawl"); + expect(response.body.data.content).toContain("🔥 Firecrawl"); + expect(response.body.data.markdown).toContain("🔥 Firecrawl"); expect(response.body.data.html).toContain(" { // 120 seconds expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); - expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); + expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl"); + expect(completedResponse.body.data[0].markdown).toContain("Firecrawl"); expect(completedResponse.body.data[0].html).toContain(" { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); + expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl"); }, 60000); // 60 seconds it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { @@ -697,8 +697,8 @@ describe("E2E Tests for API Routes", () => { // 120 seconds expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); - expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); + expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl"); + expect(completedResponse.body.data[0].markdown).toContain("Firecrawl"); expect(completedResponse.body.data[0].html).toContain(" new Promise((resolve) => setTimeout(resolve, ms)); + +const scrapInBatches = async ( + urls: string[], + batchSize: number, + delayMs: number +) => { + let successCount = 0; + let errorCount = 0; + + for (let i = 0; i < urls.length; i += batchSize) { + const batch = urls + .slice(i, i + batchSize) + .map((url) => scrapWithFireEngine(url)); + try { + const results = await Promise.all(batch); + results.forEach((data, index) => { + if (data.trim() === "") { + errorCount++; + } else { + successCount++; + console.log( + `Scraping result ${i + index + 1}:`, + data.trim().substring(0, 20) + "..." + ); + } + }); + } catch (error) { + console.error("Error during scraping:", error); + } + await delay(delayMs); + } + + console.log(`Total successful scrapes: ${successCount}`); + console.log(`Total errored scrapes: ${errorCount}`); +}; +function run() { + const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com"); + scrapInBatches(urls, 10, 1000); +} diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4c08168..419bdba 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -10,6 +10,15 @@ import { fetchAndProcessPdf } from "./utils/pdfProcessor"; dotenv.config(); +const baseScrapers = [ + "fire-engine", + "scrapingBee", + "playwright", + "scrapingBeeLoad", + "fetch", +] as const; + + export async function generateRequestParams( url: string, wait_browser: string = "domcontentloaded", @@ -33,15 +42,39 @@ export async function generateRequestParams( return defaultParams; } } -export async function scrapWithCustomFirecrawl( +export async function scrapWithFireEngine( url: string, options?: any ): Promise { try { - // TODO: merge the custom firecrawl scraper into mono-repo when ready - return null; + const reqParams = await generateRequestParams(url); + const wait_playwright = reqParams["params"]?.wait ?? 0; + + const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ url: url, wait: wait_playwright }), + }); + + if (!response.ok) { + console.error( + `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` + ); + return ""; + } + + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + const data = await response.json(); + const html = data.content; + return html ?? ""; + } } catch (error) { - console.error(`Error scraping with custom firecrawl-scraper: ${error}`); + console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); return ""; } } @@ -63,7 +96,7 @@ export async function scrapWithScrapingBee( if (response.status !== 200 && response.status !== 404) { console.error( - `Scraping bee error in ${url} with status code ${response.status}` + `[ScrapingBee] Error fetching url: ${url} with status code ${response.status}` ); return ""; } @@ -77,7 +110,7 @@ export async function scrapWithScrapingBee( return text; } } catch (error) { - console.error(`Error scraping with Scraping Bee: ${error}`); + console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); return ""; } } @@ -97,7 +130,7 @@ export async function scrapWithPlaywright(url: string): Promise { if (!response.ok) { console.error( - `Error fetching w/ playwright server -> URL: ${url} with status: ${response.status}` + `[Playwright] Error fetching url: ${url} with status: ${response.status}` ); return ""; } @@ -111,11 +144,62 @@ export async function scrapWithPlaywright(url: string): Promise { return html ?? ""; } } catch (error) { - console.error(`Error scraping with Puppeteer: ${error}`); + console.error(`[Playwright][c] Error fetching url: ${url} -> ${error}`); return ""; } } +export async function scrapWithFetch(url: string): Promise { + try { + const response = await fetch(url); + if (!response.ok) { + console.error( + `[Fetch] Error fetching url: ${url} with status: ${response.status}` + ); + return ""; + } + + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + const text = await response.text(); + return text; + } + } catch (error) { + console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`); + return ""; + } +} + +/** + * Get the order of scrapers to be used for scraping a URL + * If the user doesn't have envs set for a specific scraper, it will be removed from the order. + * @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined + * @returns The order of scrapers to be used for scraping a URL + */ +function getScrapingFallbackOrder(defaultScraper?: string) { + const availableScrapers = baseScrapers.filter(scraper => { + switch (scraper) { + case "scrapingBee": + case "scrapingBeeLoad": + return !!process.env.SCRAPING_BEE_API_KEY; + case "fire-engine": + return !!process.env.FIRE_ENGINE_BETA_URL; + case "playwright": + return !!process.env.PLAYWRIGHT_MICROSERVICE_URL; + default: + return true; + } + }); + + const defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"]; + const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper)); + const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]); + const scrapersInOrder = Array.from(uniqueScrapers); + return scrapersInOrder as typeof baseScrapers[number][]; +} + export async function scrapSingleUrl( urlToScrap: string, pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, @@ -137,17 +221,14 @@ export async function scrapSingleUrl( const attemptScraping = async ( url: string, - method: - | "firecrawl-scraper" - | "scrapingBee" - | "playwright" - | "scrapingBeeLoad" - | "fetch" + method: typeof baseScrapers[number] ) => { let text = ""; switch (method) { - case "firecrawl-scraper": - text = await scrapWithCustomFirecrawl(url); + case "fire-engine": + if (process.env.FIRE_ENGINE_BETA_URL) { + text = await scrapWithFireEngine(url); + } break; case "scrapingBee": if (process.env.SCRAPING_BEE_API_KEY) { @@ -169,25 +250,7 @@ export async function scrapSingleUrl( } break; case "fetch": - try { - const response = await fetch(url); - if (!response.ok) { - console.error( - `Error fetching URL: ${url} with status: ${response.status}` - ); - return ""; - } - - const contentType = response.headers['content-type']; - if (contentType && contentType.includes('application/pdf')) { - return fetchAndProcessPdf(url); - } else { - text = await response.text(); - } - } catch (error) { - console.error(`Error scraping URL: ${error}`); - return ""; - } + text = await scrapWithFetch(url); break; } @@ -205,15 +268,7 @@ export async function scrapSingleUrl( console.error(`Invalid URL key, trying: ${urlToScrap}`); } const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; - const scrapersInOrder = defaultScraper - ? [ - defaultScraper, - "scrapingBee", - "playwright", - "scrapingBeeLoad", - "fetch", - ] - : ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"]; + const scrapersInOrder = getScrapingFallbackOrder(defaultScraper) for (const scraper of scrapersInOrder) { // If exists text coming from crawler, use it @@ -225,7 +280,10 @@ export async function scrapSingleUrl( } [text, html] = await attemptScraping(urlToScrap, scraper); if (text && text.trim().length >= 100) break; - console.log(`Falling back to ${scraper}`); + const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; + if (nextScraperIndex < scrapersInOrder.length) { + console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); + } } if (!text) { diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index 32f5c08..9094fc3 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -63,7 +63,7 @@ export const urlSpecificParams = { }, }, "ycombinator.com":{ - defaultScraper: "playwright", + defaultScraper: "fire-engine", params: { wait_browser: "networkidle2", block_resources: false, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 065f6d7..6772c57 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -7,7 +7,9 @@ import { callWebhook } from "./webhook"; import { logJob } from "./logging/log_job"; import { initSDK } from '@hyperdx/node-opentelemetry'; -initSDK({ consoleCapture: true, additionalInstrumentations: []}); +if(process.env.ENV === 'production') { + initSDK({ consoleCapture: true, additionalInstrumentations: []}); +} getWebScraperQueue().process( Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 5bc48c9..5fa0964 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -54,7 +54,7 @@ export const testSuiteRateLimiter = new RateLimiterRedis({ export function getRateLimiter(mode: RateLimiterMode, token: string, plan?: string){ // Special test suite case. TODO: Change this later. - if (token.includes("5089cefa58")){ + if (token.includes("5089cefa58") || token.includes("6254cf9")){ return testSuiteRateLimiter; } switch (mode) {