From 1b3547dcf2944fb3dfdd11c57edba05c91b4bc5f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 28 May 2024 12:56:24 -0700 Subject: [PATCH] Nick: --- apps/api/openapi.json | 5 +++ .../src/__tests__/e2e_withAuth/index.test.ts | 20 +++++++++++ apps/api/src/controllers/scrape.ts | 2 +- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/single_url.ts | 34 +++++++++++++------ 5 files changed, 50 insertions(+), 12 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index b483bc4..ab452ff 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -50,6 +50,11 @@ "type": "boolean", "description": "Include the raw HTML content of the page. Will output a html key in the response.", "default": false + }, + "waitFor": { + "type": "integer", + "description": "Wait x amount of milliseconds for the page to load to fetch content", + "default": 0 } } }, diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index e039531..797c6f2 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -134,6 +134,26 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 60000); // 60 seconds + + it("should return a successful response with a valid API key and waitFor option", async () => { + const startTime = Date.now(); + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } }); + const endTime = Date.now(); + const duration = endTime - startTime; + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("🔥 Firecrawl"); + expect(duration).toBeGreaterThanOrEqual(7000); + }, 12000); // 12 seconds timeout }); describe("POST /v0/crawl", () => { diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 0b3f146..f01e7e4 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -102,7 +102,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0 }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index ab0a0ef..1bd774d 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -15,6 +15,7 @@ export type PageOptions = { includeHtml?: boolean; fallback?: boolean; fetchPageContent?: boolean; + waitFor?: number; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 419bdba..7022504 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -44,18 +44,21 @@ export async function generateRequestParams( } export async function scrapWithFireEngine( url: string, + waitFor: number = 0, options?: any ): Promise { try { const reqParams = await generateRequestParams(url); - const wait_playwright = reqParams["params"]?.wait ?? 0; + // If the user has passed a wait parameter in the request, use that + const waitParam = reqParams["params"]?.wait ?? waitFor; + console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam}`); const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", { method: "POST", headers: { "Content-Type": "application/json", }, - body: JSON.stringify({ url: url, wait: wait_playwright }), + body: JSON.stringify({ url: url, wait: waitParam }), }); if (!response.ok) { @@ -115,17 +118,18 @@ export async function scrapWithScrapingBee( } } -export async function scrapWithPlaywright(url: string): Promise { +export async function scrapWithPlaywright(url: string, waitFor: number = 0): Promise { try { const reqParams = await generateRequestParams(url); - const wait_playwright = reqParams["params"]?.wait ?? 0; + // If the user has passed a wait parameter in the request, use that + const waitParam = reqParams["params"]?.wait ?? waitFor; const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, { method: "POST", headers: { "Content-Type": "application/json", }, - body: JSON.stringify({ url: url, wait: wait_playwright }), + body: JSON.stringify({ url: url, wait: waitParam }), }); if (!response.ok) { @@ -178,7 +182,7 @@ export async function scrapWithFetch(url: string): Promise { * @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined * @returns The order of scrapers to be used for scraping a URL */ -function getScrapingFallbackOrder(defaultScraper?: string) { +function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false) { const availableScrapers = baseScrapers.filter(scraper => { switch (scraper) { case "scrapingBee": @@ -193,16 +197,22 @@ function getScrapingFallbackOrder(defaultScraper?: string) { } }); - const defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"]; + let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"]; + + if (isWaitPresent) { + defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")]; + } + const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper)); const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]); const scrapersInOrder = Array.from(uniqueScrapers); + console.log(`Scrapers in order: ${scrapersInOrder}`); return scrapersInOrder as typeof baseScrapers[number][]; } export async function scrapSingleUrl( urlToScrap: string, - pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, + pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0}, existingHtml: string = "" ): Promise { urlToScrap = urlToScrap.trim(); @@ -227,7 +237,9 @@ export async function scrapSingleUrl( switch (method) { case "fire-engine": if (process.env.FIRE_ENGINE_BETA_URL) { - text = await scrapWithFireEngine(url); + console.log(`Scraping ${url} with Fire Engine`); + + text = await scrapWithFireEngine(url, pageOptions.waitFor); } break; case "scrapingBee": @@ -241,7 +253,7 @@ export async function scrapSingleUrl( break; case "playwright": if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { - text = await scrapWithPlaywright(url); + text = await scrapWithPlaywright(url, pageOptions.waitFor); } break; case "scrapingBeeLoad": @@ -268,7 +280,7 @@ export async function scrapSingleUrl( console.error(`Invalid URL key, trying: ${urlToScrap}`); } const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; - const scrapersInOrder = getScrapingFallbackOrder(defaultScraper) + const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0) for (const scraper of scrapersInOrder) { // If exists text coming from crawler, use it