From a0e404f94ea52f12259713f7bbe6c151ff41e1ec Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Wed, 29 May 2024 18:56:57 -0400 Subject: [PATCH 1/3] init commit --- apps/api/src/controllers/scrape.ts | 2 +- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/single_url.ts | 60 ++++++++++++------- 3 files changed, 40 insertions(+), 23 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index f01e7e4..b554165 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -102,7 +102,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0 }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 1bd774d..23469f0 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -16,6 +16,7 @@ export type PageOptions = { fallback?: boolean; fetchPageContent?: boolean; waitFor?: number; + screenshot?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4dfe79f..7e33827 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -45,12 +45,14 @@ export async function generateRequestParams( export async function scrapWithFireEngine( url: string, waitFor: number = 0, + screenshot: boolean = false, options?: any -): Promise { +): Promise<[string, string]> { try { const reqParams = await generateRequestParams(url); // If the user has passed a wait parameter in the request, use that const waitParam = reqParams["params"]?.wait ?? waitFor; + const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam}`); const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", { @@ -58,27 +60,28 @@ export async function scrapWithFireEngine( headers: { "Content-Type": "application/json", }, - body: JSON.stringify({ url: url, wait: waitParam }), + body: JSON.stringify({ url: url, wait: waitParam, screenshot: screenshotParam }), }); if (!response.ok) { console.error( `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` ); - return ""; + return ["", ""]; } const contentType = response.headers['content-type']; if (contentType && contentType.includes('application/pdf')) { - return fetchAndProcessPdf(url); + return [await fetchAndProcessPdf(url), ""]; } else { const data = await response.json(); const html = data.content; - return html ?? ""; + const screenshot = data.screenshot; + return [html ?? "", screenshot ?? ""]; } } catch (error) { console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); - return ""; + return ["", ""]; } } @@ -182,7 +185,7 @@ export async function scrapWithFetch(url: string): Promise { * @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined * @returns The order of scrapers to be used for scraping a URL */ -function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false) { +function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false, isScreenshotPresent: boolean = false) { const availableScrapers = baseScrapers.filter(scraper => { switch (scraper) { case "scrapingBee": @@ -199,7 +202,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"]; - if (isWaitPresent) { + if (isWaitPresent || isScreenshotPresent) { defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")]; } @@ -210,7 +213,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea return scrapersInOrder as typeof baseScrapers[number][]; } -async function handleCustomScraping(text: string, url: string): Promise { +async function handleCustomScraping(text: string, url: string): Promise<[string, string] | null> { if (text.includes(' { urlToScrap = urlToScrap.trim(); @@ -242,12 +245,13 @@ export async function scrapSingleUrl( method: typeof baseScrapers[number] ) => { let text = ""; + let screenshot = ""; switch (method) { case "fire-engine": if (process.env.FIRE_ENGINE_BETA_URL) { console.log(`Scraping ${url} with Fire Engine`); - text = await scrapWithFireEngine(url, pageOptions.waitFor); + [text, screenshot] = await scrapWithFireEngine(url, pageOptions.waitFor, pageOptions.screenshot); } break; case "scrapingBee": @@ -277,16 +281,17 @@ export async function scrapSingleUrl( // Check for custom scraping conditions const customScrapedContent = await handleCustomScraping(text, url); if (customScrapedContent) { - text = customScrapedContent; + text = customScrapedContent[0]; + screenshot = customScrapedContent[1]; } //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); - return [await parseMarkdown(cleanedHtml), text]; + return [await parseMarkdown(cleanedHtml), text, screenshot]; }; try { - let [text, html] = ["", ""]; + let [text, html, screenshot] = ["", "", ""]; let urlKey = urlToScrap; try { urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); @@ -294,7 +299,7 @@ export async function scrapSingleUrl( console.error(`Invalid URL key, trying: ${urlToScrap}`); } const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; - const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0) + const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && pageOptions.screenshot && pageOptions.screenshot === true) for (const scraper of scrapersInOrder) { // If exists text coming from crawler, use it @@ -304,7 +309,7 @@ export async function scrapSingleUrl( html = existingHtml; break; } - [text, html] = await attemptScraping(urlToScrap, scraper); + [text, html, screenshot] = await attemptScraping(urlToScrap, scraper); if (text && text.trim().length >= 100) break; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; if (nextScraperIndex < scrapersInOrder.length) { @@ -318,12 +323,23 @@ export async function scrapSingleUrl( const soup = cheerio.load(html); const metadata = extractMetadata(soup, urlToScrap); - const document: Document = { - content: text, - markdown: text, - html: pageOptions.includeHtml ? html : undefined, - metadata: { ...metadata, sourceURL: urlToScrap }, - }; + + let document: Document; + if(screenshot && screenshot.length > 0) { + document = { + content: text, + markdown: text, + html: pageOptions.includeHtml ? html : undefined, + metadata: { ...metadata, screenshot_base64: screenshot, sourceURL: urlToScrap, }, + } + }else{ + document = { + content: text, + markdown: text, + html: pageOptions.includeHtml ? html : undefined, + metadata: { ...metadata, sourceURL: urlToScrap, }, + } + } return document; } catch (error) { From 37915e11e85d363483794f8f1ad3ff3519dd61f3 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Wed, 29 May 2024 21:18:24 -0400 Subject: [PATCH 2/3] Final push --- apps/api/src/scraper/WebScraper/single_url.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 7e33827..4f521e8 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -53,7 +53,7 @@ export async function scrapWithFireEngine( // If the user has passed a wait parameter in the request, use that const waitParam = reqParams["params"]?.wait ?? waitFor; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; - console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam}`); + console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`); const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", { method: "POST", @@ -330,7 +330,7 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, - metadata: { ...metadata, screenshot_base64: screenshot, sourceURL: urlToScrap, }, + metadata: { ...metadata, screenshot: screenshot, sourceURL: urlToScrap, }, } }else{ document = { From 6c939d534d26342e936b97f3a88f936d96a39dc4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 29 May 2024 19:43:51 -0700 Subject: [PATCH 3/3] Nick: small refactor --- apps/api/src/lib/entities.ts | 5 +++++ apps/api/src/scraper/WebScraper/single_url.ts | 19 ++++++++++--------- apps/api/src/types.ts | 2 +- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 23469f0..48d3e89 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -106,4 +106,9 @@ export class SearchResult { toString(): string { return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; } +} + +export interface FireEngineResponse { + html: string; + screenshot: string; } \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4f521e8..c7f9469 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -2,7 +2,7 @@ import * as cheerio from "cheerio"; import { ScrapingBeeClient } from "scrapingbee"; import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; -import { Document, PageOptions } from "../../lib/entities"; +import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; @@ -47,7 +47,7 @@ export async function scrapWithFireEngine( waitFor: number = 0, screenshot: boolean = false, options?: any -): Promise<[string, string]> { +): Promise { try { const reqParams = await generateRequestParams(url); // If the user has passed a wait parameter in the request, use that @@ -67,21 +67,21 @@ export async function scrapWithFireEngine( console.error( `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` ); - return ["", ""]; + return { html: "", screenshot: "" }; } const contentType = response.headers['content-type']; if (contentType && contentType.includes('application/pdf')) { - return [await fetchAndProcessPdf(url), ""]; + return { html: await fetchAndProcessPdf(url), screenshot: "" }; } else { const data = await response.json(); const html = data.content; const screenshot = data.screenshot; - return [html ?? "", screenshot ?? ""]; + return { html: html ?? "", screenshot: screenshot ?? "" }; } } catch (error) { console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); - return ["", ""]; + return { html: "", screenshot: "" }; } } @@ -213,7 +213,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea return scrapersInOrder as typeof baseScrapers[number][]; } -async function handleCustomScraping(text: string, url: string): Promise<[string, string] | null> { +async function handleCustomScraping(text: string, url: string): Promise { if (text.includes('