From 6bea803120cad898d8e7f5b5e99d24dc137bdef6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 31 May 2024 15:39:54 -0700 Subject: [PATCH 1/4] Nick: --- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/single_url.ts | 128 +++++++++++++----- apps/playwright-service/main.py | 45 +++--- 3 files changed, 123 insertions(+), 51 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 48d3e89..dd5fc72 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -17,6 +17,7 @@ export type PageOptions = { fetchPageContent?: boolean; waitFor?: number; screenshot?: boolean; + headers?: Record; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c7f9469..3a04004 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -18,7 +18,6 @@ const baseScrapers = [ "fetch", ] as const; - export async function generateRequestParams( url: string, wait_browser: string = "domcontentloaded", @@ -46,6 +45,7 @@ export async function scrapWithFireEngine( url: string, waitFor: number = 0, screenshot: boolean = false, + headers?: Record, options?: any ): Promise { try { @@ -53,14 +53,21 @@ export async function scrapWithFireEngine( // If the user has passed a wait parameter in the request, use that const waitParam = reqParams["params"]?.wait ?? waitFor; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; - console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`); + console.log( + `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` + ); - const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", { + const response = await fetch(process.env.FIRE_ENGINE_BETA_URL + "/scrape", { method: "POST", headers: { "Content-Type": "application/json", }, - body: JSON.stringify({ url: url, wait: waitParam, screenshot: screenshotParam }), + body: JSON.stringify({ + url: url, + wait: waitParam, + screenshot: screenshotParam, + headers: headers, + }), }); if (!response.ok) { @@ -70,8 +77,8 @@ export async function scrapWithFireEngine( return { html: "", screenshot: "" }; } - const contentType = response.headers['content-type']; - if (contentType && contentType.includes('application/pdf')) { + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { return { html: await fetchAndProcessPdf(url), screenshot: "" }; } else { const data = await response.json(); @@ -106,9 +113,9 @@ export async function scrapWithScrapingBee( ); return ""; } - - const contentType = response.headers['content-type']; - if (contentType && contentType.includes('application/pdf')) { + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { return fetchAndProcessPdf(url); } else { const decoder = new TextDecoder(); @@ -121,7 +128,10 @@ export async function scrapWithScrapingBee( } } -export async function scrapWithPlaywright(url: string, waitFor: number = 0): Promise { +export async function scrapWithPlaywright( + url: string, + waitFor: number = 0 +): Promise { try { const reqParams = await generateRequestParams(url); // If the user has passed a wait parameter in the request, use that @@ -142,8 +152,8 @@ export async function scrapWithPlaywright(url: string, waitFor: number = 0): Pro return ""; } - const contentType = response.headers['content-type']; - if (contentType && contentType.includes('application/pdf')) { + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { return fetchAndProcessPdf(url); } else { const data = await response.json(); @@ -166,8 +176,8 @@ export async function scrapWithFetch(url: string): Promise { return ""; } - const contentType = response.headers['content-type']; - if (contentType && contentType.includes('application/pdf')) { + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { return fetchAndProcessPdf(url); } else { const text = await response.text(); @@ -185,8 +195,13 @@ export async function scrapWithFetch(url: string): Promise { * @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined * @returns The order of scrapers to be used for scraping a URL */ -function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false, isScreenshotPresent: boolean = false) { - const availableScrapers = baseScrapers.filter(scraper => { +function getScrapingFallbackOrder( + defaultScraper?: string, + isWaitPresent: boolean = false, + isScreenshotPresent: boolean = false, + isHeadersPresent: boolean = false +) { + const availableScrapers = baseScrapers.filter((scraper) => { switch (scraper) { case "scrapingBee": case "scrapingBeeLoad": @@ -200,22 +215,46 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea } }); - let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"]; - - if (isWaitPresent || isScreenshotPresent) { - defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")]; + let defaultOrder = [ + "scrapingBee", + "fire-engine", + "playwright", + "scrapingBeeLoad", + "fetch", + ]; + + if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { + defaultOrder = [ + "fire-engine", + "playwright", + ...defaultOrder.filter( + (scraper) => scraper !== "fire-engine" && scraper !== "playwright" + ), + ]; } - const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper)); - const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]); + const filteredDefaultOrder = defaultOrder.filter( + (scraper: (typeof baseScrapers)[number]) => + availableScrapers.includes(scraper) + ); + const uniqueScrapers = new Set( + defaultScraper + ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] + : [...filteredDefaultOrder, ...availableScrapers] + ); const scrapersInOrder = Array.from(uniqueScrapers); console.log(`Scrapers in order: ${scrapersInOrder}`); - return scrapersInOrder as typeof baseScrapers[number][]; + return scrapersInOrder as (typeof baseScrapers)[number][]; } -async function handleCustomScraping(text: string, url: string): Promise { +async function handleCustomScraping( + text: string, + url: string +): Promise { if (text.includes(' { urlToScrap = urlToScrap.trim(); @@ -242,7 +286,7 @@ export async function scrapSingleUrl( const attemptScraping = async ( url: string, - method: typeof baseScrapers[number] + method: (typeof baseScrapers)[number] ) => { let text = ""; let screenshot = ""; @@ -250,7 +294,12 @@ export async function scrapSingleUrl( case "fire-engine": if (process.env.FIRE_ENGINE_BETA_URL) { console.log(`Scraping ${url} with Fire Engine`); - const response = await scrapWithFireEngine(url, pageOptions.waitFor, pageOptions.screenshot); + const response = await scrapWithFireEngine( + url, + pageOptions.waitFor, + pageOptions.screenshot, + pageOptions.headers + ); text = response.html; screenshot = response.screenshot; } @@ -300,7 +349,12 @@ export async function scrapSingleUrl( console.error(`Invalid URL key, trying: ${urlToScrap}`); } const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; - const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && pageOptions.screenshot && pageOptions.screenshot === true) + const scrapersInOrder = getScrapingFallbackOrder( + defaultScraper, + pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, + pageOptions && pageOptions.screenshot && pageOptions.screenshot === true, + pageOptions && pageOptions.headers && pageOptions.headers !== undefined + ); for (const scraper of scrapersInOrder) { // If exists text coming from crawler, use it @@ -326,20 +380,24 @@ export async function scrapSingleUrl( const metadata = extractMetadata(soup, urlToScrap); let document: Document; - if(screenshot && screenshot.length > 0) { + if (screenshot && screenshot.length > 0) { document = { content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, - metadata: { ...metadata, screenshot: screenshot, sourceURL: urlToScrap, }, - } - }else{ + metadata: { + ...metadata, + screenshot: screenshot, + sourceURL: urlToScrap, + }, + }; + } else { document = { content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, - metadata: { ...metadata, sourceURL: urlToScrap, }, - } + metadata: { ...metadata, sourceURL: urlToScrap }, + }; } return document; diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index 9cb0c4e..b4ac715 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -4,10 +4,10 @@ from fastapi.responses import JSONResponse from pydantic import BaseModel from os import environ -PROXY_SERVER = environ.get('PROXY_SERVER', None) -PROXY_USERNAME = environ.get('PROXY_USERNAME', None) -PROXY_PASSWORD = environ.get('PROXY_PASSWORD', None) -BLOCK_MEDIA = environ.get('BLOCK_MEDIA', 'False').upper() == 'TRUE' +PROXY_SERVER = environ.get("PROXY_SERVER", None) +PROXY_USERNAME = environ.get("PROXY_USERNAME", None) +PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None) +BLOCK_MEDIA = environ.get("BLOCK_MEDIA", "False").upper() == "TRUE" app = FastAPI() @@ -15,6 +15,8 @@ app = FastAPI() class UrlModel(BaseModel): url: str wait: int = None + wait_until: str = "load" + headers: dict = None browser: Browser = None @@ -36,26 +38,37 @@ async def shutdown_event(): async def root(body: UrlModel): context = None if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD: - context = await browser.new_context(proxy={"server": PROXY_SERVER, - "username": PROXY_USERNAME, - "password": PROXY_PASSWORD}) + context = await browser.new_context( + proxy={ + "server": PROXY_SERVER, + "username": PROXY_USERNAME, + "password": PROXY_PASSWORD, + } + ) else: context = await browser.new_context() if BLOCK_MEDIA: - await context.route("**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}", - handler=lambda route, request: route.abort()) + await context.route( + "**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}", + handler=lambda route, request: route.abort(), + ) page = await context.new_page() + + # Set headers if provided + if body.headers: + await page.set_extra_http_headers(body.headers) + await page.goto( body.url, - wait_until="load", - timeout=body.timeout if body.timeout else 15000, - ) - # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough - if body.wait: - await page.wait_for_timeout(body.wait) - + timeout=15000, + wait_until=body.wait_until if body.wait_until else "load", + ) # Set max timeout to 15s + if body.wait: # Check if wait parameter is provided in the request body + await page.wait_for_timeout( + body.wait + ) # Convert seconds to milliseconds for playwright page_content = await page.content() await context.close() json_compatible_item_data = {"content": page_content} From 3b8059edb64ba4ef2362e72a6c740ad64ed5b6e1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 31 May 2024 15:43:06 -0700 Subject: [PATCH 2/4] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 3a04004..b73d121 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -130,7 +130,8 @@ export async function scrapWithScrapingBee( export async function scrapWithPlaywright( url: string, - waitFor: number = 0 + waitFor: number = 0, + headers?: Record ): Promise { try { const reqParams = await generateRequestParams(url); @@ -142,7 +143,7 @@ export async function scrapWithPlaywright( headers: { "Content-Type": "application/json", }, - body: JSON.stringify({ url: url, wait: waitParam }), + body: JSON.stringify({ url: url, wait: waitParam, headers: headers }), }); if (!response.ok) { @@ -315,7 +316,7 @@ export async function scrapSingleUrl( break; case "playwright": if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { - text = await scrapWithPlaywright(url, pageOptions.waitFor); + text = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers); } break; case "scrapingBeeLoad": From 8cb62dde9287c8a5dac1d81f2eede5147762936b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 31 May 2024 16:09:39 -0700 Subject: [PATCH 3/4] Update website_params.ts --- .../WebScraper/utils/custom/website_params.ts | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index 9094fc3..01f6614 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -141,5 +141,23 @@ export const urlSpecificParams = { accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", }, + }, + "firecrawl.dev":{ + defaultScraper: "fire-engine", + params: { + + }, + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "sec-fetch-site": "same-origin", + "sec-fetch-mode": "cors", + "sec-fetch-dest": "empty", + referer: "https://www.google.com/", + "accept-language": "en-US,en;q=0.9", + "accept-encoding": "gzip, deflate, br", + accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + }, } }; From fde522c3e16ec57ebf156eb6fc11137850982c43 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 2 Jun 2024 20:23:45 -0700 Subject: [PATCH 4/4] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index b73d121..56514c0 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -268,6 +268,7 @@ export async function scrapSingleUrl( includeHtml: false, waitFor: 0, screenshot: false, + headers: {} }, existingHtml: string = "" ): Promise {