From 6c939d534d26342e936b97f3a88f936d96a39dc4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 29 May 2024 19:43:51 -0700 Subject: [PATCH] Nick: small refactor --- apps/api/src/lib/entities.ts | 5 +++++ apps/api/src/scraper/WebScraper/single_url.ts | 19 ++++++++++--------- apps/api/src/types.ts | 2 +- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 23469f0..48d3e89 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -106,4 +106,9 @@ export class SearchResult { toString(): string { return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; } +} + +export interface FireEngineResponse { + html: string; + screenshot: string; } \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4f521e8..c7f9469 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -2,7 +2,7 @@ import * as cheerio from "cheerio"; import { ScrapingBeeClient } from "scrapingbee"; import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; -import { Document, PageOptions } from "../../lib/entities"; +import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; @@ -47,7 +47,7 @@ export async function scrapWithFireEngine( waitFor: number = 0, screenshot: boolean = false, options?: any -): Promise<[string, string]> { +): Promise { try { const reqParams = await generateRequestParams(url); // If the user has passed a wait parameter in the request, use that @@ -67,21 +67,21 @@ export async function scrapWithFireEngine( console.error( `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` ); - return ["", ""]; + return { html: "", screenshot: "" }; } const contentType = response.headers['content-type']; if (contentType && contentType.includes('application/pdf')) { - return [await fetchAndProcessPdf(url), ""]; + return { html: await fetchAndProcessPdf(url), screenshot: "" }; } else { const data = await response.json(); const html = data.content; const screenshot = data.screenshot; - return [html ?? "", screenshot ?? ""]; + return { html: html ?? "", screenshot: screenshot ?? "" }; } } catch (error) { console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); - return ["", ""]; + return { html: "", screenshot: "" }; } } @@ -213,7 +213,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea return scrapersInOrder as typeof baseScrapers[number][]; } -async function handleCustomScraping(text: string, url: string): Promise<[string, string] | null> { +async function handleCustomScraping(text: string, url: string): Promise { if (text.includes('