0

Nick: small refactor

This commit is contained in:
Nicolas 2024-05-29 19:43:51 -07:00
parent 37915e11e8
commit 6c939d534d
3 changed files with 16 additions and 10 deletions

View File

@ -107,3 +107,8 @@ export class SearchResult {
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
} }
} }
export interface FireEngineResponse {
html: string;
screenshot: string;
}

View File

@ -2,7 +2,7 @@ import * as cheerio from "cheerio";
import { ScrapingBeeClient } from "scrapingbee"; import { ScrapingBeeClient } from "scrapingbee";
import { extractMetadata } from "./utils/metadata"; import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { Document, PageOptions } from "../../lib/entities"; import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags"; import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params"; import { urlSpecificParams } from "./utils/custom/website_params";
@ -47,7 +47,7 @@ export async function scrapWithFireEngine(
waitFor: number = 0, waitFor: number = 0,
screenshot: boolean = false, screenshot: boolean = false,
options?: any options?: any
): Promise<[string, string]> { ): Promise<FireEngineResponse> {
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that // If the user has passed a wait parameter in the request, use that
@ -67,21 +67,21 @@ export async function scrapWithFireEngine(
console.error( console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
); );
return ["", ""]; return { html: "", screenshot: "" };
} }
const contentType = response.headers['content-type']; const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) { if (contentType && contentType.includes('application/pdf')) {
return [await fetchAndProcessPdf(url), ""]; return { html: await fetchAndProcessPdf(url), screenshot: "" };
} else { } else {
const data = await response.json(); const data = await response.json();
const html = data.content; const html = data.content;
const screenshot = data.screenshot; const screenshot = data.screenshot;
return [html ?? "", screenshot ?? ""]; return { html: html ?? "", screenshot: screenshot ?? "" };
} }
} catch (error) { } catch (error) {
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
return ["", ""]; return { html: "", screenshot: "" };
} }
} }
@ -213,7 +213,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
return scrapersInOrder as typeof baseScrapers[number][]; return scrapersInOrder as typeof baseScrapers[number][];
} }
async function handleCustomScraping(text: string, url: string): Promise<[string, string] | null> { async function handleCustomScraping(text: string, url: string): Promise<FireEngineResponse | null> {
if (text.includes('<meta name="readme-deploy"')) { if (text.includes('<meta name="readme-deploy"')) {
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`); console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
return await scrapWithFireEngine(url, 1000); return await scrapWithFireEngine(url, 1000);
@ -250,8 +250,9 @@ export async function scrapSingleUrl(
case "fire-engine": case "fire-engine":
if (process.env.FIRE_ENGINE_BETA_URL) { if (process.env.FIRE_ENGINE_BETA_URL) {
console.log(`Scraping ${url} with Fire Engine`); console.log(`Scraping ${url} with Fire Engine`);
const response = await scrapWithFireEngine(url, pageOptions.waitFor, pageOptions.screenshot);
[text, screenshot] = await scrapWithFireEngine(url, pageOptions.waitFor, pageOptions.screenshot); text = response.html;
screenshot = response.screenshot;
} }
break; break;
case "scrapingBee": case "scrapingBee":