Nick: small refactor
This commit is contained in:
parent
37915e11e8
commit
6c939d534d
@ -107,3 +107,8 @@ export class SearchResult {
|
|||||||
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
|
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface FireEngineResponse {
|
||||||
|
html: string;
|
||||||
|
screenshot: string;
|
||||||
|
}
|
@ -2,7 +2,7 @@ import * as cheerio from "cheerio";
|
|||||||
import { ScrapingBeeClient } from "scrapingbee";
|
import { ScrapingBeeClient } from "scrapingbee";
|
||||||
import { extractMetadata } from "./utils/metadata";
|
import { extractMetadata } from "./utils/metadata";
|
||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
import { Document, PageOptions } from "../../lib/entities";
|
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||||
@ -47,7 +47,7 @@ export async function scrapWithFireEngine(
|
|||||||
waitFor: number = 0,
|
waitFor: number = 0,
|
||||||
screenshot: boolean = false,
|
screenshot: boolean = false,
|
||||||
options?: any
|
options?: any
|
||||||
): Promise<[string, string]> {
|
): Promise<FireEngineResponse> {
|
||||||
try {
|
try {
|
||||||
const reqParams = await generateRequestParams(url);
|
const reqParams = await generateRequestParams(url);
|
||||||
// If the user has passed a wait parameter in the request, use that
|
// If the user has passed a wait parameter in the request, use that
|
||||||
@ -67,21 +67,21 @@ export async function scrapWithFireEngine(
|
|||||||
console.error(
|
console.error(
|
||||||
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
||||||
);
|
);
|
||||||
return ["", ""];
|
return { html: "", screenshot: "" };
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = response.headers['content-type'];
|
const contentType = response.headers['content-type'];
|
||||||
if (contentType && contentType.includes('application/pdf')) {
|
if (contentType && contentType.includes('application/pdf')) {
|
||||||
return [await fetchAndProcessPdf(url), ""];
|
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
||||||
} else {
|
} else {
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
const html = data.content;
|
const html = data.content;
|
||||||
const screenshot = data.screenshot;
|
const screenshot = data.screenshot;
|
||||||
return [html ?? "", screenshot ?? ""];
|
return { html: html ?? "", screenshot: screenshot ?? "" };
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
||||||
return ["", ""];
|
return { html: "", screenshot: "" };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -213,7 +213,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
|
|||||||
return scrapersInOrder as typeof baseScrapers[number][];
|
return scrapersInOrder as typeof baseScrapers[number][];
|
||||||
}
|
}
|
||||||
|
|
||||||
async function handleCustomScraping(text: string, url: string): Promise<[string, string] | null> {
|
async function handleCustomScraping(text: string, url: string): Promise<FireEngineResponse | null> {
|
||||||
if (text.includes('<meta name="readme-deploy"')) {
|
if (text.includes('<meta name="readme-deploy"')) {
|
||||||
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
|
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
|
||||||
return await scrapWithFireEngine(url, 1000);
|
return await scrapWithFireEngine(url, 1000);
|
||||||
@ -250,8 +250,9 @@ export async function scrapSingleUrl(
|
|||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
console.log(`Scraping ${url} with Fire Engine`);
|
console.log(`Scraping ${url} with Fire Engine`);
|
||||||
|
const response = await scrapWithFireEngine(url, pageOptions.waitFor, pageOptions.screenshot);
|
||||||
[text, screenshot] = await scrapWithFireEngine(url, pageOptions.waitFor, pageOptions.screenshot);
|
text = response.html;
|
||||||
|
screenshot = response.screenshot;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
|
Loading…
Reference in New Issue
Block a user