0

Merge pull request #207 from mendableai/feat/screenshot-support

Feat/screenshot support
This commit is contained in:
Nicolas 2024-05-30 15:36:09 -07:00 committed by GitHub
commit d486d7da1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 50 additions and 27 deletions

View File

@ -102,7 +102,7 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0 }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
const extractorOptions = req.body.extractorOptions ?? { const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown" mode: "markdown"
} }

View File

@ -16,6 +16,7 @@ export type PageOptions = {
fallback?: boolean; fallback?: boolean;
fetchPageContent?: boolean; fetchPageContent?: boolean;
waitFor?: number; waitFor?: number;
screenshot?: boolean;
}; };
export type ExtractorOptions = { export type ExtractorOptions = {
@ -106,3 +107,8 @@ export class SearchResult {
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
} }
} }
export interface FireEngineResponse {
html: string;
screenshot: string;
}

View File

@ -2,7 +2,7 @@ import * as cheerio from "cheerio";
import { ScrapingBeeClient } from "scrapingbee"; import { ScrapingBeeClient } from "scrapingbee";
import { extractMetadata } from "./utils/metadata"; import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { Document, PageOptions } from "../../lib/entities"; import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags"; import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params"; import { urlSpecificParams } from "./utils/custom/website_params";
@ -45,40 +45,43 @@ export async function generateRequestParams(
export async function scrapWithFireEngine( export async function scrapWithFireEngine(
url: string, url: string,
waitFor: number = 0, waitFor: number = 0,
screenshot: boolean = false,
options?: any options?: any
): Promise<string> { ): Promise<FireEngineResponse> {
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that // If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor; const waitParam = reqParams["params"]?.wait ?? waitFor;
console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam}`); const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`);
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", { const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", {
method: "POST", method: "POST",
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
body: JSON.stringify({ url: url, wait: waitParam }), body: JSON.stringify({ url: url, wait: waitParam, screenshot: screenshotParam }),
}); });
if (!response.ok) { if (!response.ok) {
console.error( console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
); );
return ""; return { html: "", screenshot: "" };
} }
const contentType = response.headers['content-type']; const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) { if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url); return { html: await fetchAndProcessPdf(url), screenshot: "" };
} else { } else {
const data = await response.json(); const data = await response.json();
const html = data.content; const html = data.content;
return html ?? ""; const screenshot = data.screenshot;
return { html: html ?? "", screenshot: screenshot ?? "" };
} }
} catch (error) { } catch (error) {
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
return ""; return { html: "", screenshot: "" };
} }
} }
@ -182,7 +185,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined * @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
* @returns The order of scrapers to be used for scraping a URL * @returns The order of scrapers to be used for scraping a URL
*/ */
function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false) { function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false, isScreenshotPresent: boolean = false) {
const availableScrapers = baseScrapers.filter(scraper => { const availableScrapers = baseScrapers.filter(scraper => {
switch (scraper) { switch (scraper) {
case "scrapingBee": case "scrapingBee":
@ -199,7 +202,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"]; let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"];
if (isWaitPresent) { if (isWaitPresent || isScreenshotPresent) {
defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")]; defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")];
} }
@ -210,7 +213,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
return scrapersInOrder as typeof baseScrapers[number][]; return scrapersInOrder as typeof baseScrapers[number][];
} }
async function handleCustomScraping(text: string, url: string): Promise<string | null> { async function handleCustomScraping(text: string, url: string): Promise<FireEngineResponse | null> {
if (text.includes('<meta name="readme-deploy"')) { if (text.includes('<meta name="readme-deploy"')) {
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`); console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
return await scrapWithFireEngine(url, 1000); return await scrapWithFireEngine(url, 1000);
@ -220,7 +223,7 @@ async function handleCustomScraping(text: string, url: string): Promise<string |
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0}, pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0, screenshot: false },
existingHtml: string = "" existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -242,12 +245,14 @@ export async function scrapSingleUrl(
method: typeof baseScrapers[number] method: typeof baseScrapers[number]
) => { ) => {
let text = ""; let text = "";
let screenshot = "";
switch (method) { switch (method) {
case "fire-engine": case "fire-engine":
if (process.env.FIRE_ENGINE_BETA_URL) { if (process.env.FIRE_ENGINE_BETA_URL) {
console.log(`Scraping ${url} with Fire Engine`); console.log(`Scraping ${url} with Fire Engine`);
const response = await scrapWithFireEngine(url, pageOptions.waitFor, pageOptions.screenshot);
text = await scrapWithFireEngine(url, pageOptions.waitFor); text = response.html;
screenshot = response.screenshot;
} }
break; break;
case "scrapingBee": case "scrapingBee":
@ -277,16 +282,17 @@ export async function scrapSingleUrl(
// Check for custom scraping conditions // Check for custom scraping conditions
const customScrapedContent = await handleCustomScraping(text, url); const customScrapedContent = await handleCustomScraping(text, url);
if (customScrapedContent) { if (customScrapedContent) {
text = customScrapedContent; text = customScrapedContent[0];
screenshot = customScrapedContent[1];
} }
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(text, pageOptions); let cleanedHtml = removeUnwantedElements(text, pageOptions);
return [await parseMarkdown(cleanedHtml), text]; return [await parseMarkdown(cleanedHtml), text, screenshot];
}; };
try { try {
let [text, html] = ["", ""]; let [text, html, screenshot] = ["", "", ""];
let urlKey = urlToScrap; let urlKey = urlToScrap;
try { try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
@ -294,7 +300,7 @@ export async function scrapSingleUrl(
console.error(`Invalid URL key, trying: ${urlToScrap}`); console.error(`Invalid URL key, trying: ${urlToScrap}`);
} }
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0) const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && pageOptions.screenshot && pageOptions.screenshot === true)
for (const scraper of scrapersInOrder) { for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it // If exists text coming from crawler, use it
@ -304,7 +310,7 @@ export async function scrapSingleUrl(
html = existingHtml; html = existingHtml;
break; break;
} }
[text, html] = await attemptScraping(urlToScrap, scraper); [text, html, screenshot] = await attemptScraping(urlToScrap, scraper);
if (text && text.trim().length >= 100) break; if (text && text.trim().length >= 100) break;
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
if (nextScraperIndex < scrapersInOrder.length) { if (nextScraperIndex < scrapersInOrder.length) {
@ -318,12 +324,23 @@ export async function scrapSingleUrl(
const soup = cheerio.load(html); const soup = cheerio.load(html);
const metadata = extractMetadata(soup, urlToScrap); const metadata = extractMetadata(soup, urlToScrap);
const document: Document = {
content: text, let document: Document;
markdown: text, if(screenshot && screenshot.length > 0) {
html: pageOptions.includeHtml ? html : undefined, document = {
metadata: { ...metadata, sourceURL: urlToScrap }, content: text,
}; markdown: text,
html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, screenshot: screenshot, sourceURL: urlToScrap, },
}
}else{
document = {
content: text,
markdown: text,
html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, sourceURL: urlToScrap, },
}
}
return document; return document;
} catch (error) { } catch (error) {