0

Merge pull request #207 from mendableai/feat/screenshot-support

Feat/screenshot support
This commit is contained in:
Nicolas 2024-05-30 15:36:09 -07:00 committed by GitHub
commit d486d7da1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 50 additions and 27 deletions

View File

@ -102,7 +102,7 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(status).json({ error });
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0 };
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown"
}

View File

@ -16,6 +16,7 @@ export type PageOptions = {
fallback?: boolean;
fetchPageContent?: boolean;
waitFor?: number;
screenshot?: boolean;
};
export type ExtractorOptions = {
@ -106,3 +107,8 @@ export class SearchResult {
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
}
}
export interface FireEngineResponse {
html: string;
screenshot: string;
}

View File

@ -2,7 +2,7 @@ import * as cheerio from "cheerio";
import { ScrapingBeeClient } from "scrapingbee";
import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv";
import { Document, PageOptions } from "../../lib/entities";
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params";
@ -45,40 +45,43 @@ export async function generateRequestParams(
export async function scrapWithFireEngine(
url: string,
waitFor: number = 0,
screenshot: boolean = false,
options?: any
): Promise<string> {
): Promise<FireEngineResponse> {
try {
const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor;
console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam}`);
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`);
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ url: url, wait: waitParam }),
body: JSON.stringify({ url: url, wait: waitParam, screenshot: screenshotParam }),
});
if (!response.ok) {
console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
);
return "";
return { html: "", screenshot: "" };
}
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url);
return { html: await fetchAndProcessPdf(url), screenshot: "" };
} else {
const data = await response.json();
const html = data.content;
return html ?? "";
const screenshot = data.screenshot;
return { html: html ?? "", screenshot: screenshot ?? "" };
}
} catch (error) {
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
return "";
return { html: "", screenshot: "" };
}
}
@ -182,7 +185,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
* @returns The order of scrapers to be used for scraping a URL
*/
function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false) {
function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false, isScreenshotPresent: boolean = false) {
const availableScrapers = baseScrapers.filter(scraper => {
switch (scraper) {
case "scrapingBee":
@ -199,7 +202,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"];
if (isWaitPresent) {
if (isWaitPresent || isScreenshotPresent) {
defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")];
}
@ -210,7 +213,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
return scrapersInOrder as typeof baseScrapers[number][];
}
async function handleCustomScraping(text: string, url: string): Promise<string | null> {
async function handleCustomScraping(text: string, url: string): Promise<FireEngineResponse | null> {
if (text.includes('<meta name="readme-deploy"')) {
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
return await scrapWithFireEngine(url, 1000);
@ -220,7 +223,7 @@ async function handleCustomScraping(text: string, url: string): Promise<string |
export async function scrapSingleUrl(
urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0},
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0, screenshot: false },
existingHtml: string = ""
): Promise<Document> {
urlToScrap = urlToScrap.trim();
@ -242,12 +245,14 @@ export async function scrapSingleUrl(
method: typeof baseScrapers[number]
) => {
let text = "";
let screenshot = "";
switch (method) {
case "fire-engine":
if (process.env.FIRE_ENGINE_BETA_URL) {
console.log(`Scraping ${url} with Fire Engine`);
text = await scrapWithFireEngine(url, pageOptions.waitFor);
const response = await scrapWithFireEngine(url, pageOptions.waitFor, pageOptions.screenshot);
text = response.html;
screenshot = response.screenshot;
}
break;
case "scrapingBee":
@ -277,16 +282,17 @@ export async function scrapSingleUrl(
// Check for custom scraping conditions
const customScrapedContent = await handleCustomScraping(text, url);
if (customScrapedContent) {
text = customScrapedContent;
text = customScrapedContent[0];
screenshot = customScrapedContent[1];
}
//* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(text, pageOptions);
return [await parseMarkdown(cleanedHtml), text];
return [await parseMarkdown(cleanedHtml), text, screenshot];
};
try {
let [text, html] = ["", ""];
let [text, html, screenshot] = ["", "", ""];
let urlKey = urlToScrap;
try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
@ -294,7 +300,7 @@ export async function scrapSingleUrl(
console.error(`Invalid URL key, trying: ${urlToScrap}`);
}
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0)
const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && pageOptions.screenshot && pageOptions.screenshot === true)
for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it
@ -304,7 +310,7 @@ export async function scrapSingleUrl(
html = existingHtml;
break;
}
[text, html] = await attemptScraping(urlToScrap, scraper);
[text, html, screenshot] = await attemptScraping(urlToScrap, scraper);
if (text && text.trim().length >= 100) break;
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
if (nextScraperIndex < scrapersInOrder.length) {
@ -318,12 +324,23 @@ export async function scrapSingleUrl(
const soup = cheerio.load(html);
const metadata = extractMetadata(soup, urlToScrap);
const document: Document = {
content: text,
markdown: text,
html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, sourceURL: urlToScrap },
};
let document: Document;
if(screenshot && screenshot.length > 0) {
document = {
content: text,
markdown: text,
html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, screenshot: screenshot, sourceURL: urlToScrap, },
}
}else{
document = {
content: text,
markdown: text,
html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, sourceURL: urlToScrap, },
}
}
return document;
} catch (error) {