Merge pull request #207 from mendableai/feat/screenshot-support
Feat/screenshot support
This commit is contained in:
commit
d486d7da1c
@ -102,7 +102,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0 };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
|
||||||
const extractorOptions = req.body.extractorOptions ?? {
|
const extractorOptions = req.body.extractorOptions ?? {
|
||||||
mode: "markdown"
|
mode: "markdown"
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,7 @@ export type PageOptions = {
|
|||||||
fallback?: boolean;
|
fallback?: boolean;
|
||||||
fetchPageContent?: boolean;
|
fetchPageContent?: boolean;
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
|
screenshot?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
@ -105,4 +106,9 @@ export class SearchResult {
|
|||||||
toString(): string {
|
toString(): string {
|
||||||
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
|
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface FireEngineResponse {
|
||||||
|
html: string;
|
||||||
|
screenshot: string;
|
||||||
}
|
}
|
@ -2,7 +2,7 @@ import * as cheerio from "cheerio";
|
|||||||
import { ScrapingBeeClient } from "scrapingbee";
|
import { ScrapingBeeClient } from "scrapingbee";
|
||||||
import { extractMetadata } from "./utils/metadata";
|
import { extractMetadata } from "./utils/metadata";
|
||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
import { Document, PageOptions } from "../../lib/entities";
|
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||||
@ -45,40 +45,43 @@ export async function generateRequestParams(
|
|||||||
export async function scrapWithFireEngine(
|
export async function scrapWithFireEngine(
|
||||||
url: string,
|
url: string,
|
||||||
waitFor: number = 0,
|
waitFor: number = 0,
|
||||||
|
screenshot: boolean = false,
|
||||||
options?: any
|
options?: any
|
||||||
): Promise<string> {
|
): Promise<FireEngineResponse> {
|
||||||
try {
|
try {
|
||||||
const reqParams = await generateRequestParams(url);
|
const reqParams = await generateRequestParams(url);
|
||||||
// If the user has passed a wait parameter in the request, use that
|
// If the user has passed a wait parameter in the request, use that
|
||||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||||
console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam}`);
|
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||||
|
console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`);
|
||||||
|
|
||||||
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", {
|
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
body: JSON.stringify({ url: url, wait: waitParam }),
|
body: JSON.stringify({ url: url, wait: waitParam, screenshot: screenshotParam }),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
console.error(
|
console.error(
|
||||||
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
||||||
);
|
);
|
||||||
return "";
|
return { html: "", screenshot: "" };
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = response.headers['content-type'];
|
const contentType = response.headers['content-type'];
|
||||||
if (contentType && contentType.includes('application/pdf')) {
|
if (contentType && contentType.includes('application/pdf')) {
|
||||||
return fetchAndProcessPdf(url);
|
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
||||||
} else {
|
} else {
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
const html = data.content;
|
const html = data.content;
|
||||||
return html ?? "";
|
const screenshot = data.screenshot;
|
||||||
|
return { html: html ?? "", screenshot: screenshot ?? "" };
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
||||||
return "";
|
return { html: "", screenshot: "" };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -182,7 +185,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
|||||||
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
|
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
|
||||||
* @returns The order of scrapers to be used for scraping a URL
|
* @returns The order of scrapers to be used for scraping a URL
|
||||||
*/
|
*/
|
||||||
function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false) {
|
function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false, isScreenshotPresent: boolean = false) {
|
||||||
const availableScrapers = baseScrapers.filter(scraper => {
|
const availableScrapers = baseScrapers.filter(scraper => {
|
||||||
switch (scraper) {
|
switch (scraper) {
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
@ -199,7 +202,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
|
|||||||
|
|
||||||
let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"];
|
let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"];
|
||||||
|
|
||||||
if (isWaitPresent) {
|
if (isWaitPresent || isScreenshotPresent) {
|
||||||
defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")];
|
defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -210,7 +213,7 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
|
|||||||
return scrapersInOrder as typeof baseScrapers[number][];
|
return scrapersInOrder as typeof baseScrapers[number][];
|
||||||
}
|
}
|
||||||
|
|
||||||
async function handleCustomScraping(text: string, url: string): Promise<string | null> {
|
async function handleCustomScraping(text: string, url: string): Promise<FireEngineResponse | null> {
|
||||||
if (text.includes('<meta name="readme-deploy"')) {
|
if (text.includes('<meta name="readme-deploy"')) {
|
||||||
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
|
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
|
||||||
return await scrapWithFireEngine(url, 1000);
|
return await scrapWithFireEngine(url, 1000);
|
||||||
@ -220,7 +223,7 @@ async function handleCustomScraping(text: string, url: string): Promise<string |
|
|||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0},
|
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0, screenshot: false },
|
||||||
existingHtml: string = ""
|
existingHtml: string = ""
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
@ -242,12 +245,14 @@ export async function scrapSingleUrl(
|
|||||||
method: typeof baseScrapers[number]
|
method: typeof baseScrapers[number]
|
||||||
) => {
|
) => {
|
||||||
let text = "";
|
let text = "";
|
||||||
|
let screenshot = "";
|
||||||
switch (method) {
|
switch (method) {
|
||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
console.log(`Scraping ${url} with Fire Engine`);
|
console.log(`Scraping ${url} with Fire Engine`);
|
||||||
|
const response = await scrapWithFireEngine(url, pageOptions.waitFor, pageOptions.screenshot);
|
||||||
text = await scrapWithFireEngine(url, pageOptions.waitFor);
|
text = response.html;
|
||||||
|
screenshot = response.screenshot;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
@ -277,16 +282,17 @@ export async function scrapSingleUrl(
|
|||||||
// Check for custom scraping conditions
|
// Check for custom scraping conditions
|
||||||
const customScrapedContent = await handleCustomScraping(text, url);
|
const customScrapedContent = await handleCustomScraping(text, url);
|
||||||
if (customScrapedContent) {
|
if (customScrapedContent) {
|
||||||
text = customScrapedContent;
|
text = customScrapedContent[0];
|
||||||
|
screenshot = customScrapedContent[1];
|
||||||
}
|
}
|
||||||
|
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||||
|
|
||||||
return [await parseMarkdown(cleanedHtml), text];
|
return [await parseMarkdown(cleanedHtml), text, screenshot];
|
||||||
};
|
};
|
||||||
try {
|
try {
|
||||||
let [text, html] = ["", ""];
|
let [text, html, screenshot] = ["", "", ""];
|
||||||
let urlKey = urlToScrap;
|
let urlKey = urlToScrap;
|
||||||
try {
|
try {
|
||||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||||
@ -294,7 +300,7 @@ export async function scrapSingleUrl(
|
|||||||
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||||
}
|
}
|
||||||
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
||||||
const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0)
|
const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && pageOptions.screenshot && pageOptions.screenshot === true)
|
||||||
|
|
||||||
for (const scraper of scrapersInOrder) {
|
for (const scraper of scrapersInOrder) {
|
||||||
// If exists text coming from crawler, use it
|
// If exists text coming from crawler, use it
|
||||||
@ -304,7 +310,7 @@ export async function scrapSingleUrl(
|
|||||||
html = existingHtml;
|
html = existingHtml;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
[text, html] = await attemptScraping(urlToScrap, scraper);
|
[text, html, screenshot] = await attemptScraping(urlToScrap, scraper);
|
||||||
if (text && text.trim().length >= 100) break;
|
if (text && text.trim().length >= 100) break;
|
||||||
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||||
if (nextScraperIndex < scrapersInOrder.length) {
|
if (nextScraperIndex < scrapersInOrder.length) {
|
||||||
@ -318,12 +324,23 @@ export async function scrapSingleUrl(
|
|||||||
|
|
||||||
const soup = cheerio.load(html);
|
const soup = cheerio.load(html);
|
||||||
const metadata = extractMetadata(soup, urlToScrap);
|
const metadata = extractMetadata(soup, urlToScrap);
|
||||||
const document: Document = {
|
|
||||||
content: text,
|
let document: Document;
|
||||||
markdown: text,
|
if(screenshot && screenshot.length > 0) {
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
document = {
|
||||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
content: text,
|
||||||
};
|
markdown: text,
|
||||||
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
|
metadata: { ...metadata, screenshot: screenshot, sourceURL: urlToScrap, },
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
document = {
|
||||||
|
content: text,
|
||||||
|
markdown: text,
|
||||||
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
|
metadata: { ...metadata, sourceURL: urlToScrap, },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return document;
|
return document;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -58,5 +58,5 @@ export interface AuthResponse {
|
|||||||
error?: string;
|
error?: string;
|
||||||
status?: number;
|
status?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user