0
This commit is contained in:
Nicolas 2024-05-31 15:39:54 -07:00
parent 2139129296
commit 6bea803120
3 changed files with 123 additions and 51 deletions

View File

@ -17,6 +17,7 @@ export type PageOptions = {
fetchPageContent?: boolean; fetchPageContent?: boolean;
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
headers?: Record<string, string>;
}; };
export type ExtractorOptions = { export type ExtractorOptions = {

View File

@ -18,7 +18,6 @@ const baseScrapers = [
"fetch", "fetch",
] as const; ] as const;
export async function generateRequestParams( export async function generateRequestParams(
url: string, url: string,
wait_browser: string = "domcontentloaded", wait_browser: string = "domcontentloaded",
@ -46,6 +45,7 @@ export async function scrapWithFireEngine(
url: string, url: string,
waitFor: number = 0, waitFor: number = 0,
screenshot: boolean = false, screenshot: boolean = false,
headers?: Record<string, string>,
options?: any options?: any
): Promise<FireEngineResponse> { ): Promise<FireEngineResponse> {
try { try {
@ -53,14 +53,21 @@ export async function scrapWithFireEngine(
// If the user has passed a wait parameter in the request, use that // If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor; const waitParam = reqParams["params"]?.wait ?? waitFor;
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`); console.log(
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
);
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", { const response = await fetch(process.env.FIRE_ENGINE_BETA_URL + "/scrape", {
method: "POST", method: "POST",
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
body: JSON.stringify({ url: url, wait: waitParam, screenshot: screenshotParam }), body: JSON.stringify({
url: url,
wait: waitParam,
screenshot: screenshotParam,
headers: headers,
}),
}); });
if (!response.ok) { if (!response.ok) {
@ -70,8 +77,8 @@ export async function scrapWithFireEngine(
return { html: "", screenshot: "" }; return { html: "", screenshot: "" };
} }
const contentType = response.headers['content-type']; const contentType = response.headers["content-type"];
if (contentType && contentType.includes('application/pdf')) { if (contentType && contentType.includes("application/pdf")) {
return { html: await fetchAndProcessPdf(url), screenshot: "" }; return { html: await fetchAndProcessPdf(url), screenshot: "" };
} else { } else {
const data = await response.json(); const data = await response.json();
@ -106,9 +113,9 @@ export async function scrapWithScrapingBee(
); );
return ""; return "";
} }
const contentType = response.headers['content-type']; const contentType = response.headers["content-type"];
if (contentType && contentType.includes('application/pdf')) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url);
} else { } else {
const decoder = new TextDecoder(); const decoder = new TextDecoder();
@ -121,7 +128,10 @@ export async function scrapWithScrapingBee(
} }
} }
export async function scrapWithPlaywright(url: string, waitFor: number = 0): Promise<string> { export async function scrapWithPlaywright(
url: string,
waitFor: number = 0
): Promise<string> {
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that // If the user has passed a wait parameter in the request, use that
@ -142,8 +152,8 @@ export async function scrapWithPlaywright(url: string, waitFor: number = 0): Pro
return ""; return "";
} }
const contentType = response.headers['content-type']; const contentType = response.headers["content-type"];
if (contentType && contentType.includes('application/pdf')) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url);
} else { } else {
const data = await response.json(); const data = await response.json();
@ -166,8 +176,8 @@ export async function scrapWithFetch(url: string): Promise<string> {
return ""; return "";
} }
const contentType = response.headers['content-type']; const contentType = response.headers["content-type"];
if (contentType && contentType.includes('application/pdf')) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url);
} else { } else {
const text = await response.text(); const text = await response.text();
@ -185,8 +195,13 @@ export async function scrapWithFetch(url: string): Promise<string> {
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined * @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
* @returns The order of scrapers to be used for scraping a URL * @returns The order of scrapers to be used for scraping a URL
*/ */
function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false, isScreenshotPresent: boolean = false) { function getScrapingFallbackOrder(
const availableScrapers = baseScrapers.filter(scraper => { defaultScraper?: string,
isWaitPresent: boolean = false,
isScreenshotPresent: boolean = false,
isHeadersPresent: boolean = false
) {
const availableScrapers = baseScrapers.filter((scraper) => {
switch (scraper) { switch (scraper) {
case "scrapingBee": case "scrapingBee":
case "scrapingBeeLoad": case "scrapingBeeLoad":
@ -200,22 +215,46 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
} }
}); });
let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"]; let defaultOrder = [
"scrapingBee",
if (isWaitPresent || isScreenshotPresent) { "fire-engine",
defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")]; "playwright",
"scrapingBeeLoad",
"fetch",
];
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
defaultOrder = [
"fire-engine",
"playwright",
...defaultOrder.filter(
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
),
];
} }
const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper)); const filteredDefaultOrder = defaultOrder.filter(
const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]); (scraper: (typeof baseScrapers)[number]) =>
availableScrapers.includes(scraper)
);
const uniqueScrapers = new Set(
defaultScraper
? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
: [...filteredDefaultOrder, ...availableScrapers]
);
const scrapersInOrder = Array.from(uniqueScrapers); const scrapersInOrder = Array.from(uniqueScrapers);
console.log(`Scrapers in order: ${scrapersInOrder}`); console.log(`Scrapers in order: ${scrapersInOrder}`);
return scrapersInOrder as typeof baseScrapers[number][]; return scrapersInOrder as (typeof baseScrapers)[number][];
} }
async function handleCustomScraping(text: string, url: string): Promise<FireEngineResponse | null> { async function handleCustomScraping(
text: string,
url: string
): Promise<FireEngineResponse | null> {
if (text.includes('<meta name="readme-deploy"')) { if (text.includes('<meta name="readme-deploy"')) {
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`); console.log(
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
);
return await scrapWithFireEngine(url, 1000); return await scrapWithFireEngine(url, 1000);
} }
return null; return null;
@ -223,7 +262,12 @@ async function handleCustomScraping(text: string, url: string): Promise<FireEngi
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0, screenshot: false }, pageOptions: PageOptions = {
onlyMainContent: true,
includeHtml: false,
waitFor: 0,
screenshot: false,
},
existingHtml: string = "" existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -242,7 +286,7 @@ export async function scrapSingleUrl(
const attemptScraping = async ( const attemptScraping = async (
url: string, url: string,
method: typeof baseScrapers[number] method: (typeof baseScrapers)[number]
) => { ) => {
let text = ""; let text = "";
let screenshot = ""; let screenshot = "";
@ -250,7 +294,12 @@ export async function scrapSingleUrl(
case "fire-engine": case "fire-engine":
if (process.env.FIRE_ENGINE_BETA_URL) { if (process.env.FIRE_ENGINE_BETA_URL) {
console.log(`Scraping ${url} with Fire Engine`); console.log(`Scraping ${url} with Fire Engine`);
const response = await scrapWithFireEngine(url, pageOptions.waitFor, pageOptions.screenshot); const response = await scrapWithFireEngine(
url,
pageOptions.waitFor,
pageOptions.screenshot,
pageOptions.headers
);
text = response.html; text = response.html;
screenshot = response.screenshot; screenshot = response.screenshot;
} }
@ -300,7 +349,12 @@ export async function scrapSingleUrl(
console.error(`Invalid URL key, trying: ${urlToScrap}`); console.error(`Invalid URL key, trying: ${urlToScrap}`);
} }
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && pageOptions.screenshot && pageOptions.screenshot === true) const scrapersInOrder = getScrapingFallbackOrder(
defaultScraper,
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
pageOptions && pageOptions.screenshot && pageOptions.screenshot === true,
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
);
for (const scraper of scrapersInOrder) { for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it // If exists text coming from crawler, use it
@ -326,20 +380,24 @@ export async function scrapSingleUrl(
const metadata = extractMetadata(soup, urlToScrap); const metadata = extractMetadata(soup, urlToScrap);
let document: Document; let document: Document;
if(screenshot && screenshot.length > 0) { if (screenshot && screenshot.length > 0) {
document = { document = {
content: text, content: text,
markdown: text, markdown: text,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, screenshot: screenshot, sourceURL: urlToScrap, }, metadata: {
} ...metadata,
}else{ screenshot: screenshot,
sourceURL: urlToScrap,
},
};
} else {
document = { document = {
content: text, content: text,
markdown: text, markdown: text,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, sourceURL: urlToScrap, }, metadata: { ...metadata, sourceURL: urlToScrap },
} };
} }
return document; return document;

View File

@ -4,10 +4,10 @@ from fastapi.responses import JSONResponse
from pydantic import BaseModel from pydantic import BaseModel
from os import environ from os import environ
PROXY_SERVER = environ.get('PROXY_SERVER', None) PROXY_SERVER = environ.get("PROXY_SERVER", None)
PROXY_USERNAME = environ.get('PROXY_USERNAME', None) PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
PROXY_PASSWORD = environ.get('PROXY_PASSWORD', None) PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None)
BLOCK_MEDIA = environ.get('BLOCK_MEDIA', 'False').upper() == 'TRUE' BLOCK_MEDIA = environ.get("BLOCK_MEDIA", "False").upper() == "TRUE"
app = FastAPI() app = FastAPI()
@ -15,6 +15,8 @@ app = FastAPI()
class UrlModel(BaseModel): class UrlModel(BaseModel):
url: str url: str
wait: int = None wait: int = None
wait_until: str = "load"
headers: dict = None
browser: Browser = None browser: Browser = None
@ -36,26 +38,37 @@ async def shutdown_event():
async def root(body: UrlModel): async def root(body: UrlModel):
context = None context = None
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD: if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
context = await browser.new_context(proxy={"server": PROXY_SERVER, context = await browser.new_context(
"username": PROXY_USERNAME, proxy={
"password": PROXY_PASSWORD}) "server": PROXY_SERVER,
"username": PROXY_USERNAME,
"password": PROXY_PASSWORD,
}
)
else: else:
context = await browser.new_context() context = await browser.new_context()
if BLOCK_MEDIA: if BLOCK_MEDIA:
await context.route("**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}", await context.route(
handler=lambda route, request: route.abort()) "**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}",
handler=lambda route, request: route.abort(),
)
page = await context.new_page() page = await context.new_page()
# Set headers if provided
if body.headers:
await page.set_extra_http_headers(body.headers)
await page.goto( await page.goto(
body.url, body.url,
wait_until="load", timeout=15000,
timeout=body.timeout if body.timeout else 15000, wait_until=body.wait_until if body.wait_until else "load",
) ) # Set max timeout to 15s
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough if body.wait: # Check if wait parameter is provided in the request body
if body.wait: await page.wait_for_timeout(
await page.wait_for_timeout(body.wait) body.wait
) # Convert seconds to milliseconds for playwright
page_content = await page.content() page_content = await page.content()
await context.close() await context.close()
json_compatible_item_data = {"content": page_content} json_compatible_item_data = {"content": page_content}