Nick:
This commit is contained in:
parent
2139129296
commit
6bea803120
@ -17,6 +17,7 @@ export type PageOptions = {
|
|||||||
fetchPageContent?: boolean;
|
fetchPageContent?: boolean;
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
|
headers?: Record<string, string>;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
|
@ -18,7 +18,6 @@ const baseScrapers = [
|
|||||||
"fetch",
|
"fetch",
|
||||||
] as const;
|
] as const;
|
||||||
|
|
||||||
|
|
||||||
export async function generateRequestParams(
|
export async function generateRequestParams(
|
||||||
url: string,
|
url: string,
|
||||||
wait_browser: string = "domcontentloaded",
|
wait_browser: string = "domcontentloaded",
|
||||||
@ -46,6 +45,7 @@ export async function scrapWithFireEngine(
|
|||||||
url: string,
|
url: string,
|
||||||
waitFor: number = 0,
|
waitFor: number = 0,
|
||||||
screenshot: boolean = false,
|
screenshot: boolean = false,
|
||||||
|
headers?: Record<string, string>,
|
||||||
options?: any
|
options?: any
|
||||||
): Promise<FireEngineResponse> {
|
): Promise<FireEngineResponse> {
|
||||||
try {
|
try {
|
||||||
@ -53,14 +53,21 @@ export async function scrapWithFireEngine(
|
|||||||
// If the user has passed a wait parameter in the request, use that
|
// If the user has passed a wait parameter in the request, use that
|
||||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||||
console.log(`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`);
|
console.log(
|
||||||
|
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
|
||||||
|
);
|
||||||
|
|
||||||
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", {
|
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL + "/scrape", {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
body: JSON.stringify({ url: url, wait: waitParam, screenshot: screenshotParam }),
|
body: JSON.stringify({
|
||||||
|
url: url,
|
||||||
|
wait: waitParam,
|
||||||
|
screenshot: screenshotParam,
|
||||||
|
headers: headers,
|
||||||
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
@ -70,8 +77,8 @@ export async function scrapWithFireEngine(
|
|||||||
return { html: "", screenshot: "" };
|
return { html: "", screenshot: "" };
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = response.headers['content-type'];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes('application/pdf')) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
||||||
} else {
|
} else {
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
@ -106,9 +113,9 @@ export async function scrapWithScrapingBee(
|
|||||||
);
|
);
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = response.headers['content-type'];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes('application/pdf')) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url);
|
||||||
} else {
|
} else {
|
||||||
const decoder = new TextDecoder();
|
const decoder = new TextDecoder();
|
||||||
@ -121,7 +128,10 @@ export async function scrapWithScrapingBee(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapWithPlaywright(url: string, waitFor: number = 0): Promise<string> {
|
export async function scrapWithPlaywright(
|
||||||
|
url: string,
|
||||||
|
waitFor: number = 0
|
||||||
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const reqParams = await generateRequestParams(url);
|
const reqParams = await generateRequestParams(url);
|
||||||
// If the user has passed a wait parameter in the request, use that
|
// If the user has passed a wait parameter in the request, use that
|
||||||
@ -142,8 +152,8 @@ export async function scrapWithPlaywright(url: string, waitFor: number = 0): Pro
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = response.headers['content-type'];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes('application/pdf')) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url);
|
||||||
} else {
|
} else {
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
@ -166,8 +176,8 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = response.headers['content-type'];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes('application/pdf')) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url);
|
||||||
} else {
|
} else {
|
||||||
const text = await response.text();
|
const text = await response.text();
|
||||||
@ -185,8 +195,13 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
|||||||
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
|
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
|
||||||
* @returns The order of scrapers to be used for scraping a URL
|
* @returns The order of scrapers to be used for scraping a URL
|
||||||
*/
|
*/
|
||||||
function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolean = false, isScreenshotPresent: boolean = false) {
|
function getScrapingFallbackOrder(
|
||||||
const availableScrapers = baseScrapers.filter(scraper => {
|
defaultScraper?: string,
|
||||||
|
isWaitPresent: boolean = false,
|
||||||
|
isScreenshotPresent: boolean = false,
|
||||||
|
isHeadersPresent: boolean = false
|
||||||
|
) {
|
||||||
|
const availableScrapers = baseScrapers.filter((scraper) => {
|
||||||
switch (scraper) {
|
switch (scraper) {
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
case "scrapingBeeLoad":
|
case "scrapingBeeLoad":
|
||||||
@ -200,22 +215,46 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"];
|
let defaultOrder = [
|
||||||
|
"scrapingBee",
|
||||||
if (isWaitPresent || isScreenshotPresent) {
|
"fire-engine",
|
||||||
defaultOrder = ["fire-engine", "playwright", ...defaultOrder.filter(scraper => scraper !== "fire-engine" && scraper !== "playwright")];
|
"playwright",
|
||||||
|
"scrapingBeeLoad",
|
||||||
|
"fetch",
|
||||||
|
];
|
||||||
|
|
||||||
|
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||||
|
defaultOrder = [
|
||||||
|
"fire-engine",
|
||||||
|
"playwright",
|
||||||
|
...defaultOrder.filter(
|
||||||
|
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||||
|
),
|
||||||
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper));
|
const filteredDefaultOrder = defaultOrder.filter(
|
||||||
const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]);
|
(scraper: (typeof baseScrapers)[number]) =>
|
||||||
|
availableScrapers.includes(scraper)
|
||||||
|
);
|
||||||
|
const uniqueScrapers = new Set(
|
||||||
|
defaultScraper
|
||||||
|
? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
|
||||||
|
: [...filteredDefaultOrder, ...availableScrapers]
|
||||||
|
);
|
||||||
const scrapersInOrder = Array.from(uniqueScrapers);
|
const scrapersInOrder = Array.from(uniqueScrapers);
|
||||||
console.log(`Scrapers in order: ${scrapersInOrder}`);
|
console.log(`Scrapers in order: ${scrapersInOrder}`);
|
||||||
return scrapersInOrder as typeof baseScrapers[number][];
|
return scrapersInOrder as (typeof baseScrapers)[number][];
|
||||||
}
|
}
|
||||||
|
|
||||||
async function handleCustomScraping(text: string, url: string): Promise<FireEngineResponse | null> {
|
async function handleCustomScraping(
|
||||||
|
text: string,
|
||||||
|
url: string
|
||||||
|
): Promise<FireEngineResponse | null> {
|
||||||
if (text.includes('<meta name="readme-deploy"')) {
|
if (text.includes('<meta name="readme-deploy"')) {
|
||||||
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
|
console.log(
|
||||||
|
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
||||||
|
);
|
||||||
return await scrapWithFireEngine(url, 1000);
|
return await scrapWithFireEngine(url, 1000);
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
@ -223,7 +262,12 @@ async function handleCustomScraping(text: string, url: string): Promise<FireEngi
|
|||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0, screenshot: false },
|
pageOptions: PageOptions = {
|
||||||
|
onlyMainContent: true,
|
||||||
|
includeHtml: false,
|
||||||
|
waitFor: 0,
|
||||||
|
screenshot: false,
|
||||||
|
},
|
||||||
existingHtml: string = ""
|
existingHtml: string = ""
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
@ -242,7 +286,7 @@ export async function scrapSingleUrl(
|
|||||||
|
|
||||||
const attemptScraping = async (
|
const attemptScraping = async (
|
||||||
url: string,
|
url: string,
|
||||||
method: typeof baseScrapers[number]
|
method: (typeof baseScrapers)[number]
|
||||||
) => {
|
) => {
|
||||||
let text = "";
|
let text = "";
|
||||||
let screenshot = "";
|
let screenshot = "";
|
||||||
@ -250,7 +294,12 @@ export async function scrapSingleUrl(
|
|||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
console.log(`Scraping ${url} with Fire Engine`);
|
console.log(`Scraping ${url} with Fire Engine`);
|
||||||
const response = await scrapWithFireEngine(url, pageOptions.waitFor, pageOptions.screenshot);
|
const response = await scrapWithFireEngine(
|
||||||
|
url,
|
||||||
|
pageOptions.waitFor,
|
||||||
|
pageOptions.screenshot,
|
||||||
|
pageOptions.headers
|
||||||
|
);
|
||||||
text = response.html;
|
text = response.html;
|
||||||
screenshot = response.screenshot;
|
screenshot = response.screenshot;
|
||||||
}
|
}
|
||||||
@ -300,7 +349,12 @@ export async function scrapSingleUrl(
|
|||||||
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||||
}
|
}
|
||||||
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
||||||
const scrapersInOrder = getScrapingFallbackOrder(defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && pageOptions.screenshot && pageOptions.screenshot === true)
|
const scrapersInOrder = getScrapingFallbackOrder(
|
||||||
|
defaultScraper,
|
||||||
|
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||||
|
pageOptions && pageOptions.screenshot && pageOptions.screenshot === true,
|
||||||
|
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
||||||
|
);
|
||||||
|
|
||||||
for (const scraper of scrapersInOrder) {
|
for (const scraper of scrapersInOrder) {
|
||||||
// If exists text coming from crawler, use it
|
// If exists text coming from crawler, use it
|
||||||
@ -326,20 +380,24 @@ export async function scrapSingleUrl(
|
|||||||
const metadata = extractMetadata(soup, urlToScrap);
|
const metadata = extractMetadata(soup, urlToScrap);
|
||||||
|
|
||||||
let document: Document;
|
let document: Document;
|
||||||
if(screenshot && screenshot.length > 0) {
|
if (screenshot && screenshot.length > 0) {
|
||||||
document = {
|
document = {
|
||||||
content: text,
|
content: text,
|
||||||
markdown: text,
|
markdown: text,
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
metadata: { ...metadata, screenshot: screenshot, sourceURL: urlToScrap, },
|
metadata: {
|
||||||
}
|
...metadata,
|
||||||
}else{
|
screenshot: screenshot,
|
||||||
|
sourceURL: urlToScrap,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
} else {
|
||||||
document = {
|
document = {
|
||||||
content: text,
|
content: text,
|
||||||
markdown: text,
|
markdown: text,
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
metadata: { ...metadata, sourceURL: urlToScrap, },
|
metadata: { ...metadata, sourceURL: urlToScrap },
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
return document;
|
return document;
|
||||||
|
@ -4,10 +4,10 @@ from fastapi.responses import JSONResponse
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from os import environ
|
from os import environ
|
||||||
|
|
||||||
PROXY_SERVER = environ.get('PROXY_SERVER', None)
|
PROXY_SERVER = environ.get("PROXY_SERVER", None)
|
||||||
PROXY_USERNAME = environ.get('PROXY_USERNAME', None)
|
PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
|
||||||
PROXY_PASSWORD = environ.get('PROXY_PASSWORD', None)
|
PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None)
|
||||||
BLOCK_MEDIA = environ.get('BLOCK_MEDIA', 'False').upper() == 'TRUE'
|
BLOCK_MEDIA = environ.get("BLOCK_MEDIA", "False").upper() == "TRUE"
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
@ -15,6 +15,8 @@ app = FastAPI()
|
|||||||
class UrlModel(BaseModel):
|
class UrlModel(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
wait: int = None
|
wait: int = None
|
||||||
|
wait_until: str = "load"
|
||||||
|
headers: dict = None
|
||||||
|
|
||||||
|
|
||||||
browser: Browser = None
|
browser: Browser = None
|
||||||
@ -36,26 +38,37 @@ async def shutdown_event():
|
|||||||
async def root(body: UrlModel):
|
async def root(body: UrlModel):
|
||||||
context = None
|
context = None
|
||||||
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
|
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
|
||||||
context = await browser.new_context(proxy={"server": PROXY_SERVER,
|
context = await browser.new_context(
|
||||||
"username": PROXY_USERNAME,
|
proxy={
|
||||||
"password": PROXY_PASSWORD})
|
"server": PROXY_SERVER,
|
||||||
|
"username": PROXY_USERNAME,
|
||||||
|
"password": PROXY_PASSWORD,
|
||||||
|
}
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
context = await browser.new_context()
|
context = await browser.new_context()
|
||||||
|
|
||||||
if BLOCK_MEDIA:
|
if BLOCK_MEDIA:
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}",
|
await context.route(
|
||||||
handler=lambda route, request: route.abort())
|
"**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}",
|
||||||
|
handler=lambda route, request: route.abort(),
|
||||||
|
)
|
||||||
|
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
|
||||||
|
# Set headers if provided
|
||||||
|
if body.headers:
|
||||||
|
await page.set_extra_http_headers(body.headers)
|
||||||
|
|
||||||
await page.goto(
|
await page.goto(
|
||||||
body.url,
|
body.url,
|
||||||
wait_until="load",
|
timeout=15000,
|
||||||
timeout=body.timeout if body.timeout else 15000,
|
wait_until=body.wait_until if body.wait_until else "load",
|
||||||
)
|
) # Set max timeout to 15s
|
||||||
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
|
if body.wait: # Check if wait parameter is provided in the request body
|
||||||
if body.wait:
|
await page.wait_for_timeout(
|
||||||
await page.wait_for_timeout(body.wait)
|
body.wait
|
||||||
|
) # Convert seconds to milliseconds for playwright
|
||||||
page_content = await page.content()
|
page_content = await page.content()
|
||||||
await context.close()
|
await context.close()
|
||||||
json_compatible_item_data = {"content": page_content}
|
json_compatible_item_data = {"content": page_content}
|
||||||
|
Loading…
Reference in New Issue
Block a user