Merge branch 'main' into feat/better-gdrive-pdf-fetch
This commit is contained in:
commit
9e000ded03
@ -3,7 +3,7 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
export async function handleCustomScraping(
|
||||
text: string,
|
||||
url: string
|
||||
): Promise<{ scraper: string; url: string; wait_after_load?: number } | null> {
|
||||
): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
||||
// Check for Readme Docs special case
|
||||
if (text.includes('<meta name="readme-deploy"')) {
|
||||
console.log(
|
||||
@ -12,7 +12,10 @@ export async function handleCustomScraping(
|
||||
return {
|
||||
scraper: "fire-engine",
|
||||
url: url,
|
||||
wait_after_load: 1000,
|
||||
waitAfterLoad: 1000,
|
||||
pageOptions: {
|
||||
scrollXPaths: ['//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]']
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@ -24,7 +27,7 @@ export async function handleCustomScraping(
|
||||
return {
|
||||
scraper: "fire-engine",
|
||||
url: url,
|
||||
wait_after_load: 3000,
|
||||
waitAfterLoad: 3000,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -46,6 +46,7 @@ export async function scrapWithFireEngine(
|
||||
url: string,
|
||||
waitFor: number = 0,
|
||||
screenshot: boolean = false,
|
||||
pageOptions: { scrollXPaths?: string[] } = {},
|
||||
headers?: Record<string, string>,
|
||||
options?: any
|
||||
): Promise<FireEngineResponse> {
|
||||
@ -68,6 +69,7 @@ export async function scrapWithFireEngine(
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
headers: headers,
|
||||
pageOptions: pageOptions
|
||||
}),
|
||||
});
|
||||
|
||||
@ -334,7 +336,7 @@ export async function scrapSingleUrl(
|
||||
if (customScraperResult){
|
||||
switch (customScraperResult.scraper) {
|
||||
case "fire-engine":
|
||||
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
|
||||
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions)
|
||||
case "pdf":
|
||||
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user