0

Merge branch 'main' into feat/better-gdrive-pdf-fetch

This commit is contained in:
Rafael Miller 2024-06-05 14:07:56 -03:00 committed by GitHub
commit 9e000ded03
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 9 additions and 4 deletions

View File

@ -3,7 +3,7 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor";
export async function handleCustomScraping(
text: string,
url: string
): Promise<{ scraper: string; url: string; wait_after_load?: number } | null> {
): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
// Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) {
console.log(
@ -12,7 +12,10 @@ export async function handleCustomScraping(
return {
scraper: "fire-engine",
url: url,
wait_after_load: 1000,
waitAfterLoad: 1000,
pageOptions: {
scrollXPaths: ['//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]']
}
};
}
@ -24,7 +27,7 @@ export async function handleCustomScraping(
return {
scraper: "fire-engine",
url: url,
wait_after_load: 3000,
waitAfterLoad: 3000,
};
}

View File

@ -46,6 +46,7 @@ export async function scrapWithFireEngine(
url: string,
waitFor: number = 0,
screenshot: boolean = false,
pageOptions: { scrollXPaths?: string[] } = {},
headers?: Record<string, string>,
options?: any
): Promise<FireEngineResponse> {
@ -68,6 +69,7 @@ export async function scrapWithFireEngine(
wait: waitParam,
screenshot: screenshotParam,
headers: headers,
pageOptions: pageOptions
}),
});
@ -334,7 +336,7 @@ export async function scrapSingleUrl(
if (customScraperResult){
switch (customScraperResult.scraper) {
case "fire-engine":
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions)
case "pdf":
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
}