0

Merge branch 'main' into feat/better-gdrive-pdf-fetch

This commit is contained in:
Rafael Miller 2024-06-05 14:07:56 -03:00 committed by GitHub
commit 9e000ded03
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 9 additions and 4 deletions

View File

@ -3,7 +3,7 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor";
export async function handleCustomScraping( export async function handleCustomScraping(
text: string, text: string,
url: string url: string
): Promise<{ scraper: string; url: string; wait_after_load?: number } | null> { ): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
// Check for Readme Docs special case // Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) { if (text.includes('<meta name="readme-deploy"')) {
console.log( console.log(
@ -12,7 +12,10 @@ export async function handleCustomScraping(
return { return {
scraper: "fire-engine", scraper: "fire-engine",
url: url, url: url,
wait_after_load: 1000, waitAfterLoad: 1000,
pageOptions: {
scrollXPaths: ['//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]']
}
}; };
} }
@ -24,7 +27,7 @@ export async function handleCustomScraping(
return { return {
scraper: "fire-engine", scraper: "fire-engine",
url: url, url: url,
wait_after_load: 3000, waitAfterLoad: 3000,
}; };
} }

View File

@ -46,6 +46,7 @@ export async function scrapWithFireEngine(
url: string, url: string,
waitFor: number = 0, waitFor: number = 0,
screenshot: boolean = false, screenshot: boolean = false,
pageOptions: { scrollXPaths?: string[] } = {},
headers?: Record<string, string>, headers?: Record<string, string>,
options?: any options?: any
): Promise<FireEngineResponse> { ): Promise<FireEngineResponse> {
@ -68,6 +69,7 @@ export async function scrapWithFireEngine(
wait: waitParam, wait: waitParam,
screenshot: screenshotParam, screenshot: screenshotParam,
headers: headers, headers: headers,
pageOptions: pageOptions
}), }),
}); });
@ -334,7 +336,7 @@ export async function scrapSingleUrl(
if (customScraperResult){ if (customScraperResult){
switch (customScraperResult.scraper) { switch (customScraperResult.scraper) {
case "fire-engine": case "fire-engine":
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load) customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions)
case "pdf": case "pdf":
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot } customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
} }