Merge branch 'main' into feat/better-gdrive-pdf-fetch
This commit is contained in:
commit
9e000ded03
@ -3,7 +3,7 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
|||||||
export async function handleCustomScraping(
|
export async function handleCustomScraping(
|
||||||
text: string,
|
text: string,
|
||||||
url: string
|
url: string
|
||||||
): Promise<{ scraper: string; url: string; wait_after_load?: number } | null> {
|
): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
||||||
// Check for Readme Docs special case
|
// Check for Readme Docs special case
|
||||||
if (text.includes('<meta name="readme-deploy"')) {
|
if (text.includes('<meta name="readme-deploy"')) {
|
||||||
console.log(
|
console.log(
|
||||||
@ -12,7 +12,10 @@ export async function handleCustomScraping(
|
|||||||
return {
|
return {
|
||||||
scraper: "fire-engine",
|
scraper: "fire-engine",
|
||||||
url: url,
|
url: url,
|
||||||
wait_after_load: 1000,
|
waitAfterLoad: 1000,
|
||||||
|
pageOptions: {
|
||||||
|
scrollXPaths: ['//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]']
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -24,7 +27,7 @@ export async function handleCustomScraping(
|
|||||||
return {
|
return {
|
||||||
scraper: "fire-engine",
|
scraper: "fire-engine",
|
||||||
url: url,
|
url: url,
|
||||||
wait_after_load: 3000,
|
waitAfterLoad: 3000,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,6 +46,7 @@ export async function scrapWithFireEngine(
|
|||||||
url: string,
|
url: string,
|
||||||
waitFor: number = 0,
|
waitFor: number = 0,
|
||||||
screenshot: boolean = false,
|
screenshot: boolean = false,
|
||||||
|
pageOptions: { scrollXPaths?: string[] } = {},
|
||||||
headers?: Record<string, string>,
|
headers?: Record<string, string>,
|
||||||
options?: any
|
options?: any
|
||||||
): Promise<FireEngineResponse> {
|
): Promise<FireEngineResponse> {
|
||||||
@ -68,6 +69,7 @@ export async function scrapWithFireEngine(
|
|||||||
wait: waitParam,
|
wait: waitParam,
|
||||||
screenshot: screenshotParam,
|
screenshot: screenshotParam,
|
||||||
headers: headers,
|
headers: headers,
|
||||||
|
pageOptions: pageOptions
|
||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -334,7 +336,7 @@ export async function scrapSingleUrl(
|
|||||||
if (customScraperResult){
|
if (customScraperResult){
|
||||||
switch (customScraperResult.scraper) {
|
switch (customScraperResult.scraper) {
|
||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
|
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions)
|
||||||
case "pdf":
|
case "pdf":
|
||||||
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
|
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user