0

Merge pull request #239 from mendableai/feat/scroll-xpaths

[Feat] Added scroll xpaths on fire-engine for handling readme docs
This commit is contained in:
Nicolas 2024-06-05 10:05:47 -07:00 committed by GitHub
commit 6d76037f6d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 10 additions and 5 deletions

View File

@ -1,7 +1,7 @@
export async function handleCustomScraping( export async function handleCustomScraping(
text: string, text: string,
url: string url: string
): Promise<{ scraper: string; url: string; wait_after_load: number } | null> { ): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
// Check for Readme Docs special case // Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) { if (text.includes('<meta name="readme-deploy"')) {
console.log( console.log(
@ -10,7 +10,10 @@ export async function handleCustomScraping(
return { return {
scraper: "fire-engine", scraper: "fire-engine",
url: url, url: url,
wait_after_load: 1000, waitAfterLoad: 1000,
pageOptions: {
scrollXPaths: ['//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]']
}
}; };
} }
@ -22,7 +25,7 @@ export async function handleCustomScraping(
return { return {
scraper: "fire-engine", scraper: "fire-engine",
url: url, url: url,
wait_after_load: 3000, waitAfterLoad: 3000,
}; };
} }
@ -37,7 +40,7 @@ export async function handleCustomScraping(
return { return {
scraper: "fire-engine", scraper: "fire-engine",
url: url, url: url,
wait_after_load: 1000, waitAfterLoad: 1000,
}; };
} }

View File

@ -46,6 +46,7 @@ export async function scrapWithFireEngine(
url: string, url: string,
waitFor: number = 0, waitFor: number = 0,
screenshot: boolean = false, screenshot: boolean = false,
pageOptions: { scrollXPaths?: string[] } = {},
headers?: Record<string, string>, headers?: Record<string, string>,
options?: any options?: any
): Promise<FireEngineResponse> { ): Promise<FireEngineResponse> {
@ -68,6 +69,7 @@ export async function scrapWithFireEngine(
wait: waitParam, wait: waitParam,
screenshot: screenshotParam, screenshot: screenshotParam,
headers: headers, headers: headers,
pageOptions: pageOptions
}), }),
}); });
@ -332,7 +334,7 @@ export async function scrapSingleUrl(
const customScraperResult = await handleCustomScraping(text, url); const customScraperResult = await handleCustomScraping(text, url);
if(customScraperResult){ if(customScraperResult){
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load) customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions)
} }
if (customScrapedContent) { if (customScrapedContent) {