0

Merge pull request #239 from mendableai/feat/scroll-xpaths

[Feat] Added scroll xpaths on fire-engine for handling readme docs
This commit is contained in:
Nicolas 2024-06-05 10:05:47 -07:00 committed by GitHub
commit 6d76037f6d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 10 additions and 5 deletions

View File

@ -1,7 +1,7 @@
export async function handleCustomScraping(
text: string,
url: string
): Promise<{ scraper: string; url: string; wait_after_load: number } | null> {
): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
// Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) {
console.log(
@ -10,7 +10,10 @@ export async function handleCustomScraping(
return {
scraper: "fire-engine",
url: url,
wait_after_load: 1000,
waitAfterLoad: 1000,
pageOptions: {
scrollXPaths: ['//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]']
}
};
}
@ -22,7 +25,7 @@ export async function handleCustomScraping(
return {
scraper: "fire-engine",
url: url,
wait_after_load: 3000,
waitAfterLoad: 3000,
};
}
@ -37,7 +40,7 @@ export async function handleCustomScraping(
return {
scraper: "fire-engine",
url: url,
wait_after_load: 1000,
waitAfterLoad: 1000,
};
}

View File

@ -46,6 +46,7 @@ export async function scrapWithFireEngine(
url: string,
waitFor: number = 0,
screenshot: boolean = false,
pageOptions: { scrollXPaths?: string[] } = {},
headers?: Record<string, string>,
options?: any
): Promise<FireEngineResponse> {
@ -68,6 +69,7 @@ export async function scrapWithFireEngine(
wait: waitParam,
screenshot: screenshotParam,
headers: headers,
pageOptions: pageOptions
}),
});
@ -332,7 +334,7 @@ export async function scrapSingleUrl(
const customScraperResult = await handleCustomScraping(text, url);
if(customScraperResult){
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions)
}
if (customScrapedContent) {