Merge pull request #239 from mendableai/feat/scroll-xpaths

[Feat] Added scroll xpaths on fire-engine for handling readme docs
2024-06-05 10:05:47 -07:00 · 2024-06-05 10:05:47 -07:00 · 6d76037f6d
commit 6d76037f6d
parent a547f9a78e ccc55127d6
2 changed files with 10 additions and 5 deletions
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@ -1,7 +1,7 @@
 export async function handleCustomScraping(
  text: string,
  url: string
-): Promise<{ scraper: string; url: string; wait_after_load: number } | null> {
+): Promise<{ scraper: string; url: string; waitAfterLoad: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
  // Check for Readme Docs special case
  if (text.includes('<meta name="readme-deploy"')) {
    console.log(
@ -10,7 +10,10 @@ export async function handleCustomScraping(
    return {
      scraper: "fire-engine",
      url: url,
-      wait_after_load: 1000,
+      waitAfterLoad: 1000,
+      pageOptions: {
+        scrollXPaths: ['//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]']
+      }
    };
  }

@ -22,7 +25,7 @@ export async function handleCustomScraping(
    return {
      scraper: "fire-engine",
      url: url,
-      wait_after_load: 3000,
+      waitAfterLoad: 3000,
    };
  }

@ -37,7 +40,7 @@ export async function handleCustomScraping(
    return {
      scraper: "fire-engine",
      url: url,
-      wait_after_load: 1000,
+      waitAfterLoad: 1000,
    };
  }

--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -46,6 +46,7 @@ export async function scrapWithFireEngine(
  url: string,
  waitFor: number = 0,
  screenshot: boolean = false,
+  pageOptions: { scrollXPaths?: string[] } = {},
  headers?: Record<string, string>,
  options?: any
 ): Promise<FireEngineResponse> {
@ -68,6 +69,7 @@ export async function scrapWithFireEngine(
        wait: waitParam,
        screenshot: screenshotParam,
        headers: headers,
+        pageOptions: pageOptions
      }),
    });

@ -332,7 +334,7 @@ export async function scrapSingleUrl(
    const customScraperResult = await handleCustomScraping(text, url);

    if(customScraperResult){
-      customScrapedContent  = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
+      customScrapedContent  = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions)
    }

    if (customScrapedContent) {