Merge pull request #204 from mendableai/feat/custom-scraping-readme

[Feat] Added custom scraping conditions for readme docs
2024-05-29 10:00:24 -07:00 · 2024-05-29 10:00:24 -07:00 · 51b0b88cd4
commit 51b0b88cd4
parent 8911ddf10c ee9a2184e2
1 changed files with 14 additions and 0 deletions
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -210,6 +210,14 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
  return scrapersInOrder as typeof baseScrapers[number][];
 }
 async function handleCustomScraping(text: string, url: string): Promise<string | null> {
  if (text.includes('<meta name="readme-deploy"')) {
    console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
    return await scrapWithFireEngine(url, 1000);
  }
  return null;
 }
 export async function scrapSingleUrl(
  urlToScrap: string,
  pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0},
@ -266,6 +274,12 @@ export async function scrapSingleUrl(
        break;
    }
    // Check for custom scraping conditions
    const customScrapedContent = await handleCustomScraping(text, url);
    if (customScrapedContent) {
      text = customScrapedContent;
    }
    //* TODO: add an optional to return markdown or structured/extracted content
    let cleanedHtml = removeUnwantedElements(text, pageOptions);