Merge pull request #204 from mendableai/feat/custom-scraping-readme
[Feat] Added custom scraping conditions for readme docs
This commit is contained in:
commit
51b0b88cd4
@ -210,6 +210,14 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
|
|||||||
return scrapersInOrder as typeof baseScrapers[number][];
|
return scrapersInOrder as typeof baseScrapers[number][];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function handleCustomScraping(text: string, url: string): Promise<string | null> {
|
||||||
|
if (text.includes('<meta name="readme-deploy"')) {
|
||||||
|
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
|
||||||
|
return await scrapWithFireEngine(url, 1000);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0},
|
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0},
|
||||||
@ -266,6 +274,12 @@ export async function scrapSingleUrl(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check for custom scraping conditions
|
||||||
|
const customScrapedContent = await handleCustomScraping(text, url);
|
||||||
|
if (customScrapedContent) {
|
||||||
|
text = customScrapedContent;
|
||||||
|
}
|
||||||
|
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user