0

Added custom scraping conditions for readme docs

This commit is contained in:
rafaelsideguide 2024-05-29 13:39:43 -03:00
parent 8911ddf10c
commit ee9a2184e2

View File

@ -210,6 +210,14 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
return scrapersInOrder as typeof baseScrapers[number][]; return scrapersInOrder as typeof baseScrapers[number][];
} }
async function handleCustomScraping(text: string, url: string): Promise<string | null> {
if (text.includes('<meta name="readme-deploy"')) {
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
return await scrapWithFireEngine(url, 1000);
}
return null;
}
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0}, pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0},
@ -266,6 +274,12 @@ export async function scrapSingleUrl(
break; break;
} }
// Check for custom scraping conditions
const customScrapedContent = await handleCustomScraping(text, url);
if (customScrapedContent) {
text = customScrapedContent;
}
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(text, pageOptions); let cleanedHtml = removeUnwantedElements(text, pageOptions);