From 8b3c3aae911c0d6bef8e65ceae26f9fac15af288 Mon Sep 17 00:00:00 2001 From: AndyMik90 Date: Tue, 18 Jun 2024 07:31:46 +0200 Subject: [PATCH] Added support for RegEx in removeTags --- apps/api/src/scraper/WebScraper/single_url.ts | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index db8f0ae..354a5cb 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -316,15 +316,26 @@ export async function scrapSingleUrl( const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { const soup = cheerio.load(html); soup("script, style, iframe, noscript, meta, head").remove(); - + if (pageOptions.removeTags) { if (typeof pageOptions.removeTags === 'string') { - pageOptions.removeTags.split(',').forEach((tag) => { - soup(tag.trim()).remove(); - }); - } else if (Array.isArray(pageOptions.removeTags)) { + pageOptions.removeTags = [pageOptions.removeTags]; + } + + if (Array.isArray(pageOptions.removeTags)) { pageOptions.removeTags.forEach((tag) => { - soup(tag).remove(); + let elementsToRemove; + if (tag.startsWith("*") && tag.endsWith("*")) { + const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`); + elementsToRemove = soup('*').filter((index, element) => { + const classNames = soup(element).attr('class'); + return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className)); + }); + } else { + elementsToRemove = soup(tag); + } + + elementsToRemove.remove(); }); } } @@ -332,11 +343,13 @@ export async function scrapSingleUrl( if (pageOptions.onlyMainContent) { // remove any other tags that are not in the main content excludeNonMainTags.forEach((tag) => { - soup(tag).remove(); + const elementsToRemove = soup(tag); + elementsToRemove.remove(); }); } - return soup.html(); - }; + const cleanedHtml = soup.html(); + return cleanedHtml; +}; const attemptScraping = async ( url: string,