From 8b3c3aae911c0d6bef8e65ceae26f9fac15af288 Mon Sep 17 00:00:00 2001
From: AndyMik90 <andre@mikalsenutvikling.no>
Date: Tue, 18 Jun 2024 07:31:46 +0200
Subject: [PATCH] Added support for RegEx in removeTags

---
 apps/api/src/scraper/WebScraper/single_url.ts | 31 +++++++++++++------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index db8f0ae..354a5cb 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -316,15 +316,26 @@ export async function scrapSingleUrl(
   const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
     const soup = cheerio.load(html);
     soup("script, style, iframe, noscript, meta, head").remove();
-
+    
     if (pageOptions.removeTags) {
       if (typeof pageOptions.removeTags === 'string') {
-        pageOptions.removeTags.split(',').forEach((tag) => {
-          soup(tag.trim()).remove();
-        });
-      } else if (Array.isArray(pageOptions.removeTags)) {
+        pageOptions.removeTags = [pageOptions.removeTags];
+      }
+    
+      if (Array.isArray(pageOptions.removeTags)) {
         pageOptions.removeTags.forEach((tag) => {
-          soup(tag).remove();
+          let elementsToRemove;
+          if (tag.startsWith("*") && tag.endsWith("*")) {
+            const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`);
+            elementsToRemove = soup('*').filter((index, element) => {
+              const classNames = soup(element).attr('class');
+              return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className));
+            });
+          } else {
+            elementsToRemove = soup(tag);
+          }
+    
+          elementsToRemove.remove();
         });
       }
     }
@@ -332,11 +343,13 @@ export async function scrapSingleUrl(
     if (pageOptions.onlyMainContent) {
       // remove any other tags that are not in the main content
       excludeNonMainTags.forEach((tag) => {
-        soup(tag).remove();
+        const elementsToRemove = soup(tag);
+        elementsToRemove.remove();
       });
     }
-    return soup.html();
-  };
+    const cleanedHtml = soup.html();
+    return cleanedHtml;
+};
 
   const attemptScraping = async (
     url: string,