From 8b3c3aae911c0d6bef8e65ceae26f9fac15af288 Mon Sep 17 00:00:00 2001 From: AndyMik90 Date: Tue, 18 Jun 2024 07:31:46 +0200 Subject: [PATCH 1/4] Added support for RegEx in removeTags --- apps/api/src/scraper/WebScraper/single_url.ts | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index db8f0ae..354a5cb 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -316,15 +316,26 @@ export async function scrapSingleUrl( const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { const soup = cheerio.load(html); soup("script, style, iframe, noscript, meta, head").remove(); - + if (pageOptions.removeTags) { if (typeof pageOptions.removeTags === 'string') { - pageOptions.removeTags.split(',').forEach((tag) => { - soup(tag.trim()).remove(); - }); - } else if (Array.isArray(pageOptions.removeTags)) { + pageOptions.removeTags = [pageOptions.removeTags]; + } + + if (Array.isArray(pageOptions.removeTags)) { pageOptions.removeTags.forEach((tag) => { - soup(tag).remove(); + let elementsToRemove; + if (tag.startsWith("*") && tag.endsWith("*")) { + const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`); + elementsToRemove = soup('*').filter((index, element) => { + const classNames = soup(element).attr('class'); + return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className)); + }); + } else { + elementsToRemove = soup(tag); + } + + elementsToRemove.remove(); }); } } @@ -332,11 +343,13 @@ export async function scrapSingleUrl( if (pageOptions.onlyMainContent) { // remove any other tags that are not in the main content excludeNonMainTags.forEach((tag) => { - soup(tag).remove(); + const elementsToRemove = soup(tag); + elementsToRemove.remove(); }); } - return soup.html(); - }; + const cleanedHtml = soup.html(); + return cleanedHtml; +}; const attemptScraping = async ( url: string, From 6c726a02eb64df41f64011d7bd87e5b6ccb6c844 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 18 Jun 2024 09:46:42 -0300 Subject: [PATCH 2/4] Moved to utils/removeUnwantedElements, added unit tests --- apps/api/src/scraper/WebScraper/single_url.ts | 40 +----------- .../__tests__/removeUnwantedElements.test.ts | 63 +++++++++++++++++++ .../utils/removeUnwantedElements.ts | 41 ++++++++++++ 3 files changed, 105 insertions(+), 39 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 354a5cb..e112cd4 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -4,10 +4,10 @@ import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; -import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { handleCustomScraping } from "./custom/handleCustomScraping"; +import { removeUnwantedElements } from "./utils/removeUnwantedElements"; import axios from "axios"; dotenv.config(); @@ -313,44 +313,6 @@ export async function scrapSingleUrl( ): Promise { urlToScrap = urlToScrap.trim(); - const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { - const soup = cheerio.load(html); - soup("script, style, iframe, noscript, meta, head").remove(); - - if (pageOptions.removeTags) { - if (typeof pageOptions.removeTags === 'string') { - pageOptions.removeTags = [pageOptions.removeTags]; - } - - if (Array.isArray(pageOptions.removeTags)) { - pageOptions.removeTags.forEach((tag) => { - let elementsToRemove; - if (tag.startsWith("*") && tag.endsWith("*")) { - const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`); - elementsToRemove = soup('*').filter((index, element) => { - const classNames = soup(element).attr('class'); - return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className)); - }); - } else { - elementsToRemove = soup(tag); - } - - elementsToRemove.remove(); - }); - } - } - - if (pageOptions.onlyMainContent) { - // remove any other tags that are not in the main content - excludeNonMainTags.forEach((tag) => { - const elementsToRemove = soup(tag); - elementsToRemove.remove(); - }); - } - const cleanedHtml = soup.html(); - return cleanedHtml; -}; - const attemptScraping = async ( url: string, method: (typeof baseScrapers)[number] diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts new file mode 100644 index 0000000..cfa49e7 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts @@ -0,0 +1,63 @@ +import { removeUnwantedElements } from "../removeUnwantedElements"; +import { PageOptions } from "../../../../lib/entities"; + +describe('removeUnwantedElements', () => { + it('should remove script, style, iframe, noscript, meta, and head tags', () => { + const html = `Test
Content
`; + const options: PageOptions = {}; + const result = removeUnwantedElements(html, options); + expect(result).not.toContain('