0

Added support for RegEx in removeTags

This commit is contained in:
AndyMik90 2024-06-18 07:31:46 +02:00
parent a20d002a6b
commit 8b3c3aae91

View File

@ -316,15 +316,26 @@ export async function scrapSingleUrl(
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
const soup = cheerio.load(html); const soup = cheerio.load(html);
soup("script, style, iframe, noscript, meta, head").remove(); soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.removeTags) { if (pageOptions.removeTags) {
if (typeof pageOptions.removeTags === 'string') { if (typeof pageOptions.removeTags === 'string') {
pageOptions.removeTags.split(',').forEach((tag) => { pageOptions.removeTags = [pageOptions.removeTags];
soup(tag.trim()).remove(); }
});
} else if (Array.isArray(pageOptions.removeTags)) { if (Array.isArray(pageOptions.removeTags)) {
pageOptions.removeTags.forEach((tag) => { pageOptions.removeTags.forEach((tag) => {
soup(tag).remove(); let elementsToRemove;
if (tag.startsWith("*") && tag.endsWith("*")) {
const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`);
elementsToRemove = soup('*').filter((index, element) => {
const classNames = soup(element).attr('class');
return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className));
});
} else {
elementsToRemove = soup(tag);
}
elementsToRemove.remove();
}); });
} }
} }
@ -332,11 +343,13 @@ export async function scrapSingleUrl(
if (pageOptions.onlyMainContent) { if (pageOptions.onlyMainContent) {
// remove any other tags that are not in the main content // remove any other tags that are not in the main content
excludeNonMainTags.forEach((tag) => { excludeNonMainTags.forEach((tag) => {
soup(tag).remove(); const elementsToRemove = soup(tag);
elementsToRemove.remove();
}); });
} }
return soup.html(); const cleanedHtml = soup.html();
}; return cleanedHtml;
};
const attemptScraping = async ( const attemptScraping = async (
url: string, url: string,