Added support for RegEx in removeTags
This commit is contained in:
parent
a20d002a6b
commit
8b3c3aae91
@ -319,12 +319,23 @@ export async function scrapSingleUrl(
|
|||||||
|
|
||||||
if (pageOptions.removeTags) {
|
if (pageOptions.removeTags) {
|
||||||
if (typeof pageOptions.removeTags === 'string') {
|
if (typeof pageOptions.removeTags === 'string') {
|
||||||
pageOptions.removeTags.split(',').forEach((tag) => {
|
pageOptions.removeTags = [pageOptions.removeTags];
|
||||||
soup(tag.trim()).remove();
|
}
|
||||||
});
|
|
||||||
} else if (Array.isArray(pageOptions.removeTags)) {
|
if (Array.isArray(pageOptions.removeTags)) {
|
||||||
pageOptions.removeTags.forEach((tag) => {
|
pageOptions.removeTags.forEach((tag) => {
|
||||||
soup(tag).remove();
|
let elementsToRemove;
|
||||||
|
if (tag.startsWith("*") && tag.endsWith("*")) {
|
||||||
|
const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`);
|
||||||
|
elementsToRemove = soup('*').filter((index, element) => {
|
||||||
|
const classNames = soup(element).attr('class');
|
||||||
|
return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className));
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
elementsToRemove = soup(tag);
|
||||||
|
}
|
||||||
|
|
||||||
|
elementsToRemove.remove();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -332,11 +343,13 @@ export async function scrapSingleUrl(
|
|||||||
if (pageOptions.onlyMainContent) {
|
if (pageOptions.onlyMainContent) {
|
||||||
// remove any other tags that are not in the main content
|
// remove any other tags that are not in the main content
|
||||||
excludeNonMainTags.forEach((tag) => {
|
excludeNonMainTags.forEach((tag) => {
|
||||||
soup(tag).remove();
|
const elementsToRemove = soup(tag);
|
||||||
|
elementsToRemove.remove();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
return soup.html();
|
const cleanedHtml = soup.html();
|
||||||
};
|
return cleanedHtml;
|
||||||
|
};
|
||||||
|
|
||||||
const attemptScraping = async (
|
const attemptScraping = async (
|
||||||
url: string,
|
url: string,
|
||||||
|
Loading…
Reference in New Issue
Block a user