fix for some complex cases
This commit is contained in:
parent
c2fc69af1c
commit
9f7afd1e88
@ -60,4 +60,44 @@ describe('removeUnwantedElements', () => {
|
||||
expect(result).toContain('Main Content');
|
||||
expect(result).not.toContain('<aside>');
|
||||
});
|
||||
|
||||
it('should handle complex regex patterns for class names', () => {
|
||||
const html = `<div class="test-123">Remove</div><div class="test-abc">Remove</div><div class="keep">Keep</div><div class="test-xyz">Remove</div>`;
|
||||
const options: PageOptions = { removeTags: ['*.test-[a-z]+*'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('class="test-123"');
|
||||
expect(result).not.toContain('class="test-abc"');
|
||||
expect(result).not.toContain('class="test-xyz"');
|
||||
expect(result).toContain('class="keep"');
|
||||
});
|
||||
|
||||
it('should handle complex regex patterns for attributes', () => {
|
||||
const html = `<div data-info="12345">Remove</div><div data-info="abcde">Keep</div><div data-info="67890">Remove</div>`;
|
||||
const options: PageOptions = { removeTags: ['*data-info="\\d+"*'] }; // Matches data-info that starts with digits
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).not.toContain('data-info="12345"');
|
||||
expect(result).not.toContain('data-info="67890"');
|
||||
expect(result).toContain('data-info="abcde"');
|
||||
});
|
||||
|
||||
it('should handle mixed selectors with regex', () => {
|
||||
const html = `<div class="remove-this">Remove</div><div id="remove-this">Remove</div><div class="keep-this">Keep</div>`;
|
||||
const options: PageOptions = { removeTags: ['.remove-this', '#remove-this'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).not.toContain('class="remove-this"');
|
||||
expect(result).not.toContain('id="remove-this"');
|
||||
expect(result).toContain('class="keep-this"');
|
||||
});
|
||||
|
||||
it('should handle multiple regex patterns', () => {
|
||||
const html = `<div attr="test-123">Remove</div><div class="class-remove">Remove</div><div class="keep">Keep</div><div class="remove-this">Remove</div><div id="remove-this">Remove</div>`;
|
||||
const options: PageOptions = { removeTags: ['*test*', '.class-remove', '*.remove-[a-z]+*', '#remove-this'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).not.toContain('class="test-123"');
|
||||
expect(result).not.toContain('class="test-abc"');
|
||||
expect(result).not.toContain('class="remove"');
|
||||
expect(result).not.toContain('class="remove-this"');
|
||||
expect(result).not.toContain('id="remove-this"');
|
||||
expect(result).toContain('class="keep"');
|
||||
});
|
||||
});
|
||||
|
@ -15,22 +15,34 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
|
||||
pageOptions.removeTags.forEach((tag) => {
|
||||
let elementsToRemove: Cheerio<AnyNode>;
|
||||
if (tag.startsWith("*") && tag.endsWith("*")) {
|
||||
const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`);
|
||||
elementsToRemove = soup('*').filter((index, element) => {
|
||||
const classNames = soup(element).attr('class');
|
||||
return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className));
|
||||
let classMatch = false;
|
||||
|
||||
const regexPattern = new RegExp(tag.slice(1, -1), 'i');
|
||||
elementsToRemove = soup('*').filter((i, element) => {
|
||||
if (element.type === 'tag') {
|
||||
const attributes = element.attribs;
|
||||
const tagNameMatches = regexPattern.test(element.name);
|
||||
const attributesMatch = Object.keys(attributes).some(attr =>
|
||||
regexPattern.test(`${attr}="${attributes[attr]}"`)
|
||||
);
|
||||
if (tag.startsWith('*.')) {
|
||||
classMatch = Object.keys(attributes).some(attr =>
|
||||
regexPattern.test(`class="${attributes[attr]}"`)
|
||||
);
|
||||
}
|
||||
return tagNameMatches || attributesMatch || classMatch;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
} else {
|
||||
elementsToRemove = soup(tag);
|
||||
}
|
||||
|
||||
elementsToRemove.remove();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (pageOptions.onlyMainContent) {
|
||||
// remove any other tags that are not in the main content
|
||||
excludeNonMainTags.forEach((tag) => {
|
||||
const elementsToRemove = soup(tag);
|
||||
elementsToRemove.remove();
|
||||
|
Loading…
Reference in New Issue
Block a user