0

fix for some complex cases

This commit is contained in:
rafaelsideguide 2024-06-18 14:36:51 -03:00
parent c2fc69af1c
commit 9f7afd1e88
2 changed files with 58 additions and 6 deletions

View File

@ -60,4 +60,44 @@ describe('removeUnwantedElements', () => {
expect(result).toContain('Main Content'); expect(result).toContain('Main Content');
expect(result).not.toContain('<aside>'); expect(result).not.toContain('<aside>');
}); });
it('should handle complex regex patterns for class names', () => {
const html = `<div class="test-123">Remove</div><div class="test-abc">Remove</div><div class="keep">Keep</div><div class="test-xyz">Remove</div>`;
const options: PageOptions = { removeTags: ['*.test-[a-z]+*'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('class="test-123"');
expect(result).not.toContain('class="test-abc"');
expect(result).not.toContain('class="test-xyz"');
expect(result).toContain('class="keep"');
});
it('should handle complex regex patterns for attributes', () => {
const html = `<div data-info="12345">Remove</div><div data-info="abcde">Keep</div><div data-info="67890">Remove</div>`;
const options: PageOptions = { removeTags: ['*data-info="\\d+"*'] }; // Matches data-info that starts with digits
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('data-info="12345"');
expect(result).not.toContain('data-info="67890"');
expect(result).toContain('data-info="abcde"');
});
it('should handle mixed selectors with regex', () => {
const html = `<div class="remove-this">Remove</div><div id="remove-this">Remove</div><div class="keep-this">Keep</div>`;
const options: PageOptions = { removeTags: ['.remove-this', '#remove-this'] };
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('class="remove-this"');
expect(result).not.toContain('id="remove-this"');
expect(result).toContain('class="keep-this"');
});
it('should handle multiple regex patterns', () => {
const html = `<div attr="test-123">Remove</div><div class="class-remove">Remove</div><div class="keep">Keep</div><div class="remove-this">Remove</div><div id="remove-this">Remove</div>`;
const options: PageOptions = { removeTags: ['*test*', '.class-remove', '*.remove-[a-z]+*', '#remove-this'] };
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('class="test-123"');
expect(result).not.toContain('class="test-abc"');
expect(result).not.toContain('class="remove"');
expect(result).not.toContain('class="remove-this"');
expect(result).not.toContain('id="remove-this"');
expect(result).toContain('class="keep"');
});
}); });

View File

@ -15,22 +15,34 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
pageOptions.removeTags.forEach((tag) => { pageOptions.removeTags.forEach((tag) => {
let elementsToRemove: Cheerio<AnyNode>; let elementsToRemove: Cheerio<AnyNode>;
if (tag.startsWith("*") && tag.endsWith("*")) { if (tag.startsWith("*") && tag.endsWith("*")) {
const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`); let classMatch = false;
elementsToRemove = soup('*').filter((index, element) => {
const classNames = soup(element).attr('class'); const regexPattern = new RegExp(tag.slice(1, -1), 'i');
return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className)); elementsToRemove = soup('*').filter((i, element) => {
if (element.type === 'tag') {
const attributes = element.attribs;
const tagNameMatches = regexPattern.test(element.name);
const attributesMatch = Object.keys(attributes).some(attr =>
regexPattern.test(`${attr}="${attributes[attr]}"`)
);
if (tag.startsWith('*.')) {
classMatch = Object.keys(attributes).some(attr =>
regexPattern.test(`class="${attributes[attr]}"`)
);
}
return tagNameMatches || attributesMatch || classMatch;
}
return false;
}); });
} else { } else {
elementsToRemove = soup(tag); elementsToRemove = soup(tag);
} }
elementsToRemove.remove(); elementsToRemove.remove();
}); });
} }
} }
if (pageOptions.onlyMainContent) { if (pageOptions.onlyMainContent) {
// remove any other tags that are not in the main content
excludeNonMainTags.forEach((tag) => { excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag); const elementsToRemove = soup(tag);
elementsToRemove.remove(); elementsToRemove.remove();