fix for some complex cases
This commit is contained in:
parent
c2fc69af1c
commit
9f7afd1e88
@ -60,4 +60,44 @@ describe('removeUnwantedElements', () => {
|
|||||||
expect(result).toContain('Main Content');
|
expect(result).toContain('Main Content');
|
||||||
expect(result).not.toContain('<aside>');
|
expect(result).not.toContain('<aside>');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('should handle complex regex patterns for class names', () => {
|
||||||
|
const html = `<div class="test-123">Remove</div><div class="test-abc">Remove</div><div class="keep">Keep</div><div class="test-xyz">Remove</div>`;
|
||||||
|
const options: PageOptions = { removeTags: ['*.test-[a-z]+*'] };
|
||||||
|
const result = removeUnwantedElements(html, options);
|
||||||
|
expect(result).toContain('class="test-123"');
|
||||||
|
expect(result).not.toContain('class="test-abc"');
|
||||||
|
expect(result).not.toContain('class="test-xyz"');
|
||||||
|
expect(result).toContain('class="keep"');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle complex regex patterns for attributes', () => {
|
||||||
|
const html = `<div data-info="12345">Remove</div><div data-info="abcde">Keep</div><div data-info="67890">Remove</div>`;
|
||||||
|
const options: PageOptions = { removeTags: ['*data-info="\\d+"*'] }; // Matches data-info that starts with digits
|
||||||
|
const result = removeUnwantedElements(html, options);
|
||||||
|
expect(result).not.toContain('data-info="12345"');
|
||||||
|
expect(result).not.toContain('data-info="67890"');
|
||||||
|
expect(result).toContain('data-info="abcde"');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle mixed selectors with regex', () => {
|
||||||
|
const html = `<div class="remove-this">Remove</div><div id="remove-this">Remove</div><div class="keep-this">Keep</div>`;
|
||||||
|
const options: PageOptions = { removeTags: ['.remove-this', '#remove-this'] };
|
||||||
|
const result = removeUnwantedElements(html, options);
|
||||||
|
expect(result).not.toContain('class="remove-this"');
|
||||||
|
expect(result).not.toContain('id="remove-this"');
|
||||||
|
expect(result).toContain('class="keep-this"');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle multiple regex patterns', () => {
|
||||||
|
const html = `<div attr="test-123">Remove</div><div class="class-remove">Remove</div><div class="keep">Keep</div><div class="remove-this">Remove</div><div id="remove-this">Remove</div>`;
|
||||||
|
const options: PageOptions = { removeTags: ['*test*', '.class-remove', '*.remove-[a-z]+*', '#remove-this'] };
|
||||||
|
const result = removeUnwantedElements(html, options);
|
||||||
|
expect(result).not.toContain('class="test-123"');
|
||||||
|
expect(result).not.toContain('class="test-abc"');
|
||||||
|
expect(result).not.toContain('class="remove"');
|
||||||
|
expect(result).not.toContain('class="remove-this"');
|
||||||
|
expect(result).not.toContain('id="remove-this"');
|
||||||
|
expect(result).toContain('class="keep"');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
@ -15,22 +15,34 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
|
|||||||
pageOptions.removeTags.forEach((tag) => {
|
pageOptions.removeTags.forEach((tag) => {
|
||||||
let elementsToRemove: Cheerio<AnyNode>;
|
let elementsToRemove: Cheerio<AnyNode>;
|
||||||
if (tag.startsWith("*") && tag.endsWith("*")) {
|
if (tag.startsWith("*") && tag.endsWith("*")) {
|
||||||
const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`);
|
let classMatch = false;
|
||||||
elementsToRemove = soup('*').filter((index, element) => {
|
|
||||||
const classNames = soup(element).attr('class');
|
const regexPattern = new RegExp(tag.slice(1, -1), 'i');
|
||||||
return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className));
|
elementsToRemove = soup('*').filter((i, element) => {
|
||||||
|
if (element.type === 'tag') {
|
||||||
|
const attributes = element.attribs;
|
||||||
|
const tagNameMatches = regexPattern.test(element.name);
|
||||||
|
const attributesMatch = Object.keys(attributes).some(attr =>
|
||||||
|
regexPattern.test(`${attr}="${attributes[attr]}"`)
|
||||||
|
);
|
||||||
|
if (tag.startsWith('*.')) {
|
||||||
|
classMatch = Object.keys(attributes).some(attr =>
|
||||||
|
regexPattern.test(`class="${attributes[attr]}"`)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return tagNameMatches || attributesMatch || classMatch;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
elementsToRemove = soup(tag);
|
elementsToRemove = soup(tag);
|
||||||
}
|
}
|
||||||
|
|
||||||
elementsToRemove.remove();
|
elementsToRemove.remove();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pageOptions.onlyMainContent) {
|
if (pageOptions.onlyMainContent) {
|
||||||
// remove any other tags that are not in the main content
|
|
||||||
excludeNonMainTags.forEach((tag) => {
|
excludeNonMainTags.forEach((tag) => {
|
||||||
const elementsToRemove = soup(tag);
|
const elementsToRemove = soup(tag);
|
||||||
elementsToRemove.remove();
|
elementsToRemove.remove();
|
||||||
|
Loading…
Reference in New Issue
Block a user