Moved to utils/removeUnwantedElements, added unit tests
This commit is contained in:
parent
8b3c3aae91
commit
6c726a02eb
@ -4,10 +4,10 @@ import { extractMetadata } from "./utils/metadata";
|
|||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
|
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
|
||||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
import { handleCustomScraping } from "./custom/handleCustomScraping";
|
import { handleCustomScraping } from "./custom/handleCustomScraping";
|
||||||
|
import { removeUnwantedElements } from "./utils/removeUnwantedElements";
|
||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
@ -313,44 +313,6 @@ export async function scrapSingleUrl(
|
|||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
|
||||||
const soup = cheerio.load(html);
|
|
||||||
soup("script, style, iframe, noscript, meta, head").remove();
|
|
||||||
|
|
||||||
if (pageOptions.removeTags) {
|
|
||||||
if (typeof pageOptions.removeTags === 'string') {
|
|
||||||
pageOptions.removeTags = [pageOptions.removeTags];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Array.isArray(pageOptions.removeTags)) {
|
|
||||||
pageOptions.removeTags.forEach((tag) => {
|
|
||||||
let elementsToRemove;
|
|
||||||
if (tag.startsWith("*") && tag.endsWith("*")) {
|
|
||||||
const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`);
|
|
||||||
elementsToRemove = soup('*').filter((index, element) => {
|
|
||||||
const classNames = soup(element).attr('class');
|
|
||||||
return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className));
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
elementsToRemove = soup(tag);
|
|
||||||
}
|
|
||||||
|
|
||||||
elementsToRemove.remove();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (pageOptions.onlyMainContent) {
|
|
||||||
// remove any other tags that are not in the main content
|
|
||||||
excludeNonMainTags.forEach((tag) => {
|
|
||||||
const elementsToRemove = soup(tag);
|
|
||||||
elementsToRemove.remove();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
const cleanedHtml = soup.html();
|
|
||||||
return cleanedHtml;
|
|
||||||
};
|
|
||||||
|
|
||||||
const attemptScraping = async (
|
const attemptScraping = async (
|
||||||
url: string,
|
url: string,
|
||||||
method: (typeof baseScrapers)[number]
|
method: (typeof baseScrapers)[number]
|
||||||
|
@ -0,0 +1,63 @@
|
|||||||
|
import { removeUnwantedElements } from "../removeUnwantedElements";
|
||||||
|
import { PageOptions } from "../../../../lib/entities";
|
||||||
|
|
||||||
|
describe('removeUnwantedElements', () => {
|
||||||
|
it('should remove script, style, iframe, noscript, meta, and head tags', () => {
|
||||||
|
const html = `<html><head><title>Test</title></head><body><script>alert('test');</script><div>Content</div></body></html>`;
|
||||||
|
const options: PageOptions = {};
|
||||||
|
const result = removeUnwantedElements(html, options);
|
||||||
|
expect(result).not.toContain('<script>');
|
||||||
|
expect(result).not.toContain('<head>');
|
||||||
|
expect(result).toContain('Content');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should remove specified tags passed as string', () => {
|
||||||
|
const html = `<div><span>Remove</span><p>Keep</p></div>`;
|
||||||
|
const options: PageOptions = { removeTags: 'span' };
|
||||||
|
const result = removeUnwantedElements(html, options);
|
||||||
|
expect(result).not.toContain('<span>');
|
||||||
|
expect(result).toContain('<p>Keep</p>');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should remove specified tags passed as array', () => {
|
||||||
|
const html = `<div><span>Remove</span><p>Remove</p><a>Keep</a></div>`;
|
||||||
|
const options: PageOptions = { removeTags: ['span', 'p'] };
|
||||||
|
const result = removeUnwantedElements(html, options);
|
||||||
|
expect(result).not.toContain('<span>');
|
||||||
|
expect(result).not.toContain('<p>');
|
||||||
|
expect(result).toContain('<a>Keep</a>');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle class selectors', () => {
|
||||||
|
const html = `<div class="test">Remove</div><div class="keep">Keep</div>`;
|
||||||
|
const options: PageOptions = { removeTags: '.test' };
|
||||||
|
const result = removeUnwantedElements(html, options);
|
||||||
|
expect(result).not.toContain('class="test"');
|
||||||
|
expect(result).toContain('class="keep"');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle id selectors', () => {
|
||||||
|
const html = `<div id="test">Remove</div><div id="keep">Keep</div>`;
|
||||||
|
const options: PageOptions = { removeTags: '#test' };
|
||||||
|
const result = removeUnwantedElements(html, options);
|
||||||
|
expect(result).not.toContain('id="test"');
|
||||||
|
expect(result).toContain('id="keep"');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle regex patterns in class names', () => {
|
||||||
|
const html = `<div class="test-123">Remove</div><div class="test-abc">Remove</div><div class="keep">Keep</div>`;
|
||||||
|
const options: PageOptions = { removeTags: ['*test*'] };
|
||||||
|
const result = removeUnwantedElements(html, options);
|
||||||
|
expect(result).not.toContain('class="test-123"');
|
||||||
|
expect(result).not.toContain('class="test-abc"');
|
||||||
|
expect(result).toContain('class="keep"');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should remove non-main content if onlyMainContent is true', () => {
|
||||||
|
const html = `<div><main>Main Content</main><aside>Remove</aside></div>`;
|
||||||
|
const options: PageOptions = { onlyMainContent: true };
|
||||||
|
const result = removeUnwantedElements(html, options);
|
||||||
|
expect(result).toContain('Main Content');
|
||||||
|
expect(result).not.toContain('<aside>');
|
||||||
|
});
|
||||||
|
});
|
@ -0,0 +1,41 @@
|
|||||||
|
import cheerio, { AnyNode, Cheerio } from "cheerio";
|
||||||
|
import { PageOptions } from "../../../lib/entities";
|
||||||
|
import { excludeNonMainTags } from "./excludeTags";
|
||||||
|
|
||||||
|
export const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||||
|
const soup = cheerio.load(html);
|
||||||
|
soup("script, style, iframe, noscript, meta, head").remove();
|
||||||
|
|
||||||
|
if (pageOptions.removeTags) {
|
||||||
|
if (typeof pageOptions.removeTags === 'string') {
|
||||||
|
pageOptions.removeTags = [pageOptions.removeTags];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Array.isArray(pageOptions.removeTags)) {
|
||||||
|
pageOptions.removeTags.forEach((tag) => {
|
||||||
|
let elementsToRemove: Cheerio<AnyNode>;
|
||||||
|
if (tag.startsWith("*") && tag.endsWith("*")) {
|
||||||
|
const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`);
|
||||||
|
elementsToRemove = soup('*').filter((index, element) => {
|
||||||
|
const classNames = soup(element).attr('class');
|
||||||
|
return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className));
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
elementsToRemove = soup(tag);
|
||||||
|
}
|
||||||
|
|
||||||
|
elementsToRemove.remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pageOptions.onlyMainContent) {
|
||||||
|
// remove any other tags that are not in the main content
|
||||||
|
excludeNonMainTags.forEach((tag) => {
|
||||||
|
const elementsToRemove = soup(tag);
|
||||||
|
elementsToRemove.remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
const cleanedHtml = soup.html();
|
||||||
|
return cleanedHtml;
|
||||||
|
};
|
Loading…
Reference in New Issue
Block a user