0

Update index.ts

This commit is contained in:
Nicolas 2024-05-15 11:48:12 -07:00
parent d10f81e7fe
commit 1b0d6341d3

View File

@ -17,20 +17,7 @@ import {
} from "./utils/replacePaths"; } from "./utils/replacePaths";
import { generateCompletions } from "../../lib/LLM-extraction"; import { generateCompletions } from "../../lib/LLM-extraction";
import { getWebScraperQueue } from "../../../src/services/queue-service"; import { getWebScraperQueue } from "../../../src/services/queue-service";
import { parseMarkdown } from "../../lib/html-to-markdown";
import cheerio from "cheerio";
import { excludeNonMainTags } from "./utils/excludeTags";
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
const soup = cheerio.load(html);
soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.onlyMainContent) {
// remove any other tags that are not in the main content
excludeNonMainTags.forEach((tag) => {
soup(tag).remove();
});
}
return soup.html();
};
export class WebScraperDataProvider { export class WebScraperDataProvider {
private bullJobId: string; private bullJobId: string;
private urls: string[] = [""]; private urls: string[] = [""];