Update index.ts
This commit is contained in:
parent
d10f81e7fe
commit
1b0d6341d3
@ -17,20 +17,7 @@ import {
|
|||||||
} from "./utils/replacePaths";
|
} from "./utils/replacePaths";
|
||||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
|
||||||
import cheerio from "cheerio";
|
|
||||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
|
||||||
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
|
||||||
const soup = cheerio.load(html);
|
|
||||||
soup("script, style, iframe, noscript, meta, head").remove();
|
|
||||||
if (pageOptions.onlyMainContent) {
|
|
||||||
// remove any other tags that are not in the main content
|
|
||||||
excludeNonMainTags.forEach((tag) => {
|
|
||||||
soup(tag).remove();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
return soup.html();
|
|
||||||
};
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
private bullJobId: string;
|
private bullJobId: string;
|
||||||
private urls: string[] = [""];
|
private urls: string[] = [""];
|
||||||
|
Loading…
Reference in New Issue
Block a user