Merge pull request #14 from mendableai/nsc/clean-content
Option to extract only the main content, excluding headers, navs, footers etc.
This commit is contained in:
commit
7ce2dd976f
@ -110,6 +110,8 @@ app.post("/v0/scrape", async (req, res) => {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||
|
||||
try {
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
@ -118,6 +120,7 @@ app.post("/v0/scrape", async (req, res) => {
|
||||
crawlerOptions: {
|
||||
...crawlerOptions,
|
||||
},
|
||||
pageOptions: pageOptions,
|
||||
});
|
||||
|
||||
const docs = await a.getDocuments(false);
|
||||
@ -178,6 +181,7 @@ app.post("/v0/crawl", async (req, res) => {
|
||||
}
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||
|
||||
if (mode === "single_urls" && !url.includes(",")) {
|
||||
try {
|
||||
@ -188,6 +192,7 @@ app.post("/v0/crawl", async (req, res) => {
|
||||
crawlerOptions: {
|
||||
returnOnlyUrls: true,
|
||||
},
|
||||
pageOptions: pageOptions,
|
||||
});
|
||||
|
||||
const docs = await a.getDocuments(false, (progress) => {
|
||||
@ -212,6 +217,8 @@ app.post("/v0/crawl", async (req, res) => {
|
||||
mode: mode ?? "crawl", // fix for single urls not working
|
||||
crawlerOptions: { ...crawlerOptions },
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
|
||||
});
|
||||
|
||||
res.json({ jobId: job.id });
|
||||
@ -239,11 +246,13 @@ app.post("/v0/crawlWebsitePreview", async (req, res) => {
|
||||
}
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||
const job = await addWebScraperJob({
|
||||
url: url,
|
||||
mode: mode ?? "crawl", // fix for single urls not working
|
||||
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
|
||||
team_id: "preview",
|
||||
pageOptions: pageOptions,
|
||||
});
|
||||
|
||||
res.json({ jobId: job.id });
|
||||
|
@ -9,6 +9,24 @@ export interface Progress {
|
||||
currentDocumentUrl?: string;
|
||||
}
|
||||
|
||||
export type PageOptions = {
|
||||
onlyMainContent?: boolean;
|
||||
};
|
||||
export type WebScraperOptions = {
|
||||
urls: string[];
|
||||
mode: "single_urls" | "sitemap" | "crawl";
|
||||
crawlerOptions?: {
|
||||
returnOnlyUrls?: boolean;
|
||||
includes?: string[];
|
||||
excludes?: string[];
|
||||
maxCrawledLinks?: number;
|
||||
limit?: number;
|
||||
generateImgAltText?: boolean;
|
||||
};
|
||||
pageOptions?: PageOptions;
|
||||
concurrentRequests?: number;
|
||||
};
|
||||
|
||||
export class Document {
|
||||
id?: string;
|
||||
content: string;
|
||||
|
@ -13,6 +13,7 @@ export async function startWebScraperPipeline({
|
||||
url: job.data.url,
|
||||
mode: job.data.mode,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
inProgress: (progress) => {
|
||||
job.progress(progress);
|
||||
},
|
||||
@ -29,6 +30,7 @@ export async function runWebScraper({
|
||||
url,
|
||||
mode,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
inProgress,
|
||||
onSuccess,
|
||||
onError,
|
||||
@ -37,6 +39,7 @@ export async function runWebScraper({
|
||||
url: string;
|
||||
mode: "crawl" | "single_urls" | "sitemap";
|
||||
crawlerOptions: any;
|
||||
pageOptions?: any;
|
||||
inProgress: (progress: any) => void;
|
||||
onSuccess: (result: any) => void;
|
||||
onError: (error: any) => void;
|
||||
@ -44,18 +47,19 @@ export async function runWebScraper({
|
||||
}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> {
|
||||
try {
|
||||
const provider = new WebScraperDataProvider();
|
||||
|
||||
if (mode === "crawl") {
|
||||
await provider.setOptions({
|
||||
mode: mode,
|
||||
urls: [url],
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
});
|
||||
} else {
|
||||
await provider.setOptions({
|
||||
mode: mode,
|
||||
urls: url.split(","),
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
});
|
||||
}
|
||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||
|
@ -13,6 +13,10 @@ describe("WebScraperDataProvider", () => {
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content: "![another alt text](./another-image.png)",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content: "![another alt text](./another-image.webp)",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/data-image" },
|
||||
content: "![data image](data:image/png;base64,...)",
|
||||
@ -28,6 +32,10 @@ describe("WebScraperDataProvider", () => {
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content: "![another alt text](https://example.com/another-image.png)",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content: "![another alt text](https://example.com/another-image.webp)",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/data-image" },
|
||||
content: "![data image](data:image/png;base64,...)",
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { Document } from "../../lib/entities";
|
||||
import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
|
||||
import { Progress } from "../../lib/entities";
|
||||
import { scrapSingleUrl } from "./single_url";
|
||||
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
||||
@ -6,19 +6,7 @@ import { WebCrawler } from "./crawler";
|
||||
import { getValue, setValue } from "../../services/redis";
|
||||
import { getImageDescription } from "./utils/gptVision";
|
||||
|
||||
export type WebScraperOptions = {
|
||||
urls: string[];
|
||||
mode: "single_urls" | "sitemap" | "crawl";
|
||||
crawlerOptions?: {
|
||||
returnOnlyUrls?: boolean;
|
||||
includes?: string[];
|
||||
excludes?: string[];
|
||||
maxCrawledLinks?: number;
|
||||
limit?: number;
|
||||
generateImgAltText?: boolean;
|
||||
};
|
||||
concurrentRequests?: number;
|
||||
};
|
||||
|
||||
export class WebScraperDataProvider {
|
||||
private urls: string[] = [""];
|
||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||
@ -29,6 +17,7 @@ export class WebScraperDataProvider {
|
||||
private limit: number = 10000;
|
||||
private concurrentRequests: number = 20;
|
||||
private generateImgAltText: boolean = false;
|
||||
private pageOptions?: PageOptions;
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
@ -51,7 +40,7 @@ export class WebScraperDataProvider {
|
||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||
await Promise.all(
|
||||
batchUrls.map(async (url, index) => {
|
||||
const result = await scrapSingleUrl(url, true);
|
||||
const result = await scrapSingleUrl(url, true, this.pageOptions);
|
||||
processedUrls++;
|
||||
if (inProgress) {
|
||||
inProgress({
|
||||
@ -321,6 +310,7 @@ export class WebScraperDataProvider {
|
||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||
this.generateImgAltText =
|
||||
options.crawlerOptions?.generateImgAltText ?? false;
|
||||
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
||||
|
||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||
this.excludes = this.excludes.filter((item) => item !== "");
|
||||
|
@ -2,9 +2,10 @@ import * as cheerio from "cheerio";
|
||||
import { ScrapingBeeClient } from "scrapingbee";
|
||||
import { extractMetadata } from "./utils/metadata";
|
||||
import dotenv from "dotenv";
|
||||
import { Document } from "../../lib/entities";
|
||||
import { Document, PageOptions } from "../../lib/entities";
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import { parseTablesToMarkdown } from "./utils/parseTable";
|
||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||
// import puppeteer from "puppeteer";
|
||||
|
||||
dotenv.config();
|
||||
@ -77,14 +78,21 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
|
||||
|
||||
export async function scrapSingleUrl(
|
||||
urlToScrap: string,
|
||||
toMarkdown: boolean = true
|
||||
toMarkdown: boolean = true,
|
||||
pageOptions: PageOptions = { onlyMainContent: true }
|
||||
): Promise<Document> {
|
||||
console.log(`Scraping URL: ${urlToScrap}`);
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
const removeUnwantedElements = (html: string) => {
|
||||
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||
const soup = cheerio.load(html);
|
||||
soup("script, style, iframe, noscript, meta, head").remove();
|
||||
if (pageOptions.onlyMainContent) {
|
||||
// remove any other tags that are not in the main content
|
||||
excludeNonMainTags.forEach((tag) => {
|
||||
soup(tag).remove();
|
||||
});
|
||||
}
|
||||
return soup.html();
|
||||
};
|
||||
|
||||
@ -133,7 +141,7 @@ export async function scrapSingleUrl(
|
||||
}
|
||||
break;
|
||||
}
|
||||
let cleanedHtml = removeUnwantedElements(text);
|
||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||
return [await parseMarkdown(cleanedHtml), text];
|
||||
};
|
||||
|
||||
|
60
apps/api/src/scraper/WebScraper/utils/excludeTags.ts
Normal file
60
apps/api/src/scraper/WebScraper/utils/excludeTags.ts
Normal file
@ -0,0 +1,60 @@
|
||||
export const excludeNonMainTags = [
|
||||
"header",
|
||||
"footer",
|
||||
"nav",
|
||||
"aside",
|
||||
".header",
|
||||
".top",
|
||||
".navbar",
|
||||
"#header",
|
||||
".footer",
|
||||
".bottom",
|
||||
"#footer",
|
||||
".sidebar",
|
||||
".side",
|
||||
".aside",
|
||||
"#sidebar",
|
||||
".modal",
|
||||
".popup",
|
||||
"#modal",
|
||||
".overlay",
|
||||
".ad",
|
||||
".ads",
|
||||
".advert",
|
||||
"#ad",
|
||||
".lang-selector",
|
||||
".language",
|
||||
"#language-selector",
|
||||
".social",
|
||||
".social-media",
|
||||
".social-links",
|
||||
"#social",
|
||||
".menu",
|
||||
".navigation",
|
||||
"#nav",
|
||||
".breadcrumbs",
|
||||
"#breadcrumbs",
|
||||
".form",
|
||||
"form",
|
||||
"#search-form",
|
||||
".search",
|
||||
"#search",
|
||||
".share",
|
||||
"#share",
|
||||
".pagination",
|
||||
"#pagination",
|
||||
".widget",
|
||||
"#widget",
|
||||
".related",
|
||||
"#related",
|
||||
".tag",
|
||||
"#tag",
|
||||
".category",
|
||||
"#category",
|
||||
".comment",
|
||||
"#comment",
|
||||
".reply",
|
||||
"#reply",
|
||||
".author",
|
||||
"#author",
|
||||
];
|
@ -20,7 +20,9 @@ export interface WebScraperOptions {
|
||||
url: string;
|
||||
mode: "crawl" | "single_urls" | "sitemap";
|
||||
crawlerOptions: any;
|
||||
pageOptions: any;
|
||||
team_id: string;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user