Merge pull request #14 from mendableai/nsc/clean-content
Option to extract only the main content, excluding headers, navs, footers etc.
This commit is contained in:
commit
7ce2dd976f
@ -110,6 +110,8 @@ app.post("/v0/scrape", async (req, res) => {
|
|||||||
return res.status(400).json({ error: "Url is required" });
|
return res.status(400).json({ error: "Url is required" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
@ -118,6 +120,7 @@ app.post("/v0/scrape", async (req, res) => {
|
|||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
...crawlerOptions,
|
...crawlerOptions,
|
||||||
},
|
},
|
||||||
|
pageOptions: pageOptions,
|
||||||
});
|
});
|
||||||
|
|
||||||
const docs = await a.getDocuments(false);
|
const docs = await a.getDocuments(false);
|
||||||
@ -178,6 +181,7 @@ app.post("/v0/crawl", async (req, res) => {
|
|||||||
}
|
}
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||||
|
|
||||||
if (mode === "single_urls" && !url.includes(",")) {
|
if (mode === "single_urls" && !url.includes(",")) {
|
||||||
try {
|
try {
|
||||||
@ -188,6 +192,7 @@ app.post("/v0/crawl", async (req, res) => {
|
|||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
returnOnlyUrls: true,
|
returnOnlyUrls: true,
|
||||||
},
|
},
|
||||||
|
pageOptions: pageOptions,
|
||||||
});
|
});
|
||||||
|
|
||||||
const docs = await a.getDocuments(false, (progress) => {
|
const docs = await a.getDocuments(false, (progress) => {
|
||||||
@ -212,6 +217,8 @@ app.post("/v0/crawl", async (req, res) => {
|
|||||||
mode: mode ?? "crawl", // fix for single urls not working
|
mode: mode ?? "crawl", // fix for single urls not working
|
||||||
crawlerOptions: { ...crawlerOptions },
|
crawlerOptions: { ...crawlerOptions },
|
||||||
team_id: team_id,
|
team_id: team_id,
|
||||||
|
pageOptions: pageOptions,
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
res.json({ jobId: job.id });
|
res.json({ jobId: job.id });
|
||||||
@ -239,11 +246,13 @@ app.post("/v0/crawlWebsitePreview", async (req, res) => {
|
|||||||
}
|
}
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||||
const job = await addWebScraperJob({
|
const job = await addWebScraperJob({
|
||||||
url: url,
|
url: url,
|
||||||
mode: mode ?? "crawl", // fix for single urls not working
|
mode: mode ?? "crawl", // fix for single urls not working
|
||||||
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
|
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
|
||||||
team_id: "preview",
|
team_id: "preview",
|
||||||
|
pageOptions: pageOptions,
|
||||||
});
|
});
|
||||||
|
|
||||||
res.json({ jobId: job.id });
|
res.json({ jobId: job.id });
|
||||||
|
@ -9,6 +9,24 @@ export interface Progress {
|
|||||||
currentDocumentUrl?: string;
|
currentDocumentUrl?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export type PageOptions = {
|
||||||
|
onlyMainContent?: boolean;
|
||||||
|
};
|
||||||
|
export type WebScraperOptions = {
|
||||||
|
urls: string[];
|
||||||
|
mode: "single_urls" | "sitemap" | "crawl";
|
||||||
|
crawlerOptions?: {
|
||||||
|
returnOnlyUrls?: boolean;
|
||||||
|
includes?: string[];
|
||||||
|
excludes?: string[];
|
||||||
|
maxCrawledLinks?: number;
|
||||||
|
limit?: number;
|
||||||
|
generateImgAltText?: boolean;
|
||||||
|
};
|
||||||
|
pageOptions?: PageOptions;
|
||||||
|
concurrentRequests?: number;
|
||||||
|
};
|
||||||
|
|
||||||
export class Document {
|
export class Document {
|
||||||
id?: string;
|
id?: string;
|
||||||
content: string;
|
content: string;
|
||||||
|
@ -13,6 +13,7 @@ export async function startWebScraperPipeline({
|
|||||||
url: job.data.url,
|
url: job.data.url,
|
||||||
mode: job.data.mode,
|
mode: job.data.mode,
|
||||||
crawlerOptions: job.data.crawlerOptions,
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
|
pageOptions: job.data.pageOptions,
|
||||||
inProgress: (progress) => {
|
inProgress: (progress) => {
|
||||||
job.progress(progress);
|
job.progress(progress);
|
||||||
},
|
},
|
||||||
@ -29,6 +30,7 @@ export async function runWebScraper({
|
|||||||
url,
|
url,
|
||||||
mode,
|
mode,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
|
pageOptions,
|
||||||
inProgress,
|
inProgress,
|
||||||
onSuccess,
|
onSuccess,
|
||||||
onError,
|
onError,
|
||||||
@ -37,6 +39,7 @@ export async function runWebScraper({
|
|||||||
url: string;
|
url: string;
|
||||||
mode: "crawl" | "single_urls" | "sitemap";
|
mode: "crawl" | "single_urls" | "sitemap";
|
||||||
crawlerOptions: any;
|
crawlerOptions: any;
|
||||||
|
pageOptions?: any;
|
||||||
inProgress: (progress: any) => void;
|
inProgress: (progress: any) => void;
|
||||||
onSuccess: (result: any) => void;
|
onSuccess: (result: any) => void;
|
||||||
onError: (error: any) => void;
|
onError: (error: any) => void;
|
||||||
@ -44,18 +47,19 @@ export async function runWebScraper({
|
|||||||
}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> {
|
}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> {
|
||||||
try {
|
try {
|
||||||
const provider = new WebScraperDataProvider();
|
const provider = new WebScraperDataProvider();
|
||||||
|
|
||||||
if (mode === "crawl") {
|
if (mode === "crawl") {
|
||||||
await provider.setOptions({
|
await provider.setOptions({
|
||||||
mode: mode,
|
mode: mode,
|
||||||
urls: [url],
|
urls: [url],
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
|
pageOptions: pageOptions,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
await provider.setOptions({
|
await provider.setOptions({
|
||||||
mode: mode,
|
mode: mode,
|
||||||
urls: url.split(","),
|
urls: url.split(","),
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
|
pageOptions: pageOptions,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||||
|
@ -13,6 +13,10 @@ describe("WebScraperDataProvider", () => {
|
|||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
content: "![another alt text](./another-image.png)",
|
content: "![another alt text](./another-image.png)",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
|
content: "![another alt text](./another-image.webp)",
|
||||||
|
},
|
||||||
{
|
{
|
||||||
metadata: { sourceURL: "https://example.com/data-image" },
|
metadata: { sourceURL: "https://example.com/data-image" },
|
||||||
content: "![data image](data:image/png;base64,...)",
|
content: "![data image](data:image/png;base64,...)",
|
||||||
@ -28,6 +32,10 @@ describe("WebScraperDataProvider", () => {
|
|||||||
metadata: { sourceURL: "https://example.com/another-page" },
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
content: "![another alt text](https://example.com/another-image.png)",
|
content: "![another alt text](https://example.com/another-image.png)",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: "https://example.com/another-page" },
|
||||||
|
content: "![another alt text](https://example.com/another-image.webp)",
|
||||||
|
},
|
||||||
{
|
{
|
||||||
metadata: { sourceURL: "https://example.com/data-image" },
|
metadata: { sourceURL: "https://example.com/data-image" },
|
||||||
content: "![data image](data:image/png;base64,...)",
|
content: "![data image](data:image/png;base64,...)",
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { Document } from "../../lib/entities";
|
import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
|
||||||
import { Progress } from "../../lib/entities";
|
import { Progress } from "../../lib/entities";
|
||||||
import { scrapSingleUrl } from "./single_url";
|
import { scrapSingleUrl } from "./single_url";
|
||||||
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
||||||
@ -6,19 +6,7 @@ import { WebCrawler } from "./crawler";
|
|||||||
import { getValue, setValue } from "../../services/redis";
|
import { getValue, setValue } from "../../services/redis";
|
||||||
import { getImageDescription } from "./utils/gptVision";
|
import { getImageDescription } from "./utils/gptVision";
|
||||||
|
|
||||||
export type WebScraperOptions = {
|
|
||||||
urls: string[];
|
|
||||||
mode: "single_urls" | "sitemap" | "crawl";
|
|
||||||
crawlerOptions?: {
|
|
||||||
returnOnlyUrls?: boolean;
|
|
||||||
includes?: string[];
|
|
||||||
excludes?: string[];
|
|
||||||
maxCrawledLinks?: number;
|
|
||||||
limit?: number;
|
|
||||||
generateImgAltText?: boolean;
|
|
||||||
};
|
|
||||||
concurrentRequests?: number;
|
|
||||||
};
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
private urls: string[] = [""];
|
private urls: string[] = [""];
|
||||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||||
@ -29,6 +17,7 @@ export class WebScraperDataProvider {
|
|||||||
private limit: number = 10000;
|
private limit: number = 10000;
|
||||||
private concurrentRequests: number = 20;
|
private concurrentRequests: number = 20;
|
||||||
private generateImgAltText: boolean = false;
|
private generateImgAltText: boolean = false;
|
||||||
|
private pageOptions?: PageOptions;
|
||||||
|
|
||||||
authorize(): void {
|
authorize(): void {
|
||||||
throw new Error("Method not implemented.");
|
throw new Error("Method not implemented.");
|
||||||
@ -51,7 +40,7 @@ export class WebScraperDataProvider {
|
|||||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
batchUrls.map(async (url, index) => {
|
batchUrls.map(async (url, index) => {
|
||||||
const result = await scrapSingleUrl(url, true);
|
const result = await scrapSingleUrl(url, true, this.pageOptions);
|
||||||
processedUrls++;
|
processedUrls++;
|
||||||
if (inProgress) {
|
if (inProgress) {
|
||||||
inProgress({
|
inProgress({
|
||||||
@ -321,6 +310,7 @@ export class WebScraperDataProvider {
|
|||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
|
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
||||||
|
|
||||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
|
@ -2,9 +2,10 @@ import * as cheerio from "cheerio";
|
|||||||
import { ScrapingBeeClient } from "scrapingbee";
|
import { ScrapingBeeClient } from "scrapingbee";
|
||||||
import { extractMetadata } from "./utils/metadata";
|
import { extractMetadata } from "./utils/metadata";
|
||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
import { Document } from "../../lib/entities";
|
import { Document, PageOptions } from "../../lib/entities";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { parseTablesToMarkdown } from "./utils/parseTable";
|
import { parseTablesToMarkdown } from "./utils/parseTable";
|
||||||
|
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||||
// import puppeteer from "puppeteer";
|
// import puppeteer from "puppeteer";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
@ -77,14 +78,21 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
|
|||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
toMarkdown: boolean = true
|
toMarkdown: boolean = true,
|
||||||
|
pageOptions: PageOptions = { onlyMainContent: true }
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
console.log(`Scraping URL: ${urlToScrap}`);
|
console.log(`Scraping URL: ${urlToScrap}`);
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
const removeUnwantedElements = (html: string) => {
|
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||||
const soup = cheerio.load(html);
|
const soup = cheerio.load(html);
|
||||||
soup("script, style, iframe, noscript, meta, head").remove();
|
soup("script, style, iframe, noscript, meta, head").remove();
|
||||||
|
if (pageOptions.onlyMainContent) {
|
||||||
|
// remove any other tags that are not in the main content
|
||||||
|
excludeNonMainTags.forEach((tag) => {
|
||||||
|
soup(tag).remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
return soup.html();
|
return soup.html();
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -133,7 +141,7 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let cleanedHtml = removeUnwantedElements(text);
|
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||||
return [await parseMarkdown(cleanedHtml), text];
|
return [await parseMarkdown(cleanedHtml), text];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
60
apps/api/src/scraper/WebScraper/utils/excludeTags.ts
Normal file
60
apps/api/src/scraper/WebScraper/utils/excludeTags.ts
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
export const excludeNonMainTags = [
|
||||||
|
"header",
|
||||||
|
"footer",
|
||||||
|
"nav",
|
||||||
|
"aside",
|
||||||
|
".header",
|
||||||
|
".top",
|
||||||
|
".navbar",
|
||||||
|
"#header",
|
||||||
|
".footer",
|
||||||
|
".bottom",
|
||||||
|
"#footer",
|
||||||
|
".sidebar",
|
||||||
|
".side",
|
||||||
|
".aside",
|
||||||
|
"#sidebar",
|
||||||
|
".modal",
|
||||||
|
".popup",
|
||||||
|
"#modal",
|
||||||
|
".overlay",
|
||||||
|
".ad",
|
||||||
|
".ads",
|
||||||
|
".advert",
|
||||||
|
"#ad",
|
||||||
|
".lang-selector",
|
||||||
|
".language",
|
||||||
|
"#language-selector",
|
||||||
|
".social",
|
||||||
|
".social-media",
|
||||||
|
".social-links",
|
||||||
|
"#social",
|
||||||
|
".menu",
|
||||||
|
".navigation",
|
||||||
|
"#nav",
|
||||||
|
".breadcrumbs",
|
||||||
|
"#breadcrumbs",
|
||||||
|
".form",
|
||||||
|
"form",
|
||||||
|
"#search-form",
|
||||||
|
".search",
|
||||||
|
"#search",
|
||||||
|
".share",
|
||||||
|
"#share",
|
||||||
|
".pagination",
|
||||||
|
"#pagination",
|
||||||
|
".widget",
|
||||||
|
"#widget",
|
||||||
|
".related",
|
||||||
|
"#related",
|
||||||
|
".tag",
|
||||||
|
"#tag",
|
||||||
|
".category",
|
||||||
|
"#category",
|
||||||
|
".comment",
|
||||||
|
"#comment",
|
||||||
|
".reply",
|
||||||
|
"#reply",
|
||||||
|
".author",
|
||||||
|
"#author",
|
||||||
|
];
|
@ -20,7 +20,9 @@ export interface WebScraperOptions {
|
|||||||
url: string;
|
url: string;
|
||||||
mode: "crawl" | "single_urls" | "sitemap";
|
mode: "crawl" | "single_urls" | "sitemap";
|
||||||
crawlerOptions: any;
|
crawlerOptions: any;
|
||||||
|
pageOptions: any;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user