Merge pull request #263 from mendableai/nsc/pageoptions-crawler
ignoreSitemap feature, pageOptions now respected in the initial crawl as well
This commit is contained in:
commit
15e791ffb1
@ -35,10 +35,7 @@ export type SearchOptions = {
|
|||||||
location?: string;
|
location?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type WebScraperOptions = {
|
export type CrawlerOptions = {
|
||||||
urls: string[];
|
|
||||||
mode: "single_urls" | "sitemap" | "crawl";
|
|
||||||
crawlerOptions?: {
|
|
||||||
returnOnlyUrls?: boolean;
|
returnOnlyUrls?: boolean;
|
||||||
includes?: string[];
|
includes?: string[];
|
||||||
excludes?: string[];
|
excludes?: string[];
|
||||||
@ -47,8 +44,14 @@ export type WebScraperOptions = {
|
|||||||
limit?: number;
|
limit?: number;
|
||||||
generateImgAltText?: boolean;
|
generateImgAltText?: boolean;
|
||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
|
ignoreSitemap?: boolean;
|
||||||
mode?: "default" | "fast"; // have a mode of some sort
|
mode?: "default" | "fast"; // have a mode of some sort
|
||||||
};
|
}
|
||||||
|
|
||||||
|
export type WebScraperOptions = {
|
||||||
|
urls: string[];
|
||||||
|
mode: "single_urls" | "sitemap" | "crawl";
|
||||||
|
crawlerOptions?: CrawlerOptions;
|
||||||
pageOptions?: PageOptions;
|
pageOptions?: PageOptions;
|
||||||
extractorOptions?: ExtractorOptions;
|
extractorOptions?: ExtractorOptions;
|
||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
|
@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
|
|||||||
import { URL } from "url";
|
import { URL } from "url";
|
||||||
import { getLinksFromSitemap } from "./sitemap";
|
import { getLinksFromSitemap } from "./sitemap";
|
||||||
import async from "async";
|
import async from "async";
|
||||||
import { Progress } from "../../lib/entities";
|
import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
|
||||||
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
|
|
||||||
@ -108,6 +108,8 @@ export class WebCrawler {
|
|||||||
|
|
||||||
public async start(
|
public async start(
|
||||||
inProgress?: (progress: Progress) => void,
|
inProgress?: (progress: Progress) => void,
|
||||||
|
pageOptions?: PageOptions,
|
||||||
|
crawlerOptions?: CrawlerOptions,
|
||||||
concurrencyLimit: number = 5,
|
concurrencyLimit: number = 5,
|
||||||
limit: number = 10000,
|
limit: number = 10000,
|
||||||
maxDepth: number = 10
|
maxDepth: number = 10
|
||||||
@ -122,17 +124,21 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if(!crawlerOptions?.ignoreSitemap){
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const urls = await this.crawlUrls(
|
const urls = await this.crawlUrls(
|
||||||
[this.initialUrl],
|
[this.initialUrl],
|
||||||
|
pageOptions,
|
||||||
concurrencyLimit,
|
concurrencyLimit,
|
||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
urls.length === 0 &&
|
urls.length === 0 &&
|
||||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
||||||
@ -140,14 +146,15 @@ export class WebCrawler {
|
|||||||
return [{ url: this.initialUrl, html: "" }];
|
return [{ url: this.initialUrl, html: "" }];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// make sure to run include exclude here again
|
// make sure to run include exclude here again
|
||||||
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
||||||
|
|
||||||
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
||||||
}
|
}
|
||||||
|
|
||||||
private async crawlUrls(
|
private async crawlUrls(
|
||||||
urls: string[],
|
urls: string[],
|
||||||
|
pageOptions: PageOptions,
|
||||||
concurrencyLimit: number,
|
concurrencyLimit: number,
|
||||||
inProgress?: (progress: Progress) => void,
|
inProgress?: (progress: Progress) => void,
|
||||||
): Promise<{ url: string, html: string }[]> {
|
): Promise<{ url: string, html: string }[]> {
|
||||||
@ -158,7 +165,7 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const newUrls = await this.crawl(task);
|
const newUrls = await this.crawl(task, pageOptions);
|
||||||
// add the initial url if not already added
|
// add the initial url if not already added
|
||||||
// if (this.visited.size === 1) {
|
// if (this.visited.size === 1) {
|
||||||
// let normalizedInitial = this.initialUrl;
|
// let normalizedInitial = this.initialUrl;
|
||||||
@ -188,7 +195,7 @@ export class WebCrawler {
|
|||||||
currentDocumentUrl: task,
|
currentDocumentUrl: task,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
|
await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
|
||||||
if (callback && typeof callback === "function") {
|
if (callback && typeof callback === "function") {
|
||||||
callback();
|
callback();
|
||||||
}
|
}
|
||||||
@ -207,20 +214,18 @@ export class WebCrawler {
|
|||||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
|
||||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
|
const normalizedUrl = this.normalizeCrawlUrl(url);
|
||||||
|
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
this.visited.add(url);
|
this.visited.add(normalizedUrl);
|
||||||
|
|
||||||
|
|
||||||
if (!url.startsWith("http")) {
|
if (!url.startsWith("http")) {
|
||||||
url = "https://" + url;
|
url = "https://" + url;
|
||||||
|
|
||||||
}
|
}
|
||||||
if (url.endsWith("/")) {
|
if (url.endsWith("/")) {
|
||||||
url = url.slice(0, -1);
|
url = url.slice(0, -1);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
|
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
|
||||||
@ -231,8 +236,8 @@ export class WebCrawler {
|
|||||||
let content: string = "";
|
let content: string = "";
|
||||||
// If it is the first link, fetch with single url
|
// If it is the first link, fetch with single url
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
const page = await scrapSingleUrl(url, {includeHtml: true});
|
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
|
||||||
content = page.html ?? ""
|
content = page.html ?? "";
|
||||||
} else {
|
} else {
|
||||||
const response = await axios.get(url);
|
const response = await axios.get(url);
|
||||||
content = response.data ?? "";
|
content = response.data ?? "";
|
||||||
@ -241,12 +246,10 @@ export class WebCrawler {
|
|||||||
let links: { url: string, html: string }[] = [];
|
let links: { url: string, html: string }[] = [];
|
||||||
|
|
||||||
// Add the initial URL to the list of links
|
// Add the initial URL to the list of links
|
||||||
if(this.visited.size === 1)
|
if (this.visited.size === 1) {
|
||||||
{
|
|
||||||
links.push({ url, html: content });
|
links.push({ url, html: content });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
$("a").each((_, element) => {
|
$("a").each((_, element) => {
|
||||||
const href = $(element).attr("href");
|
const href = $(element).attr("href");
|
||||||
if (href) {
|
if (href) {
|
||||||
@ -254,14 +257,15 @@ export class WebCrawler {
|
|||||||
if (!href.startsWith("http")) {
|
if (!href.startsWith("http")) {
|
||||||
fullUrl = new URL(href, this.baseUrl).toString();
|
fullUrl = new URL(href, this.baseUrl).toString();
|
||||||
}
|
}
|
||||||
const url = new URL(fullUrl);
|
const urlObj = new URL(fullUrl);
|
||||||
const path = url.pathname;
|
const path = urlObj.pathname;
|
||||||
|
|
||||||
if (
|
if (
|
||||||
this.isInternalLink(fullUrl) &&
|
this.isInternalLink(fullUrl) &&
|
||||||
this.matchesPattern(fullUrl) &&
|
this.matchesPattern(fullUrl) &&
|
||||||
this.noSections(fullUrl) &&
|
this.noSections(fullUrl) &&
|
||||||
this.matchesIncludes(path) &&
|
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
|
||||||
|
// this.matchesIncludes(path) &&
|
||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
||||||
) {
|
) {
|
||||||
@ -274,12 +278,22 @@ export class WebCrawler {
|
|||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
// Create a new list to return to avoid modifying the visited list
|
// Create a new list to return to avoid modifying the visited list
|
||||||
return links.filter((link) => !this.visited.has(link.url));
|
return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private normalizeCrawlUrl(url: string): string {
|
||||||
|
try{
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
urlObj.searchParams.sort(); // Sort query parameters to normalize
|
||||||
|
return urlObj.toString();
|
||||||
|
} catch (error) {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private matchesIncludes(url: string): boolean {
|
private matchesIncludes(url: string): boolean {
|
||||||
if (this.includes.length === 0 || this.includes[0] == "") return true;
|
if (this.includes.length === 0 || this.includes[0] == "") return true;
|
||||||
return this.includes.some((pattern) => new RegExp(pattern).test(url));
|
return this.includes.some((pattern) => new RegExp(pattern).test(url));
|
||||||
@ -388,7 +402,6 @@ export class WebCrawler {
|
|||||||
|
|
||||||
// Normalize and check if the URL is present in any of the sitemaps
|
// Normalize and check if the URL is present in any of the sitemaps
|
||||||
const normalizedUrl = normalizeUrl(url);
|
const normalizedUrl = normalizeUrl(url);
|
||||||
|
|
||||||
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
||||||
|
|
||||||
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
||||||
|
@ -31,6 +31,7 @@ export class WebScraperDataProvider {
|
|||||||
private limit: number = 10000;
|
private limit: number = 10000;
|
||||||
private concurrentRequests: number = 20;
|
private concurrentRequests: number = 20;
|
||||||
private generateImgAltText: boolean = false;
|
private generateImgAltText: boolean = false;
|
||||||
|
private ignoreSitemap: boolean = false;
|
||||||
private pageOptions?: PageOptions;
|
private pageOptions?: PageOptions;
|
||||||
private extractorOptions?: ExtractorOptions;
|
private extractorOptions?: ExtractorOptions;
|
||||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||||
@ -38,6 +39,7 @@ export class WebScraperDataProvider {
|
|||||||
"gpt-4-turbo";
|
"gpt-4-turbo";
|
||||||
private crawlerMode: string = "default";
|
private crawlerMode: string = "default";
|
||||||
|
|
||||||
|
|
||||||
authorize(): void {
|
authorize(): void {
|
||||||
throw new Error("Method not implemented.");
|
throw new Error("Method not implemented.");
|
||||||
}
|
}
|
||||||
@ -173,6 +175,10 @@ export class WebScraperDataProvider {
|
|||||||
|
|
||||||
let links = await crawler.start(
|
let links = await crawler.start(
|
||||||
inProgress,
|
inProgress,
|
||||||
|
this.pageOptions,
|
||||||
|
{
|
||||||
|
ignoreSitemap: this.ignoreSitemap,
|
||||||
|
},
|
||||||
5,
|
5,
|
||||||
this.limit,
|
this.limit,
|
||||||
this.maxCrawledDepth
|
this.maxCrawledDepth
|
||||||
@ -473,6 +479,7 @@ export class WebScraperDataProvider {
|
|||||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||||
|
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
||||||
|
|
||||||
// make sure all urls start with https://
|
// make sure all urls start with https://
|
||||||
this.urls = this.urls.map((url) => {
|
this.urls = this.urls.map((url) => {
|
||||||
|
@ -12,6 +12,7 @@ export async function getLinksFromSitemap(
|
|||||||
content = response.data;
|
content = response.data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user