0

Merge pull request #263 from mendableai/nsc/pageoptions-crawler

ignoreSitemap feature, pageOptions now respected in the initial crawl as well
This commit is contained in:
Nicolas 2024-06-10 18:22:51 -07:00 committed by GitHub
commit 15e791ffb1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 65 additions and 41 deletions

View File

@ -35,10 +35,7 @@ export type SearchOptions = {
location?: string; location?: string;
}; };
export type WebScraperOptions = { export type CrawlerOptions = {
urls: string[];
mode: "single_urls" | "sitemap" | "crawl";
crawlerOptions?: {
returnOnlyUrls?: boolean; returnOnlyUrls?: boolean;
includes?: string[]; includes?: string[];
excludes?: string[]; excludes?: string[];
@ -47,8 +44,14 @@ export type WebScraperOptions = {
limit?: number; limit?: number;
generateImgAltText?: boolean; generateImgAltText?: boolean;
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
ignoreSitemap?: boolean;
mode?: "default" | "fast"; // have a mode of some sort mode?: "default" | "fast"; // have a mode of some sort
}; }
export type WebScraperOptions = {
urls: string[];
mode: "single_urls" | "sitemap" | "crawl";
crawlerOptions?: CrawlerOptions;
pageOptions?: PageOptions; pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions; extractorOptions?: ExtractorOptions;
concurrentRequests?: number; concurrentRequests?: number;

View File

@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
import { URL } from "url"; import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
import async from "async"; import async from "async";
import { Progress } from "../../lib/entities"; import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
@ -108,6 +108,8 @@ export class WebCrawler {
public async start( public async start(
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
pageOptions?: PageOptions,
crawlerOptions?: CrawlerOptions,
concurrencyLimit: number = 5, concurrencyLimit: number = 5,
limit: number = 10000, limit: number = 10000,
maxDepth: number = 10 maxDepth: number = 10
@ -122,17 +124,21 @@ export class WebCrawler {
} }
if(!crawlerOptions?.ignoreSitemap){
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
return filteredLinks.map(link => ({ url: link, html: "" })); return filteredLinks.map(link => ({ url: link, html: "" }));
} }
}
const urls = await this.crawlUrls( const urls = await this.crawlUrls(
[this.initialUrl], [this.initialUrl],
pageOptions,
concurrencyLimit, concurrencyLimit,
inProgress inProgress
); );
if ( if (
urls.length === 0 && urls.length === 0 &&
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
@ -140,14 +146,15 @@ export class WebCrawler {
return [{ url: this.initialUrl, html: "" }]; return [{ url: this.initialUrl, html: "" }];
} }
// make sure to run include exclude here again // make sure to run include exclude here again
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
} }
private async crawlUrls( private async crawlUrls(
urls: string[], urls: string[],
pageOptions: PageOptions,
concurrencyLimit: number, concurrencyLimit: number,
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
): Promise<{ url: string, html: string }[]> { ): Promise<{ url: string, html: string }[]> {
@ -158,7 +165,7 @@ export class WebCrawler {
} }
return; return;
} }
const newUrls = await this.crawl(task); const newUrls = await this.crawl(task, pageOptions);
// add the initial url if not already added // add the initial url if not already added
// if (this.visited.size === 1) { // if (this.visited.size === 1) {
// let normalizedInitial = this.initialUrl; // let normalizedInitial = this.initialUrl;
@ -188,7 +195,7 @@ export class WebCrawler {
currentDocumentUrl: task, currentDocumentUrl: task,
}); });
} }
await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress); await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
} }
@ -207,20 +214,18 @@ export class WebCrawler {
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
async crawl(url: string): Promise<{url: string, html: string}[]> { async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){ const normalizedUrl = this.normalizeCrawlUrl(url);
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return []; return [];
} }
this.visited.add(url); this.visited.add(normalizedUrl);
if (!url.startsWith("http")) { if (!url.startsWith("http")) {
url = "https://" + url; url = "https://" + url;
} }
if (url.endsWith("/")) { if (url.endsWith("/")) {
url = url.slice(0, -1); url = url.slice(0, -1);
} }
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) { if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
@ -231,8 +236,8 @@ export class WebCrawler {
let content: string = ""; let content: string = "";
// If it is the first link, fetch with single url // If it is the first link, fetch with single url
if (this.visited.size === 1) { if (this.visited.size === 1) {
const page = await scrapSingleUrl(url, {includeHtml: true}); const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
content = page.html ?? "" content = page.html ?? "";
} else { } else {
const response = await axios.get(url); const response = await axios.get(url);
content = response.data ?? ""; content = response.data ?? "";
@ -241,12 +246,10 @@ export class WebCrawler {
let links: { url: string, html: string }[] = []; let links: { url: string, html: string }[] = [];
// Add the initial URL to the list of links // Add the initial URL to the list of links
if(this.visited.size === 1) if (this.visited.size === 1) {
{
links.push({ url, html: content }); links.push({ url, html: content });
} }
$("a").each((_, element) => { $("a").each((_, element) => {
const href = $(element).attr("href"); const href = $(element).attr("href");
if (href) { if (href) {
@ -254,14 +257,15 @@ export class WebCrawler {
if (!href.startsWith("http")) { if (!href.startsWith("http")) {
fullUrl = new URL(href, this.baseUrl).toString(); fullUrl = new URL(href, this.baseUrl).toString();
} }
const url = new URL(fullUrl); const urlObj = new URL(fullUrl);
const path = url.pathname; const path = urlObj.pathname;
if ( if (
this.isInternalLink(fullUrl) && this.isInternalLink(fullUrl) &&
this.matchesPattern(fullUrl) && this.matchesPattern(fullUrl) &&
this.noSections(fullUrl) && this.noSections(fullUrl) &&
this.matchesIncludes(path) && // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
// this.matchesIncludes(path) &&
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent") this.robots.isAllowed(fullUrl, "FireCrawlAgent")
) { ) {
@ -274,12 +278,22 @@ export class WebCrawler {
return links; return links;
} }
// Create a new list to return to avoid modifying the visited list // Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(link.url)); return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
} catch (error) { } catch (error) {
return []; return [];
} }
} }
private normalizeCrawlUrl(url: string): string {
try{
const urlObj = new URL(url);
urlObj.searchParams.sort(); // Sort query parameters to normalize
return urlObj.toString();
} catch (error) {
return url;
}
}
private matchesIncludes(url: string): boolean { private matchesIncludes(url: string): boolean {
if (this.includes.length === 0 || this.includes[0] == "") return true; if (this.includes.length === 0 || this.includes[0] == "") return true;
return this.includes.some((pattern) => new RegExp(pattern).test(url)); return this.includes.some((pattern) => new RegExp(pattern).test(url));
@ -388,7 +402,6 @@ export class WebCrawler {
// Normalize and check if the URL is present in any of the sitemaps // Normalize and check if the URL is present in any of the sitemaps
const normalizedUrl = normalizeUrl(url); const normalizedUrl = normalizeUrl(url);
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl

View File

@ -31,6 +31,7 @@ export class WebScraperDataProvider {
private limit: number = 10000; private limit: number = 10000;
private concurrentRequests: number = 20; private concurrentRequests: number = 20;
private generateImgAltText: boolean = false; private generateImgAltText: boolean = false;
private ignoreSitemap: boolean = false;
private pageOptions?: PageOptions; private pageOptions?: PageOptions;
private extractorOptions?: ExtractorOptions; private extractorOptions?: ExtractorOptions;
private replaceAllPathsWithAbsolutePaths?: boolean = false; private replaceAllPathsWithAbsolutePaths?: boolean = false;
@ -38,6 +39,7 @@ export class WebScraperDataProvider {
"gpt-4-turbo"; "gpt-4-turbo";
private crawlerMode: string = "default"; private crawlerMode: string = "default";
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
} }
@ -173,6 +175,10 @@ export class WebScraperDataProvider {
let links = await crawler.start( let links = await crawler.start(
inProgress, inProgress,
this.pageOptions,
{
ignoreSitemap: this.ignoreSitemap,
},
5, 5,
this.limit, this.limit,
this.maxCrawledDepth this.maxCrawledDepth
@ -473,6 +479,7 @@ export class WebScraperDataProvider {
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== ""); this.excludes = this.excludes.filter((item) => item !== "");
this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.crawlerMode = options.crawlerOptions?.mode ?? "default";
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {

View File

@ -12,6 +12,7 @@ export async function getLinksFromSitemap(
content = response.data; content = response.data;
} catch (error) { } catch (error) {
console.error(`Request failed for ${sitemapUrl}: ${error}`); console.error(`Request failed for ${sitemapUrl}: ${error}`);
return allUrls; return allUrls;
} }