0

Merge pull request #283 from mendableai/nsc/crawler-fixes

Fixes crawler getting confused with base paths that contain www.
This commit is contained in:
Nicolas 2024-06-14 13:50:32 -07:00 committed by GitHub
commit 4ec863718b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -224,11 +224,10 @@ export class WebCrawler {
} }
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> { async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
const normalizedUrl = this.normalizeCrawlUrl(url); if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return []; return [];
} }
this.visited.add(normalizedUrl); this.visited.add(url);
if (!url.startsWith("http")) { if (!url.startsWith("http")) {
url = "https://" + url; url = "https://" + url;
@ -276,15 +275,16 @@ export class WebCrawler {
const urlObj = new URL(fullUrl); const urlObj = new URL(fullUrl);
const path = urlObj.pathname; const path = urlObj.pathname;
if ( if (
this.isInternalLink(fullUrl) && this.isInternalLink(fullUrl) &&
this.matchesPattern(fullUrl) &&
this.noSections(fullUrl) && this.noSections(fullUrl) &&
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
// this.matchesIncludes(path) && // this.matchesIncludes(path) &&
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent") this.isRobotsAllowed(fullUrl)
) { ) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
} }
} }
@ -294,12 +294,15 @@ export class WebCrawler {
return links; return links;
} }
// Create a new list to return to avoid modifying the visited list // Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url))); return links.filter((link) => !this.visited.has(link.url));
} catch (error) { } catch (error) {
return []; return [];
} }
} }
private isRobotsAllowed(url: string): boolean {
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
}
private normalizeCrawlUrl(url: string): string { private normalizeCrawlUrl(url: string): string {
try{ try{
const urlObj = new URL(url); const urlObj = new URL(url);
@ -326,12 +329,10 @@ export class WebCrawler {
private isInternalLink(link: string): boolean { private isInternalLink(link: string): boolean {
const urlObj = new URL(link, this.baseUrl); const urlObj = new URL(link, this.baseUrl);
const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, ""); const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
return urlObj.hostname === domainWithoutProtocol; const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
}
return linkDomain === baseDomain;
private matchesPattern(link: string): boolean {
return true; // Placeholder for future pattern matching implementation
} }
private isFile(url: string): boolean { private isFile(url: string): boolean {