0

Merge pull request #283 from mendableai/nsc/crawler-fixes

Fixes crawler getting confused with base paths that contain www.
This commit is contained in:
Nicolas 2024-06-14 13:50:32 -07:00 committed by GitHub
commit 4ec863718b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -224,11 +224,10 @@ export class WebCrawler {
}
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
const normalizedUrl = this.normalizeCrawlUrl(url);
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return [];
}
this.visited.add(normalizedUrl);
this.visited.add(url);
if (!url.startsWith("http")) {
url = "https://" + url;
@ -276,15 +275,16 @@ export class WebCrawler {
const urlObj = new URL(fullUrl);
const path = urlObj.pathname;
if (
this.isInternalLink(fullUrl) &&
this.matchesPattern(fullUrl) &&
this.noSections(fullUrl) &&
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
// this.matchesIncludes(path) &&
!this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
this.isRobotsAllowed(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
}
@ -294,12 +294,15 @@ export class WebCrawler {
return links;
}
// Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
return links.filter((link) => !this.visited.has(link.url));
} catch (error) {
return [];
}
}
private isRobotsAllowed(url: string): boolean {
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
}
private normalizeCrawlUrl(url: string): string {
try{
const urlObj = new URL(url);
@ -326,12 +329,10 @@ export class WebCrawler {
private isInternalLink(link: string): boolean {
const urlObj = new URL(link, this.baseUrl);
const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
return urlObj.hostname === domainWithoutProtocol;
}
const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
private matchesPattern(link: string): boolean {
return true; // Placeholder for future pattern matching implementation
return linkDomain === baseDomain;
}
private isFile(url: string): boolean {