Merge pull request #283 from mendableai/nsc/crawler-fixes
Fixes crawler getting confused with base paths that contain www.
This commit is contained in:
commit
4ec863718b
@ -224,11 +224,10 @@ export class WebCrawler {
|
||||
}
|
||||
|
||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||
const normalizedUrl = this.normalizeCrawlUrl(url);
|
||||
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||
return [];
|
||||
}
|
||||
this.visited.add(normalizedUrl);
|
||||
this.visited.add(url);
|
||||
|
||||
if (!url.startsWith("http")) {
|
||||
url = "https://" + url;
|
||||
@ -276,15 +275,16 @@ export class WebCrawler {
|
||||
const urlObj = new URL(fullUrl);
|
||||
const path = urlObj.pathname;
|
||||
|
||||
|
||||
if (
|
||||
this.isInternalLink(fullUrl) &&
|
||||
this.matchesPattern(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
|
||||
// this.matchesIncludes(path) &&
|
||||
!this.matchesExcludes(path) &&
|
||||
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
||||
this.isRobotsAllowed(fullUrl)
|
||||
) {
|
||||
|
||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
}
|
||||
@ -294,12 +294,15 @@ export class WebCrawler {
|
||||
return links;
|
||||
}
|
||||
// Create a new list to return to avoid modifying the visited list
|
||||
return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
|
||||
return links.filter((link) => !this.visited.has(link.url));
|
||||
} catch (error) {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private isRobotsAllowed(url: string): boolean {
|
||||
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
|
||||
}
|
||||
private normalizeCrawlUrl(url: string): string {
|
||||
try{
|
||||
const urlObj = new URL(url);
|
||||
@ -326,12 +329,10 @@ export class WebCrawler {
|
||||
|
||||
private isInternalLink(link: string): boolean {
|
||||
const urlObj = new URL(link, this.baseUrl);
|
||||
const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
|
||||
return urlObj.hostname === domainWithoutProtocol;
|
||||
}
|
||||
|
||||
private matchesPattern(link: string): boolean {
|
||||
return true; // Placeholder for future pattern matching implementation
|
||||
const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
|
||||
const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
|
||||
|
||||
return linkDomain === baseDomain;
|
||||
}
|
||||
|
||||
private isFile(url: string): boolean {
|
||||
|
Loading…
Reference in New Issue
Block a user