0

Update crawler.ts

This commit is contained in:
Nicolas 2024-06-14 13:44:54 -07:00
parent e37aa3db57
commit e88cb314c8

View File

@ -224,11 +224,10 @@ export class WebCrawler {
}
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
const normalizedUrl = this.normalizeCrawlUrl(url);
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return [];
}
this.visited.add(normalizedUrl);
this.visited.add(url);
if (!url.startsWith("http")) {
url = "https://" + url;
@ -276,15 +275,16 @@ export class WebCrawler {
const urlObj = new URL(fullUrl);
const path = urlObj.pathname;
if (
this.isInternalLink(fullUrl) &&
this.matchesPattern(fullUrl) &&
this.noSections(fullUrl) &&
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
// this.matchesIncludes(path) &&
!this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
this.isRobotsAllowed(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
}
@ -294,12 +294,15 @@ export class WebCrawler {
return links;
}
// Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
return links.filter((link) => !this.visited.has(link.url));
} catch (error) {
return [];
}
}
private isRobotsAllowed(url: string): boolean {
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
}
private normalizeCrawlUrl(url: string): string {
try{
const urlObj = new URL(url);
@ -326,12 +329,10 @@ export class WebCrawler {
private isInternalLink(link: string): boolean {
const urlObj = new URL(link, this.baseUrl);
const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
return urlObj.hostname === domainWithoutProtocol;
}
const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
private matchesPattern(link: string): boolean {
return true; // Placeholder for future pattern matching implementation
return linkDomain === baseDomain;
}
private isFile(url: string): boolean {