Update crawler.ts
This commit is contained in:
parent
e37aa3db57
commit
e88cb314c8
@ -224,11 +224,10 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||||
const normalizedUrl = this.normalizeCrawlUrl(url);
|
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||||
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
this.visited.add(normalizedUrl);
|
this.visited.add(url);
|
||||||
|
|
||||||
if (!url.startsWith("http")) {
|
if (!url.startsWith("http")) {
|
||||||
url = "https://" + url;
|
url = "https://" + url;
|
||||||
@ -276,15 +275,16 @@ export class WebCrawler {
|
|||||||
const urlObj = new URL(fullUrl);
|
const urlObj = new URL(fullUrl);
|
||||||
const path = urlObj.pathname;
|
const path = urlObj.pathname;
|
||||||
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
this.isInternalLink(fullUrl) &&
|
this.isInternalLink(fullUrl) &&
|
||||||
this.matchesPattern(fullUrl) &&
|
|
||||||
this.noSections(fullUrl) &&
|
this.noSections(fullUrl) &&
|
||||||
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
|
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
|
||||||
// this.matchesIncludes(path) &&
|
// this.matchesIncludes(path) &&
|
||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
this.isRobotsAllowed(fullUrl)
|
||||||
) {
|
) {
|
||||||
|
|
||||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -294,12 +294,15 @@ export class WebCrawler {
|
|||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
// Create a new list to return to avoid modifying the visited list
|
// Create a new list to return to avoid modifying the visited list
|
||||||
return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
|
return links.filter((link) => !this.visited.has(link.url));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private isRobotsAllowed(url: string): boolean {
|
||||||
|
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
|
||||||
|
}
|
||||||
private normalizeCrawlUrl(url: string): string {
|
private normalizeCrawlUrl(url: string): string {
|
||||||
try{
|
try{
|
||||||
const urlObj = new URL(url);
|
const urlObj = new URL(url);
|
||||||
@ -326,12 +329,10 @@ export class WebCrawler {
|
|||||||
|
|
||||||
private isInternalLink(link: string): boolean {
|
private isInternalLink(link: string): boolean {
|
||||||
const urlObj = new URL(link, this.baseUrl);
|
const urlObj = new URL(link, this.baseUrl);
|
||||||
const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
|
const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
|
||||||
return urlObj.hostname === domainWithoutProtocol;
|
const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
|
||||||
}
|
|
||||||
|
|
||||||
private matchesPattern(link: string): boolean {
|
return linkDomain === baseDomain;
|
||||||
return true; // Placeholder for future pattern matching implementation
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private isFile(url: string): boolean {
|
private isFile(url: string): boolean {
|
||||||
|
Loading…
Reference in New Issue
Block a user