2024-04-15 17:01:47 -04:00
|
|
|
import axios from "axios";
|
|
|
|
import cheerio, { load } from "cheerio";
|
|
|
|
import { URL } from "url";
|
|
|
|
import { getLinksFromSitemap } from "./sitemap";
|
|
|
|
import async from "async";
|
|
|
|
import { Progress } from "../../lib/entities";
|
2024-05-13 23:45:11 -04:00
|
|
|
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
2024-04-15 17:01:47 -04:00
|
|
|
import robotsParser from "robots-parser";
|
|
|
|
|
|
|
|
export class WebCrawler {
|
|
|
|
private initialUrl: string;
|
|
|
|
private baseUrl: string;
|
|
|
|
private includes: string[];
|
|
|
|
private excludes: string[];
|
|
|
|
private maxCrawledLinks: number;
|
2024-05-07 10:06:26 -04:00
|
|
|
private maxCrawledDepth: number;
|
2024-04-15 17:01:47 -04:00
|
|
|
private visited: Set<string> = new Set();
|
2024-05-14 15:12:40 -04:00
|
|
|
private crawledUrls: Map<string, string> = new Map();
|
2024-04-15 17:01:47 -04:00
|
|
|
private limit: number;
|
|
|
|
private robotsTxtUrl: string;
|
|
|
|
private robots: any;
|
2024-04-16 12:49:14 -04:00
|
|
|
private generateImgAltText: boolean;
|
2024-04-15 17:01:47 -04:00
|
|
|
|
|
|
|
constructor({
|
|
|
|
initialUrl,
|
|
|
|
includes,
|
|
|
|
excludes,
|
|
|
|
maxCrawledLinks,
|
|
|
|
limit = 10000,
|
2024-04-16 12:49:14 -04:00
|
|
|
generateImgAltText = false,
|
2024-05-07 10:06:26 -04:00
|
|
|
maxCrawledDepth = 10,
|
2024-04-15 17:01:47 -04:00
|
|
|
}: {
|
|
|
|
initialUrl: string;
|
|
|
|
includes?: string[];
|
|
|
|
excludes?: string[];
|
|
|
|
maxCrawledLinks?: number;
|
|
|
|
limit?: number;
|
2024-04-16 12:49:14 -04:00
|
|
|
generateImgAltText?: boolean;
|
2024-05-07 10:06:26 -04:00
|
|
|
maxCrawledDepth?: number;
|
2024-04-15 17:01:47 -04:00
|
|
|
}) {
|
|
|
|
this.initialUrl = initialUrl;
|
|
|
|
this.baseUrl = new URL(initialUrl).origin;
|
|
|
|
this.includes = includes ?? [];
|
|
|
|
this.excludes = excludes ?? [];
|
|
|
|
this.limit = limit;
|
|
|
|
this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
|
|
|
|
this.robots = robotsParser(this.robotsTxtUrl, "");
|
|
|
|
// Deprecated, use limit instead
|
|
|
|
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
2024-05-07 10:06:26 -04:00
|
|
|
this.maxCrawledDepth = maxCrawledDepth ?? 10;
|
2024-04-16 12:49:14 -04:00
|
|
|
this.generateImgAltText = generateImgAltText ?? false;
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
|
2024-05-07 10:06:26 -04:00
|
|
|
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
2024-04-15 17:01:47 -04:00
|
|
|
return sitemapLinks
|
|
|
|
.filter((link) => {
|
|
|
|
const url = new URL(link);
|
|
|
|
const path = url.pathname;
|
2024-05-07 10:06:26 -04:00
|
|
|
const depth = url.pathname.split('/').length - 1;
|
|
|
|
|
|
|
|
// Check if the link exceeds the maximum depth allowed
|
|
|
|
if (depth > maxDepth) {
|
|
|
|
return false;
|
|
|
|
}
|
2024-04-15 17:01:47 -04:00
|
|
|
|
|
|
|
// Check if the link should be excluded
|
|
|
|
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
|
|
|
if (
|
|
|
|
this.excludes.some((excludePattern) =>
|
|
|
|
new RegExp(excludePattern).test(path)
|
|
|
|
)
|
|
|
|
) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the link matches the include patterns, if any are specified
|
|
|
|
if (this.includes.length > 0 && this.includes[0] !== "") {
|
2024-05-15 18:30:37 -04:00
|
|
|
if (!this.includes.some((includePattern) =>
|
2024-04-15 17:01:47 -04:00
|
|
|
new RegExp(includePattern).test(path)
|
2024-05-15 18:30:37 -04:00
|
|
|
)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Normalize the initial URL and the link to account for www and non-www versions
|
|
|
|
const normalizedInitialUrl = new URL(this.initialUrl);
|
|
|
|
const normalizedLink = new URL(link);
|
|
|
|
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
|
|
|
|
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
|
|
|
|
|
|
|
|
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
|
|
|
if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
|
|
|
|
return false;
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
|
|
|
// Check if the link is disallowed by robots.txt
|
|
|
|
if (!isAllowed) {
|
|
|
|
console.log(`Link disallowed by robots.txt: ${link}`);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
})
|
|
|
|
.slice(0, limit);
|
|
|
|
}
|
|
|
|
|
|
|
|
public async start(
|
|
|
|
inProgress?: (progress: Progress) => void,
|
|
|
|
concurrencyLimit: number = 5,
|
2024-05-07 10:06:26 -04:00
|
|
|
limit: number = 10000,
|
|
|
|
maxDepth: number = 10
|
2024-05-13 23:45:11 -04:00
|
|
|
): Promise<{ url: string, html: string }[]> {
|
2024-04-15 17:01:47 -04:00
|
|
|
// Fetch and parse robots.txt
|
|
|
|
try {
|
|
|
|
const response = await axios.get(this.robotsTxtUrl);
|
|
|
|
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
|
|
|
} catch (error) {
|
|
|
|
console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
2024-05-15 18:30:37 -04:00
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
|
2024-05-15 18:30:37 -04:00
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
|
|
|
if (sitemapLinks.length > 0) {
|
2024-05-15 17:35:09 -04:00
|
|
|
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
2024-05-13 23:45:11 -04:00
|
|
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
const urls = await this.crawlUrls(
|
|
|
|
[this.initialUrl],
|
|
|
|
concurrencyLimit,
|
|
|
|
inProgress
|
|
|
|
);
|
|
|
|
if (
|
|
|
|
urls.length === 0 &&
|
2024-05-07 10:06:26 -04:00
|
|
|
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
2024-04-15 17:01:47 -04:00
|
|
|
) {
|
2024-05-13 23:45:11 -04:00
|
|
|
return [{ url: this.initialUrl, html: "" }];
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
|
2024-05-15 20:13:04 -04:00
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
// make sure to run include exclude here again
|
2024-05-13 23:45:11 -04:00
|
|
|
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
|
|
|
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
private async crawlUrls(
|
|
|
|
urls: string[],
|
|
|
|
concurrencyLimit: number,
|
2024-05-15 20:13:04 -04:00
|
|
|
inProgress?: (progress: Progress) => void,
|
2024-05-13 23:45:11 -04:00
|
|
|
): Promise<{ url: string, html: string }[]> {
|
2024-04-15 17:01:47 -04:00
|
|
|
const queue = async.queue(async (task: string, callback) => {
|
2024-05-14 15:04:36 -04:00
|
|
|
if (this.crawledUrls.size >= this.maxCrawledLinks) {
|
2024-04-15 17:01:47 -04:00
|
|
|
if (callback && typeof callback === "function") {
|
|
|
|
callback();
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
const newUrls = await this.crawl(task);
|
2024-05-15 20:13:04 -04:00
|
|
|
// add the initial url if not already added
|
|
|
|
// if (this.visited.size === 1) {
|
|
|
|
// let normalizedInitial = this.initialUrl;
|
|
|
|
// if (!normalizedInitial.endsWith("/")) {
|
|
|
|
// normalizedInitial = normalizedInitial + "/";
|
|
|
|
// }
|
|
|
|
// if (!newUrls.some(page => page.url === this.initialUrl)) {
|
|
|
|
// newUrls.push({ url: this.initialUrl, html: "" });
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
2024-05-14 15:12:40 -04:00
|
|
|
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
2024-05-15 20:13:04 -04:00
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
if (inProgress && newUrls.length > 0) {
|
|
|
|
inProgress({
|
2024-05-14 15:04:36 -04:00
|
|
|
current: this.crawledUrls.size,
|
2024-04-15 17:01:47 -04:00
|
|
|
total: this.maxCrawledLinks,
|
|
|
|
status: "SCRAPING",
|
2024-05-13 23:45:11 -04:00
|
|
|
currentDocumentUrl: newUrls[newUrls.length - 1].url,
|
2024-04-15 17:01:47 -04:00
|
|
|
});
|
|
|
|
} else if (inProgress) {
|
|
|
|
inProgress({
|
2024-05-14 15:04:36 -04:00
|
|
|
current: this.crawledUrls.size,
|
2024-04-15 17:01:47 -04:00
|
|
|
total: this.maxCrawledLinks,
|
|
|
|
status: "SCRAPING",
|
|
|
|
currentDocumentUrl: task,
|
|
|
|
});
|
|
|
|
}
|
2024-05-13 23:45:11 -04:00
|
|
|
await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
|
2024-04-15 17:01:47 -04:00
|
|
|
if (callback && typeof callback === "function") {
|
|
|
|
callback();
|
|
|
|
}
|
|
|
|
}, concurrencyLimit);
|
|
|
|
|
|
|
|
queue.push(
|
|
|
|
urls.filter(
|
|
|
|
(url) =>
|
|
|
|
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
|
|
|
|
),
|
|
|
|
(err) => {
|
|
|
|
if (err) console.error(err);
|
|
|
|
}
|
|
|
|
);
|
|
|
|
await queue.drain();
|
2024-05-14 15:12:40 -04:00
|
|
|
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
|
2024-05-13 23:45:11 -04:00
|
|
|
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
2024-05-15 20:13:04 -04:00
|
|
|
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
|
2024-04-15 17:01:47 -04:00
|
|
|
return [];
|
2024-05-15 20:13:04 -04:00
|
|
|
}
|
2024-04-15 17:01:47 -04:00
|
|
|
this.visited.add(url);
|
2024-05-15 20:13:04 -04:00
|
|
|
|
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
if (!url.startsWith("http")) {
|
|
|
|
url = "https://" + url;
|
2024-05-15 20:13:04 -04:00
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
if (url.endsWith("/")) {
|
|
|
|
url = url.slice(0, -1);
|
2024-05-15 20:13:04 -04:00
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
2024-05-15 20:13:04 -04:00
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
2024-05-13 23:45:11 -04:00
|
|
|
let content : string = "";
|
|
|
|
// If it is the first link, fetch with single url
|
2024-04-15 17:01:47 -04:00
|
|
|
if (this.visited.size === 1) {
|
2024-05-13 23:45:11 -04:00
|
|
|
const page = await scrapSingleUrl(url, {includeHtml: true});
|
|
|
|
content = page.html ?? ""
|
2024-04-15 17:01:47 -04:00
|
|
|
} else {
|
|
|
|
const response = await axios.get(url);
|
2024-05-13 23:45:11 -04:00
|
|
|
content = response.data ?? "";
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
const $ = load(content);
|
2024-05-13 23:45:11 -04:00
|
|
|
let links: {url: string, html: string}[] = [];
|
2024-04-15 17:01:47 -04:00
|
|
|
|
2024-05-15 20:13:04 -04:00
|
|
|
// Add the initial URL to the list of links
|
|
|
|
if(this.visited.size === 1)
|
|
|
|
{
|
|
|
|
links.push({url, html: content});
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
$("a").each((_, element) => {
|
|
|
|
const href = $(element).attr("href");
|
|
|
|
if (href) {
|
|
|
|
let fullUrl = href;
|
|
|
|
if (!href.startsWith("http")) {
|
|
|
|
fullUrl = new URL(href, this.baseUrl).toString();
|
|
|
|
}
|
|
|
|
const url = new URL(fullUrl);
|
|
|
|
const path = url.pathname;
|
|
|
|
|
|
|
|
if (
|
|
|
|
this.isInternalLink(fullUrl) &&
|
|
|
|
this.matchesPattern(fullUrl) &&
|
|
|
|
this.noSections(fullUrl) &&
|
|
|
|
this.matchesIncludes(path) &&
|
|
|
|
!this.matchesExcludes(path) &&
|
|
|
|
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
|
|
|
) {
|
2024-05-13 23:45:11 -04:00
|
|
|
links.push({url: fullUrl, html: content});
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
2024-05-15 20:13:04 -04:00
|
|
|
if(this.visited.size === 1){
|
|
|
|
return links;
|
|
|
|
}
|
2024-05-13 23:45:11 -04:00
|
|
|
// Create a new list to return to avoid modifying the visited list
|
2024-05-14 00:10:58 -04:00
|
|
|
return links.filter((link) => !this.visited.has(link.url));
|
2024-04-15 17:01:47 -04:00
|
|
|
} catch (error) {
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private matchesIncludes(url: string): boolean {
|
|
|
|
if (this.includes.length === 0 || this.includes[0] == "") return true;
|
|
|
|
return this.includes.some((pattern) => new RegExp(pattern).test(url));
|
|
|
|
}
|
|
|
|
|
|
|
|
private matchesExcludes(url: string): boolean {
|
|
|
|
if (this.excludes.length === 0 || this.excludes[0] == "") return false;
|
|
|
|
return this.excludes.some((pattern) => new RegExp(pattern).test(url));
|
|
|
|
}
|
|
|
|
|
|
|
|
private noSections(link: string): boolean {
|
|
|
|
return !link.includes("#");
|
|
|
|
}
|
|
|
|
|
|
|
|
private isInternalLink(link: string): boolean {
|
|
|
|
const urlObj = new URL(link, this.baseUrl);
|
|
|
|
const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
|
|
|
|
return urlObj.hostname === domainWithoutProtocol;
|
|
|
|
}
|
|
|
|
|
|
|
|
private matchesPattern(link: string): boolean {
|
|
|
|
return true; // Placeholder for future pattern matching implementation
|
|
|
|
}
|
|
|
|
|
|
|
|
private isFile(url: string): boolean {
|
|
|
|
const fileExtensions = [
|
|
|
|
".png",
|
|
|
|
".jpg",
|
|
|
|
".jpeg",
|
|
|
|
".gif",
|
|
|
|
".css",
|
|
|
|
".js",
|
|
|
|
".ico",
|
|
|
|
".svg",
|
2024-04-18 10:43:57 -04:00
|
|
|
// ".pdf",
|
2024-04-15 17:01:47 -04:00
|
|
|
".zip",
|
|
|
|
".exe",
|
|
|
|
".dmg",
|
|
|
|
".mp4",
|
|
|
|
".mp3",
|
|
|
|
".pptx",
|
|
|
|
".docx",
|
|
|
|
".xlsx",
|
|
|
|
".xml",
|
|
|
|
];
|
|
|
|
return fileExtensions.some((ext) => url.endsWith(ext));
|
|
|
|
}
|
|
|
|
|
|
|
|
private isSocialMediaOrEmail(url: string): boolean {
|
|
|
|
const socialMediaOrEmail = [
|
|
|
|
"facebook.com",
|
|
|
|
"twitter.com",
|
|
|
|
"linkedin.com",
|
|
|
|
"instagram.com",
|
|
|
|
"pinterest.com",
|
|
|
|
"mailto:",
|
|
|
|
];
|
|
|
|
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
|
|
|
}
|
|
|
|
|
2024-05-15 20:13:04 -04:00
|
|
|
//
|
2024-04-15 17:01:47 -04:00
|
|
|
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
2024-05-15 20:13:04 -04:00
|
|
|
const normalizeUrl = (url: string) => {
|
|
|
|
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
|
|
|
if (url.endsWith("/")) {
|
|
|
|
url = url.slice(0, -1);
|
|
|
|
}
|
|
|
|
return url;
|
|
|
|
};
|
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
const sitemapUrl = url.endsWith("/sitemap.xml")
|
|
|
|
? url
|
|
|
|
: `${url}/sitemap.xml`;
|
2024-05-15 20:13:04 -04:00
|
|
|
|
|
|
|
let sitemapLinks: string[] = [];
|
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
try {
|
|
|
|
const response = await axios.get(sitemapUrl);
|
|
|
|
if (response.status === 200) {
|
2024-05-15 20:13:04 -04:00
|
|
|
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
} catch (error) {
|
|
|
|
// Error handling for failed sitemap fetch
|
2024-05-15 18:30:37 -04:00
|
|
|
// console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
2024-05-15 18:30:37 -04:00
|
|
|
|
2024-05-15 20:13:04 -04:00
|
|
|
if (sitemapLinks.length === 0) {
|
|
|
|
// If the first one doesn't work, try the base URL
|
|
|
|
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
|
|
|
try {
|
|
|
|
const response = await axios.get(baseUrlSitemap);
|
|
|
|
if (response.status === 200) {
|
|
|
|
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
|
|
|
}
|
|
|
|
} catch (error) {
|
|
|
|
// Error handling for failed base URL sitemap fetch
|
|
|
|
// console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
2024-05-15 18:30:37 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-05-15 20:13:04 -04:00
|
|
|
// Normalize and check if the URL is present in any of the sitemaps
|
|
|
|
const normalizedUrl = normalizeUrl(url);
|
|
|
|
|
|
|
|
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
|
|
|
|
|
|
|
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
|
|
|
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
|
|
|
|
// do not push the normalized url
|
|
|
|
sitemapLinks.push(url);
|
|
|
|
}
|
|
|
|
|
|
|
|
return sitemapLinks;
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
}
|