0
v-firecrawl/apps/api/src/scraper/WebScraper/crawler.ts

317 lines
9.3 KiB
TypeScript
Raw Normal View History

2024-04-15 17:01:47 -04:00
import axios from "axios";
import cheerio, { load } from "cheerio";
import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap";
import async from "async";
import { Progress } from "../../lib/entities";
2024-05-13 23:45:11 -04:00
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
2024-04-15 17:01:47 -04:00
import robotsParser from "robots-parser";
export class WebCrawler {
private initialUrl: string;
private baseUrl: string;
private includes: string[];
private excludes: string[];
private maxCrawledLinks: number;
2024-05-07 10:06:26 -04:00
private maxCrawledDepth: number;
2024-04-15 17:01:47 -04:00
private visited: Set<string> = new Set();
2024-05-14 15:12:40 -04:00
private crawledUrls: Map<string, string> = new Map();
2024-04-15 17:01:47 -04:00
private limit: number;
private robotsTxtUrl: string;
private robots: any;
2024-04-16 12:49:14 -04:00
private generateImgAltText: boolean;
2024-04-15 17:01:47 -04:00
constructor({
initialUrl,
includes,
excludes,
maxCrawledLinks,
limit = 10000,
2024-04-16 12:49:14 -04:00
generateImgAltText = false,
2024-05-07 10:06:26 -04:00
maxCrawledDepth = 10,
2024-04-15 17:01:47 -04:00
}: {
initialUrl: string;
includes?: string[];
excludes?: string[];
maxCrawledLinks?: number;
limit?: number;
2024-04-16 12:49:14 -04:00
generateImgAltText?: boolean;
2024-05-07 10:06:26 -04:00
maxCrawledDepth?: number;
2024-04-15 17:01:47 -04:00
}) {
this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin;
this.includes = includes ?? [];
this.excludes = excludes ?? [];
this.limit = limit;
this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
this.robots = robotsParser(this.robotsTxtUrl, "");
// Deprecated, use limit instead
this.maxCrawledLinks = maxCrawledLinks ?? limit;
2024-05-07 10:06:26 -04:00
this.maxCrawledDepth = maxCrawledDepth ?? 10;
2024-04-16 12:49:14 -04:00
this.generateImgAltText = generateImgAltText ?? false;
2024-04-15 17:01:47 -04:00
}
2024-05-07 10:06:26 -04:00
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
2024-04-15 17:01:47 -04:00
return sitemapLinks
.filter((link) => {
const url = new URL(link);
const path = url.pathname;
2024-05-07 10:06:26 -04:00
const depth = url.pathname.split('/').length - 1;
// Check if the link exceeds the maximum depth allowed
if (depth > maxDepth) {
return false;
}
2024-04-15 17:01:47 -04:00
// Check if the link should be excluded
if (this.excludes.length > 0 && this.excludes[0] !== "") {
if (
this.excludes.some((excludePattern) =>
new RegExp(excludePattern).test(path)
)
) {
return false;
}
}
// Check if the link matches the include patterns, if any are specified
if (this.includes.length > 0 && this.includes[0] !== "") {
return this.includes.some((includePattern) =>
new RegExp(includePattern).test(path)
);
}
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
// Check if the link is disallowed by robots.txt
if (!isAllowed) {
console.log(`Link disallowed by robots.txt: ${link}`);
return false;
}
2024-05-15 17:35:09 -04:00
if (!this.initialUrl.includes(link)) {
return false;
}
2024-04-15 17:01:47 -04:00
return true;
})
.slice(0, limit);
}
public async start(
inProgress?: (progress: Progress) => void,
concurrencyLimit: number = 5,
2024-05-07 10:06:26 -04:00
limit: number = 10000,
maxDepth: number = 10
2024-05-13 23:45:11 -04:00
): Promise<{ url: string, html: string }[]> {
2024-04-15 17:01:47 -04:00
// Fetch and parse robots.txt
try {
const response = await axios.get(this.robotsTxtUrl);
this.robots = robotsParser(this.robotsTxtUrl, response.data);
} catch (error) {
console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
}
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) {
2024-05-15 17:35:09 -04:00
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
2024-05-13 23:45:11 -04:00
return filteredLinks.map(link => ({ url: link, html: "" }));
2024-04-15 17:01:47 -04:00
}
const urls = await this.crawlUrls(
[this.initialUrl],
concurrencyLimit,
inProgress
);
if (
urls.length === 0 &&
2024-05-07 10:06:26 -04:00
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
2024-04-15 17:01:47 -04:00
) {
2024-05-13 23:45:11 -04:00
return [{ url: this.initialUrl, html: "" }];
2024-04-15 17:01:47 -04:00
}
// make sure to run include exclude here again
2024-05-13 23:45:11 -04:00
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
2024-04-15 17:01:47 -04:00
}
private async crawlUrls(
urls: string[],
concurrencyLimit: number,
inProgress?: (progress: Progress) => void
2024-05-13 23:45:11 -04:00
): Promise<{ url: string, html: string }[]> {
2024-04-15 17:01:47 -04:00
const queue = async.queue(async (task: string, callback) => {
2024-05-14 15:04:36 -04:00
if (this.crawledUrls.size >= this.maxCrawledLinks) {
2024-04-15 17:01:47 -04:00
if (callback && typeof callback === "function") {
callback();
}
return;
}
const newUrls = await this.crawl(task);
2024-05-14 15:12:40 -04:00
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
2024-04-15 17:01:47 -04:00
if (inProgress && newUrls.length > 0) {
inProgress({
2024-05-14 15:04:36 -04:00
current: this.crawledUrls.size,
2024-04-15 17:01:47 -04:00
total: this.maxCrawledLinks,
status: "SCRAPING",
2024-05-13 23:45:11 -04:00
currentDocumentUrl: newUrls[newUrls.length - 1].url,
2024-04-15 17:01:47 -04:00
});
} else if (inProgress) {
inProgress({
2024-05-14 15:04:36 -04:00
current: this.crawledUrls.size,
2024-04-15 17:01:47 -04:00
total: this.maxCrawledLinks,
status: "SCRAPING",
currentDocumentUrl: task,
});
}
2024-05-13 23:45:11 -04:00
await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
2024-04-15 17:01:47 -04:00
if (callback && typeof callback === "function") {
callback();
}
}, concurrencyLimit);
queue.push(
urls.filter(
(url) =>
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
),
(err) => {
if (err) console.error(err);
}
);
await queue.drain();
2024-05-14 15:12:40 -04:00
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
2024-04-15 17:01:47 -04:00
}
2024-05-13 23:45:11 -04:00
async crawl(url: string): Promise<{url: string, html: string}[]> {
2024-04-15 17:01:47 -04:00
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent"))
return [];
this.visited.add(url);
if (!url.startsWith("http")) {
url = "https://" + url;
}
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
return [];
}
try {
2024-05-13 23:45:11 -04:00
let content : string = "";
// If it is the first link, fetch with single url
2024-04-15 17:01:47 -04:00
if (this.visited.size === 1) {
2024-05-13 23:45:11 -04:00
const page = await scrapSingleUrl(url, {includeHtml: true});
content = page.html ?? ""
2024-04-15 17:01:47 -04:00
} else {
const response = await axios.get(url);
2024-05-13 23:45:11 -04:00
content = response.data ?? "";
2024-04-15 17:01:47 -04:00
}
const $ = load(content);
2024-05-13 23:45:11 -04:00
let links: {url: string, html: string}[] = [];
2024-04-15 17:01:47 -04:00
$("a").each((_, element) => {
const href = $(element).attr("href");
if (href) {
let fullUrl = href;
if (!href.startsWith("http")) {
fullUrl = new URL(href, this.baseUrl).toString();
}
const url = new URL(fullUrl);
const path = url.pathname;
if (
this.isInternalLink(fullUrl) &&
this.matchesPattern(fullUrl) &&
this.noSections(fullUrl) &&
this.matchesIncludes(path) &&
!this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
) {
2024-05-13 23:45:11 -04:00
links.push({url: fullUrl, html: content});
2024-04-15 17:01:47 -04:00
}
}
});
2024-05-13 23:45:11 -04:00
// Create a new list to return to avoid modifying the visited list
2024-05-14 00:10:58 -04:00
return links.filter((link) => !this.visited.has(link.url));
2024-04-15 17:01:47 -04:00
} catch (error) {
return [];
}
}
private matchesIncludes(url: string): boolean {
if (this.includes.length === 0 || this.includes[0] == "") return true;
return this.includes.some((pattern) => new RegExp(pattern).test(url));
}
private matchesExcludes(url: string): boolean {
if (this.excludes.length === 0 || this.excludes[0] == "") return false;
return this.excludes.some((pattern) => new RegExp(pattern).test(url));
}
private noSections(link: string): boolean {
return !link.includes("#");
}
private isInternalLink(link: string): boolean {
const urlObj = new URL(link, this.baseUrl);
const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
return urlObj.hostname === domainWithoutProtocol;
}
private matchesPattern(link: string): boolean {
return true; // Placeholder for future pattern matching implementation
}
private isFile(url: string): boolean {
const fileExtensions = [
".png",
".jpg",
".jpeg",
".gif",
".css",
".js",
".ico",
".svg",
2024-04-18 10:43:57 -04:00
// ".pdf",
2024-04-15 17:01:47 -04:00
".zip",
".exe",
".dmg",
".mp4",
".mp3",
".pptx",
".docx",
".xlsx",
".xml",
];
return fileExtensions.some((ext) => url.endsWith(ext));
}
private isSocialMediaOrEmail(url: string): boolean {
const socialMediaOrEmail = [
"facebook.com",
"twitter.com",
"linkedin.com",
"instagram.com",
"pinterest.com",
"mailto:",
];
return socialMediaOrEmail.some((ext) => url.includes(ext));
}
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
const sitemapUrl = url.endsWith("/sitemap.xml")
? url
: `${url}/sitemap.xml`;
try {
const response = await axios.get(sitemapUrl);
if (response.status === 200) {
return await getLinksFromSitemap(sitemapUrl);
}
} catch (error) {
// Error handling for failed sitemap fetch
}
return [];
}
}