0
This commit is contained in:
Nicolas 2024-06-10 16:27:10 -07:00
parent aafd23fa8a
commit 3091f0134c
3 changed files with 13 additions and 5 deletions

View File

@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
import { URL } from "url"; import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
import async from "async"; import async from "async";
import { Progress } from "../../lib/entities"; import { PageOptions, Progress } from "../../lib/entities";
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
@ -108,6 +108,7 @@ export class WebCrawler {
public async start( public async start(
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
pageOptions?: PageOptions,
concurrencyLimit: number = 5, concurrencyLimit: number = 5,
limit: number = 10000, limit: number = 10000,
maxDepth: number = 10 maxDepth: number = 10
@ -130,6 +131,7 @@ export class WebCrawler {
const urls = await this.crawlUrls( const urls = await this.crawlUrls(
[this.initialUrl], [this.initialUrl],
pageOptions,
concurrencyLimit, concurrencyLimit,
inProgress inProgress
); );
@ -148,6 +150,7 @@ export class WebCrawler {
private async crawlUrls( private async crawlUrls(
urls: string[], urls: string[],
pageOptions: PageOptions,
concurrencyLimit: number, concurrencyLimit: number,
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
): Promise<{ url: string, html: string }[]> { ): Promise<{ url: string, html: string }[]> {
@ -158,7 +161,7 @@ export class WebCrawler {
} }
return; return;
} }
const newUrls = await this.crawl(task); const newUrls = await this.crawl(task, pageOptions);
// add the initial url if not already added // add the initial url if not already added
// if (this.visited.size === 1) { // if (this.visited.size === 1) {
// let normalizedInitial = this.initialUrl; // let normalizedInitial = this.initialUrl;
@ -188,7 +191,7 @@ export class WebCrawler {
currentDocumentUrl: task, currentDocumentUrl: task,
}); });
} }
await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress); await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
} }
@ -207,7 +210,7 @@ export class WebCrawler {
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
async crawl(url: string): Promise<{url: string, html: string}[]> { async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){ if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
return []; return [];
} }
@ -231,7 +234,8 @@ export class WebCrawler {
let content : string = ""; let content : string = "";
// If it is the first link, fetch with single url // If it is the first link, fetch with single url
if (this.visited.size === 1) { if (this.visited.size === 1) {
const page = await scrapSingleUrl(url, {includeHtml: true}); console.log(pageOptions)
const page = await scrapSingleUrl(url, {...pageOptions, includeHtml: true});
content = page.html ?? "" content = page.html ?? ""
} else { } else {
const response = await axios.get(url); const response = await axios.get(url);

View File

@ -173,6 +173,7 @@ export class WebScraperDataProvider {
let links = await crawler.start( let links = await crawler.start(
inProgress, inProgress,
this.pageOptions,
5, 5,
this.limit, this.limit,
this.maxCrawledDepth this.maxCrawledDepth

View File

@ -12,6 +12,8 @@ export async function getLinksFromSitemap(
content = response.data; content = response.data;
} catch (error) { } catch (error) {
console.error(`Request failed for ${sitemapUrl}: ${error}`); console.error(`Request failed for ${sitemapUrl}: ${error}`);
console.log(allUrls)
return allUrls; return allUrls;
} }
@ -34,6 +36,7 @@ export async function getLinksFromSitemap(
} catch (error) { } catch (error) {
console.error(`Error processing ${sitemapUrl}: ${error}`); console.error(`Error processing ${sitemapUrl}: ${error}`);
} }
console.log(allUrls)
return allUrls; return allUrls;
} }