Nick:
This commit is contained in:
parent
aafd23fa8a
commit
3091f0134c
@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
|
|||||||
import { URL } from "url";
|
import { URL } from "url";
|
||||||
import { getLinksFromSitemap } from "./sitemap";
|
import { getLinksFromSitemap } from "./sitemap";
|
||||||
import async from "async";
|
import async from "async";
|
||||||
import { Progress } from "../../lib/entities";
|
import { PageOptions, Progress } from "../../lib/entities";
|
||||||
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
|
|
||||||
@ -108,6 +108,7 @@ export class WebCrawler {
|
|||||||
|
|
||||||
public async start(
|
public async start(
|
||||||
inProgress?: (progress: Progress) => void,
|
inProgress?: (progress: Progress) => void,
|
||||||
|
pageOptions?: PageOptions,
|
||||||
concurrencyLimit: number = 5,
|
concurrencyLimit: number = 5,
|
||||||
limit: number = 10000,
|
limit: number = 10000,
|
||||||
maxDepth: number = 10
|
maxDepth: number = 10
|
||||||
@ -130,6 +131,7 @@ export class WebCrawler {
|
|||||||
|
|
||||||
const urls = await this.crawlUrls(
|
const urls = await this.crawlUrls(
|
||||||
[this.initialUrl],
|
[this.initialUrl],
|
||||||
|
pageOptions,
|
||||||
concurrencyLimit,
|
concurrencyLimit,
|
||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
@ -148,6 +150,7 @@ export class WebCrawler {
|
|||||||
|
|
||||||
private async crawlUrls(
|
private async crawlUrls(
|
||||||
urls: string[],
|
urls: string[],
|
||||||
|
pageOptions: PageOptions,
|
||||||
concurrencyLimit: number,
|
concurrencyLimit: number,
|
||||||
inProgress?: (progress: Progress) => void,
|
inProgress?: (progress: Progress) => void,
|
||||||
): Promise<{ url: string, html: string }[]> {
|
): Promise<{ url: string, html: string }[]> {
|
||||||
@ -158,7 +161,7 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const newUrls = await this.crawl(task);
|
const newUrls = await this.crawl(task, pageOptions);
|
||||||
// add the initial url if not already added
|
// add the initial url if not already added
|
||||||
// if (this.visited.size === 1) {
|
// if (this.visited.size === 1) {
|
||||||
// let normalizedInitial = this.initialUrl;
|
// let normalizedInitial = this.initialUrl;
|
||||||
@ -188,7 +191,7 @@ export class WebCrawler {
|
|||||||
currentDocumentUrl: task,
|
currentDocumentUrl: task,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
|
await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
|
||||||
if (callback && typeof callback === "function") {
|
if (callback && typeof callback === "function") {
|
||||||
callback();
|
callback();
|
||||||
}
|
}
|
||||||
@ -207,7 +210,7 @@ export class WebCrawler {
|
|||||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
|
||||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
|
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
@ -231,7 +234,8 @@ export class WebCrawler {
|
|||||||
let content : string = "";
|
let content : string = "";
|
||||||
// If it is the first link, fetch with single url
|
// If it is the first link, fetch with single url
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
const page = await scrapSingleUrl(url, {includeHtml: true});
|
console.log(pageOptions)
|
||||||
|
const page = await scrapSingleUrl(url, {...pageOptions, includeHtml: true});
|
||||||
content = page.html ?? ""
|
content = page.html ?? ""
|
||||||
} else {
|
} else {
|
||||||
const response = await axios.get(url);
|
const response = await axios.get(url);
|
||||||
|
@ -173,6 +173,7 @@ export class WebScraperDataProvider {
|
|||||||
|
|
||||||
let links = await crawler.start(
|
let links = await crawler.start(
|
||||||
inProgress,
|
inProgress,
|
||||||
|
this.pageOptions,
|
||||||
5,
|
5,
|
||||||
this.limit,
|
this.limit,
|
||||||
this.maxCrawledDepth
|
this.maxCrawledDepth
|
||||||
|
@ -12,6 +12,8 @@ export async function getLinksFromSitemap(
|
|||||||
content = response.data;
|
content = response.data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
||||||
|
console.log(allUrls)
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -34,6 +36,7 @@ export async function getLinksFromSitemap(
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error processing ${sitemapUrl}: ${error}`);
|
console.error(`Error processing ${sitemapUrl}: ${error}`);
|
||||||
}
|
}
|
||||||
|
console.log(allUrls)
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user