0
This commit is contained in:
Nicolas 2024-05-14 12:04:36 -07:00
parent 8a72cf556b
commit 7f31959be7
2 changed files with 7 additions and 8 deletions

View File

@ -15,7 +15,7 @@ export class WebCrawler {
private maxCrawledLinks: number; private maxCrawledLinks: number;
private maxCrawledDepth: number; private maxCrawledDepth: number;
private visited: Set<string> = new Set(); private visited: Set<string> = new Set();
private crawledUrls: { url: string, html: string }[] = []; private crawledUrls: Set<{ url: string, html: string }> = new Set();
private limit: number; private limit: number;
private robotsTxtUrl: string; private robotsTxtUrl: string;
private robots: any; private robots: any;
@ -136,24 +136,24 @@ export class WebCrawler {
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<{ url: string, html: string }[]> { ): Promise<{ url: string, html: string }[]> {
const queue = async.queue(async (task: string, callback) => { const queue = async.queue(async (task: string, callback) => {
if (this.crawledUrls.length >= this.maxCrawledLinks) { if (this.crawledUrls.size >= this.maxCrawledLinks) {
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
} }
return; return;
} }
const newUrls = await this.crawl(task); const newUrls = await this.crawl(task);
newUrls.forEach((page) => this.crawledUrls.push(page)); newUrls.forEach((page) => this.crawledUrls.add(page));
if (inProgress && newUrls.length > 0) { if (inProgress && newUrls.length > 0) {
inProgress({ inProgress({
current: this.crawledUrls.length, current: this.crawledUrls.size,
total: this.maxCrawledLinks, total: this.maxCrawledLinks,
status: "SCRAPING", status: "SCRAPING",
currentDocumentUrl: newUrls[newUrls.length - 1].url, currentDocumentUrl: newUrls[newUrls.length - 1].url,
}); });
} else if (inProgress) { } else if (inProgress) {
inProgress({ inProgress({
current: this.crawledUrls.length, current: this.crawledUrls.size,
total: this.maxCrawledLinks, total: this.maxCrawledLinks,
status: "SCRAPING", status: "SCRAPING",
currentDocumentUrl: task, currentDocumentUrl: task,
@ -175,7 +175,7 @@ export class WebCrawler {
} }
); );
await queue.drain(); await queue.drain();
return this.crawledUrls; return Array.from(this.crawledUrls);
} }
async crawl(url: string): Promise<{url: string, html: string}[]> { async crawl(url: string): Promise<{url: string, html: string}[]> {
@ -311,3 +311,4 @@ export class WebCrawler {
} }
} }

View File

@ -277,8 +277,6 @@ export class WebScraperDataProvider {
): Promise<Document[]> { ): Promise<Document[]> {
await this.setCachedDocuments(documents, links); await this.setCachedDocuments(documents, links);
documents = this.removeChildLinks(documents); documents = this.removeChildLinks(documents);
documents = this.filterDocsExcludeInclude(documents);
documents = this.filterDepth(documents);
return documents.splice(0, this.limit); return documents.splice(0, this.limit);
} }