From 7f31959be7a3333b32bc6b3d2dcc128fa07fb5b6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 14 May 2024 12:04:36 -0700 Subject: [PATCH] Nick: --- apps/api/src/scraper/WebScraper/crawler.ts | 13 +++++++------ apps/api/src/scraper/WebScraper/index.ts | 2 -- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 4509531..3dc6dc4 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -15,7 +15,7 @@ export class WebCrawler { private maxCrawledLinks: number; private maxCrawledDepth: number; private visited: Set = new Set(); - private crawledUrls: { url: string, html: string }[] = []; + private crawledUrls: Set<{ url: string, html: string }> = new Set(); private limit: number; private robotsTxtUrl: string; private robots: any; @@ -136,24 +136,24 @@ export class WebCrawler { inProgress?: (progress: Progress) => void ): Promise<{ url: string, html: string }[]> { const queue = async.queue(async (task: string, callback) => { - if (this.crawledUrls.length >= this.maxCrawledLinks) { + if (this.crawledUrls.size >= this.maxCrawledLinks) { if (callback && typeof callback === "function") { callback(); } return; } const newUrls = await this.crawl(task); - newUrls.forEach((page) => this.crawledUrls.push(page)); + newUrls.forEach((page) => this.crawledUrls.add(page)); if (inProgress && newUrls.length > 0) { inProgress({ - current: this.crawledUrls.length, + current: this.crawledUrls.size, total: this.maxCrawledLinks, status: "SCRAPING", currentDocumentUrl: newUrls[newUrls.length - 1].url, }); } else if (inProgress) { inProgress({ - current: this.crawledUrls.length, + current: this.crawledUrls.size, total: this.maxCrawledLinks, status: "SCRAPING", currentDocumentUrl: task, @@ -175,7 +175,7 @@ export class WebCrawler { } ); await queue.drain(); - return this.crawledUrls; + return Array.from(this.crawledUrls); } async crawl(url: string): Promise<{url: string, html: string}[]> { @@ -311,3 +311,4 @@ export class WebCrawler { } } + diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1eeb65f..1f5a785 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -277,8 +277,6 @@ export class WebScraperDataProvider { ): Promise { await this.setCachedDocuments(documents, links); documents = this.removeChildLinks(documents); - documents = this.filterDocsExcludeInclude(documents); - documents = this.filterDepth(documents); return documents.splice(0, this.limit); }