From bc6b929b43ffa0385c3243ae284c95919997e707 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 10 May 2024 12:15:54 -0300 Subject: [PATCH 1/2] [Bug] Fixing /crawl limit --- apps/api/src/scraper/WebScraper/crawler.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 0248df2..1b371fd 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -25,7 +25,7 @@ export class WebCrawler { initialUrl, includes, excludes, - maxCrawledLinks, + maxCrawledLinks = 10000, limit = 10000, generateImgAltText = false, maxCrawledDepth = 10, @@ -136,7 +136,7 @@ export class WebCrawler { inProgress?: (progress: Progress) => void ): Promise { const queue = async.queue(async (task: string, callback) => { - if (this.crawledUrls.size >= this.maxCrawledLinks) { + if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) { if (callback && typeof callback === "function") { callback(); } @@ -147,14 +147,14 @@ export class WebCrawler { if (inProgress && newUrls.length > 0) { inProgress({ current: this.crawledUrls.size, - total: this.maxCrawledLinks, + total: Math.min(this.maxCrawledLinks, this.limit), status: "SCRAPING", currentDocumentUrl: newUrls[newUrls.length - 1], }); } else if (inProgress) { inProgress({ current: this.crawledUrls.size, - total: this.maxCrawledLinks, + total: Math.min(this.maxCrawledLinks, this.limit), status: "SCRAPING", currentDocumentUrl: task, }); From 4dfc371241ede6fc615319f783da655e39d2e1fd Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 22 May 2024 14:38:41 -0300 Subject: [PATCH 2/2] Update index.test.ts --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 331283e..39825c4 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -266,7 +266,7 @@ describe("E2E Tests for API Routes", () => { urls.forEach((url: string) => { expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); }); - }, 60000); // 60 seconds + }, 90000); // 90 seconds it("should return a successful response with a valid API key and limit to 3", async () => { const crawlResponse = await request(TEST_URL)