From 24be4866c56d6c660ba170bf5a7088f6e9f9e1f1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 17:16:20 -0700 Subject: [PATCH] Nick: --- apps/api/src/scraper/WebScraper/crawler.ts | 1 - apps/test-suite/data/crawl.json | 16 ++++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 8449efb..9e080d7 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -151,7 +151,6 @@ export class WebCrawler { concurrencyLimit: number, inProgress?: (progress: Progress) => void, ): Promise<{ url: string, html: string }[]> { - console.log("Crawling URLs: ", urls); const queue = async.queue(async (task: string, callback) => { if (this.crawledUrls.size >= this.maxCrawledLinks) { if (callback && typeof callback === "function") { diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json index 59cfa9f..8bc28a6 100644 --- a/apps/test-suite/data/crawl.json +++ b/apps/test-suite/data/crawl.json @@ -1,4 +1,10 @@ -[{ +[ + { + "website": "https://www.vellum.ai/llm-leaderboard", + "expected_min_num_of_pages": 1, + "expected_crawled_pages": ["https://www.vellum.ai/llm-leaderboard"] + }, + { "website": "https://openai.com/news", "expected_min_num_of_pages": 4, "expected_crawled_pages": [ @@ -70,8 +76,6 @@ ] }, - - { "website": "https://ycombinator.com/companies", "expected_min_num_of_pages": 20, @@ -115,11 +119,7 @@ ], "notes": "This one should not go backwards, but it does!" }, - { - "website": "https://www.vellum.ai/llm-leaderboard", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, + { "website": "https://www.instructables.com/circuits", "expected_min_num_of_pages": 12,