diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 521b1e1..7cfd1be 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -88,6 +88,10 @@ export class WebCrawler { return false; } + if (!this.initialUrl.includes(link)) { + return false; + } + return true; }) .slice(0, limit); @@ -109,7 +113,7 @@ export class WebCrawler { const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { - const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); + let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); return filteredLinks.map(link => ({ url: link, html: "" })); } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index c95e889..cf074ec 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -145,12 +145,18 @@ export class WebScraperDataProvider { let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); - const allLinks = links.map((e) => e.url); + let allLinks = links.map((e) => e.url); const allHtmls = links.map((e)=> e.html); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(allLinks , inProgress); } + + allLinks = allLinks.filter(link => { + const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; + const normalizedLink = link.endsWith('/') ? link : `${link}/`; + return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl; + }); let documents = []; // check if fast mode is enabled and there is html inside the links @@ -175,6 +181,12 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { let links = await getLinksFromSitemap(this.urls[0]); + links = links.filter(link => { + const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; + const normalizedLink = link.endsWith('/') ? link : `${link}/`; + return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl; + }); + if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(links, inProgress); } diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json index 3a56131..d729644 100644 --- a/apps/test-suite/data/crawl.json +++ b/apps/test-suite/data/crawl.json @@ -2,7 +2,7 @@ { "website": "https://mendable.ai/pricing", "expected_min_num_of_pages": 29, - "expected_crawled_pages": [ + "expected_not_crawled_pages": [ "https://mendable.ai/", "https://mendable.ai/blog", "https://mendable.ai/signin", @@ -34,7 +34,9 @@ "https://www.agentops.ai/blog/streamlining-hr-with-saas", "https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions", "https://www.agentops.ai/blog/efficient-hr-operations-with-saas", - "https://www.agentops.ai/blog/hr-made-simple-with-saas", + "https://www.agentops.ai/blog/hr-made-simple-with-saas" + ], + "expected_not_crawled_pages": [ "https://www.agentops.ai/about-us", "https://www.agentops.ai/contact-us" ] @@ -69,7 +71,7 @@ { "website": "https://en.wikipedia.org/wiki/T._N._Seshan", "expected_min_num_of_pages": 100, - "expected_crawled_pages": [ + "expected_not_crawled_pages": [ "https://en.wikipedia.org/wiki/Wikipedia:Contents", "https://en.wikipedia.org/wiki/Wikipedia:Contact_us", "https://en.wikipedia.org/wiki/V._S._Ramadevi", @@ -79,15 +81,10 @@ "https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg" ] }, - { - "website": "https://mendable.ai/blog", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, { "website": "https://www.framer.com/pricing", "expected_min_num_of_pages": 58, - "expected_crawled_pages": [ + "expected_not_crawled_pages": [ "https://www.framer.com/features/navigation/", "https://www.framer.com/contact/", "https://www.framer.com/add-ons/", @@ -101,7 +98,7 @@ { "website": "https://fly.io/docs/gpus/gpu-quickstart", "expected_min_num_of_pages": 39, - "expected_crawled_pages": [ + "expected_not_crawled_pages": [ "https://fly.io/docs/getting-started/", "https://fly.io/docs/hands-on/", "https://fly.io/docs/about/support/", @@ -118,8 +115,8 @@ "expected_crawled_pages": [""] }, { - "website": "https://www.instructables.com", - "expected_min_num_of_pages": 78, + "website": "https://www.instructables.com/circuits", + "expected_min_num_of_pages": 12, "expected_crawled_pages": [ "https://www.instructables.com/circuits/", "https://www.instructables.com/circuits/apple/projects/", diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts index 85bcabe..3a4a35e 100644 --- a/apps/test-suite/tests/crawl.test.ts +++ b/apps/test-suite/tests/crawl.test.ts @@ -62,6 +62,7 @@ describe("Crawling Checkup (E2E)", () => { // fail the test console.log('No response'); continue; + // continue; } if (!completedResponse.body || completedResponse.body.status !== "completed") { @@ -72,7 +73,7 @@ describe("Crawling Checkup (E2E)", () => { actual_output: 'FAILURE', error: `Crawl job did not complete successfully.` }); - return null; + continue; } // check how many webpages were crawled successfully @@ -85,11 +86,11 @@ describe("Crawling Checkup (E2E)", () => { actual_output: `FAILURE: ${completedResponse.body.data.length}`, error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}` }); - return null; + continue; } // checks if crawled pages contain expected_crawled_pages - if (websiteData.expected_crawled_pages.some(page => !completedResponse.body.data.some((d: { url: string }) => d.url === page))) { + if (websiteData.expected_crawled_pages && websiteData.expected_crawled_pages.length > 0 && websiteData.expected_crawled_pages.some(page => !completedResponse.body.data?.some((d: { url: string }) => d.url === page))) { errorLog.push({ website: websiteData.website, prompt: 'CRAWL', @@ -97,7 +98,19 @@ describe("Crawling Checkup (E2E)", () => { actual_output: `FAILURE: ${completedResponse.body.data}`, error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}` }); - return null; + continue; + } + + // checks if crawled pages not contain expected_not_crawled_pages + if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) { + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: `SUCCESS: ${websiteData.expected_not_crawled_pages}`, + actual_output: `FAILURE: ${completedResponse.body.data}`, + error: `Expected crawled pages to not contain ${websiteData.expected_not_crawled_pages}, but got ${completedResponse.body.data}` + }); + continue; } passedTests++; @@ -110,6 +123,7 @@ describe("Crawling Checkup (E2E)", () => { actual_output: 'FAILURE', error: `Error processing ${websiteData.website}: ${error}` }); + continue; } }