From d91043376ce01b1ef8469bf3037cfe220452c5d4 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 15 May 2024 18:54:40 -0300 Subject: [PATCH] not working yet --- apps/api/src/scraper/WebScraper/index.ts | 16 ++++++++++------ apps/test-suite/tests/crawl.test.ts | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index cf074ec..7e19357 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -133,6 +133,7 @@ export class WebScraperDataProvider { private async handleCrawlMode( inProgress?: (progress: Progress) => void ): Promise { + console.log('??? >>>', this.urls[0]) const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, @@ -148,15 +149,16 @@ export class WebScraperDataProvider { let allLinks = links.map((e) => e.url); const allHtmls = links.map((e)=> e.html); - if (this.returnOnlyUrls) { - return this.returnOnlyUrlsResponse(allLinks , inProgress); - } - allLinks = allLinks.filter(link => { const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; const normalizedLink = link.endsWith('/') ? link : `${link}/`; - return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl; + return normalizedLink.startsWith(normalizedInitialUrl); }); + console.log('>>>>>??>?>?>?>?.', {allLinks}) + + if (this.returnOnlyUrls) { + return this.returnOnlyUrlsResponse(allLinks , inProgress); + } let documents = []; // check if fast mode is enabled and there is html inside the links @@ -184,9 +186,11 @@ export class WebScraperDataProvider { links = links.filter(link => { const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; const normalizedLink = link.endsWith('/') ? link : `${link}/`; - return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl; + return normalizedLink.startsWith(normalizedInitialUrl); }); + console.log('>>>>>??>?>?>?>?.', {links}) + if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(links, inProgress); } diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts index 3a4a35e..853379b 100644 --- a/apps/test-suite/tests/crawl.test.ts +++ b/apps/test-suite/tests/crawl.test.ts @@ -102,7 +102,7 @@ describe("Crawling Checkup (E2E)", () => { } // checks if crawled pages not contain expected_not_crawled_pages - if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) { + if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && completedResponse.body.data && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) { errorLog.push({ website: websiteData.website, prompt: 'CRAWL',