not working yet

2024-05-15 18:54:40 -03:00 · 2024-05-15 18:54:40 -03:00 · d91043376c
commit d91043376c
parent fa014defc7
2 changed files with 11 additions and 7 deletions
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -133,6 +133,7 @@ export class WebScraperDataProvider {
  private async handleCrawlMode(
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
+    console.log('??? >>>', this.urls[0])
    const crawler = new WebCrawler({
      initialUrl: this.urls[0],
      includes: this.includes,
@ -148,15 +149,16 @@ export class WebScraperDataProvider {
    let allLinks = links.map((e) => e.url);
    const allHtmls = links.map((e)=> e.html);

-    if (this.returnOnlyUrls) {
-      return this.returnOnlyUrlsResponse(allLinks , inProgress);
-    }
-
    allLinks = allLinks.filter(link => {
      const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
      const normalizedLink = link.endsWith('/') ? link : `${link}/`;
-      return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl;
+      return normalizedLink.startsWith(normalizedInitialUrl);
    });
+    console.log('>>>>>??>?>?>?>?.', {allLinks})
+
+    if (this.returnOnlyUrls) {
+      return this.returnOnlyUrlsResponse(allLinks , inProgress);
+    }
    
    let documents = [];
    // check if fast mode is enabled and there is html inside the links
@ -184,9 +186,11 @@ export class WebScraperDataProvider {
    links = links.filter(link => {
      const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
      const normalizedLink = link.endsWith('/') ? link : `${link}/`;
-      return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl;
+      return normalizedLink.startsWith(normalizedInitialUrl);
    });

+    console.log('>>>>>??>?>?>?>?.', {links})
+
    if (this.returnOnlyUrls) {
      return this.returnOnlyUrlsResponse(links, inProgress);
    }
--- a/apps/test-suite/tests/crawl.test.ts
+++ b/apps/test-suite/tests/crawl.test.ts
@ -102,7 +102,7 @@ describe("Crawling Checkup (E2E)", () => {
          }

          // checks if crawled pages not contain expected_not_crawled_pages
-          if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
+          if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && completedResponse.body.data && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
            errorLog.push({
              website: websiteData.website,
              prompt: 'CRAWL',