From a04610302a7c0812183c240a2644d0c81de86597 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 16 Apr 2024 16:31:33 -0300
Subject: [PATCH 1/7] Spliting relative paths for images

---
 apps/api/src/index.ts                    |  4 +++
 apps/api/src/scraper/WebScraper/index.ts | 37 ++++++++++++++++++------
 2 files changed, 32 insertions(+), 9 deletions(-)
diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts
index 7198988..26fb2a9 100644
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@@ -90,6 +90,7 @@ app.post("/v0/scrape", async (req, res) => {
   try {
     // make sure to authenticate user first, Bearer <token>
     const team_id = await authenticateUser(req, res, "scrape");
+    const crawlerOptions = req.body.crawlerOptions ?? {};
 
     try {
       const { success: creditsCheckSuccess, message: creditsCheckMessage } =
@@ -113,6 +114,9 @@ app.post("/v0/scrape", async (req, res) => {
       await a.setOptions({
         mode: "single_urls",
         urls: [url],
+        crawlerOptions: {
+          ...crawlerOptions,
+        },
       });
 
       const docs = await a.getDocuments(false);
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index b54d9e6..8290762 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -74,7 +74,7 @@ export class WebScraperDataProvider {
       throw new Error("Url is required");
     }
 
-    if (!useCaching) {
+    if (true) {//!useCaching) {
       if (this.mode === "crawl") {
         const crawler = new WebCrawler({
           initialUrl: this.urls[0],
@@ -95,7 +95,7 @@ export class WebScraperDataProvider {
         }
         let documents = await this.convertUrlsToDocuments(links, inProgress);
         documents = await this.getSitemapData(this.urls[0], documents);
-        console.log("documents", documents)
+        documents = this.replaceImgPathsWithAbsolutePaths(documents);
         if (this.generateImgAltText) {
           documents = await this.generatesImgAltText(documents);
         }
@@ -122,6 +122,7 @@ export class WebScraperDataProvider {
 
       if (this.mode === "single_urls") {
         let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
+        documents = this.replaceImgPathsWithAbsolutePaths(documents);
         if (this.generateImgAltText) {
           documents = await this.generatesImgAltText(documents);
         }
@@ -138,6 +139,7 @@ export class WebScraperDataProvider {
         let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
 
         documents = await this.getSitemapData(this.urls[0], documents);
+        documents = this.replaceImgPathsWithAbsolutePaths(documents);
         if (this.generateImgAltText) {
           documents = await this.generatesImgAltText(documents);
         }
@@ -297,29 +299,46 @@ export class WebScraperDataProvider {
   }
   generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
     await Promise.all(documents.map(async (document) => {
-      const baseUrl = new URL(document.metadata.sourceURL).origin;
-      const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
+      const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
 
-      await Promise.all(images.map(async (image) => {
+      await Promise.all(images.map(async (image: string) => {
         let imageUrl = image.match(/\(([^)]+)\)/)[1];
         let altText = image.match(/\[(.*?)\]/)[1];
-        let newImageUrl = '';
 
         if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) {
-          newImageUrl = baseUrl + imageUrl;
           const imageIndex = document.content.indexOf(image);
           const contentLength = document.content.length;
           let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength));
           let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
           let frontText = document.content.substring(frontTextStartIndex, imageIndex);
-          altText = await getImageDescription(newImageUrl, backText, frontText);
+          altText = await getImageDescription(imageUrl, backText, frontText);
         }
 
-        document.content = document.content.replace(image, `![${altText}](${newImageUrl})`);
+        document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
       }));
     }));
 
     return documents;
   }
+  
+  replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
+    documents.forEach(document => {
+      const baseUrl = new URL(document.metadata.sourceURL).origin;
+      const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
+
+      images.forEach(image => {
+        let imageUrl = image.match(/\(([^)]+)\)/)[1];
+        let altText = image.match(/\[(.*?)\]/)[1];
+
+        if (!imageUrl.startsWith("data:image")) {
+          imageUrl = baseUrl + imageUrl;
+        }
+
+        document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
+      });
+    });
+
+    return documents;
+  }
 }
 

From d23a7ae591fb21c28ec303bf160c6a51bede2635 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 16 Apr 2024 16:34:01 -0300
Subject: [PATCH 2/7] improving relative paths

---
 apps/api/src/scraper/WebScraper/index.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 8290762..6f368a1 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -74,7 +74,7 @@ export class WebScraperDataProvider {
       throw new Error("Url is required");
     }
 
-    if (true) {//!useCaching) {
+    if (!useCaching) {
       if (this.mode === "crawl") {
         const crawler = new WebCrawler({
           initialUrl: this.urls[0],

From b375ce3e39df3ce0a44bf1778ca389b0fe04bdf2 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Wed, 17 Apr 2024 14:54:54 -0300
Subject: [PATCH 3/7] adding unit tests and bugfixing

---
 .../WebScraper/__tests__/index.test.ts        | 97 +++++++++++++++++++
 apps/api/src/scraper/WebScraper/index.ts      | 15 ++-
 2 files changed, 107 insertions(+), 5 deletions(-)
 create mode 100644 apps/api/src/scraper/WebScraper/__tests__/index.test.ts

diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts
new file mode 100644
index 0000000..e060d16
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts
@@ -0,0 +1,97 @@
+import { WebScraperDataProvider } from '../index';
+
+describe('WebScraperDataProvider', () => {
+  describe('replaceImgPathsWithAbsolutePaths', () => {
+    it('should replace image paths with absolute paths', () => {
+      const webScraperDataProvider = new WebScraperDataProvider();
+      const documents = [
+        {
+          metadata: { sourceURL: 'https://example.com/page' },
+          content: '![alt text](/image.png)',
+        },
+        {
+          metadata: { sourceURL: 'https://example.com/another-page' },
+          content: '![another alt text](./another-image.png)',
+        },
+        {
+          metadata: { sourceURL: 'https://example.com/data-image' },
+          content: '![data image](data:image/png;base64,...)',
+        }
+      ];
+
+      const expectedDocuments = [
+        {
+          metadata: { sourceURL: 'https://example.com/page' },
+          content: '![alt text](https://example.com/image.png)',
+        },
+        {
+          metadata: { sourceURL: 'https://example.com/another-page' },
+          content: '![another alt text](https://example.com/another-image.png)',
+        },
+        {
+          metadata: { sourceURL: 'https://example.com/data-image' },
+          content: '![data image](data:image/png;base64,...)',
+        }
+      ];
+
+      const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
+      expect(result).toEqual(expectedDocuments);
+    });
+
+    it('should handle absolute URLs without modification', () => {
+      const webScraperDataProvider = new WebScraperDataProvider();
+      const documents = [
+        {
+          metadata: { sourceURL: 'https://example.com/page' },
+          content: '![alt text](https://example.com/image.png)',
+        },
+        {
+          metadata: { sourceURL: 'https://example.com/another-page' },
+          content: '![another alt text](http://anotherexample.com/another-image.png)',
+        }
+      ];
+
+      const expectedDocuments = [
+        {
+          metadata: { sourceURL: 'https://example.com/page' },
+          content: '![alt text](https://example.com/image.png)',
+        },
+        {
+          metadata: { sourceURL: 'https://example.com/another-page' },
+          content: '![another alt text](http://anotherexample.com/another-image.png)',
+        }
+      ];
+
+      const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
+      expect(result).toEqual(expectedDocuments);
+    });
+
+    it('should not replace non-image content within the documents', () => {
+      const webScraperDataProvider = new WebScraperDataProvider();
+      const documents = [
+        {
+          metadata: { sourceURL: 'https://example.com/page' },
+          content: 'This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).',
+        },
+        {
+          metadata: { sourceURL: 'https://example.com/another-page' },
+          content: 'Another test. ![another alt text](./another-image.png) Here is some **bold text**.',
+        }
+      ];
+    
+      const expectedDocuments = [
+        {
+          metadata: { sourceURL: 'https://example.com/page' },
+          content: 'This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).',
+        },
+        {
+          metadata: { sourceURL: 'https://example.com/another-page' },
+          content: 'Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.',
+        }
+      ];
+    
+      const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
+      expect(result).toEqual(expectedDocuments);
+    });
+  });
+});
\ No newline at end of file
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 6f368a1..727b597 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -325,19 +325,24 @@ export class WebScraperDataProvider {
     documents.forEach(document => {
       const baseUrl = new URL(document.metadata.sourceURL).origin;
       const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
-
+  
       images.forEach(image => {
         let imageUrl = image.match(/\(([^)]+)\)/)[1];
         let altText = image.match(/\[(.*?)\]/)[1];
-
+  
         if (!imageUrl.startsWith("data:image")) {
-          imageUrl = baseUrl + imageUrl;
+          if (!imageUrl.startsWith("http")) {
+            if (imageUrl.startsWith("/")) {
+              imageUrl = imageUrl.substring(1);
+            }
+            imageUrl = new URL(imageUrl, baseUrl).toString();
+          }
         }
-
+  
         document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
       });
     });
-
+  
     return documents;
   }
 }

From 2eb81545fa50e8aee61c855bc00fe3f1625c41e2 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 17 Apr 2024 11:04:03 -0700
Subject: [PATCH 4/7] Update index.test.ts

---
 .../WebScraper/__tests__/index.test.ts        | 166 +++++++++++++-----
 1 file changed, 120 insertions(+), 46 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts
index e060d16..49b3926 100644
--- a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts
+++ b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts
@@ -1,97 +1,171 @@
-import { WebScraperDataProvider } from '../index';
+import { WebScraperDataProvider } from "../index";
 
-describe('WebScraperDataProvider', () => {
-  describe('replaceImgPathsWithAbsolutePaths', () => {
-    it('should replace image paths with absolute paths', () => {
+describe("WebScraperDataProvider", () => {
+  describe("replaceImgPathsWithAbsolutePaths", () => {
+    it("should replace image paths with absolute paths", () => {
       const webScraperDataProvider = new WebScraperDataProvider();
       const documents = [
         {
-          metadata: { sourceURL: 'https://example.com/page' },
-          content: '![alt text](/image.png)',
+          metadata: { sourceURL: "https://example.com/page" },
+          content: "![alt text](/image.png)",
         },
         {
-          metadata: { sourceURL: 'https://example.com/another-page' },
-          content: '![another alt text](./another-image.png)',
+          metadata: { sourceURL: "https://example.com/another-page" },
+          content: "![another alt text](./another-image.png)",
         },
         {
-          metadata: { sourceURL: 'https://example.com/data-image' },
-          content: '![data image](data:image/png;base64,...)',
-        }
+          metadata: { sourceURL: "https://example.com/data-image" },
+          content: "![data image](data:image/png;base64,...)",
+        },
       ];
 
       const expectedDocuments = [
         {
-          metadata: { sourceURL: 'https://example.com/page' },
-          content: '![alt text](https://example.com/image.png)',
+          metadata: { sourceURL: "https://example.com/page" },
+          content: "![alt text](https://example.com/image.png)",
         },
         {
-          metadata: { sourceURL: 'https://example.com/another-page' },
-          content: '![another alt text](https://example.com/another-image.png)',
+          metadata: { sourceURL: "https://example.com/another-page" },
+          content: "![another alt text](https://example.com/another-image.png)",
         },
         {
-          metadata: { sourceURL: 'https://example.com/data-image' },
-          content: '![data image](data:image/png;base64,...)',
-        }
+          metadata: { sourceURL: "https://example.com/data-image" },
+          content: "![data image](data:image/png;base64,...)",
+        },
       ];
 
-      const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
+      const result =
+        webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
       expect(result).toEqual(expectedDocuments);
     });
 
-    it('should handle absolute URLs without modification', () => {
+    it("should handle absolute URLs without modification", () => {
       const webScraperDataProvider = new WebScraperDataProvider();
       const documents = [
         {
-          metadata: { sourceURL: 'https://example.com/page' },
-          content: '![alt text](https://example.com/image.png)',
+          metadata: { sourceURL: "https://example.com/page" },
+          content: "![alt text](https://example.com/image.png)",
         },
         {
-          metadata: { sourceURL: 'https://example.com/another-page' },
-          content: '![another alt text](http://anotherexample.com/another-image.png)',
-        }
+          metadata: { sourceURL: "https://example.com/another-page" },
+          content:
+            "![another alt text](http://anotherexample.com/another-image.png)",
+        },
       ];
 
       const expectedDocuments = [
         {
-          metadata: { sourceURL: 'https://example.com/page' },
-          content: '![alt text](https://example.com/image.png)',
+          metadata: { sourceURL: "https://example.com/page" },
+          content: "![alt text](https://example.com/image.png)",
         },
         {
-          metadata: { sourceURL: 'https://example.com/another-page' },
-          content: '![another alt text](http://anotherexample.com/another-image.png)',
-        }
+          metadata: { sourceURL: "https://example.com/another-page" },
+          content:
+            "![another alt text](http://anotherexample.com/another-image.png)",
+        },
       ];
 
-      const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
+      const result =
+        webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
       expect(result).toEqual(expectedDocuments);
     });
 
-    it('should not replace non-image content within the documents', () => {
+    it("should not replace non-image content within the documents", () => {
       const webScraperDataProvider = new WebScraperDataProvider();
       const documents = [
         {
-          metadata: { sourceURL: 'https://example.com/page' },
-          content: 'This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).',
+          metadata: { sourceURL: "https://example.com/page" },
+          content:
+            "This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).",
         },
         {
-          metadata: { sourceURL: 'https://example.com/another-page' },
-          content: 'Another test. ![another alt text](./another-image.png) Here is some **bold text**.',
-        }
+          metadata: { sourceURL: "https://example.com/another-page" },
+          content:
+            "Another test. ![another alt text](./another-image.png) Here is some **bold text**.",
+        },
       ];
-    
+
       const expectedDocuments = [
         {
-          metadata: { sourceURL: 'https://example.com/page' },
-          content: 'This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).',
+          metadata: { sourceURL: "https://example.com/page" },
+          content:
+            "This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).",
         },
         {
-          metadata: { sourceURL: 'https://example.com/another-page' },
-          content: 'Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.',
-        }
+          metadata: { sourceURL: "https://example.com/another-page" },
+          content:
+            "Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.",
+        },
       ];
-    
-      const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
+
+      const result =
+        webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
+      expect(result).toEqual(expectedDocuments);
+    });
+    it("should replace multiple image paths within the documents", () => {
+      const webScraperDataProvider = new WebScraperDataProvider();
+      const documents = [
+        {
+          metadata: { sourceURL: "https://example.com/page" },
+          content:
+            "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/image2.png)",
+        },
+        {
+          metadata: { sourceURL: "https://example.com/another-page" },
+          content:
+            "Another test. ![another alt text](./another-image1.png) Here is some **bold text**. ![another alt text](./another-image2.png)",
+        },
+      ];
+
+      const expectedDocuments = [
+        {
+          metadata: { sourceURL: "https://example.com/page" },
+          content:
+            "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/image2.png)",
+        },
+        {
+          metadata: { sourceURL: "https://example.com/another-page" },
+          content:
+            "Another test. ![another alt text](https://example.com/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-image2.png)",
+        },
+      ];
+
+      const result =
+        webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
+      expect(result).toEqual(expectedDocuments);
+    });
+
+    it("should replace image paths within the documents with complex URLs", () => {
+      const webScraperDataProvider = new WebScraperDataProvider();
+      const documents = [
+        {
+          metadata: { sourceURL: "https://example.com/page/subpage" },
+          content:
+            "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/sub/image2.png)",
+        },
+        {
+          metadata: { sourceURL: "https://example.com/another-page/subpage" },
+          content:
+            "Another test. ![another alt text](/another-page/another-image1.png) Here is some **bold text**. ![another alt text](/another-page/sub/another-image2.png)",
+        },
+      ];
+
+      const expectedDocuments = [
+        {
+          metadata: { sourceURL: "https://example.com/page/subpage" },
+          content:
+            "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/sub/image2.png)",
+        },
+        {
+          metadata: { sourceURL: "https://example.com/another-page/subpage" },
+          content:
+            "Another test. ![another alt text](https://example.com/another-page/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-page/sub/another-image2.png)",
+        },
+      ];
+
+      const result =
+        webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
       expect(result).toEqual(expectedDocuments);
     });
   });
-});
\ No newline at end of file
+});

From 871d5d91b0fa0d477425bbdc512edcbaa4a33f56 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 17 Apr 2024 12:51:12 -0700
Subject: [PATCH 5/7] Update index.ts

---
 apps/api/src/scraper/WebScraper/index.ts | 284 +++++++++++++++--------
 1 file changed, 190 insertions(+), 94 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 727b597..ecb2fff 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -49,19 +49,21 @@ export class WebScraperDataProvider {
     const results: (Document | null)[] = new Array(urls.length).fill(null);
     for (let i = 0; i < urls.length; i += this.concurrentRequests) {
       const batchUrls = urls.slice(i, i + this.concurrentRequests);
-      await Promise.all(batchUrls.map(async (url, index) => {
-        const result = await scrapSingleUrl(url, true);
-        processedUrls++;
-        if (inProgress) {
-          inProgress({
-            current: processedUrls,
-            total: totalUrls,
-            status: "SCRAPING",
-            currentDocumentUrl: url,
-          });
-        }
-        results[i + index] = result;
-      }));
+      await Promise.all(
+        batchUrls.map(async (url, index) => {
+          const result = await scrapSingleUrl(url, true);
+          processedUrls++;
+          if (inProgress) {
+            inProgress({
+              current: processedUrls,
+              total: totalUrls,
+              status: "SCRAPING",
+              currentDocumentUrl: url,
+            });
+          }
+          results[i + index] = result;
+        })
+      );
     }
     return results.filter((result) => result !== null) as Document[];
   }
@@ -102,33 +104,58 @@ export class WebScraperDataProvider {
 
         // CACHING DOCUMENTS
         // - parent document
-        const cachedParentDocumentString = await getValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0]));
+        const cachedParentDocumentString = await getValue(
+          "web-scraper-cache:" + this.normalizeUrl(this.urls[0])
+        );
         if (cachedParentDocumentString != null) {
           let cachedParentDocument = JSON.parse(cachedParentDocumentString);
-          if (!cachedParentDocument.childrenLinks || cachedParentDocument.childrenLinks.length < links.length - 1) {
-            cachedParentDocument.childrenLinks = links.filter((link) => link !== this.urls[0]);
-            await setValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0]), JSON.stringify(cachedParentDocument), 60 * 60 * 24 * 10); // 10 days
+          if (
+            !cachedParentDocument.childrenLinks ||
+            cachedParentDocument.childrenLinks.length < links.length - 1
+          ) {
+            cachedParentDocument.childrenLinks = links.filter(
+              (link) => link !== this.urls[0]
+            );
+            await setValue(
+              "web-scraper-cache:" + this.normalizeUrl(this.urls[0]),
+              JSON.stringify(cachedParentDocument),
+              60 * 60 * 24 * 10
+            ); // 10 days
           }
         } else {
-          let parentDocument = documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) === this.normalizeUrl(this.urls[0]))
+          let parentDocument = documents.filter(
+            (document) =>
+              this.normalizeUrl(document.metadata.sourceURL) ===
+              this.normalizeUrl(this.urls[0])
+          );
           await this.setCachedDocuments(parentDocument, links);
         }
 
-        await this.setCachedDocuments(documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) !== this.normalizeUrl(this.urls[0])), []);
+        await this.setCachedDocuments(
+          documents.filter(
+            (document) =>
+              this.normalizeUrl(document.metadata.sourceURL) !==
+              this.normalizeUrl(this.urls[0])
+          ),
+          []
+        );
         documents = this.removeChildLinks(documents);
         documents = documents.splice(0, this.limit);
         return documents;
       }
 
       if (this.mode === "single_urls") {
-        let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
+        let documents = await this.convertUrlsToDocuments(
+          this.urls,
+          inProgress
+        );
         documents = this.replaceImgPathsWithAbsolutePaths(documents);
         if (this.generateImgAltText) {
           documents = await this.generatesImgAltText(documents);
         }
         const baseUrl = new URL(this.urls[0]).origin;
         documents = await this.getSitemapData(baseUrl, documents);
-        
+
         await this.setCachedDocuments(documents);
         documents = this.removeChildLinks(documents);
         documents = documents.splice(0, this.limit);
@@ -136,14 +163,17 @@ export class WebScraperDataProvider {
       }
       if (this.mode === "sitemap") {
         const links = await getLinksFromSitemap(this.urls[0]);
-        let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
+        let documents = await this.convertUrlsToDocuments(
+          links.slice(0, this.limit),
+          inProgress
+        );
 
         documents = await this.getSitemapData(this.urls[0], documents);
         documents = this.replaceImgPathsWithAbsolutePaths(documents);
         if (this.generateImgAltText) {
           documents = await this.generatesImgAltText(documents);
         }
-        
+
         await this.setCachedDocuments(documents);
         documents = this.removeChildLinks(documents);
         documents = documents.splice(0, this.limit);
@@ -153,11 +183,22 @@ export class WebScraperDataProvider {
       return [];
     }
 
-    let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit));
+    let documents = await this.getCachedDocuments(
+      this.urls.slice(0, this.limit)
+    );
     if (documents.length < this.limit) {
-       const newDocuments: Document[] = await this.getDocuments(false, inProgress);
-      newDocuments.forEach(doc => {
-        if (!documents.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) {
+      const newDocuments: Document[] = await this.getDocuments(
+        false,
+        inProgress
+      );
+      newDocuments.forEach((doc) => {
+        if (
+          !documents.some(
+            (d) =>
+              this.normalizeUrl(d.metadata.sourceURL) ===
+              this.normalizeUrl(doc.metadata?.sourceURL)
+          )
+        ) {
           documents.push(doc);
         }
       });
@@ -173,17 +214,23 @@ export class WebScraperDataProvider {
       const url = new URL(document.metadata.sourceURL);
       const path = url.pathname;
 
-      if (this.excludes.length > 0 && this.excludes[0] !== '') {
+      if (this.excludes.length > 0 && this.excludes[0] !== "") {
         // Check if the link should be excluded
-        if (this.excludes.some(excludePattern => new RegExp(excludePattern).test(path))) {
+        if (
+          this.excludes.some((excludePattern) =>
+            new RegExp(excludePattern).test(path)
+          )
+        ) {
           return false;
         }
       }
-      
-      if (this.includes.length > 0 && this.includes[0] !== '') {
+
+      if (this.includes.length > 0 && this.includes[0] !== "") {
         // Check if the link matches the include patterns, if any are specified
         if (this.includes.length > 0) {
-          return this.includes.some(includePattern => new RegExp(includePattern).test(path));
+          return this.includes.some((includePattern) =>
+            new RegExp(includePattern).test(path)
+          );
         }
       }
       return true;
@@ -200,7 +247,7 @@ export class WebScraperDataProvider {
   private removeChildLinks(documents: Document[]): Document[] {
     for (let document of documents) {
       if (document?.childrenLinks) delete document.childrenLinks;
-    };
+    }
     return documents;
   }
 
@@ -210,10 +257,14 @@ export class WebScraperDataProvider {
         continue;
       }
       const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
-      await setValue('web-scraper-cache:' + normalizedUrl, JSON.stringify({
-        ...document,
-        childrenLinks: childrenLinks || []
-      }), 60 * 60 * 24 * 10); // 10 days
+      await setValue(
+        "web-scraper-cache:" + normalizedUrl,
+        JSON.stringify({
+          ...document,
+          childrenLinks: childrenLinks || [],
+        }),
+        60 * 60 * 24 * 10
+      ); // 10 days
     }
   }
 
@@ -221,8 +272,12 @@ export class WebScraperDataProvider {
     let documents: Document[] = [];
     for (const url of urls) {
       const normalizedUrl = this.normalizeUrl(url);
-      console.log("Getting cached document for web-scraper-cache:" + normalizedUrl)
-      const cachedDocumentString = await getValue('web-scraper-cache:' + normalizedUrl);
+      console.log(
+        "Getting cached document for web-scraper-cache:" + normalizedUrl
+      );
+      const cachedDocumentString = await getValue(
+        "web-scraper-cache:" + normalizedUrl
+      );
       if (cachedDocumentString) {
         const cachedDocument = JSON.parse(cachedDocumentString);
         documents.push(cachedDocument);
@@ -230,10 +285,18 @@ export class WebScraperDataProvider {
         // get children documents
         for (const childUrl of cachedDocument.childrenLinks) {
           const normalizedChildUrl = this.normalizeUrl(childUrl);
-          const childCachedDocumentString = await getValue('web-scraper-cache:' + normalizedChildUrl);
+          const childCachedDocumentString = await getValue(
+            "web-scraper-cache:" + normalizedChildUrl
+          );
           if (childCachedDocumentString) {
             const childCachedDocument = JSON.parse(childCachedDocumentString);
-            if (!documents.find((doc) => doc.metadata.sourceURL === childCachedDocument.metadata.sourceURL)) {
+            if (
+              !documents.find(
+                (doc) =>
+                  doc.metadata.sourceURL ===
+                  childCachedDocument.metadata.sourceURL
+              )
+            ) {
               documents.push(childCachedDocument);
             }
           }
@@ -248,7 +311,7 @@ export class WebScraperDataProvider {
       throw new Error("Urls are required");
     }
 
-    console.log("options", options.crawlerOptions?.excludes)
+    console.log("options", options.crawlerOptions?.excludes);
     this.urls = options.urls;
     this.mode = options.mode;
     this.concurrentRequests = options.concurrentRequests ?? 20;
@@ -257,13 +320,12 @@ export class WebScraperDataProvider {
     this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
     this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
     this.limit = options.crawlerOptions?.limit ?? 10000;
-    this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false;
-
+    this.generateImgAltText =
+      options.crawlerOptions?.generateImgAltText ?? false;
 
     //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
-    this.excludes = this.excludes.filter(item => item !== '');
-  
-  
+    this.excludes = this.excludes.filter((item) => item !== "");
+
     // make sure all urls start with https://
     this.urls = this.urls.map((url) => {
       if (!url.trim().startsWith("http")) {
@@ -274,10 +336,14 @@ export class WebScraperDataProvider {
   }
 
   private async getSitemapData(baseUrl: string, documents: Document[]) {
-    const sitemapData = await fetchSitemapData(baseUrl)
+    const sitemapData = await fetchSitemapData(baseUrl);
     if (sitemapData) {
       for (let i = 0; i < documents.length; i++) {
-        const docInSitemapData = sitemapData.find((data) => this.normalizeUrl(data.loc) === this.normalizeUrl(documents[i].metadata.sourceURL))
+        const docInSitemapData = sitemapData.find(
+          (data) =>
+            this.normalizeUrl(data.loc) ===
+            this.normalizeUrl(documents[i].metadata.sourceURL)
+        );
         if (docInSitemapData) {
           let sitemapDocData: Partial<SitemapEntry> = {};
           if (docInSitemapData.changefreq) {
@@ -298,52 +364,82 @@ export class WebScraperDataProvider {
     return documents;
   }
   generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
-    await Promise.all(documents.map(async (document) => {
-      const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
+    await Promise.all(
+      documents.map(async (document) => {
+        const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
 
-      await Promise.all(images.map(async (image: string) => {
-        let imageUrl = image.match(/\(([^)]+)\)/)[1];
-        let altText = image.match(/\[(.*?)\]/)[1];
+        await Promise.all(
+          images.map(async (image: string) => {
+            let imageUrl = image.match(/\(([^)]+)\)/)[1];
+            let altText = image.match(/\[(.*?)\]/)[1];
 
-        if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) {
-          const imageIndex = document.content.indexOf(image);
-          const contentLength = document.content.length;
-          let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength));
-          let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
-          let frontText = document.content.substring(frontTextStartIndex, imageIndex);
-          altText = await getImageDescription(imageUrl, backText, frontText);
-        }
-
-        document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
-      }));
-    }));
-
-    return documents;
-  }
-  
-  replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
-    documents.forEach(document => {
-      const baseUrl = new URL(document.metadata.sourceURL).origin;
-      const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
-  
-      images.forEach(image => {
-        let imageUrl = image.match(/\(([^)]+)\)/)[1];
-        let altText = image.match(/\[(.*?)\]/)[1];
-  
-        if (!imageUrl.startsWith("data:image")) {
-          if (!imageUrl.startsWith("http")) {
-            if (imageUrl.startsWith("/")) {
-              imageUrl = imageUrl.substring(1);
+            if (
+              !altText &&
+              !imageUrl.startsWith("data:image") &&
+              /\.(png|jpeg|gif|webp)$/.test(imageUrl)
+            ) {
+              const imageIndex = document.content.indexOf(image);
+              const contentLength = document.content.length;
+              let backText = document.content.substring(
+                imageIndex + image.length,
+                Math.min(imageIndex + image.length + 1000, contentLength)
+              );
+              let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
+              let frontText = document.content.substring(
+                frontTextStartIndex,
+                imageIndex
+              );
+              altText = await getImageDescription(
+                imageUrl,
+                backText,
+                frontText
+              );
             }
-            imageUrl = new URL(imageUrl, baseUrl).toString();
-          }
-        }
-  
-        document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
-      });
-    });
-  
-    return documents;
-  }
-}
 
+            document.content = document.content.replace(
+              image,
+              `![${altText}](${imageUrl})`
+            );
+          })
+        );
+      })
+    );
+
+    return documents;
+  };
+
+  replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
+    try {
+      documents.forEach((document) => {
+        const baseUrl = new URL(document.metadata.sourceURL).origin;
+        const images =
+          document.content.match(
+            /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
+          ) || [];
+
+        images.forEach((image: string) => {
+          let imageUrl = image.match(/\(([^)]+)\)/)[1];
+          let altText = image.match(/\[(.*?)\]/)[1];
+
+          if (!imageUrl.startsWith("data:image")) {
+            if (!imageUrl.startsWith("http")) {
+              if (imageUrl.startsWith("/")) {
+                imageUrl = imageUrl.substring(1);
+              }
+              imageUrl = new URL(imageUrl, baseUrl).toString();
+            }
+          }
+
+          document.content = document.content.replace(
+            image,
+            `![${altText}](${imageUrl})`
+          );
+        });
+      });
+
+      return documents;
+    } catch (error) {
+      return documents;
+    }
+  };
+}

From de439f6529111b2f839dc8a8ef126310e1a0b31d Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 17 Apr 2024 12:51:29 -0700
Subject: [PATCH 6/7] Update index.ts

---
 apps/api/src/scraper/WebScraper/index.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index ecb2fff..501dde0 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -439,6 +439,7 @@ export class WebScraperDataProvider {
 
       return documents;
     } catch (error) {
+      console.error("Error replacing img paths with absolute paths", error);
       return documents;
     }
   };

From 52fb28bc1a943d6489f85fb93061f8c01bf6c0f1 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 17 Apr 2024 12:52:15 -0700
Subject: [PATCH 7/7] Update index.ts

---
 apps/api/src/scraper/WebScraper/index.ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 501dde0..e1bd425 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -311,7 +311,6 @@ export class WebScraperDataProvider {
       throw new Error("Urls are required");
     }
 
-    console.log("options", options.crawlerOptions?.excludes);
     this.urls = options.urls;
     this.mode = options.mode;
     this.concurrentRequests = options.concurrentRequests ?? 20;