From 72e1dadccd33214e3a25b92a41c15a680847dd11 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Fri, 19 Apr 2024 11:47:20 -0300
Subject: [PATCH 1/3] adding option to replace all relative paths with absolute
 paths

---
 apps/api/src/lib/entities.ts                  |   1 +
 .../WebScraper/__tests__/index.test.ts        | 179 ------------------
 apps/api/src/scraper/WebScraper/index.ts      |  63 +++---
 .../utils/__tests__/pdfProcessor.test.ts      |  69 ++++---
 .../utils/__tests__/replacePaths.test.ts      | 114 +++++++++++
 .../scraper/WebScraper/utils/replacePaths.ts  |  80 ++++++++
 apps/api/src/services/queue-worker.ts         |   1 -
 7 files changed, 257 insertions(+), 250 deletions(-)
 delete mode 100644 apps/api/src/scraper/WebScraper/__tests__/index.test.ts
 create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts
 create mode 100644 apps/api/src/scraper/WebScraper/utils/replacePaths.ts

diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index d608756..e261dd4 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -22,6 +22,7 @@ export type WebScraperOptions = {
     maxCrawledLinks?: number;
     limit?: number;
     generateImgAltText?: boolean;
+    replaceAllPathsWithAbsolutePaths?: boolean;
   };
   pageOptions?: PageOptions;
   concurrentRequests?: number;
diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts
deleted file mode 100644
index 42d9513..0000000
--- a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts
+++ /dev/null
@@ -1,179 +0,0 @@
-import { WebScraperDataProvider } from "../index";
-
-describe("WebScraperDataProvider", () => {
-  describe("replaceImgPathsWithAbsolutePaths", () => {
-    it("should replace image paths with absolute paths", () => {
-      const webScraperDataProvider = new WebScraperDataProvider();
-      const documents = [
-        {
-          metadata: { sourceURL: "https://example.com/page" },
-          content: "![alt text](/image.png)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page" },
-          content: "![another alt text](./another-image.png)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page" },
-          content: "![another alt text](./another-image.webp)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/data-image" },
-          content: "![data image](data:image/png;base64,...)",
-        },
-      ];
-
-      const expectedDocuments = [
-        {
-          metadata: { sourceURL: "https://example.com/page" },
-          content: "![alt text](https://example.com/image.png)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page" },
-          content: "![another alt text](https://example.com/another-image.png)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page" },
-          content: "![another alt text](https://example.com/another-image.webp)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/data-image" },
-          content: "![data image](data:image/png;base64,...)",
-        },
-      ];
-
-      const result =
-        webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
-      expect(result).toEqual(expectedDocuments);
-    });
-
-    it("should handle absolute URLs without modification", () => {
-      const webScraperDataProvider = new WebScraperDataProvider();
-      const documents = [
-        {
-          metadata: { sourceURL: "https://example.com/page" },
-          content: "![alt text](https://example.com/image.png)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page" },
-          content:
-            "![another alt text](http://anotherexample.com/another-image.png)",
-        },
-      ];
-
-      const expectedDocuments = [
-        {
-          metadata: { sourceURL: "https://example.com/page" },
-          content: "![alt text](https://example.com/image.png)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page" },
-          content:
-            "![another alt text](http://anotherexample.com/another-image.png)",
-        },
-      ];
-
-      const result =
-        webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
-      expect(result).toEqual(expectedDocuments);
-    });
-
-    it("should not replace non-image content within the documents", () => {
-      const webScraperDataProvider = new WebScraperDataProvider();
-      const documents = [
-        {
-          metadata: { sourceURL: "https://example.com/page" },
-          content:
-            "This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page" },
-          content:
-            "Another test. ![another alt text](./another-image.png) Here is some **bold text**.",
-        },
-      ];
-
-      const expectedDocuments = [
-        {
-          metadata: { sourceURL: "https://example.com/page" },
-          content:
-            "This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page" },
-          content:
-            "Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.",
-        },
-      ];
-
-      const result =
-        webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
-      expect(result).toEqual(expectedDocuments);
-    });
-    it("should replace multiple image paths within the documents", () => {
-      const webScraperDataProvider = new WebScraperDataProvider();
-      const documents = [
-        {
-          metadata: { sourceURL: "https://example.com/page" },
-          content:
-            "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/image2.png)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page" },
-          content:
-            "Another test. ![another alt text](./another-image1.png) Here is some **bold text**. ![another alt text](./another-image2.png)",
-        },
-      ];
-
-      const expectedDocuments = [
-        {
-          metadata: { sourceURL: "https://example.com/page" },
-          content:
-            "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/image2.png)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page" },
-          content:
-            "Another test. ![another alt text](https://example.com/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-image2.png)",
-        },
-      ];
-
-      const result =
-        webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
-      expect(result).toEqual(expectedDocuments);
-    });
-
-    it("should replace image paths within the documents with complex URLs", () => {
-      const webScraperDataProvider = new WebScraperDataProvider();
-      const documents = [
-        {
-          metadata: { sourceURL: "https://example.com/page/subpage" },
-          content:
-            "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/sub/image2.png)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page/subpage" },
-          content:
-            "Another test. ![another alt text](/another-page/another-image1.png) Here is some **bold text**. ![another alt text](/another-page/sub/another-image2.png)",
-        },
-      ];
-
-      const expectedDocuments = [
-        {
-          metadata: { sourceURL: "https://example.com/page/subpage" },
-          content:
-            "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/sub/image2.png)",
-        },
-        {
-          metadata: { sourceURL: "https://example.com/another-page/subpage" },
-          content:
-            "Another test. ![another alt text](https://example.com/another-page/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-page/sub/another-image2.png)",
-        },
-      ];
-
-      const result =
-        webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
-      expect(result).toEqual(expectedDocuments);
-    });
-  });
-});
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 551c8d8..c2146be 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -6,6 +6,7 @@ import { WebCrawler } from "./crawler";
 import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/gptVision";
 import { fetchAndProcessPdf } from "./utils/pdfProcessor";
+import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
 
 
 export class WebScraperDataProvider {
@@ -19,6 +20,7 @@ export class WebScraperDataProvider {
   private concurrentRequests: number = 20;
   private generateImgAltText: boolean = false;
   private pageOptions?: PageOptions;
+  private replaceAllPathsWithAbsolutePaths?: boolean = false;
 
   authorize(): void {
     throw new Error("Method not implemented.");
@@ -100,7 +102,13 @@ export class WebScraperDataProvider {
 
         let documents = await this.convertUrlsToDocuments(links, inProgress);
         documents = await this.getSitemapData(this.urls[0], documents);
-        documents = this.replaceImgPathsWithAbsolutePaths(documents);
+
+        if (this.replaceAllPathsWithAbsolutePaths) {
+          documents = replacePathsWithAbsolutePaths(documents);
+        } else {
+          documents = replaceImgPathsWithAbsolutePaths(documents);
+        }
+
         if (this.generateImgAltText) {
           documents = await this.generatesImgAltText(documents);
         }
@@ -164,7 +172,13 @@ export class WebScraperDataProvider {
           this.urls.filter((link) => !link.endsWith(".pdf")),
           inProgress
         );
-        documents = this.replaceImgPathsWithAbsolutePaths(documents);
+
+        if (this.replaceAllPathsWithAbsolutePaths) {
+          documents = replacePathsWithAbsolutePaths(documents);
+        } else {
+          documents = replaceImgPathsWithAbsolutePaths(documents);
+        }
+
         if (this.generateImgAltText) {
           documents = await this.generatesImgAltText(documents);
         }
@@ -197,7 +211,13 @@ export class WebScraperDataProvider {
         );
 
         documents = await this.getSitemapData(this.urls[0], documents);
-        documents = this.replaceImgPathsWithAbsolutePaths(documents);
+
+        if (this.replaceAllPathsWithAbsolutePaths) {
+          documents = replacePathsWithAbsolutePaths(documents);
+        } else {
+          documents = replaceImgPathsWithAbsolutePaths(documents);
+        }
+
         if (this.generateImgAltText) {
           documents = await this.generatesImgAltText(documents);
         }
@@ -351,6 +371,7 @@ export class WebScraperDataProvider {
     this.generateImgAltText =
       options.crawlerOptions?.generateImgAltText ?? false;
     this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
+    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
 
     //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
     this.excludes = this.excludes.filter((item) => item !== "");
@@ -436,40 +457,4 @@ export class WebScraperDataProvider {
 
     return documents;
   };
-
-  replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
-    try {
-      documents.forEach((document) => {
-        const baseUrl = new URL(document.metadata.sourceURL).origin;
-        const images =
-          document.content.match(
-            /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
-          ) || [];
-
-        images.forEach((image: string) => {
-          let imageUrl = image.match(/\(([^)]+)\)/)[1];
-          let altText = image.match(/\[(.*?)\]/)[1];
-
-          if (!imageUrl.startsWith("data:image")) {
-            if (!imageUrl.startsWith("http")) {
-              if (imageUrl.startsWith("/")) {
-                imageUrl = imageUrl.substring(1);
-              }
-              imageUrl = new URL(imageUrl, baseUrl).toString();
-            }
-          }
-
-          document.content = document.content.replace(
-            image,
-            `![${altText}](${imageUrl})`
-          );
-        });
-      });
-
-      return documents;
-    } catch (error) {
-      console.error("Error replacing img paths with absolute paths", error);
-      return documents;
-    }
-  };
 }
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
index 7d25aec..f14c8d4 100644
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
@@ -1,40 +1,47 @@
 import * as pdfProcessor from '../pdfProcessor';
 
 describe('PDF Processing Module - Integration Test', () => {
-  it('should download and read a simple PDF file by URL', async () => {
+  it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
+    delete process.env.LLAMAPARSE_API_KEY;
     const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
-    expect(pdfContent).toEqual("Dummy PDF file");
+    expect(pdfContent.trim()).toEqual("Dummy PDF file");
   });
 
-  it('should download and read a complex PDF file by URL', async () => {
-    const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf');
+// We're hitting the LLAMAPARSE rate limit 🫠
+//   it('should download and read a simple PDF file by URL', async () => {
+//     const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
+//     expect(pdfContent).toEqual("Dummy PDF file");
+//   });
 
-    const expectedContent = 'A Comprehensive Overview of Large Language Models\n' +
-    '                       a                        a,∗            b,∗                         c,d,∗                   e,f                           e,f                     g,i\n' +
-    '   Humza Naveed         , Asad Ullah Khan          , Shi Qiu     , Muhammad Saqib               , Saeed Anwar        , Muhammad Usman              , Naveed Akhtar         ,\n' +
-    '                                                                     Nick Barnes      h, Ajmal Mian      i\n' +
-    '                                                   aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' +
-    '                                                     bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' +
-    '                                                        cUniversity of Technology Sydney (UTS), Sydney, Australia\n' +
-    '                                       dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' +
-    '                                           eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' +
-    '                                   fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' +
-    '                                                        gThe University of Melbourne (UoM), Melbourne, Australia\n' +
-    '                                                       hAustralian National University (ANU), Canberra, Australia\n' +
-    '                                                       iThe University of Western Australia (UWA), Perth, Australia\n' +
-    '  Abstract\n' +
-    '     Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' +
-    '  beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' +
-    '  topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' +
-    '  robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' +
-    '  LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' +
-    '  the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' +
-    '  yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' +
-    '  on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' +
-    '  concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' +
-    '  provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' +
-    '  extensive informative summaries of the existing works to advance the LLM research.\n'
-    expect(pdfContent).toContain(expectedContent);
-  }, 60000); 
+//   it('should download and read a complex PDF file by URL', async () => {
+//     const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf');
+
+//     const expectedContent = 'A Comprehensive Overview of Large Language Models\n' +
+//     '                       a                        a,∗            b,∗                         c,d,∗                   e,f                           e,f                     g,i\n' +
+//     '   Humza Naveed         , Asad Ullah Khan          , Shi Qiu     , Muhammad Saqib               , Saeed Anwar        , Muhammad Usman              , Naveed Akhtar         ,\n' +
+//     '                                                                     Nick Barnes      h, Ajmal Mian      i\n' +
+//     '                                                   aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' +
+//     '                                                     bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' +
+//     '                                                        cUniversity of Technology Sydney (UTS), Sydney, Australia\n' +
+//     '                                       dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' +
+//     '                                           eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' +
+//     '                                   fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' +
+//     '                                                        gThe University of Melbourne (UoM), Melbourne, Australia\n' +
+//     '                                                       hAustralian National University (ANU), Canberra, Australia\n' +
+//     '                                                       iThe University of Western Australia (UWA), Perth, Australia\n' +
+//     '  Abstract\n' +
+//     '     Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' +
+//     '  beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' +
+//     '  topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' +
+//     '  robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' +
+//     '  LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' +
+//     '  the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' +
+//     '  yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' +
+//     '  on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' +
+//     '  concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' +
+//     '  provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' +
+//     '  extensive informative summaries of the existing works to advance the LLM research.\n'
+//     expect(pdfContent).toContain(expectedContent);
+//   }, 60000); 
 
 });
\ No newline at end of file
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts
new file mode 100644
index 0000000..aae567c
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts
@@ -0,0 +1,114 @@
+import { Document } from "../../../../lib/entities";
+import { replacePathsWithAbsolutePaths, replaceImgPathsWithAbsolutePaths } from "../replacePaths";
+
+describe('replacePaths', () => {
+  describe('replacePathsWithAbsolutePaths', () => {
+    it('should replace relative paths with absolute paths', () => {
+      const documents: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).'
+      }];
+
+      const expectedDocuments: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).'
+      }];
+
+      const result = replacePathsWithAbsolutePaths(documents);
+      expect(result).toEqual(expectedDocuments);
+    });
+
+    it('should not alter absolute URLs', () => {
+      const documents: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).'
+      }];
+
+      const result = replacePathsWithAbsolutePaths(documents);
+      expect(result).toEqual(documents); // Expect no change
+    });
+
+    it('should not alter data URLs for images', () => {
+      const documents: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).'
+      }];
+
+      const result = replacePathsWithAbsolutePaths(documents);
+      expect(result).toEqual(documents); // Expect no change
+    });
+
+    it('should handle multiple links and images correctly', () => {
+      const documents: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).'
+      }];
+
+      const expectedDocuments: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).'
+      }];
+
+      const result = replacePathsWithAbsolutePaths(documents);
+      expect(result).toEqual(expectedDocuments);
+    });
+
+    it('should correctly handle a mix of absolute and relative paths', () => {
+      const documents: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
+      }];
+
+      const expectedDocuments: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
+      }];
+
+      const result = replacePathsWithAbsolutePaths(documents);
+      expect(result).toEqual(expectedDocuments);
+    });
+    
+  });
+
+  describe('replaceImgPathsWithAbsolutePaths', () => {
+    it('should replace relative image paths with absolute paths', () => {
+      const documents: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'Here is an image: ![alt text](/path/to/image.jpg).'
+      }];
+
+      const expectedDocuments: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
+      }];
+
+      const result = replaceImgPathsWithAbsolutePaths(documents);
+      expect(result).toEqual(expectedDocuments);
+    });
+
+    it('should not alter data:image URLs', () => {
+      const documents: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).'
+      }];
+
+      const result = replaceImgPathsWithAbsolutePaths(documents);
+      expect(result).toEqual(documents); // Expect no change
+    });
+
+    it('should handle multiple images with a mix of data and relative URLs', () => {
+      const documents: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).'
+      }];
+
+      const expectedDocuments: Document[] = [{
+        metadata: { sourceURL: 'https://example.com' },
+        content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).'
+      }];
+
+      const result = replaceImgPathsWithAbsolutePaths(documents);
+      expect(result).toEqual(expectedDocuments);
+    });
+  });
+});
\ No newline at end of file
diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts
new file mode 100644
index 0000000..d652611
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts
@@ -0,0 +1,80 @@
+import { Document } from "../../../lib/entities";
+
+export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
+  try {
+    documents.forEach((document) => {
+      const baseUrl = new URL(document.metadata.sourceURL).origin;
+      const paths =
+        document.content.match(
+          /(!?\[.*?\])\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)|href="([^"]+)"/g
+        ) || [];
+
+      paths.forEach((path: string) => {
+        const isImage = path.startsWith("!");
+        let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
+        let url = matchedUrl[1];
+
+        if (!url.startsWith("data:") && !url.startsWith("http")) {
+          if (url.startsWith("/")) {
+            url = url.substring(1);
+          }
+          url = new URL(url, baseUrl).toString();
+        }
+
+        const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
+        if (isImage) {
+          document.content = document.content.replace(
+            path,
+            `${markdownLinkOrImageText}(${url})`
+          );
+        } else {
+          document.content = document.content.replace(
+            path,
+            `${markdownLinkOrImageText}(${url})`
+          );
+        }
+      });
+    });
+
+    return documents;
+  } catch (error) {
+    console.error("Error replacing paths with absolute paths", error);
+    return documents;
+  }
+};
+
+export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
+  try {
+    documents.forEach((document) => {
+      const baseUrl = new URL(document.metadata.sourceURL).origin;
+      const images =
+        document.content.match(
+          /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
+        ) || [];
+
+      images.forEach((image: string) => {
+        let imageUrl = image.match(/\(([^)]+)\)/)[1];
+        let altText = image.match(/\[(.*?)\]/)[1];
+
+        if (!imageUrl.startsWith("data:image")) {
+          if (!imageUrl.startsWith("http")) {
+            if (imageUrl.startsWith("/")) {
+              imageUrl = imageUrl.substring(1);
+            }
+            imageUrl = new URL(imageUrl, baseUrl).toString();
+          }
+        }
+
+        document.content = document.content.replace(
+          image,
+          `![${altText}](${imageUrl})`
+        );
+      });
+    });
+
+    return documents;
+  } catch (error) {
+    console.error("Error replacing img paths with absolute paths", error);
+    return documents;
+  }
+};
\ No newline at end of file
diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index f3a971a..c9c5f73 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -3,7 +3,6 @@ import { getWebScraperQueue } from "./queue-service";
 import "dotenv/config";
 import { logtail } from "./logtail";
 import { startWebScraperPipeline } from "../main/runWebScraper";
-import { WebScraperDataProvider } from "../scraper/WebScraper";
 import { callWebhook } from "./webhook";
 
 getWebScraperQueue().process(

From 3ddff62a56d8201fb907b09dbb5e41b57f458623 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Fri, 19 Apr 2024 14:49:35 -0300
Subject: [PATCH 2/3] adding better doc and types for js-sdk

---
 apps/js-sdk/firecrawl/src/index.ts | 108 +++++++++++++++++++++++++++--
 1 file changed, 101 insertions(+), 7 deletions(-)

diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts
index 3d105e7..be55066 100644
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@@ -2,17 +2,60 @@ import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios';
 import dotenv from 'dotenv';
 dotenv.config();
 
-interface FirecrawlAppConfig {
+/**
+ * Configuration interface for FirecrawlApp.
+ */
+export interface FirecrawlAppConfig {
   apiKey?: string | null;
 }
 
-interface Params {
+/**
+ * Generic parameter interface.
+ */
+export interface Params {
   [key: string]: any;
 }
 
+/**
+ * Response interface for scraping operations.
+ */
+export interface ScrapeResponse {
+  success: boolean;
+  data?: any;
+  error?: string;
+}
+
+/**
+ * Response interface for crawling operations.
+ */
+export interface CrawlResponse {
+  success: boolean;
+  jobId?: string;
+  data?: any;
+  error?: string;
+}
+
+/**
+ * Response interface for job status checks.
+ */
+export interface JobStatusResponse {
+  success: boolean;
+  status: string;
+  jobId?: string;
+  data?: any;
+  error?: string;
+}
+
+/**
+ * Main class for interacting with the Firecrawl API.
+ */
 export default class FirecrawlApp {
   private apiKey: string;
 
+  /**
+   * Initializes a new instance of the FirecrawlApp class.
+   * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
+   */
   constructor({ apiKey = null }: FirecrawlAppConfig) {
     this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || '';
     if (!this.apiKey) {
@@ -20,7 +63,13 @@ export default class FirecrawlApp {
     }
   }
 
-  async scrapeUrl(url: string, params: Params | null = null): Promise<any> {
+  /**
+   * Scrapes a URL using the Firecrawl API.
+   * @param {string} url - The URL to scrape.
+   * @param {Params | null} params - Additional parameters for the scrape request.
+   * @returns {Promise<ScrapeResponse>} The response from the scrape operation.
+   */
+  async scrapeUrl(url: string, params: Params | null = null): Promise<ScrapeResponse> {
     const headers: AxiosRequestHeaders = {
       'Content-Type': 'application/json',
       'Authorization': `Bearer ${this.apiKey}`,
@@ -34,7 +83,7 @@ export default class FirecrawlApp {
       if (response.status === 200) {
         const responseData = response.data;
         if (responseData.success) {
-          return responseData.data;
+          return responseData; 
         } else {
           throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
         }
@@ -44,9 +93,18 @@ export default class FirecrawlApp {
     } catch (error: any) {
       throw new Error(error.message);
     }
+    return { success: false, error: 'Internal server error.' };
   }
 
-  async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2): Promise<any> {
+  /**
+   * Initiates a crawl job for a URL using the Firecrawl API.
+   * @param {string} url - The URL to crawl.
+   * @param {Params | null} params - Additional parameters for the crawl request.
+   * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
+   * @param {number} timeout - Timeout in seconds for job status checks.
+   * @returns {Promise<CrawlResponse>} The response from the crawl operation.
+   */
+  async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2): Promise<CrawlResponse> {
     const headers = this.prepareHeaders();
     let jsonData: Params = { url };
     if (params) {
@@ -59,7 +117,7 @@ export default class FirecrawlApp {
         if (waitUntilDone) {
           return this.monitorJobStatus(jobId, headers, timeout);
         } else {
-          return { jobId };
+          return { success: true, jobId };
         }
       } else {
         this.handleError(response, 'start crawl job');
@@ -68,9 +126,15 @@ export default class FirecrawlApp {
       console.log(error)
       throw new Error(error.message);
     }
+    return { success: false, error: 'Internal server error.' };
   }
 
-  async checkCrawlStatus(jobId: string): Promise<any> {
+  /**
+   * Checks the status of a crawl job using the Firecrawl API.
+   * @param {string} jobId - The job ID of the crawl operation.
+   * @returns {Promise<JobStatusResponse>} The response containing the job status.
+   */
+  async checkCrawlStatus(jobId: string): Promise<JobStatusResponse> {
     const headers: AxiosRequestHeaders = this.prepareHeaders();
     try {
       const response: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers);
@@ -82,8 +146,13 @@ export default class FirecrawlApp {
     } catch (error: any) {
       throw new Error(error.message);
     }
+    return { success: false, status: 'unknown', error: 'Internal server error.' };
   }
 
+  /**
+   * Prepares the headers for an API request.
+   * @returns {AxiosRequestHeaders} The prepared headers.
+   */
   prepareHeaders(): AxiosRequestHeaders {
     return {
       'Content-Type': 'application/json',
@@ -91,14 +160,34 @@ export default class FirecrawlApp {
     } as AxiosRequestHeaders;
   }
 
+  /**
+   * Sends a POST request to the specified URL.
+   * @param {string} url - The URL to send the request to.
+   * @param {Params} data - The data to send in the request.
+   * @param {AxiosRequestHeaders} headers - The headers for the request.
+   * @returns {Promise<AxiosResponse>} The response from the POST request.
+   */
   postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
     return axios.post(url, data, { headers });
   }
 
+  /**
+   * Sends a GET request to the specified URL.
+   * @param {string} url - The URL to send the request to.
+   * @param {AxiosRequestHeaders} headers - The headers for the request.
+   * @returns {Promise<AxiosResponse>} The response from the GET request.
+   */
   getRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
     return axios.get(url, { headers });
   }
 
+  /**
+   * Monitors the status of a crawl job until completion or failure.
+   * @param {string} jobId - The job ID of the crawl operation.
+   * @param {AxiosRequestHeaders} headers - The headers for the request.
+   * @param {number} timeout - Timeout in seconds for job status checks.
+   * @returns {Promise<any>} The final job status or data.
+   */
   async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise<any> {
     while (true) {
       const statusResponse: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers);
@@ -124,6 +213,11 @@ export default class FirecrawlApp {
     }
   }
 
+  /**
+   * Handles errors from API responses.
+   * @param {AxiosResponse} response - The response from the API.
+   * @param {string} action - The action being performed when the error occurred.
+   */
   handleError(response: AxiosResponse, action: string): void {
     if ([402, 409, 500].includes(response.status)) {
       const errorMessage: string = response.data.error || 'Unknown error occurred';

From 384fb1db1868bf2e3e2bf9c5c1e105216faa5ae8 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Fri, 19 Apr 2024 15:27:54 -0300
Subject: [PATCH 3/3] updating version

---
 apps/js-sdk/firecrawl/build/index.js | 62 +++++++++++++++++++++++++++-
 apps/js-sdk/firecrawl/package.json   |  2 +-
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js
index be4223f..25ae999 100644
--- a/apps/js-sdk/firecrawl/build/index.js
+++ b/apps/js-sdk/firecrawl/build/index.js
@@ -10,13 +10,26 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
 import axios from 'axios';
 import dotenv from 'dotenv';
 dotenv.config();
+/**
+ * Main class for interacting with the Firecrawl API.
+ */
 export default class FirecrawlApp {
+    /**
+     * Initializes a new instance of the FirecrawlApp class.
+     * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
+     */
     constructor({ apiKey = null }) {
         this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || '';
         if (!this.apiKey) {
             throw new Error('No API key provided');
         }
     }
+    /**
+     * Scrapes a URL using the Firecrawl API.
+     * @param {string} url - The URL to scrape.
+     * @param {Params | null} params - Additional parameters for the scrape request.
+     * @returns {Promise<ScrapeResponse>} The response from the scrape operation.
+     */
     scrapeUrl(url_1) {
         return __awaiter(this, arguments, void 0, function* (url, params = null) {
             const headers = {
@@ -32,7 +45,7 @@ export default class FirecrawlApp {
                 if (response.status === 200) {
                     const responseData = response.data;
                     if (responseData.success) {
-                        return responseData.data;
+                        return responseData;
                     }
                     else {
                         throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
@@ -45,8 +58,17 @@ export default class FirecrawlApp {
             catch (error) {
                 throw new Error(error.message);
             }
+            return { success: false, error: 'Internal server error.' };
         });
     }
+    /**
+     * Initiates a crawl job for a URL using the Firecrawl API.
+     * @param {string} url - The URL to crawl.
+     * @param {Params | null} params - Additional parameters for the crawl request.
+     * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
+     * @param {number} timeout - Timeout in seconds for job status checks.
+     * @returns {Promise<CrawlResponse>} The response from the crawl operation.
+     */
     crawlUrl(url_1) {
         return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, timeout = 2) {
             const headers = this.prepareHeaders();
@@ -62,7 +84,7 @@ export default class FirecrawlApp {
                         return this.monitorJobStatus(jobId, headers, timeout);
                     }
                     else {
-                        return { jobId };
+                        return { success: true, jobId };
                     }
                 }
                 else {
@@ -73,8 +95,14 @@ export default class FirecrawlApp {
                 console.log(error);
                 throw new Error(error.message);
             }
+            return { success: false, error: 'Internal server error.' };
         });
     }
+    /**
+     * Checks the status of a crawl job using the Firecrawl API.
+     * @param {string} jobId - The job ID of the crawl operation.
+     * @returns {Promise<JobStatusResponse>} The response containing the job status.
+     */
     checkCrawlStatus(jobId) {
         return __awaiter(this, void 0, void 0, function* () {
             const headers = this.prepareHeaders();
@@ -90,20 +118,45 @@ export default class FirecrawlApp {
             catch (error) {
                 throw new Error(error.message);
             }
+            return { success: false, status: 'unknown', error: 'Internal server error.' };
         });
     }
+    /**
+     * Prepares the headers for an API request.
+     * @returns {AxiosRequestHeaders} The prepared headers.
+     */
     prepareHeaders() {
         return {
             'Content-Type': 'application/json',
             'Authorization': `Bearer ${this.apiKey}`,
         };
     }
+    /**
+     * Sends a POST request to the specified URL.
+     * @param {string} url - The URL to send the request to.
+     * @param {Params} data - The data to send in the request.
+     * @param {AxiosRequestHeaders} headers - The headers for the request.
+     * @returns {Promise<AxiosResponse>} The response from the POST request.
+     */
     postRequest(url, data, headers) {
         return axios.post(url, data, { headers });
     }
+    /**
+     * Sends a GET request to the specified URL.
+     * @param {string} url - The URL to send the request to.
+     * @param {AxiosRequestHeaders} headers - The headers for the request.
+     * @returns {Promise<AxiosResponse>} The response from the GET request.
+     */
     getRequest(url, headers) {
         return axios.get(url, { headers });
     }
+    /**
+     * Monitors the status of a crawl job until completion or failure.
+     * @param {string} jobId - The job ID of the crawl operation.
+     * @param {AxiosRequestHeaders} headers - The headers for the request.
+     * @param {number} timeout - Timeout in seconds for job status checks.
+     * @returns {Promise<any>} The final job status or data.
+     */
     monitorJobStatus(jobId, headers, timeout) {
         return __awaiter(this, void 0, void 0, function* () {
             while (true) {
@@ -134,6 +187,11 @@ export default class FirecrawlApp {
             }
         });
     }
+    /**
+     * Handles errors from API responses.
+     * @param {AxiosResponse} response - The response from the API.
+     * @param {string} action - The action being performed when the error occurred.
+     */
     handleError(response, action) {
         if ([402, 409, 500].includes(response.status)) {
             const errorMessage = response.data.error || 'Unknown error occurred';
diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json
index 89e6d3f..58aa5ac 100644
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@mendable/firecrawl-js",
-  "version": "0.0.9",
+  "version": "0.0.10",
   "description": "JavaScript SDK for Firecrawl API",
   "main": "build/index.js",
   "type": "module",