From 72e1dadccd33214e3a25b92a41c15a680847dd11 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 19 Apr 2024 11:47:20 -0300 Subject: [PATCH 1/3] adding option to replace all relative paths with absolute paths --- apps/api/src/lib/entities.ts | 1 + .../WebScraper/__tests__/index.test.ts | 179 ------------------ apps/api/src/scraper/WebScraper/index.ts | 63 +++--- .../utils/__tests__/pdfProcessor.test.ts | 69 ++++--- .../utils/__tests__/replacePaths.test.ts | 114 +++++++++++ .../scraper/WebScraper/utils/replacePaths.ts | 80 ++++++++ apps/api/src/services/queue-worker.ts | 1 - 7 files changed, 257 insertions(+), 250 deletions(-) delete mode 100644 apps/api/src/scraper/WebScraper/__tests__/index.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/replacePaths.ts diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index d608756..e261dd4 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -22,6 +22,7 @@ export type WebScraperOptions = { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; + replaceAllPathsWithAbsolutePaths?: boolean; }; pageOptions?: PageOptions; concurrentRequests?: number; diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts deleted file mode 100644 index 42d9513..0000000 --- a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts +++ /dev/null @@ -1,179 +0,0 @@ -import { WebScraperDataProvider } from "../index"; - -describe("WebScraperDataProvider", () => { - describe("replaceImgPathsWithAbsolutePaths", () => { - it("should replace image paths with absolute paths", () => { - const webScraperDataProvider = new WebScraperDataProvider(); - const documents = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: "![alt text](/image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: "![another alt text](./another-image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: "![another alt text](./another-image.webp)", - }, - { - metadata: { sourceURL: "https://example.com/data-image" }, - content: "![data image](data:image/png;base64,...)", - }, - ]; - - const expectedDocuments = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: "![alt text](https://example.com/image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: "![another alt text](https://example.com/another-image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: "![another alt text](https://example.com/another-image.webp)", - }, - { - metadata: { sourceURL: "https://example.com/data-image" }, - content: "![data image](data:image/png;base64,...)", - }, - ]; - - const result = - webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); - expect(result).toEqual(expectedDocuments); - }); - - it("should handle absolute URLs without modification", () => { - const webScraperDataProvider = new WebScraperDataProvider(); - const documents = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: "![alt text](https://example.com/image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "![another alt text](http://anotherexample.com/another-image.png)", - }, - ]; - - const expectedDocuments = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: "![alt text](https://example.com/image.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "![another alt text](http://anotherexample.com/another-image.png)", - }, - ]; - - const result = - webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); - expect(result).toEqual(expectedDocuments); - }); - - it("should not replace non-image content within the documents", () => { - const webScraperDataProvider = new WebScraperDataProvider(); - const documents = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: - "This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "Another test. ![another alt text](./another-image.png) Here is some **bold text**.", - }, - ]; - - const expectedDocuments = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: - "This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.", - }, - ]; - - const result = - webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); - expect(result).toEqual(expectedDocuments); - }); - it("should replace multiple image paths within the documents", () => { - const webScraperDataProvider = new WebScraperDataProvider(); - const documents = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: - "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/image2.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "Another test. ![another alt text](./another-image1.png) Here is some **bold text**. ![another alt text](./another-image2.png)", - }, - ]; - - const expectedDocuments = [ - { - metadata: { sourceURL: "https://example.com/page" }, - content: - "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/image2.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page" }, - content: - "Another test. ![another alt text](https://example.com/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-image2.png)", - }, - ]; - - const result = - webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); - expect(result).toEqual(expectedDocuments); - }); - - it("should replace image paths within the documents with complex URLs", () => { - const webScraperDataProvider = new WebScraperDataProvider(); - const documents = [ - { - metadata: { sourceURL: "https://example.com/page/subpage" }, - content: - "This is a test. ![alt text](/image1.png) Here is a link: [Example](https://example.com). ![alt text](/sub/image2.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page/subpage" }, - content: - "Another test. ![another alt text](/another-page/another-image1.png) Here is some **bold text**. ![another alt text](/another-page/sub/another-image2.png)", - }, - ]; - - const expectedDocuments = [ - { - metadata: { sourceURL: "https://example.com/page/subpage" }, - content: - "This is a test. ![alt text](https://example.com/image1.png) Here is a link: [Example](https://example.com). ![alt text](https://example.com/sub/image2.png)", - }, - { - metadata: { sourceURL: "https://example.com/another-page/subpage" }, - content: - "Another test. ![another alt text](https://example.com/another-page/another-image1.png) Here is some **bold text**. ![another alt text](https://example.com/another-page/sub/another-image2.png)", - }, - ]; - - const result = - webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); - expect(result).toEqual(expectedDocuments); - }); - }); -}); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 551c8d8..c2146be 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -6,6 +6,7 @@ import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/gptVision"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; +import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; export class WebScraperDataProvider { @@ -19,6 +20,7 @@ export class WebScraperDataProvider { private concurrentRequests: number = 20; private generateImgAltText: boolean = false; private pageOptions?: PageOptions; + private replaceAllPathsWithAbsolutePaths?: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -100,7 +102,13 @@ export class WebScraperDataProvider { let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); - documents = this.replaceImgPathsWithAbsolutePaths(documents); + + if (this.replaceAllPathsWithAbsolutePaths) { + documents = replacePathsWithAbsolutePaths(documents); + } else { + documents = replaceImgPathsWithAbsolutePaths(documents); + } + if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -164,7 +172,13 @@ export class WebScraperDataProvider { this.urls.filter((link) => !link.endsWith(".pdf")), inProgress ); - documents = this.replaceImgPathsWithAbsolutePaths(documents); + + if (this.replaceAllPathsWithAbsolutePaths) { + documents = replacePathsWithAbsolutePaths(documents); + } else { + documents = replaceImgPathsWithAbsolutePaths(documents); + } + if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -197,7 +211,13 @@ export class WebScraperDataProvider { ); documents = await this.getSitemapData(this.urls[0], documents); - documents = this.replaceImgPathsWithAbsolutePaths(documents); + + if (this.replaceAllPathsWithAbsolutePaths) { + documents = replacePathsWithAbsolutePaths(documents); + } else { + documents = replaceImgPathsWithAbsolutePaths(documents); + } + if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -351,6 +371,7 @@ export class WebScraperDataProvider { this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; + this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); @@ -436,40 +457,4 @@ export class WebScraperDataProvider { return documents; }; - - replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { - try { - documents.forEach((document) => { - const baseUrl = new URL(document.metadata.sourceURL).origin; - const images = - document.content.match( - /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g - ) || []; - - images.forEach((image: string) => { - let imageUrl = image.match(/\(([^)]+)\)/)[1]; - let altText = image.match(/\[(.*?)\]/)[1]; - - if (!imageUrl.startsWith("data:image")) { - if (!imageUrl.startsWith("http")) { - if (imageUrl.startsWith("/")) { - imageUrl = imageUrl.substring(1); - } - imageUrl = new URL(imageUrl, baseUrl).toString(); - } - } - - document.content = document.content.replace( - image, - `![${altText}](${imageUrl})` - ); - }); - }); - - return documents; - } catch (error) { - console.error("Error replacing img paths with absolute paths", error); - return documents; - } - }; } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts index 7d25aec..f14c8d4 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -1,40 +1,47 @@ import * as pdfProcessor from '../pdfProcessor'; describe('PDF Processing Module - Integration Test', () => { - it('should download and read a simple PDF file by URL', async () => { + it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { + delete process.env.LLAMAPARSE_API_KEY; const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); - expect(pdfContent).toEqual("Dummy PDF file"); + expect(pdfContent.trim()).toEqual("Dummy PDF file"); }); - it('should download and read a complex PDF file by URL', async () => { - const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf'); +// We're hitting the LLAMAPARSE rate limit 🫠 +// it('should download and read a simple PDF file by URL', async () => { +// const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); +// expect(pdfContent).toEqual("Dummy PDF file"); +// }); - const expectedContent = 'A Comprehensive Overview of Large Language Models\n' + - ' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' + - ' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' + - ' Nick Barnes h, Ajmal Mian i\n' + - ' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' + - ' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' + - ' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' + - ' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' + - ' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' + - ' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' + - ' gThe University of Melbourne (UoM), Melbourne, Australia\n' + - ' hAustralian National University (ANU), Canberra, Australia\n' + - ' iThe University of Western Australia (UWA), Perth, Australia\n' + - ' Abstract\n' + - ' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' + - ' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' + - ' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' + - ' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' + - ' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' + - ' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' + - ' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' + - ' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' + - ' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' + - ' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' + - ' extensive informative summaries of the existing works to advance the LLM research.\n' - expect(pdfContent).toContain(expectedContent); - }, 60000); +// it('should download and read a complex PDF file by URL', async () => { +// const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf'); + +// const expectedContent = 'A Comprehensive Overview of Large Language Models\n' + +// ' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' + +// ' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' + +// ' Nick Barnes h, Ajmal Mian i\n' + +// ' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' + +// ' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' + +// ' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' + +// ' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' + +// ' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' + +// ' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' + +// ' gThe University of Melbourne (UoM), Melbourne, Australia\n' + +// ' hAustralian National University (ANU), Canberra, Australia\n' + +// ' iThe University of Western Australia (UWA), Perth, Australia\n' + +// ' Abstract\n' + +// ' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' + +// ' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' + +// ' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' + +// ' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' + +// ' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' + +// ' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' + +// ' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' + +// ' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' + +// ' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' + +// ' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' + +// ' extensive informative summaries of the existing works to advance the LLM research.\n' +// expect(pdfContent).toContain(expectedContent); +// }, 60000); }); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts new file mode 100644 index 0000000..aae567c --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts @@ -0,0 +1,114 @@ +import { Document } from "../../../../lib/entities"; +import { replacePathsWithAbsolutePaths, replaceImgPathsWithAbsolutePaths } from "../replacePaths"; + +describe('replacePaths', () => { + describe('replacePathsWithAbsolutePaths', () => { + it('should replace relative paths with absolute paths', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).' + }]; + + const expectedDocuments: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).' + }]; + + const result = replacePathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should not alter absolute URLs', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).' + }]; + + const result = replacePathsWithAbsolutePaths(documents); + expect(result).toEqual(documents); // Expect no change + }); + + it('should not alter data URLs for images', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'This is an image: ![alt text]().' + }]; + + const result = replacePathsWithAbsolutePaths(documents); + expect(result).toEqual(documents); // Expect no change + }); + + it('should handle multiple links and images correctly', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).' + }]; + + const expectedDocuments: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).' + }]; + + const result = replacePathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should correctly handle a mix of absolute and relative paths', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().' + }]; + + const expectedDocuments: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().' + }]; + + const result = replacePathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + }); + + describe('replaceImgPathsWithAbsolutePaths', () => { + it('should replace relative image paths with absolute paths', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Here is an image: ![alt text](/path/to/image.jpg).' + }]; + + const expectedDocuments: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' + }]; + + const result = replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should not alter data:image URLs', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'An image with a data URL: ![alt text]().' + }]; + + const result = replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(documents); // Expect no change + }); + + it('should handle multiple images with a mix of data and relative URLs', () => { + const documents: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).' + }]; + + const expectedDocuments: Document[] = [{ + metadata: { sourceURL: 'https://example.com' }, + content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).' + }]; + + const result = replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts new file mode 100644 index 0000000..d652611 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts @@ -0,0 +1,80 @@ +import { Document } from "../../../lib/entities"; + +export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => { + try { + documents.forEach((document) => { + const baseUrl = new URL(document.metadata.sourceURL).origin; + const paths = + document.content.match( + /(!?\[.*?\])\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)|href="([^"]+)"/g + ) || []; + + paths.forEach((path: string) => { + const isImage = path.startsWith("!"); + let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/); + let url = matchedUrl[1]; + + if (!url.startsWith("data:") && !url.startsWith("http")) { + if (url.startsWith("/")) { + url = url.substring(1); + } + url = new URL(url, baseUrl).toString(); + } + + const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0]; + if (isImage) { + document.content = document.content.replace( + path, + `${markdownLinkOrImageText}(${url})` + ); + } else { + document.content = document.content.replace( + path, + `${markdownLinkOrImageText}(${url})` + ); + } + }); + }); + + return documents; + } catch (error) { + console.error("Error replacing paths with absolute paths", error); + return documents; + } +}; + +export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { + try { + documents.forEach((document) => { + const baseUrl = new URL(document.metadata.sourceURL).origin; + const images = + document.content.match( + /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g + ) || []; + + images.forEach((image: string) => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; + + if (!imageUrl.startsWith("data:image")) { + if (!imageUrl.startsWith("http")) { + if (imageUrl.startsWith("/")) { + imageUrl = imageUrl.substring(1); + } + imageUrl = new URL(imageUrl, baseUrl).toString(); + } + } + + document.content = document.content.replace( + image, + `![${altText}](${imageUrl})` + ); + }); + }); + + return documents; + } catch (error) { + console.error("Error replacing img paths with absolute paths", error); + return documents; + } +}; \ No newline at end of file diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index f3a971a..c9c5f73 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -3,7 +3,6 @@ import { getWebScraperQueue } from "./queue-service"; import "dotenv/config"; import { logtail } from "./logtail"; import { startWebScraperPipeline } from "../main/runWebScraper"; -import { WebScraperDataProvider } from "../scraper/WebScraper"; import { callWebhook } from "./webhook"; getWebScraperQueue().process( From 3ddff62a56d8201fb907b09dbb5e41b57f458623 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 19 Apr 2024 14:49:35 -0300 Subject: [PATCH 2/3] adding better doc and types for js-sdk --- apps/js-sdk/firecrawl/src/index.ts | 108 +++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 7 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 3d105e7..be55066 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -2,17 +2,60 @@ import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios'; import dotenv from 'dotenv'; dotenv.config(); -interface FirecrawlAppConfig { +/** + * Configuration interface for FirecrawlApp. + */ +export interface FirecrawlAppConfig { apiKey?: string | null; } -interface Params { +/** + * Generic parameter interface. + */ +export interface Params { [key: string]: any; } +/** + * Response interface for scraping operations. + */ +export interface ScrapeResponse { + success: boolean; + data?: any; + error?: string; +} + +/** + * Response interface for crawling operations. + */ +export interface CrawlResponse { + success: boolean; + jobId?: string; + data?: any; + error?: string; +} + +/** + * Response interface for job status checks. + */ +export interface JobStatusResponse { + success: boolean; + status: string; + jobId?: string; + data?: any; + error?: string; +} + +/** + * Main class for interacting with the Firecrawl API. + */ export default class FirecrawlApp { private apiKey: string; + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ constructor({ apiKey = null }: FirecrawlAppConfig) { this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || ''; if (!this.apiKey) { @@ -20,7 +63,13 @@ export default class FirecrawlApp { } } - async scrapeUrl(url: string, params: Params | null = null): Promise { + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + async scrapeUrl(url: string, params: Params | null = null): Promise { const headers: AxiosRequestHeaders = { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}`, @@ -34,7 +83,7 @@ export default class FirecrawlApp { if (response.status === 200) { const responseData = response.data; if (responseData.success) { - return responseData.data; + return responseData; } else { throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); } @@ -44,9 +93,18 @@ export default class FirecrawlApp { } catch (error: any) { throw new Error(error.message); } + return { success: false, error: 'Internal server error.' }; } - async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2): Promise { + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The response from the crawl operation. + */ + async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2): Promise { const headers = this.prepareHeaders(); let jsonData: Params = { url }; if (params) { @@ -59,7 +117,7 @@ export default class FirecrawlApp { if (waitUntilDone) { return this.monitorJobStatus(jobId, headers, timeout); } else { - return { jobId }; + return { success: true, jobId }; } } else { this.handleError(response, 'start crawl job'); @@ -68,9 +126,15 @@ export default class FirecrawlApp { console.log(error) throw new Error(error.message); } + return { success: false, error: 'Internal server error.' }; } - async checkCrawlStatus(jobId: string): Promise { + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + async checkCrawlStatus(jobId: string): Promise { const headers: AxiosRequestHeaders = this.prepareHeaders(); try { const response: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers); @@ -82,8 +146,13 @@ export default class FirecrawlApp { } catch (error: any) { throw new Error(error.message); } + return { success: false, status: 'unknown', error: 'Internal server error.' }; } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ prepareHeaders(): AxiosRequestHeaders { return { 'Content-Type': 'application/json', @@ -91,14 +160,34 @@ export default class FirecrawlApp { } as AxiosRequestHeaders; } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise { return axios.post(url, data, { headers }); } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ getRequest(url: string, headers: AxiosRequestHeaders): Promise { return axios.get(url, { headers }); } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise { while (true) { const statusResponse: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers); @@ -124,6 +213,11 @@ export default class FirecrawlApp { } } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ handleError(response: AxiosResponse, action: string): void { if ([402, 409, 500].includes(response.status)) { const errorMessage: string = response.data.error || 'Unknown error occurred'; From 384fb1db1868bf2e3e2bf9c5c1e105216faa5ae8 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 19 Apr 2024 15:27:54 -0300 Subject: [PATCH 3/3] updating version --- apps/js-sdk/firecrawl/build/index.js | 62 +++++++++++++++++++++++++++- apps/js-sdk/firecrawl/package.json | 2 +- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js index be4223f..25ae999 100644 --- a/apps/js-sdk/firecrawl/build/index.js +++ b/apps/js-sdk/firecrawl/build/index.js @@ -10,13 +10,26 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge import axios from 'axios'; import dotenv from 'dotenv'; dotenv.config(); +/** + * Main class for interacting with the Firecrawl API. + */ export default class FirecrawlApp { + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ constructor({ apiKey = null }) { this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || ''; if (!this.apiKey) { throw new Error('No API key provided'); } } + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ scrapeUrl(url_1) { return __awaiter(this, arguments, void 0, function* (url, params = null) { const headers = { @@ -32,7 +45,7 @@ export default class FirecrawlApp { if (response.status === 200) { const responseData = response.data; if (responseData.success) { - return responseData.data; + return responseData; } else { throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); @@ -45,8 +58,17 @@ export default class FirecrawlApp { catch (error) { throw new Error(error.message); } + return { success: false, error: 'Internal server error.' }; }); } + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The response from the crawl operation. + */ crawlUrl(url_1) { return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, timeout = 2) { const headers = this.prepareHeaders(); @@ -62,7 +84,7 @@ export default class FirecrawlApp { return this.monitorJobStatus(jobId, headers, timeout); } else { - return { jobId }; + return { success: true, jobId }; } } else { @@ -73,8 +95,14 @@ export default class FirecrawlApp { console.log(error); throw new Error(error.message); } + return { success: false, error: 'Internal server error.' }; }); } + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ checkCrawlStatus(jobId) { return __awaiter(this, void 0, void 0, function* () { const headers = this.prepareHeaders(); @@ -90,20 +118,45 @@ export default class FirecrawlApp { catch (error) { throw new Error(error.message); } + return { success: false, status: 'unknown', error: 'Internal server error.' }; }); } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ prepareHeaders() { return { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}`, }; } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ postRequest(url, data, headers) { return axios.post(url, data, { headers }); } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ getRequest(url, headers) { return axios.get(url, { headers }); } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ monitorJobStatus(jobId, headers, timeout) { return __awaiter(this, void 0, void 0, function* () { while (true) { @@ -134,6 +187,11 @@ export default class FirecrawlApp { } }); } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ handleError(response, action) { if ([402, 409, 500].includes(response.status)) { const errorMessage = response.data.error || 'Unknown error occurred'; diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 89e6d3f..58aa5ac 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.9", + "version": "0.0.10", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "type": "module",