From 520739c9f44b77d94288f3ea9e0433330ae1bc12 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 12:43:16 -0700 Subject: [PATCH 1/3] Nick: fixed bugs associated with absolute path replacements --- apps/api/openapi.json | 10 +++++----- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/index.ts | 11 +++++----- .../scraper/WebScraper/utils/replacePaths.ts | 20 +++++++++++-------- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 7147af1..a755e37 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -190,11 +190,6 @@ "description": "Ignore the website sitemap when crawling", "default": false }, - "replaceAllPathsWithAbsolutePaths": { - "type": "boolean", - "description": "Replace all relative paths with absolute paths for images and links", - "default": false - }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl", @@ -223,6 +218,11 @@ "headers": { "type": "object", "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." + }, + "replaceAllPathsWithAbsolutePaths": { + "type": "boolean", + "description": "Replace all relative paths with absolute paths for images and links", + "default": false } } } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 744c07b..d5002c7 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -18,6 +18,7 @@ export type PageOptions = { waitFor?: number; screenshot?: boolean; headers?: Record; + replaceAllPathsWithAbsolutePaths?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 7dcd175..54897f1 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -302,9 +302,10 @@ export class WebScraperDataProvider { } private applyPathReplacements(documents: Document[]): Document[] { - return this.replaceAllPathsWithAbsolutePaths - ? replacePathsWithAbsolutePaths(documents) - : replaceImgPathsWithAbsolutePaths(documents); + if (this.replaceAllPathsWithAbsolutePaths) { + documents = replacePathsWithAbsolutePaths(documents); + } + return replaceImgPathsWithAbsolutePaths(documents); } private async applyImgAltText(documents: Document[]): Promise { @@ -473,9 +474,9 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} - this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; + this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); this.crawlerMode = options.crawlerOptions?.mode ?? "default"; diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts index d652611..788916c 100644 --- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts +++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts @@ -10,7 +10,8 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] ) || []; paths.forEach((path: string) => { - const isImage = path.startsWith("!"); + try { + const isImage = path.startsWith("!"); let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/); let url = matchedUrl[1]; @@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] } const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0]; - if (isImage) { - document.content = document.content.replace( - path, - `${markdownLinkOrImageText}(${url})` - ); - } else { + // Image is handled afterwards + if (!isImage) { document.content = document.content.replace( path, `${markdownLinkOrImageText}(${url})` ); + } + } catch (error) { + } }); + document.markdown = document.content; }); return documents; @@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen if (!imageUrl.startsWith("http")) { if (imageUrl.startsWith("/")) { imageUrl = imageUrl.substring(1); + imageUrl = new URL(imageUrl, baseUrl).toString(); + } else { + imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString(); } - imageUrl = new URL(imageUrl, baseUrl).toString(); } } @@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen `![${altText}](${imageUrl})` ); }); + document.markdown = document.content; }); return documents; From 2239e03269ec8ef3c3dba2596ac8994fa4562b05 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 12:54:02 -0700 Subject: [PATCH 2/3] Update replacePaths.test.ts --- .../WebScraper/utils/__tests__/replacePaths.test.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts index aae567c..6ecd990 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts @@ -6,12 +6,12 @@ describe('replacePaths', () => { it('should replace relative paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).' + content: 'This is a [link](/path/to/resource).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).' + content: 'This is a [link](https://example.com/path/to/resource).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -21,7 +21,7 @@ describe('replacePaths', () => { it('should not alter absolute URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).' + content: 'This is an [external link](https://external.com/path).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -41,12 +41,12 @@ describe('replacePaths', () => { it('should handle multiple links and images correctly', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).' + content: 'Here are two links: [link1](/path1) and [link2](/path2).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).' + content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).' }]; const result = replacePathsWithAbsolutePaths(documents); From 1e3e06a1d57bffdafb7f562ca9fd5a4cb15ad05f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 11 Jun 2024 13:02:39 -0700 Subject: [PATCH 3/3] Update replacePaths.test.ts --- .../utils/__tests__/replacePaths.test.ts | 39 ++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts index 6ecd990..e201926 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts @@ -6,12 +6,14 @@ describe('replacePaths', () => { it('should replace relative paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](/path/to/resource).' + content: 'This is a [link](/path/to/resource).', + markdown: 'This is a [link](/path/to/resource).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is a [link](https://example.com/path/to/resource).' + content: 'This is a [link](https://example.com/path/to/resource).', + markdown: 'This is a [link](https://example.com/path/to/resource).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -21,7 +23,8 @@ describe('replacePaths', () => { it('should not alter absolute URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an [external link](https://external.com/path).' + content: 'This is an [external link](https://external.com/path).', + markdown: 'This is an [external link](https://external.com/path).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -31,7 +34,8 @@ describe('replacePaths', () => { it('should not alter data URLs for images', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).' + content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).', + markdown: 'This is an image: ![alt text](data:image/png;base64,ABC123==).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -41,12 +45,14 @@ describe('replacePaths', () => { it('should handle multiple links and images correctly', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](/path1) and [link2](/path2).' + content: 'Here are two links: [link1](/path1) and [link2](/path2).', + markdown: 'Here are two links: [link1](/path1) and [link2](/path2).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).' + content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).', + markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -56,12 +62,14 @@ describe('replacePaths', () => { it('should correctly handle a mix of absolute and relative paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' + content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).', + markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' + content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).', + markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' }]; const result = replacePathsWithAbsolutePaths(documents); @@ -74,12 +82,14 @@ describe('replacePaths', () => { it('should replace relative image paths with absolute paths', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here is an image: ![alt text](/path/to/image.jpg).' + content: 'Here is an image: ![alt text](/path/to/image.jpg).', + markdown: 'Here is an image: ![alt text](/path/to/image.jpg).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' + content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).', + markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); @@ -89,7 +99,8 @@ describe('replacePaths', () => { it('should not alter data:image URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).' + content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).', + markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).' }]; const result = replaceImgPathsWithAbsolutePaths(documents); @@ -99,12 +110,14 @@ describe('replacePaths', () => { it('should handle multiple images with a mix of data and relative URLs', () => { const documents: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).' + content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).', + markdown: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).' }]; const expectedDocuments: Document[] = [{ metadata: { sourceURL: 'https://example.com' }, - content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).' + content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).', + markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).' }]; const result = replaceImgPathsWithAbsolutePaths(documents);