From b375ce3e39df3ce0a44bf1778ca389b0fe04bdf2 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:54:54 -0300 Subject: [PATCH] adding unit tests and bugfixing --- .../WebScraper/__tests__/index.test.ts | 97 +++++++++++++++++++ apps/api/src/scraper/WebScraper/index.ts | 15 ++- 2 files changed, 107 insertions(+), 5 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/__tests__/index.test.ts diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts new file mode 100644 index 0000000..e060d16 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts @@ -0,0 +1,97 @@ +import { WebScraperDataProvider } from '../index'; + +describe('WebScraperDataProvider', () => { + describe('replaceImgPathsWithAbsolutePaths', () => { + it('should replace image paths with absolute paths', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](./another-image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/data-image' }, + content: '![data image](data:image/png;base64,...)', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](https://example.com/another-image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/data-image' }, + content: '![data image](data:image/png;base64,...)', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should handle absolute URLs without modification', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](http://anotherexample.com/another-image.png)', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](http://anotherexample.com/another-image.png)', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should not replace non-image content within the documents', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: 'This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: 'Another test. ![another alt text](./another-image.png) Here is some **bold text**.', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: 'This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: 'Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 6f368a1..727b597 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -325,19 +325,24 @@ export class WebScraperDataProvider { documents.forEach(document => { const baseUrl = new URL(document.metadata.sourceURL).origin; const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; - + images.forEach(image => { let imageUrl = image.match(/\(([^)]+)\)/)[1]; let altText = image.match(/\[(.*?)\]/)[1]; - + if (!imageUrl.startsWith("data:image")) { - imageUrl = baseUrl + imageUrl; + if (!imageUrl.startsWith("http")) { + if (imageUrl.startsWith("/")) { + imageUrl = imageUrl.substring(1); + } + imageUrl = new URL(imageUrl, baseUrl).toString(); + } } - + document.content = document.content.replace(image, `![${altText}](${imageUrl})`); }); }); - + return documents; } }