diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts new file mode 100644 index 0000000..e060d16 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts @@ -0,0 +1,97 @@ +import { WebScraperDataProvider } from '../index'; + +describe('WebScraperDataProvider', () => { + describe('replaceImgPathsWithAbsolutePaths', () => { + it('should replace image paths with absolute paths', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](./another-image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/data-image' }, + content: '![data image](data:image/png;base64,...)', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](https://example.com/another-image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/data-image' }, + content: '![data image](data:image/png;base64,...)', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should handle absolute URLs without modification', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](http://anotherexample.com/another-image.png)', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: '![alt text](https://example.com/image.png)', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: '![another alt text](http://anotherexample.com/another-image.png)', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + + it('should not replace non-image content within the documents', () => { + const webScraperDataProvider = new WebScraperDataProvider(); + const documents = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: 'This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: 'Another test. ![another alt text](./another-image.png) Here is some **bold text**.', + } + ]; + + const expectedDocuments = [ + { + metadata: { sourceURL: 'https://example.com/page' }, + content: 'This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).', + }, + { + metadata: { sourceURL: 'https://example.com/another-page' }, + content: 'Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.', + } + ]; + + const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents); + expect(result).toEqual(expectedDocuments); + }); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 6f368a1..727b597 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -325,19 +325,24 @@ export class WebScraperDataProvider { documents.forEach(document => { const baseUrl = new URL(document.metadata.sourceURL).origin; const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; - + images.forEach(image => { let imageUrl = image.match(/\(([^)]+)\)/)[1]; let altText = image.match(/\[(.*?)\]/)[1]; - + if (!imageUrl.startsWith("data:image")) { - imageUrl = baseUrl + imageUrl; + if (!imageUrl.startsWith("http")) { + if (imageUrl.startsWith("/")) { + imageUrl = imageUrl.substring(1); + } + imageUrl = new URL(imageUrl, baseUrl).toString(); + } } - + document.content = document.content.replace(image, `![${altText}](${imageUrl})`); }); }); - + return documents; } }