adding unit tests and bugfixing
This commit is contained in:
parent
d23a7ae591
commit
b375ce3e39
97
apps/api/src/scraper/WebScraper/__tests__/index.test.ts
Normal file
97
apps/api/src/scraper/WebScraper/__tests__/index.test.ts
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
import { WebScraperDataProvider } from '../index';
|
||||||
|
|
||||||
|
describe('WebScraperDataProvider', () => {
|
||||||
|
describe('replaceImgPathsWithAbsolutePaths', () => {
|
||||||
|
it('should replace image paths with absolute paths', () => {
|
||||||
|
const webScraperDataProvider = new WebScraperDataProvider();
|
||||||
|
const documents = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||||||
|
content: '![alt text](/image.png)',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||||
|
content: '![another alt text](./another-image.png)',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/data-image' },
|
||||||
|
content: '![data image](data:image/png;base64,...)',
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
const expectedDocuments = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||||||
|
content: '![alt text](https://example.com/image.png)',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||||
|
content: '![another alt text](https://example.com/another-image.png)',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/data-image' },
|
||||||
|
content: '![data image](data:image/png;base64,...)',
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle absolute URLs without modification', () => {
|
||||||
|
const webScraperDataProvider = new WebScraperDataProvider();
|
||||||
|
const documents = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||||||
|
content: '![alt text](https://example.com/image.png)',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||||
|
content: '![another alt text](http://anotherexample.com/another-image.png)',
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
const expectedDocuments = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||||||
|
content: '![alt text](https://example.com/image.png)',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||||
|
content: '![another alt text](http://anotherexample.com/another-image.png)',
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should not replace non-image content within the documents', () => {
|
||||||
|
const webScraperDataProvider = new WebScraperDataProvider();
|
||||||
|
const documents = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||||||
|
content: 'This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||||
|
content: 'Another test. ![another alt text](./another-image.png) Here is some **bold text**.',
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
const expectedDocuments = [
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||||||
|
content: 'This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||||
|
content: 'Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.',
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
expect(result).toEqual(expectedDocuments);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
@ -325,19 +325,24 @@ export class WebScraperDataProvider {
|
|||||||
documents.forEach(document => {
|
documents.forEach(document => {
|
||||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||||
const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
|
const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
|
||||||
|
|
||||||
images.forEach(image => {
|
images.forEach(image => {
|
||||||
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||||
let altText = image.match(/\[(.*?)\]/)[1];
|
let altText = image.match(/\[(.*?)\]/)[1];
|
||||||
|
|
||||||
if (!imageUrl.startsWith("data:image")) {
|
if (!imageUrl.startsWith("data:image")) {
|
||||||
imageUrl = baseUrl + imageUrl;
|
if (!imageUrl.startsWith("http")) {
|
||||||
|
if (imageUrl.startsWith("/")) {
|
||||||
|
imageUrl = imageUrl.substring(1);
|
||||||
|
}
|
||||||
|
imageUrl = new URL(imageUrl, baseUrl).toString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
|
document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user