adding unit tests and bugfixing
This commit is contained in:
parent
d23a7ae591
commit
b375ce3e39
97
apps/api/src/scraper/WebScraper/__tests__/index.test.ts
Normal file
97
apps/api/src/scraper/WebScraper/__tests__/index.test.ts
Normal file
@ -0,0 +1,97 @@
|
||||
import { WebScraperDataProvider } from '../index';
|
||||
|
||||
describe('WebScraperDataProvider', () => {
|
||||
describe('replaceImgPathsWithAbsolutePaths', () => {
|
||||
it('should replace image paths with absolute paths', () => {
|
||||
const webScraperDataProvider = new WebScraperDataProvider();
|
||||
const documents = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/ec7af/ec7af62cf2ffefaa68508df145b694a439e95886" alt="alt text"',
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/656cc/656ccea391b3bf2d7b0fb4037dd474a3a271fbef" alt="another alt text"',
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/data-image' },
|
||||
content: 'data:image/s3,"s3://crabby-images/17cfb/17cfbc2c71e12b66a7f8b84ce611d4a6429b15f7" alt="data image"',
|
||||
}
|
||||
];
|
||||
|
||||
const expectedDocuments = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text"',
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/85143/8514392dec916c98e200a7c57307941fbdd303b6" alt="another alt text"',
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/data-image' },
|
||||
content: 'data:image/s3,"s3://crabby-images/17cfb/17cfbc2c71e12b66a7f8b84ce611d4a6429b15f7" alt="data image"',
|
||||
}
|
||||
];
|
||||
|
||||
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
it('should handle absolute URLs without modification', () => {
|
||||
const webScraperDataProvider = new WebScraperDataProvider();
|
||||
const documents = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text"',
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/eb36c/eb36cb4d8e1e42cffdefe36b705625ee7923ef3c" alt="another alt text"',
|
||||
}
|
||||
];
|
||||
|
||||
const expectedDocuments = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text"',
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/eb36c/eb36cb4d8e1e42cffdefe36b705625ee7923ef3c" alt="another alt text"',
|
||||
}
|
||||
];
|
||||
|
||||
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
it('should not replace non-image content within the documents', () => {
|
||||
const webScraperDataProvider = new WebScraperDataProvider();
|
||||
const documents = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'This is a test. data:image/s3,"s3://crabby-images/ec7af/ec7af62cf2ffefaa68508df145b694a439e95886" alt="alt text" Here is a link: [Example](https://example.com).',
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'Another test. data:image/s3,"s3://crabby-images/656cc/656ccea391b3bf2d7b0fb4037dd474a3a271fbef" alt="another alt text" Here is some **bold text**.',
|
||||
}
|
||||
];
|
||||
|
||||
const expectedDocuments = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'This is a test. data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text" Here is a link: [Example](https://example.com).',
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'Another test. data:image/s3,"s3://crabby-images/85143/8514392dec916c98e200a7c57307941fbdd303b6" alt="another alt text" Here is some **bold text**.',
|
||||
}
|
||||
];
|
||||
|
||||
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
});
|
||||
});
|
@ -325,19 +325,24 @@ export class WebScraperDataProvider {
|
||||
documents.forEach(document => {
|
||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||
const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
|
||||
|
||||
|
||||
images.forEach(image => {
|
||||
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||
let altText = image.match(/\[(.*?)\]/)[1];
|
||||
|
||||
|
||||
if (!imageUrl.startsWith("data:image")) {
|
||||
imageUrl = baseUrl + imageUrl;
|
||||
if (!imageUrl.startsWith("http")) {
|
||||
if (imageUrl.startsWith("/")) {
|
||||
imageUrl = imageUrl.substring(1);
|
||||
}
|
||||
imageUrl = new URL(imageUrl, baseUrl).toString();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
document.content = document.content.replace(image, `data:image/s3,"s3://crabby-images/e8b4e/e8b4ee99a66003a6d3ce2217d76c453d92d078bd" alt="${altText}"`);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
return documents;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user