Update index.test.ts
This commit is contained in:
parent
60245343c9
commit
2eb81545fa
@ -1,97 +1,171 @@
|
||||
import { WebScraperDataProvider } from '../index';
|
||||
import { WebScraperDataProvider } from "../index";
|
||||
|
||||
describe('WebScraperDataProvider', () => {
|
||||
describe('replaceImgPathsWithAbsolutePaths', () => {
|
||||
it('should replace image paths with absolute paths', () => {
|
||||
describe("WebScraperDataProvider", () => {
|
||||
describe("replaceImgPathsWithAbsolutePaths", () => {
|
||||
it("should replace image paths with absolute paths", () => {
|
||||
const webScraperDataProvider = new WebScraperDataProvider();
|
||||
const documents = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/ec7af/ec7af62cf2ffefaa68508df145b694a439e95886" alt="alt text"',
|
||||
metadata: { sourceURL: "https://example.com/page" },
|
||||
content: "data:image/s3,"s3://crabby-images/ec7af/ec7af62cf2ffefaa68508df145b694a439e95886" alt="alt text"",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/656cc/656ccea391b3bf2d7b0fb4037dd474a3a271fbef" alt="another alt text"',
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content: "data:image/s3,"s3://crabby-images/656cc/656ccea391b3bf2d7b0fb4037dd474a3a271fbef" alt="another alt text"",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/data-image' },
|
||||
content: 'data:image/s3,"s3://crabby-images/17cfb/17cfbc2c71e12b66a7f8b84ce611d4a6429b15f7" alt="data image"',
|
||||
}
|
||||
metadata: { sourceURL: "https://example.com/data-image" },
|
||||
content: "data:image/s3,"s3://crabby-images/17cfb/17cfbc2c71e12b66a7f8b84ce611d4a6429b15f7" alt="data image"",
|
||||
},
|
||||
];
|
||||
|
||||
const expectedDocuments = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text"',
|
||||
metadata: { sourceURL: "https://example.com/page" },
|
||||
content: "data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text"",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/85143/8514392dec916c98e200a7c57307941fbdd303b6" alt="another alt text"',
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content: "data:image/s3,"s3://crabby-images/85143/8514392dec916c98e200a7c57307941fbdd303b6" alt="another alt text"",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/data-image' },
|
||||
content: 'data:image/s3,"s3://crabby-images/17cfb/17cfbc2c71e12b66a7f8b84ce611d4a6429b15f7" alt="data image"',
|
||||
}
|
||||
metadata: { sourceURL: "https://example.com/data-image" },
|
||||
content: "data:image/s3,"s3://crabby-images/17cfb/17cfbc2c71e12b66a7f8b84ce611d4a6429b15f7" alt="data image"",
|
||||
},
|
||||
];
|
||||
|
||||
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||
const result =
|
||||
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
it('should handle absolute URLs without modification', () => {
|
||||
it("should handle absolute URLs without modification", () => {
|
||||
const webScraperDataProvider = new WebScraperDataProvider();
|
||||
const documents = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text"',
|
||||
metadata: { sourceURL: "https://example.com/page" },
|
||||
content: "data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text"",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/eb36c/eb36cb4d8e1e42cffdefe36b705625ee7923ef3c" alt="another alt text"',
|
||||
}
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content:
|
||||
"data:image/s3,"s3://crabby-images/eb36c/eb36cb4d8e1e42cffdefe36b705625ee7923ef3c" alt="another alt text"",
|
||||
},
|
||||
];
|
||||
|
||||
const expectedDocuments = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text"',
|
||||
metadata: { sourceURL: "https://example.com/page" },
|
||||
content: "data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text"",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'data:image/s3,"s3://crabby-images/eb36c/eb36cb4d8e1e42cffdefe36b705625ee7923ef3c" alt="another alt text"',
|
||||
}
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content:
|
||||
"data:image/s3,"s3://crabby-images/eb36c/eb36cb4d8e1e42cffdefe36b705625ee7923ef3c" alt="another alt text"",
|
||||
},
|
||||
];
|
||||
|
||||
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||
const result =
|
||||
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
it('should not replace non-image content within the documents', () => {
|
||||
it("should not replace non-image content within the documents", () => {
|
||||
const webScraperDataProvider = new WebScraperDataProvider();
|
||||
const documents = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'This is a test. data:image/s3,"s3://crabby-images/ec7af/ec7af62cf2ffefaa68508df145b694a439e95886" alt="alt text" Here is a link: [Example](https://example.com).',
|
||||
metadata: { sourceURL: "https://example.com/page" },
|
||||
content:
|
||||
"This is a test. data:image/s3,"s3://crabby-images/ec7af/ec7af62cf2ffefaa68508df145b694a439e95886" alt="alt text" Here is a link: [Example](https://example.com).",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'Another test. data:image/s3,"s3://crabby-images/656cc/656ccea391b3bf2d7b0fb4037dd474a3a271fbef" alt="another alt text" Here is some **bold text**.',
|
||||
}
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content:
|
||||
"Another test. data:image/s3,"s3://crabby-images/656cc/656ccea391b3bf2d7b0fb4037dd474a3a271fbef" alt="another alt text" Here is some **bold text**.",
|
||||
},
|
||||
];
|
||||
|
||||
|
||||
const expectedDocuments = [
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/page' },
|
||||
content: 'This is a test. data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text" Here is a link: [Example](https://example.com).',
|
||||
metadata: { sourceURL: "https://example.com/page" },
|
||||
content:
|
||||
"This is a test. data:image/s3,"s3://crabby-images/01a7d/01a7df95a5de36522e05107d2baaaafcfec0f578" alt="alt text" Here is a link: [Example](https://example.com).",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: 'https://example.com/another-page' },
|
||||
content: 'Another test. data:image/s3,"s3://crabby-images/85143/8514392dec916c98e200a7c57307941fbdd303b6" alt="another alt text" Here is some **bold text**.',
|
||||
}
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content:
|
||||
"Another test. data:image/s3,"s3://crabby-images/85143/8514392dec916c98e200a7c57307941fbdd303b6" alt="another alt text" Here is some **bold text**.",
|
||||
},
|
||||
];
|
||||
|
||||
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||
|
||||
const result =
|
||||
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
it("should replace multiple image paths within the documents", () => {
|
||||
const webScraperDataProvider = new WebScraperDataProvider();
|
||||
const documents = [
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/page" },
|
||||
content:
|
||||
"This is a test. data:image/s3,"s3://crabby-images/727ec/727ec38975e036411613f7dc127bbc7fb429284e" alt="alt text" Here is a link: [Example](https://example.com). data:image/s3,"s3://crabby-images/f6e6f/f6e6fe634f71925ef00796c7adc332ec346f0e19" alt="alt text"",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content:
|
||||
"Another test. data:image/s3,"s3://crabby-images/dc452/dc452c17c9fe9bb589cc5e605d9701b0d9bc992a" alt="another alt text" Here is some **bold text**. data:image/s3,"s3://crabby-images/3cd20/3cd2054a1f0ecd09df489baea7d585307ddaf42a" alt="another alt text"",
|
||||
},
|
||||
];
|
||||
|
||||
const expectedDocuments = [
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/page" },
|
||||
content:
|
||||
"This is a test. data:image/s3,"s3://crabby-images/b9e9d/b9e9dbf13add6c3a9a7e83ec4cf791ccc52db68e" alt="alt text" Here is a link: [Example](https://example.com). data:image/s3,"s3://crabby-images/5c7fd/5c7fd34962071b945b5e2bd22173beadf4457924" alt="alt text"",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/another-page" },
|
||||
content:
|
||||
"Another test. data:image/s3,"s3://crabby-images/a1233/a1233981c7b417a79d4dbe66aef9a0ca6ac8e509" alt="another alt text" Here is some **bold text**. data:image/s3,"s3://crabby-images/57451/57451408088912fe31340e76faf1be0710381a73" alt="another alt text"",
|
||||
},
|
||||
];
|
||||
|
||||
const result =
|
||||
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
it("should replace image paths within the documents with complex URLs", () => {
|
||||
const webScraperDataProvider = new WebScraperDataProvider();
|
||||
const documents = [
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/page/subpage" },
|
||||
content:
|
||||
"This is a test. data:image/s3,"s3://crabby-images/727ec/727ec38975e036411613f7dc127bbc7fb429284e" alt="alt text" Here is a link: [Example](https://example.com). data:image/s3,"s3://crabby-images/533f2/533f2239d9af8ced07ecf13e63339eba36708ca6" alt="alt text"",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/another-page/subpage" },
|
||||
content:
|
||||
"Another test. data:image/s3,"s3://crabby-images/3dac8/3dac848b6b0f0b6603c425f96255925667dbc9ee" alt="another alt text" Here is some **bold text**. data:image/s3,"s3://crabby-images/add3c/add3c0e9ae828bdc706b7ef5dd541344f06f12a9" alt="another alt text"",
|
||||
},
|
||||
];
|
||||
|
||||
const expectedDocuments = [
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/page/subpage" },
|
||||
content:
|
||||
"This is a test. data:image/s3,"s3://crabby-images/b9e9d/b9e9dbf13add6c3a9a7e83ec4cf791ccc52db68e" alt="alt text" Here is a link: [Example](https://example.com). data:image/s3,"s3://crabby-images/9a22f/9a22fc0e480f1397365e3a98c9ae9cc591f46922" alt="alt text"",
|
||||
},
|
||||
{
|
||||
metadata: { sourceURL: "https://example.com/another-page/subpage" },
|
||||
content:
|
||||
"Another test. data:image/s3,"s3://crabby-images/026c1/026c121d128513d4ae7a32d3106011fd43d2f91e" alt="another alt text" Here is some **bold text**. data:image/s3,"s3://crabby-images/199fa/199fa8cea1cba62784f2882df69a7fb8f9b2b346" alt="another alt text"",
|
||||
},
|
||||
];
|
||||
|
||||
const result =
|
||||
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
Loading…
x
Reference in New Issue
Block a user