0

Merge pull request #268 from mendableai/nsc/abs-path-fix

Fixed bugs associated with absolute path replacements
This commit is contained in:
Nicolas 2024-06-11 13:20:42 -07:00 committed by GitHub
commit c08db830df
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 50 additions and 31 deletions

View File

@ -190,11 +190,6 @@
"description": "Ignore the website sitemap when crawling", "description": "Ignore the website sitemap when crawling",
"default": false "default": false
}, },
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"limit": { "limit": {
"type": "integer", "type": "integer",
"description": "Maximum number of pages to crawl", "description": "Maximum number of pages to crawl",
@ -223,6 +218,11 @@
"headers": { "headers": {
"type": "object", "type": "object",
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
} }
} }
} }

View File

@ -18,6 +18,7 @@ export type PageOptions = {
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
headers?: Record<string, string>; headers?: Record<string, string>;
replaceAllPathsWithAbsolutePaths?: boolean;
}; };
export type ExtractorOptions = { export type ExtractorOptions = {

View File

@ -302,9 +302,10 @@ export class WebScraperDataProvider {
} }
private applyPathReplacements(documents: Document[]): Document[] { private applyPathReplacements(documents: Document[]): Document[] {
return this.replaceAllPathsWithAbsolutePaths if (this.replaceAllPathsWithAbsolutePaths) {
? replacePathsWithAbsolutePaths(documents) documents = replacePathsWithAbsolutePaths(documents);
: replaceImgPathsWithAbsolutePaths(documents); }
return replaceImgPathsWithAbsolutePaths(documents);
} }
private async applyImgAltText(documents: Document[]): Promise<Document[]> { private async applyImgAltText(documents: Document[]): Promise<Document[]> {
@ -473,9 +474,9 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000; this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText = this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false; options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== ""); this.excludes = this.excludes.filter((item) => item !== "");
this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.crawlerMode = options.crawlerOptions?.mode ?? "default";

View File

@ -6,12 +6,14 @@ describe('replacePaths', () => {
it('should replace relative paths with absolute paths', () => { it('should replace relative paths with absolute paths', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).' content: 'This is a [link](/path/to/resource).',
markdown: 'This is a [link](/path/to/resource).'
}]; }];
const expectedDocuments: Document[] = [{ const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).' content: 'This is a [link](https://example.com/path/to/resource).',
markdown: 'This is a [link](https://example.com/path/to/resource).'
}]; }];
const result = replacePathsWithAbsolutePaths(documents); const result = replacePathsWithAbsolutePaths(documents);
@ -21,7 +23,8 @@ describe('replacePaths', () => {
it('should not alter absolute URLs', () => { it('should not alter absolute URLs', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).' content: 'This is an [external link](https://external.com/path).',
markdown: 'This is an [external link](https://external.com/path).'
}]; }];
const result = replacePathsWithAbsolutePaths(documents); const result = replacePathsWithAbsolutePaths(documents);
@ -31,7 +34,8 @@ describe('replacePaths', () => {
it('should not alter data URLs for images', () => { it('should not alter data URLs for images', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'This is an image: ![alt text]().' content: 'This is an image: ![alt text]().',
markdown: 'This is an image: ![alt text]().'
}]; }];
const result = replacePathsWithAbsolutePaths(documents); const result = replacePathsWithAbsolutePaths(documents);
@ -41,12 +45,14 @@ describe('replacePaths', () => {
it('should handle multiple links and images correctly', () => { it('should handle multiple links and images correctly', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).' content: 'Here are two links: [link1](/path1) and [link2](/path2).',
markdown: 'Here are two links: [link1](/path1) and [link2](/path2).'
}]; }];
const expectedDocuments: Document[] = [{ const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).' content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).',
markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
}]; }];
const result = replacePathsWithAbsolutePaths(documents); const result = replacePathsWithAbsolutePaths(documents);
@ -56,12 +62,14 @@ describe('replacePaths', () => {
it('should correctly handle a mix of absolute and relative paths', () => { it('should correctly handle a mix of absolute and relative paths', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().' content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().',
markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().'
}]; }];
const expectedDocuments: Document[] = [{ const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().' content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().',
markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().'
}]; }];
const result = replacePathsWithAbsolutePaths(documents); const result = replacePathsWithAbsolutePaths(documents);
@ -74,12 +82,14 @@ describe('replacePaths', () => {
it('should replace relative image paths with absolute paths', () => { it('should replace relative image paths with absolute paths', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Here is an image: ![alt text](/path/to/image.jpg).' content: 'Here is an image: ![alt text](/path/to/image.jpg).',
markdown: 'Here is an image: ![alt text](/path/to/image.jpg).'
}]; }];
const expectedDocuments: Document[] = [{ const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).',
markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
}]; }];
const result = replaceImgPathsWithAbsolutePaths(documents); const result = replaceImgPathsWithAbsolutePaths(documents);
@ -89,7 +99,8 @@ describe('replacePaths', () => {
it('should not alter data:image URLs', () => { it('should not alter data:image URLs', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'An image with a data URL: ![alt text]().' content: 'An image with a data URL: ![alt text]().',
markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).'
}]; }];
const result = replaceImgPathsWithAbsolutePaths(documents); const result = replaceImgPathsWithAbsolutePaths(documents);
@ -99,12 +110,14 @@ describe('replacePaths', () => {
it('should handle multiple images with a mix of data and relative URLs', () => { it('should handle multiple images with a mix of data and relative URLs', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).' content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).',
markdown: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).'
}]; }];
const expectedDocuments: Document[] = [{ const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).' content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).',
markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).'
}]; }];
const result = replaceImgPathsWithAbsolutePaths(documents); const result = replaceImgPathsWithAbsolutePaths(documents);

View File

@ -10,6 +10,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
) || []; ) || [];
paths.forEach((path: string) => { paths.forEach((path: string) => {
try {
const isImage = path.startsWith("!"); const isImage = path.startsWith("!");
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/); let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
let url = matchedUrl[1]; let url = matchedUrl[1];
@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
} }
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0]; const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
if (isImage) { // Image is handled afterwards
document.content = document.content.replace( if (!isImage) {
path,
`${markdownLinkOrImageText}(${url})`
);
} else {
document.content = document.content.replace( document.content = document.content.replace(
path, path,
`${markdownLinkOrImageText}(${url})` `${markdownLinkOrImageText}(${url})`
); );
}
} catch (error) {
} }
}); });
document.markdown = document.content;
}); });
return documents; return documents;
@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
if (!imageUrl.startsWith("http")) { if (!imageUrl.startsWith("http")) {
if (imageUrl.startsWith("/")) { if (imageUrl.startsWith("/")) {
imageUrl = imageUrl.substring(1); imageUrl = imageUrl.substring(1);
}
imageUrl = new URL(imageUrl, baseUrl).toString(); imageUrl = new URL(imageUrl, baseUrl).toString();
} else {
imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
}
} }
} }
@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
`![${altText}](${imageUrl})` `![${altText}](${imageUrl})`
); );
}); });
document.markdown = document.content;
}); });
return documents; return documents;