Nick: fixed bugs associated with absolute path replacements
This commit is contained in:
parent
788abdce6e
commit
520739c9f4
@ -190,11 +190,6 @@
|
|||||||
"description": "Ignore the website sitemap when crawling",
|
"description": "Ignore the website sitemap when crawling",
|
||||||
"default": false
|
"default": false
|
||||||
},
|
},
|
||||||
"replaceAllPathsWithAbsolutePaths": {
|
|
||||||
"type": "boolean",
|
|
||||||
"description": "Replace all relative paths with absolute paths for images and links",
|
|
||||||
"default": false
|
|
||||||
},
|
|
||||||
"limit": {
|
"limit": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Maximum number of pages to crawl",
|
"description": "Maximum number of pages to crawl",
|
||||||
@ -223,6 +218,11 @@
|
|||||||
"headers": {
|
"headers": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
||||||
|
},
|
||||||
|
"replaceAllPathsWithAbsolutePaths": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Replace all relative paths with absolute paths for images and links",
|
||||||
|
"default": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@ export type PageOptions = {
|
|||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
|
@ -302,9 +302,10 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private applyPathReplacements(documents: Document[]): Document[] {
|
private applyPathReplacements(documents: Document[]): Document[] {
|
||||||
return this.replaceAllPathsWithAbsolutePaths
|
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||||
? replacePathsWithAbsolutePaths(documents)
|
documents = replacePathsWithAbsolutePaths(documents);
|
||||||
: replaceImgPathsWithAbsolutePaths(documents);
|
}
|
||||||
|
return replaceImgPathsWithAbsolutePaths(documents);
|
||||||
}
|
}
|
||||||
|
|
||||||
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
||||||
@ -473,9 +474,9 @@ export class WebScraperDataProvider {
|
|||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
|
||||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||||
|
@ -10,6 +10,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
|
|||||||
) || [];
|
) || [];
|
||||||
|
|
||||||
paths.forEach((path: string) => {
|
paths.forEach((path: string) => {
|
||||||
|
try {
|
||||||
const isImage = path.startsWith("!");
|
const isImage = path.startsWith("!");
|
||||||
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
|
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
|
||||||
let url = matchedUrl[1];
|
let url = matchedUrl[1];
|
||||||
@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
|
|||||||
}
|
}
|
||||||
|
|
||||||
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
|
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
|
||||||
if (isImage) {
|
// Image is handled afterwards
|
||||||
document.content = document.content.replace(
|
if (!isImage) {
|
||||||
path,
|
|
||||||
`${markdownLinkOrImageText}(${url})`
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
document.content = document.content.replace(
|
document.content = document.content.replace(
|
||||||
path,
|
path,
|
||||||
`${markdownLinkOrImageText}(${url})`
|
`${markdownLinkOrImageText}(${url})`
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
document.markdown = document.content;
|
||||||
});
|
});
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
|
|||||||
if (!imageUrl.startsWith("http")) {
|
if (!imageUrl.startsWith("http")) {
|
||||||
if (imageUrl.startsWith("/")) {
|
if (imageUrl.startsWith("/")) {
|
||||||
imageUrl = imageUrl.substring(1);
|
imageUrl = imageUrl.substring(1);
|
||||||
}
|
|
||||||
imageUrl = new URL(imageUrl, baseUrl).toString();
|
imageUrl = new URL(imageUrl, baseUrl).toString();
|
||||||
|
} else {
|
||||||
|
imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
|
|||||||
`![${altText}](${imageUrl})`
|
`![${altText}](${imageUrl})`
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
document.markdown = document.content;
|
||||||
});
|
});
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
|
Loading…
Reference in New Issue
Block a user