Spliting relative paths for images
This commit is contained in:
parent
3e4064bce2
commit
a04610302a
@ -90,6 +90,7 @@ app.post("/v0/scrape", async (req, res) => {
|
|||||||
try {
|
try {
|
||||||
// make sure to authenticate user first, Bearer <token>
|
// make sure to authenticate user first, Bearer <token>
|
||||||
const team_id = await authenticateUser(req, res, "scrape");
|
const team_id = await authenticateUser(req, res, "scrape");
|
||||||
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||||
@ -113,6 +114,9 @@ app.post("/v0/scrape", async (req, res) => {
|
|||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: [url],
|
urls: [url],
|
||||||
|
crawlerOptions: {
|
||||||
|
...crawlerOptions,
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
const docs = await a.getDocuments(false);
|
const docs = await a.getDocuments(false);
|
||||||
|
@ -74,7 +74,7 @@ export class WebScraperDataProvider {
|
|||||||
throw new Error("Url is required");
|
throw new Error("Url is required");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!useCaching) {
|
if (true) {//!useCaching) {
|
||||||
if (this.mode === "crawl") {
|
if (this.mode === "crawl") {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
@ -95,7 +95,7 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
console.log("documents", documents)
|
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
if (this.generateImgAltText) {
|
if (this.generateImgAltText) {
|
||||||
documents = await this.generatesImgAltText(documents);
|
documents = await this.generatesImgAltText(documents);
|
||||||
}
|
}
|
||||||
@ -122,6 +122,7 @@ export class WebScraperDataProvider {
|
|||||||
|
|
||||||
if (this.mode === "single_urls") {
|
if (this.mode === "single_urls") {
|
||||||
let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
|
let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
|
||||||
|
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
if (this.generateImgAltText) {
|
if (this.generateImgAltText) {
|
||||||
documents = await this.generatesImgAltText(documents);
|
documents = await this.generatesImgAltText(documents);
|
||||||
}
|
}
|
||||||
@ -138,6 +139,7 @@ export class WebScraperDataProvider {
|
|||||||
let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
|
let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
|
||||||
|
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
|
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
if (this.generateImgAltText) {
|
if (this.generateImgAltText) {
|
||||||
documents = await this.generatesImgAltText(documents);
|
documents = await this.generatesImgAltText(documents);
|
||||||
}
|
}
|
||||||
@ -297,29 +299,46 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
|
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
|
||||||
await Promise.all(documents.map(async (document) => {
|
await Promise.all(documents.map(async (document) => {
|
||||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
|
||||||
const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
|
|
||||||
|
|
||||||
await Promise.all(images.map(async (image) => {
|
await Promise.all(images.map(async (image: string) => {
|
||||||
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||||
let altText = image.match(/\[(.*?)\]/)[1];
|
let altText = image.match(/\[(.*?)\]/)[1];
|
||||||
let newImageUrl = '';
|
|
||||||
|
|
||||||
if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) {
|
if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) {
|
||||||
newImageUrl = baseUrl + imageUrl;
|
|
||||||
const imageIndex = document.content.indexOf(image);
|
const imageIndex = document.content.indexOf(image);
|
||||||
const contentLength = document.content.length;
|
const contentLength = document.content.length;
|
||||||
let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength));
|
let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength));
|
||||||
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
|
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
|
||||||
let frontText = document.content.substring(frontTextStartIndex, imageIndex);
|
let frontText = document.content.substring(frontTextStartIndex, imageIndex);
|
||||||
altText = await getImageDescription(newImageUrl, backText, frontText);
|
altText = await getImageDescription(imageUrl, backText, frontText);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.content = document.content.replace(image, `![${altText}](${newImageUrl})`);
|
document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
|
||||||
}));
|
}));
|
||||||
}));
|
}));
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||||
|
documents.forEach(document => {
|
||||||
|
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||||
|
const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
|
||||||
|
|
||||||
|
images.forEach(image => {
|
||||||
|
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||||
|
let altText = image.match(/\[(.*?)\]/)[1];
|
||||||
|
|
||||||
|
if (!imageUrl.startsWith("data:image")) {
|
||||||
|
imageUrl = baseUrl + imageUrl;
|
||||||
|
}
|
||||||
|
|
||||||
|
document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user