diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 7198988..26fb2a9 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -90,6 +90,7 @@ app.post("/v0/scrape", async (req, res) => { try { // make sure to authenticate user first, Bearer const team_id = await authenticateUser(req, res, "scrape"); + const crawlerOptions = req.body.crawlerOptions ?? {}; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -113,6 +114,9 @@ app.post("/v0/scrape", async (req, res) => { await a.setOptions({ mode: "single_urls", urls: [url], + crawlerOptions: { + ...crawlerOptions, + }, }); const docs = await a.getDocuments(false); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index b54d9e6..8290762 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -74,7 +74,7 @@ export class WebScraperDataProvider { throw new Error("Url is required"); } - if (!useCaching) { + if (true) {//!useCaching) { if (this.mode === "crawl") { const crawler = new WebCrawler({ initialUrl: this.urls[0], @@ -95,7 +95,7 @@ export class WebScraperDataProvider { } let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); - console.log("documents", documents) + documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -122,6 +122,7 @@ export class WebScraperDataProvider { if (this.mode === "single_urls") { let documents = await this.convertUrlsToDocuments(this.urls, inProgress); + documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -138,6 +139,7 @@ export class WebScraperDataProvider { let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress); documents = await this.getSitemapData(this.urls[0], documents); + documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } @@ -297,29 +299,46 @@ export class WebScraperDataProvider { } generatesImgAltText = async (documents: Document[]): Promise => { await Promise.all(documents.map(async (document) => { - const baseUrl = new URL(document.metadata.sourceURL).origin; - const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; + const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || []; - await Promise.all(images.map(async (image) => { + await Promise.all(images.map(async (image: string) => { let imageUrl = image.match(/\(([^)]+)\)/)[1]; let altText = image.match(/\[(.*?)\]/)[1]; - let newImageUrl = ''; if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) { - newImageUrl = baseUrl + imageUrl; const imageIndex = document.content.indexOf(image); const contentLength = document.content.length; let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength)); let frontTextStartIndex = Math.max(imageIndex - 1000, 0); let frontText = document.content.substring(frontTextStartIndex, imageIndex); - altText = await getImageDescription(newImageUrl, backText, frontText); + altText = await getImageDescription(imageUrl, backText, frontText); } - document.content = document.content.replace(image, `![${altText}](${newImageUrl})`); + document.content = document.content.replace(image, `![${altText}](${imageUrl})`); })); })); return documents; } + + replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { + documents.forEach(document => { + const baseUrl = new URL(document.metadata.sourceURL).origin; + const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; + + images.forEach(image => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; + + if (!imageUrl.startsWith("data:image")) { + imageUrl = baseUrl + imageUrl; + } + + document.content = document.content.replace(image, `![${altText}](${imageUrl})`); + }); + }); + + return documents; + } }