Spliting relative paths for images

2024-04-16 16:31:33 -03:00 · 2024-04-16 16:31:33 -03:00 · a04610302a
commit a04610302a
parent 3e4064bce2
2 changed files with 32 additions and 9 deletions
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -90,6 +90,7 @@ app.post("/v0/scrape", async (req, res) => {
  try {
    // make sure to authenticate user first, Bearer <token>
    const team_id = await authenticateUser(req, res, "scrape");
    const crawlerOptions = req.body.crawlerOptions ?? {};
    try {
      const { success: creditsCheckSuccess, message: creditsCheckMessage } =
@ -113,6 +114,9 @@ app.post("/v0/scrape", async (req, res) => {
      await a.setOptions({
        mode: "single_urls",
        urls: [url],
        crawlerOptions: {
          ...crawlerOptions,
        },
      });
      const docs = await a.getDocuments(false);
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -74,7 +74,7 @@ export class WebScraperDataProvider {
      throw new Error("Url is required");
    }
-    if (!useCaching) {
+    if (true) {//!useCaching) {
      if (this.mode === "crawl") {
        const crawler = new WebCrawler({
          initialUrl: this.urls[0],
@ -95,7 +95,7 @@ export class WebScraperDataProvider {
        }
        let documents = await this.convertUrlsToDocuments(links, inProgress);
        documents = await this.getSitemapData(this.urls[0], documents);
-        console.log("documents", documents)
+        documents = this.replaceImgPathsWithAbsolutePaths(documents);
        if (this.generateImgAltText) {
          documents = await this.generatesImgAltText(documents);
        }
@ -122,6 +122,7 @@ export class WebScraperDataProvider {
      if (this.mode === "single_urls") {
        let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
        documents = this.replaceImgPathsWithAbsolutePaths(documents);
        if (this.generateImgAltText) {
          documents = await this.generatesImgAltText(documents);
        }
@ -138,6 +139,7 @@ export class WebScraperDataProvider {
        let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
        documents = await this.getSitemapData(this.urls[0], documents);
        documents = this.replaceImgPathsWithAbsolutePaths(documents);
        if (this.generateImgAltText) {
          documents = await this.generatesImgAltText(documents);
        }
@ -297,29 +299,46 @@ export class WebScraperDataProvider {
  }
  generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
    await Promise.all(documents.map(async (document) => {
-      const baseUrl = new URL(document.metadata.sourceURL).origin;
+      const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
      const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
-      await Promise.all(images.map(async (image) => {
+      await Promise.all(images.map(async (image: string) => {
        let imageUrl = image.match(/\(([^)]+)\)/)[1];
        let altText = image.match(/\[(.*?)\]/)[1];
        let newImageUrl = '';
        if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) {
          newImageUrl = baseUrl + imageUrl;
          const imageIndex = document.content.indexOf(image);
          const contentLength = document.content.length;
          let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength));
          let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
          let frontText = document.content.substring(frontTextStartIndex, imageIndex);
-          altText = await getImageDescription(newImageUrl, backText, frontText);
+          altText = await getImageDescription(imageUrl, backText, frontText);
        }
-        document.content = document.content.replace(image, `![${altText}](${newImageUrl})`);
+        document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
      }));
    }));
    return documents;
  }
  replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
    documents.forEach(document => {
      const baseUrl = new URL(document.metadata.sourceURL).origin;
      const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
      images.forEach(image => {
        let imageUrl = image.match(/\(([^)]+)\)/)[1];
        let altText = image.match(/\[(.*?)\]/)[1];
        if (!imageUrl.startsWith("data:image")) {
          imageUrl = baseUrl + imageUrl;
        }
        document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
      });
    });
    return documents;
  }
 }