Spliting relative paths for images

2024-04-16 16:31:33 -03:00 · 2024-04-16 16:31:33 -03:00 · a04610302a
commit a04610302a
parent 3e4064bce2
2 changed files with 32 additions and 9 deletions
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -90,6 +90,7 @@ app.post("/v0/scrape", async (req, res) => {
  try {
    // make sure to authenticate user first, Bearer <token>
    const team_id = await authenticateUser(req, res, "scrape");
+    const crawlerOptions = req.body.crawlerOptions ?? {};

    try {
      const { success: creditsCheckSuccess, message: creditsCheckMessage } =
@ -113,6 +114,9 @@ app.post("/v0/scrape", async (req, res) => {
      await a.setOptions({
        mode: "single_urls",
        urls: [url],
+        crawlerOptions: {
+          ...crawlerOptions,
+        },
      });

      const docs = await a.getDocuments(false);
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -74,7 +74,7 @@ export class WebScraperDataProvider {
      throw new Error("Url is required");
    }

-    if (!useCaching) {
+    if (true) {//!useCaching) {
      if (this.mode === "crawl") {
        const crawler = new WebCrawler({
          initialUrl: this.urls[0],
@ -95,7 +95,7 @@ export class WebScraperDataProvider {
        }
        let documents = await this.convertUrlsToDocuments(links, inProgress);
        documents = await this.getSitemapData(this.urls[0], documents);
-        console.log("documents", documents)
+        documents = this.replaceImgPathsWithAbsolutePaths(documents);
        if (this.generateImgAltText) {
          documents = await this.generatesImgAltText(documents);
        }
@ -122,6 +122,7 @@ export class WebScraperDataProvider {

      if (this.mode === "single_urls") {
        let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
+        documents = this.replaceImgPathsWithAbsolutePaths(documents);
        if (this.generateImgAltText) {
          documents = await this.generatesImgAltText(documents);
        }
@ -138,6 +139,7 @@ export class WebScraperDataProvider {
        let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);

        documents = await this.getSitemapData(this.urls[0], documents);
+        documents = this.replaceImgPathsWithAbsolutePaths(documents);
        if (this.generateImgAltText) {
          documents = await this.generatesImgAltText(documents);
        }
@ -297,29 +299,46 @@ export class WebScraperDataProvider {
  }
  generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
    await Promise.all(documents.map(async (document) => {
-      const baseUrl = new URL(document.metadata.sourceURL).origin;
-      const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
+      const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];

-      await Promise.all(images.map(async (image) => {
+      await Promise.all(images.map(async (image: string) => {
        let imageUrl = image.match(/\(([^)]+)\)/)[1];
        let altText = image.match(/\[(.*?)\]/)[1];
-        let newImageUrl = '';

        if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) {
-          newImageUrl = baseUrl + imageUrl;
          const imageIndex = document.content.indexOf(image);
          const contentLength = document.content.length;
          let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength));
          let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
          let frontText = document.content.substring(frontTextStartIndex, imageIndex);
-          altText = await getImageDescription(newImageUrl, backText, frontText);
+          altText = await getImageDescription(imageUrl, backText, frontText);
        }

-        document.content = document.content.replace(image, `![${altText}](${newImageUrl})`);
+        document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
      }));
    }));

    return documents;
  }
+  
+  replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
+    documents.forEach(document => {
+      const baseUrl = new URL(document.metadata.sourceURL).origin;
+      const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
+
+      images.forEach(image => {
+        let imageUrl = image.match(/\(([^)]+)\)/)[1];
+        let altText = image.match(/\[(.*?)\]/)[1];
+
+        if (!imageUrl.startsWith("data:image")) {
+          imageUrl = baseUrl + imageUrl;
+        }
+
+        document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
+      });
+    });
+
+    return documents;
+  }
 }