Nick: fixed bugs associated with absolute path replacements

2024-06-11 12:43:16 -07:00 · 2024-06-11 12:43:16 -07:00 · 520739c9f4
commit 520739c9f4
parent 788abdce6e
4 changed files with 24 additions and 18 deletions
--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@ -190,11 +190,6 @@
                        "description": "Ignore the website sitemap when crawling",
                        "default": false
                      },
-                      "replaceAllPathsWithAbsolutePaths": {
-                        "type": "boolean",
-                        "description": "Replace all relative paths with absolute paths for images and links",
-                        "default": false
-                      },
                      "limit": {
                        "type": "integer",
                        "description": "Maximum number of pages to crawl",
@ -223,6 +218,11 @@
                      "headers": {
                        "type": "object",
                        "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
+                      },
+                      "replaceAllPathsWithAbsolutePaths": {
+                        "type": "boolean",
+                        "description": "Replace all relative paths with absolute paths for images and links",
+                        "default": false
                      }
                    }
                  }
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -18,6 +18,7 @@ export type PageOptions = {
  waitFor?: number;
  screenshot?: boolean;
  headers?: Record<string, string>;
+  replaceAllPathsWithAbsolutePaths?: boolean;
 };

 export type ExtractorOptions = {
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -302,9 +302,10 @@ export class WebScraperDataProvider {
  }

  private applyPathReplacements(documents: Document[]): Document[] {
-    return this.replaceAllPathsWithAbsolutePaths
-      ? replacePathsWithAbsolutePaths(documents)
-      : replaceImgPathsWithAbsolutePaths(documents);
+    if (this.replaceAllPathsWithAbsolutePaths) {
+      documents = replacePathsWithAbsolutePaths(documents);
+    }
+    return replaceImgPathsWithAbsolutePaths(documents);
  }

  private async applyImgAltText(documents: Document[]): Promise<Document[]> {
@ -473,9 +474,9 @@ export class WebScraperDataProvider {
    this.limit = options.crawlerOptions?.limit ?? 10000;
    this.generateImgAltText =
      options.crawlerOptions?.generateImgAltText ?? false;
-    this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
    this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
-    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
+    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
    //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
    this.excludes = this.excludes.filter((item) => item !== "");
    this.crawlerMode = options.crawlerOptions?.mode ?? "default";
--- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts
+++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts
@ -10,7 +10,8 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
        ) || [];

      paths.forEach((path: string) => {
-        const isImage = path.startsWith("!");
+        try {
+          const isImage = path.startsWith("!");
        let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
        let url = matchedUrl[1];

@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
        }

        const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
-        if (isImage) {
-          document.content = document.content.replace(
-            path,
-            `${markdownLinkOrImageText}(${url})`
-          );
-        } else {
+        // Image is handled afterwards
+        if (!isImage) {
          document.content = document.content.replace(
            path,
            `${markdownLinkOrImageText}(${url})`
          );
+          }
+        } catch (error) {
+          
        }
      });
+      document.markdown = document.content;
    });

    return documents;
@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
          if (!imageUrl.startsWith("http")) {
            if (imageUrl.startsWith("/")) {
              imageUrl = imageUrl.substring(1);
+              imageUrl = new URL(imageUrl, baseUrl).toString();
+            } else {
+              imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
            }
-            imageUrl = new URL(imageUrl, baseUrl).toString();
          }
        }

@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
          `![${altText}](${imageUrl})`
        );
      });
+      document.markdown = document.content;
    });

    return documents;