From d10f81e7feecf2250b4ca102899dcc33660468bd Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 15 May 2024 11:28:20 -0700
Subject: [PATCH] Nick: fixes

---
 apps/api/src/scraper/WebScraper/index.ts      | 4 ++--
 apps/api/src/scraper/WebScraper/single_url.ts | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index bdc7483..0a86a90 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -71,8 +71,8 @@ export class WebScraperDataProvider {
       const batchUrls = urls.slice(i, i + this.concurrentRequests);
       await Promise.all(
         batchUrls.map(async (url, index) => {
-          const existingText = allHtmls ? allHtmls[i + index] : "";
-          const result = await scrapSingleUrl(url, this.pageOptions, existingText);
+          const existingHTML = allHtmls ? allHtmls[i + index] : "";
+          const result = await scrapSingleUrl(url, this.pageOptions, existingHTML);
           processedUrls++;
           if (inProgress) {
             inProgress({
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index c41beb5..4bbaee7 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -107,7 +107,7 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
 export async function scrapSingleUrl(
   urlToScrap: string,
   pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
-  existingText: string = ""
+  existingHtml: string = ""
 ): Promise<Document> {
   urlToScrap = urlToScrap.trim();
 
@@ -199,8 +199,10 @@ export async function scrapSingleUrl(
 
     for (const scraper of scrapersInOrder) {
       // If exists text coming from crawler, use it
-      if (existingText && existingText.trim().length >= 100) {
-        text = existingText;
+      if (existingHtml && existingHtml.trim().length >= 100) {
+        let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
+        text = await parseMarkdown(cleanedHtml);
+        html = existingHtml;
         break;
       }
       [text, html] = await attemptScraping(urlToScrap, scraper);