From d10f81e7feecf2250b4ca102899dcc33660468bd Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 11:28:20 -0700 Subject: [PATCH] Nick: fixes --- apps/api/src/scraper/WebScraper/index.ts | 4 ++-- apps/api/src/scraper/WebScraper/single_url.ts | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index bdc7483..0a86a90 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -71,8 +71,8 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const existingText = allHtmls ? allHtmls[i + index] : ""; - const result = await scrapSingleUrl(url, this.pageOptions, existingText); + const existingHTML = allHtmls ? allHtmls[i + index] : ""; + const result = await scrapSingleUrl(url, this.pageOptions, existingHTML); processedUrls++; if (inProgress) { inProgress({ diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c41beb5..4bbaee7 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -107,7 +107,7 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, - existingText: string = "" + existingHtml: string = "" ): Promise { urlToScrap = urlToScrap.trim(); @@ -199,8 +199,10 @@ export async function scrapSingleUrl( for (const scraper of scrapersInOrder) { // If exists text coming from crawler, use it - if (existingText && existingText.trim().length >= 100) { - text = existingText; + if (existingHtml && existingHtml.trim().length >= 100) { + let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions); + text = await parseMarkdown(cleanedHtml); + html = existingHtml; break; } [text, html] = await attemptScraping(urlToScrap, scraper);