From 8eb2e95f19b4f5389f8447ccbd961ce53dc1391a Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Mon, 13 May 2024 16:13:10 -0300
Subject: [PATCH] Cleaned up

---
 apps/api/src/scraper/WebScraper/index.ts      | 26 +-------------
 apps/api/src/scraper/WebScraper/single_url.ts | 23 +++++++++----
 .../scraper/WebScraper/utils/pdfProcessor.ts  | 34 +------------------
 3 files changed, 18 insertions(+), 65 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index de941e0..1d9656e 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -10,7 +10,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
 import { WebCrawler } from "./crawler";
 import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/imageDescription";
-import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
+import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 import {
   replaceImgPathsWithAbsolutePaths,
   replacePathsWithAbsolutePaths,
@@ -144,11 +144,7 @@ export class WebScraperDataProvider {
       return this.returnOnlyUrlsResponse(links, inProgress);
     }
 
-    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
-    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);    
-
     let documents = await this.processLinks(links, inProgress);
-    // documents.push(...pdfDocuments);
     return this.cacheAndFinalizeDocuments(documents, links);
   }
 
@@ -156,11 +152,8 @@ export class WebScraperDataProvider {
     inProgress?: (progress: Progress) => void
   ): Promise<Document[]> {
     const links = this.urls;
-    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
-    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
 
     let documents = await this.processLinks(links, inProgress);
-    // documents.push(...pdfDocuments);
     return documents;
   }
 
@@ -172,11 +165,7 @@ export class WebScraperDataProvider {
       return this.returnOnlyUrlsResponse(links, inProgress);
     }
 
-    // let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
-    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
-
     let documents = await this.processLinks(links, inProgress);
-    // documents.push(...pdfDocuments);
     return this.cacheAndFinalizeDocuments(documents, links);
   }
 
@@ -233,19 +222,6 @@ export class WebScraperDataProvider {
     );
   }
 
-  private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
-    const checks = links.map(async (link) => ({
-      link,
-      isPdf: await isUrlAPdf({ url: link })
-    }));
-  
-    const results = await Promise.all(checks);
-    const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
-    const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
-  
-    return [pdfLinks, notPdfLinks];
-  }
-
   private applyPathReplacements(documents: Document[]): Document[] {
     return this.replaceAllPathsWithAbsolutePaths
       ? replacePathsWithAbsolutePaths(documents)
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index 33d8518..baf465e 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -67,13 +67,11 @@ export async function scrapWithScrapingBee(
       );
       return "";
     }
-    // Check the content type of the response
+    
     const contentType = response.headers['content-type'];
     if (contentType && contentType.includes('application/pdf')) {
-      // Handle PDF content type
       return fetchAndProcessPdf(url);
     } else {
-      // Assume the content is text and decode it
       const decoder = new TextDecoder();
       const text = decoder.decode(response.data);
       return text;
@@ -104,9 +102,14 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
       return "";
     }
 
-    const data = await response.json();
-    const html = data.content;
-    return html ?? "";
+    const contentType = response.headers['content-type'];
+    if (contentType && contentType.includes('application/pdf')) {
+      return fetchAndProcessPdf(url);
+    } else {
+      const data = await response.json();
+      const html = data.content;
+      return html ?? "";
+    }
   } catch (error) {
     console.error(`Error scraping with Puppeteer: ${error}`);
     return "";
@@ -173,7 +176,13 @@ export async function scrapSingleUrl(
             );
             return "";
           }
-          text = await response.text();
+
+          const contentType = response.headers['content-type'];
+          if (contentType && contentType.includes('application/pdf')) {
+            return fetchAndProcessPdf(url);
+          } else {
+            text = await response.text();
+          }
         } catch (error) {
           console.error(`Error scraping URL: ${error}`);
           return "";
diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
index a72de30..ba92fd4 100644
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -105,36 +105,4 @@ async function processPdf(file: string) {
   const fileContent = fs.readFileSync(file);
   const data = await pdf(fileContent);
   return data.text;
-}
-/**
- * Check if a url is a pdf
- * @param url The url to check
- * @param fastMode If true, the function will return false if the url is does not end with .pdf
- * @returns A promise that resolves to true if the url is a pdf, false otherwise
- */
-export async function isUrlAPdf({
-  url,
-  fastMode = false,
-}: {
-  url: string;
-  fastMode?: boolean;
-}): Promise<boolean> {
-  try {
-    if (url.endsWith(".pdf")) {
-      return true;
-    }
-    // If fast mode is enabled, we skip the HEAD request and return false
-    if (fastMode) {
-      return false;
-    }
-    const before = Date.now();
-    const response = await axios.head(url);
-    const after = Date.now();
-    console.log(`${after - before}ms - HEAD Request for ${url}`);
-    const contentType = response.headers['content-type'];
-    return contentType.includes('application/pdf');
-  } catch (error) {
-    // console.error("Error making HEAD request:", error);
-    return false;
-  }
-}
+}
\ No newline at end of file