From 140529c6090ece93ba60cdb3ce360f9a28b8ffb7 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 19 Apr 2024 13:05:21 -0700
Subject: [PATCH 01/12] Nick: fixes pdfs not found

---
 apps/api/src/scraper/WebScraper/index.ts      | 14 +++++------
 .../scraper/WebScraper/utils/pdfProcessor.ts  | 23 ++++++++++++++++++-
 2 files changed, 29 insertions(+), 8 deletions(-)
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index c2146be..47fa05c 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -5,7 +5,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
 import { WebCrawler } from "./crawler";
 import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/gptVision";
-import { fetchAndProcessPdf } from "./utils/pdfProcessor";
+import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
 import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
 
 
@@ -88,7 +88,7 @@ export class WebScraperDataProvider {
           }));
         }
 
-        let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
+        let pdfLinks = links.filter((link) => isUrlAPdf(link));
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
@@ -98,7 +98,7 @@ export class WebScraperDataProvider {
             provider: "web-scraper"
           });
         }
-        links = links.filter((link) => !link.endsWith(".pdf"));
+        links = links.filter((link) => !isUrlAPdf(link));
 
         let documents = await this.convertUrlsToDocuments(links, inProgress);
         documents = await this.getSitemapData(this.urls[0], documents);
@@ -157,7 +157,7 @@ export class WebScraperDataProvider {
       }
 
       if (this.mode === "single_urls") {
-        let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf"));
+        let pdfLinks = this.urls.filter((link) => isUrlAPdf(link));
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
@@ -169,7 +169,7 @@ export class WebScraperDataProvider {
         }
 
         let documents = await this.convertUrlsToDocuments(
-          this.urls.filter((link) => !link.endsWith(".pdf")),
+          this.urls.filter((link) => !isUrlAPdf(link)),
           inProgress
         );
 
@@ -193,7 +193,7 @@ export class WebScraperDataProvider {
       }
       if (this.mode === "sitemap") {
         let links = await getLinksFromSitemap(this.urls[0]);
-        let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
+        let pdfLinks = links.filter((link) => isUrlAPdf(link));
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
@@ -203,7 +203,7 @@ export class WebScraperDataProvider {
             provider: "web-scraper"
           });
         }
-        links = links.filter((link) => !link.endsWith(".pdf"));
+        links = links.filter((link) => !isUrlAPdf(link));
 
         let documents = await this.convertUrlsToDocuments(
           links.slice(0, this.limit),
diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
index fb08d9c..75f80fb 100644
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -35,6 +35,7 @@ async function downloadPdf(url: string): Promise<string> {
 }
 
 export async function processPdfToText(filePath: string): Promise<string> {
+
   let content = "";
 
   if (process.env.LLAMAPARSE_API_KEY) {
@@ -105,4 +106,24 @@ async function processPdf(file: string){
   const fileContent = fs.readFileSync(file);
   const data = await pdf(fileContent);
   return data.text;
-}
\ No newline at end of file
+}
+
+// fetchAndProcessPdf("https://www.fda.gov/media/167973/download?attachment").then((e)=>{
+//   console.log(e);
+// })
+
+export async function isUrlAPdf(url: string): Promise<boolean> {
+  try {
+    if (url.endsWith('.pdf')) {
+      return true;
+    }
+    const response = await fetch(url, { method: 'HEAD' });
+    const contentType = response.headers.get('Content-Type');
+    return contentType !== null && contentType.includes('application/pdf');
+  } catch (error) {
+    console.error('Error making HEAD request:', error);
+    return false;
+  }
+}
+
+

From 43cfcec326645bda17b04ecd5ec2372d3cb69d8d Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 19 Apr 2024 13:12:08 -0700
Subject: [PATCH 02/12] Nick: disabling in crawl and sitemap for now

---
 apps/api/src/scraper/WebScraper/index.ts             | 12 ++++++------
 .../api/src/scraper/WebScraper/utils/pdfProcessor.ts |  6 +++++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 47fa05c..58144ba 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -88,7 +88,7 @@ export class WebScraperDataProvider {
           }));
         }
 
-        let pdfLinks = links.filter((link) => isUrlAPdf(link));
+        let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true}));
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
@@ -98,7 +98,7 @@ export class WebScraperDataProvider {
             provider: "web-scraper"
           });
         }
-        links = links.filter((link) => !isUrlAPdf(link));
+        links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true}));
 
         let documents = await this.convertUrlsToDocuments(links, inProgress);
         documents = await this.getSitemapData(this.urls[0], documents);
@@ -157,7 +157,7 @@ export class WebScraperDataProvider {
       }
 
       if (this.mode === "single_urls") {
-        let pdfLinks = this.urls.filter((link) => isUrlAPdf(link));
+        let pdfLinks = this.urls.filter((link) => isUrlAPdf({url: link, fastMode: false}));
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
@@ -169,7 +169,7 @@ export class WebScraperDataProvider {
         }
 
         let documents = await this.convertUrlsToDocuments(
-          this.urls.filter((link) => !isUrlAPdf(link)),
+          this.urls.filter((link) => !isUrlAPdf({url: link, fastMode: true})),
           inProgress
         );
 
@@ -193,7 +193,7 @@ export class WebScraperDataProvider {
       }
       if (this.mode === "sitemap") {
         let links = await getLinksFromSitemap(this.urls[0]);
-        let pdfLinks = links.filter((link) => isUrlAPdf(link));
+        let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true}));
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
@@ -203,7 +203,7 @@ export class WebScraperDataProvider {
             provider: "web-scraper"
           });
         }
-        links = links.filter((link) => !isUrlAPdf(link));
+        links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true}));
 
         let documents = await this.convertUrlsToDocuments(
           links.slice(0, this.limit),
diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
index 75f80fb..2d0203f 100644
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -112,11 +112,15 @@ async function processPdf(file: string){
 //   console.log(e);
 // })
 
-export async function isUrlAPdf(url: string): Promise<boolean> {
+export async function isUrlAPdf({url, fastMode}: {url: string, fastMode: boolean}): Promise<boolean> {
   try {
     if (url.endsWith('.pdf')) {
       return true;
     }
+    // If fast mode is enabled, we skip the HEAD request and return false
+    if (fastMode) {
+      return false;
+    }
     const response = await fetch(url, { method: 'HEAD' });
     const contentType = response.headers.get('Content-Type');
     return contentType !== null && contentType.includes('application/pdf');

From c5cb268b61cd9b1fe7035b8d6a72bc80cfe3d4e2 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 19 Apr 2024 13:13:42 -0700
Subject: [PATCH 03/12] Update pdfProcessor.ts

---
 .../scraper/WebScraper/utils/pdfProcessor.ts  | 42 ++++++++++---------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
index 2d0203f..80476e9 100644
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise<string> {
 async function downloadPdf(url: string): Promise<string> {
   const response = await axios({
     url,
-    method: 'GET',
-    responseType: 'stream',
+    method: "GET",
+    responseType: "stream",
   });
 
   const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
@@ -29,13 +29,12 @@ async function downloadPdf(url: string): Promise<string> {
   response.data.pipe(writer);
 
   return new Promise((resolve, reject) => {
-    writer.on('finish', () => resolve(tempFilePath));
-    writer.on('error', reject);
+    writer.on("finish", () => resolve(tempFilePath));
+    writer.on("error", reject);
   });
 }
 
 export async function processPdfToText(filePath: string): Promise<string> {
-
   let content = "";
 
   if (process.env.LLAMAPARSE_API_KEY) {
@@ -102,32 +101,37 @@ export async function processPdfToText(filePath: string): Promise<string> {
   return content;
 }
 
-async function processPdf(file: string){
+async function processPdf(file: string) {
   const fileContent = fs.readFileSync(file);
   const data = await pdf(fileContent);
   return data.text;
 }
-
-// fetchAndProcessPdf("https://www.fda.gov/media/167973/download?attachment").then((e)=>{
-//   console.log(e);
-// })
-
-export async function isUrlAPdf({url, fastMode}: {url: string, fastMode: boolean}): Promise<boolean> {
+/**
+ * Check if a url is a pdf
+ * @param url The url to check
+ * @param fastMode If true, the function will return false if the url is does not end with .pdf
+ * @returns A promise that resolves to true if the url is a pdf, false otherwise
+ */
+export async function isUrlAPdf({
+  url,
+  fastMode,
+}: {
+  url: string;
+  fastMode: boolean;
+}): Promise<boolean> {
   try {
-    if (url.endsWith('.pdf')) {
+    if (url.endsWith(".pdf")) {
       return true;
     }
     // If fast mode is enabled, we skip the HEAD request and return false
     if (fastMode) {
       return false;
     }
-    const response = await fetch(url, { method: 'HEAD' });
-    const contentType = response.headers.get('Content-Type');
-    return contentType !== null && contentType.includes('application/pdf');
+    const response = await fetch(url, { method: "HEAD" });
+    const contentType = response.headers.get("Content-Type");
+    return contentType !== null && contentType.includes("application/pdf");
   } catch (error) {
-    console.error('Error making HEAD request:', error);
+    console.error("Error making HEAD request:", error);
     return false;
   }
 }
-
-

From 5b937991491d24ebd0411e08ecd9601dc309a87e Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 19 Apr 2024 15:13:17 -0700
Subject: [PATCH 04/12] Nick: a bit faster

---
 apps/api/src/scraper/WebScraper/index.ts | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 58144ba..0dc68b0 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -157,19 +157,23 @@ export class WebScraperDataProvider {
       }
 
       if (this.mode === "single_urls") {
-        let pdfLinks = this.urls.filter((link) => isUrlAPdf({url: link, fastMode: false}));
         let pdfDocuments: Document[] = [];
-        for (let pdfLink of pdfLinks) {
-          const pdfContent = await fetchAndProcessPdf(pdfLink);
-          pdfDocuments.push({
-            content: pdfContent,
-            metadata: { sourceURL: pdfLink },
-            provider: "web-scraper"
-          });
+        let nonPdfUrls: string[] = [];
+        for (let url of this.urls) {
+          if (isUrlAPdf({url: url, fastMode: false})) {
+            const pdfContent = await fetchAndProcessPdf(url);
+            pdfDocuments.push({
+              content: pdfContent,
+              metadata: { sourceURL: url },
+              provider: "web-scraper"
+            });
+          } else {
+            nonPdfUrls.push(url);
+          }
         }
 
         let documents = await this.convertUrlsToDocuments(
-          this.urls.filter((link) => !isUrlAPdf({url: link, fastMode: true})),
+          nonPdfUrls,
           inProgress
         );
 

From 84cebf618bb316f7494b2c0c350a22d66528f698 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 19 Apr 2024 15:36:00 -0700
Subject: [PATCH 05/12] Nick:

---
 apps/api/src/__tests__/e2e/index.test.ts |  3 +++
 apps/api/src/scraper/WebScraper/index.ts | 13 ++++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts
index 554453b..9e7a75f 100644
--- a/apps/api/src/__tests__/e2e/index.test.ts
+++ b/apps/api/src/__tests__/e2e/index.test.ts
@@ -42,6 +42,7 @@ describe('E2E Tests for API Routes', () => {
         .set('Authorization', `Bearer this_is_just_a_preview_token`)
         .set('Content-Type', 'application/json')
         .send({ url: 'https://firecrawl.dev' });
+
       expect(response.statusCode).toBe(200);
     }, 10000); // 10 seconds timeout
 
@@ -51,6 +52,8 @@ describe('E2E Tests for API Routes', () => {
         .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
         .set('Content-Type', 'application/json')
         .send({ url: 'https://firecrawl.dev' });
+        await new Promise((r) => setTimeout(r, 2000));
+
       expect(response.statusCode).toBe(200);
       expect(response.body).toHaveProperty('data');
       expect(response.body.data).toHaveProperty('content');
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 0dc68b0..9d9a236 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -88,7 +88,7 @@ export class WebScraperDataProvider {
           }));
         }
 
-        let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true}));
+        let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true}));
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
@@ -98,7 +98,7 @@ export class WebScraperDataProvider {
             provider: "web-scraper"
           });
         }
-        links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true}));
+        links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true}));
 
         let documents = await this.convertUrlsToDocuments(links, inProgress);
         documents = await this.getSitemapData(this.urls[0], documents);
@@ -157,10 +157,12 @@ export class WebScraperDataProvider {
       }
 
       if (this.mode === "single_urls") {
+        console.log("Single urls mode");
         let pdfDocuments: Document[] = [];
         let nonPdfUrls: string[] = [];
         for (let url of this.urls) {
-          if (isUrlAPdf({url: url, fastMode: false})) {
+          console.log("Checking if url is a pdf", url);
+          if (await isUrlAPdf({url: url, fastMode: false})) {
             const pdfContent = await fetchAndProcessPdf(url);
             pdfDocuments.push({
               content: pdfContent,
@@ -169,6 +171,7 @@ export class WebScraperDataProvider {
             });
           } else {
             nonPdfUrls.push(url);
+            console.log("Fetching and processing url", url);
           }
         }
 
@@ -197,7 +200,7 @@ export class WebScraperDataProvider {
       }
       if (this.mode === "sitemap") {
         let links = await getLinksFromSitemap(this.urls[0]);
-        let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true}));
+        let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true}));
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
@@ -207,7 +210,7 @@ export class WebScraperDataProvider {
             provider: "web-scraper"
           });
         }
-        links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true}));
+        links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true}));
 
         let documents = await this.convertUrlsToDocuments(
           links.slice(0, this.limit),

From f1dd97af0f0c98dd46b3355ccd488c420acdb97e Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 19 Apr 2024 15:37:27 -0700
Subject: [PATCH 06/12] Update index.ts

---
 apps/api/src/scraper/WebScraper/index.ts | 40 ++++++++++++++----------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 9d9a236..fe291fb 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -6,8 +6,10 @@ import { WebCrawler } from "./crawler";
 import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/gptVision";
 import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
-import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
-
+import {
+  replaceImgPathsWithAbsolutePaths,
+  replacePathsWithAbsolutePaths,
+} from "./utils/replacePaths";
 
 export class WebScraperDataProvider {
   private urls: string[] = [""];
@@ -36,8 +38,6 @@ export class WebScraperDataProvider {
   ): Promise<Document[]> {
     const totalUrls = urls.length;
     let processedUrls = 0;
-    console.log("Converting urls to documents");
-    console.log("Total urls", urls);
     const results: (Document | null)[] = new Array(urls.length).fill(null);
     for (let i = 0; i < urls.length; i += this.concurrentRequests) {
       const batchUrls = urls.slice(i, i + this.concurrentRequests);
@@ -88,17 +88,21 @@ export class WebScraperDataProvider {
           }));
         }
 
-        let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true}));
+        let pdfLinks = links.filter(
+          async (link) => await isUrlAPdf({ url: link, fastMode: true })
+        );
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
           pdfDocuments.push({
             content: pdfContent,
             metadata: { sourceURL: pdfLink },
-            provider: "web-scraper"
+            provider: "web-scraper",
           });
         }
-        links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true}));
+        links = links.filter(
+          async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
+        );
 
         let documents = await this.convertUrlsToDocuments(links, inProgress);
         documents = await this.getSitemapData(this.urls[0], documents);
@@ -157,21 +161,18 @@ export class WebScraperDataProvider {
       }
 
       if (this.mode === "single_urls") {
-        console.log("Single urls mode");
         let pdfDocuments: Document[] = [];
         let nonPdfUrls: string[] = [];
         for (let url of this.urls) {
-          console.log("Checking if url is a pdf", url);
-          if (await isUrlAPdf({url: url, fastMode: false})) {
+          if (await isUrlAPdf({ url: url, fastMode: false })) {
             const pdfContent = await fetchAndProcessPdf(url);
             pdfDocuments.push({
               content: pdfContent,
               metadata: { sourceURL: url },
-              provider: "web-scraper"
+              provider: "web-scraper",
             });
           } else {
             nonPdfUrls.push(url);
-            console.log("Fetching and processing url", url);
           }
         }
 
@@ -200,17 +201,21 @@ export class WebScraperDataProvider {
       }
       if (this.mode === "sitemap") {
         let links = await getLinksFromSitemap(this.urls[0]);
-        let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true}));
+        let pdfLinks = links.filter(
+          async (link) => await isUrlAPdf({ url: link, fastMode: true })
+        );
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
           pdfDocuments.push({
             content: pdfContent,
             metadata: { sourceURL: pdfLink },
-            provider: "web-scraper"
+            provider: "web-scraper",
           });
         }
-        links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true}));
+        links = links.filter(
+          async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
+        );
 
         let documents = await this.convertUrlsToDocuments(
           links.slice(0, this.limit),
@@ -377,8 +382,9 @@ export class WebScraperDataProvider {
     this.limit = options.crawlerOptions?.limit ?? 10000;
     this.generateImgAltText =
       options.crawlerOptions?.generateImgAltText ?? false;
-    this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
-    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
+    this.pageOptions = options.pageOptions ?? { onlyMainContent: false };
+    this.replaceAllPathsWithAbsolutePaths =
+      options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
 
     //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
     this.excludes = this.excludes.filter((item) => item !== "");

From f8b207793f6f48d3b974b43e27425477407b28a3 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Mon, 29 Apr 2024 15:15:32 -0300
Subject: [PATCH 07/12] changed the request to do a HEAD to check for a PDF
 instead

---
 apps/api/src/__tests__/e2e/index.test.ts      | 39 ++++++++++++++++++
 apps/api/src/scraper/WebScraper/index.ts      | 40 ++++++++++++-------
 .../scraper/WebScraper/utils/pdfProcessor.ts  | 12 +++---
 3 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts
index 9e7a75f..a652619 100644
--- a/apps/api/src/__tests__/e2e/index.test.ts
+++ b/apps/api/src/__tests__/e2e/index.test.ts
@@ -170,6 +170,45 @@ describe('E2E Tests for API Routes', () => {
         expect(completedResponse.body.data[0]).toHaveProperty('metadata');
         expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl');
     }, 60000); // 60 seconds
+
+    // it('should return a successful response for a valid crawl job with PDF content', async () => {
+
+    // });
+
+    it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
+      const crawlResponse = await request(TEST_URL)
+        .post('/v0/crawl')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 5 }});
+      expect(crawlResponse.statusCode).toBe(200);
+
+      const response = await request(TEST_URL)
+        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty('status');
+        expect(response.body.status).toBe('active');
+
+        // wait for 30 seconds
+        await new Promise((r) => setTimeout(r, 60000));
+  
+        const completedResponse = await request(TEST_URL)
+        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
+        console.log(completedResponse.body.data)
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty('status');
+        expect(completedResponse.body.status).toBe('completed');
+        expect(completedResponse.body).toHaveProperty('data');
+        expect(completedResponse.body.data.length).toBeGreaterThan(1);
+        expect(completedResponse.body.data[0]).toHaveProperty('content');
+        expect(completedResponse.body.data[0]).toHaveProperty('markdown');
+        expect(completedResponse.body.data[0]).toHaveProperty('metadata');
+        expect(completedResponse.body.data[0].content).toContain('The Peculiar Balmer Line Profiles of OQ 208');
+    }, 90000); // 60 seconds
+
+
   });
 
   describe('GET /is-production', () => {
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index fe291fb..2b02076 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -88,9 +88,17 @@ export class WebScraperDataProvider {
           }));
         }
 
-        let pdfLinks = links.filter(
-          async (link) => await isUrlAPdf({ url: link, fastMode: true })
-        );
+        let pdfLinks = [];
+        let notPdfLinks = [];
+        for (let link of links) {
+          if (await isUrlAPdf({ url: link })) {
+            pdfLinks.push(link);
+          } else {
+            notPdfLinks.push(link);
+          }
+        }
+
+        console.log("crawl", {pdfLinks})
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
@@ -100,11 +108,8 @@ export class WebScraperDataProvider {
             provider: "web-scraper",
           });
         }
-        links = links.filter(
-          async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
-        );
 
-        let documents = await this.convertUrlsToDocuments(links, inProgress);
+        let documents = await this.convertUrlsToDocuments(notPdfLinks, inProgress);
         documents = await this.getSitemapData(this.urls[0], documents);
 
         if (this.replaceAllPathsWithAbsolutePaths) {
@@ -164,7 +169,7 @@ export class WebScraperDataProvider {
         let pdfDocuments: Document[] = [];
         let nonPdfUrls: string[] = [];
         for (let url of this.urls) {
-          if (await isUrlAPdf({ url: url, fastMode: false })) {
+          if (await isUrlAPdf({ url: url })) {
             const pdfContent = await fetchAndProcessPdf(url);
             pdfDocuments.push({
               content: pdfContent,
@@ -201,9 +206,17 @@ export class WebScraperDataProvider {
       }
       if (this.mode === "sitemap") {
         let links = await getLinksFromSitemap(this.urls[0]);
-        let pdfLinks = links.filter(
-          async (link) => await isUrlAPdf({ url: link, fastMode: true })
-        );
+
+        let pdfLinks = [];
+        let nonPdfLinks = [];
+        for (let link of links) {
+          if (await isUrlAPdf({ url: link })) {
+            pdfLinks.push(link);
+          } else {
+            nonPdfLinks.push(link);
+          }
+        }
+
         let pdfDocuments: Document[] = [];
         for (let pdfLink of pdfLinks) {
           const pdfContent = await fetchAndProcessPdf(pdfLink);
@@ -213,12 +226,9 @@ export class WebScraperDataProvider {
             provider: "web-scraper",
           });
         }
-        links = links.filter(
-          async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
-        );
 
         let documents = await this.convertUrlsToDocuments(
-          links.slice(0, this.limit),
+          nonPdfLinks.slice(0, this.limit),
           inProgress
         );
 
diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
index 80476e9..67fb134 100644
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -114,10 +114,10 @@ async function processPdf(file: string) {
  */
 export async function isUrlAPdf({
   url,
-  fastMode,
+  fastMode = false,
 }: {
   url: string;
-  fastMode: boolean;
+  fastMode?: boolean;
 }): Promise<boolean> {
   try {
     if (url.endsWith(".pdf")) {
@@ -127,11 +127,11 @@ export async function isUrlAPdf({
     if (fastMode) {
       return false;
     }
-    const response = await fetch(url, { method: "HEAD" });
-    const contentType = response.headers.get("Content-Type");
-    return contentType !== null && contentType.includes("application/pdf");
+    const response = await axios.head(url);
+    const contentType = response.headers['content-type'];
+    return contentType.includes('application/pdf');
   } catch (error) {
-    console.error("Error making HEAD request:", error);
+    // console.error("Error making HEAD request:", error);
     return false;
   }
 }

From 35480bd2ad4554c64577d01e690922d7d72d974f Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 30 Apr 2024 10:40:32 -0300
Subject: [PATCH 08/12] Update index.test.ts

---
 apps/api/src/__tests__/e2e/index.test.ts | 49 ++++++++++++++++++++----
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts
index a652619..0ceca19 100644
--- a/apps/api/src/__tests__/e2e/index.test.ts
+++ b/apps/api/src/__tests__/e2e/index.test.ts
@@ -61,6 +61,36 @@ describe('E2E Tests for API Routes', () => {
       expect(response.body.data).toHaveProperty('metadata');
       expect(response.body.data.content).toContain('🔥 FireCrawl');
     }, 30000); // 30 seconds timeout
+
+    it('should return a successful response for a valid scrape with PDF file', async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' });
+      await new Promise((r) => setTimeout(r, 6000));
+      
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
+    }, 30000); // 30 seconds
+
+    it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' });
+      await new Promise((r) => setTimeout(r, 6000));
+      
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
+    }, 30000); // 30 seconds
   });
 
   describe('POST /v0/crawl', () => {
@@ -180,7 +210,7 @@ describe('E2E Tests for API Routes', () => {
         .post('/v0/crawl')
         .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
         .set('Content-Type', 'application/json')
-        .send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 5 }});
+        .send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }});
       expect(crawlResponse.statusCode).toBe(200);
 
       const response = await request(TEST_URL)
@@ -191,22 +221,25 @@ describe('E2E Tests for API Routes', () => {
         expect(response.body.status).toBe('active');
 
         // wait for 30 seconds
-        await new Promise((r) => setTimeout(r, 60000));
+        await new Promise((r) => setTimeout(r, 30000));
   
         const completedResponse = await request(TEST_URL)
         .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
         .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
-        console.log(completedResponse.body.data)
+
         expect(completedResponse.statusCode).toBe(200);
         expect(completedResponse.body).toHaveProperty('status');
         expect(completedResponse.body.status).toBe('completed');
         expect(completedResponse.body).toHaveProperty('data');
         expect(completedResponse.body.data.length).toBeGreaterThan(1);
-        expect(completedResponse.body.data[0]).toHaveProperty('content');
-        expect(completedResponse.body.data[0]).toHaveProperty('markdown');
-        expect(completedResponse.body.data[0]).toHaveProperty('metadata');
-        expect(completedResponse.body.data[0].content).toContain('The Peculiar Balmer Line Profiles of OQ 208');
-    }, 90000); // 60 seconds
+        expect(completedResponse.body.data).toEqual(
+          expect.arrayContaining([
+            expect.objectContaining({
+              content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.')
+            })
+          ])
+        );
+    }, 60000); // 60 seconds
 
 
   });

From f4348024c61e9ce15feeb0928d4d87a91a3f352e Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Mon, 13 May 2024 09:13:42 -0300
Subject: [PATCH 09/12] Added check during scraping to deal with pdfs

Checks if the URL is a PDF during the scraping process (single_url.ts).

TODO: Run integration tests - Does this strat affect the running time?

ps. Some comments need to be removed if we decide to proceed with this strategy.
---
 .../src/__tests__/e2e_withAuth/index.test.ts  | 12 +++-----
 apps/api/src/scraper/WebScraper/index.ts      | 28 ++++++++++++++++++-
 apps/api/src/scraper/WebScraper/single_url.ts | 15 ++++++++--
 .../scraper/WebScraper/utils/pdfProcessor.ts  |  9 ++++--
 4 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index a49b169..d69a70b 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -117,7 +117,7 @@ describe("E2E Tests for API Routes", () => {
       expect(response.body.data).toHaveProperty('content');
       expect(response.body.data).toHaveProperty('metadata');
       expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
-    }, 30000); // 30 seconds
+    }, 60000); // 60 seconds
   
     it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
       const response = await request(TEST_URL)
@@ -132,7 +132,7 @@ describe("E2E Tests for API Routes", () => {
       expect(response.body.data).toHaveProperty('content');
       expect(response.body.data).toHaveProperty('metadata');
       expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
-    }, 30000); // 30 seconds
+    }, 60000); // 60 seconds
   });
 
   describe("POST /v0/crawl", () => {
@@ -427,10 +427,8 @@ describe("E2E Tests for API Routes", () => {
       .send({ url: "https://jestjs.io" });
     expect(crawlResponse.statusCode).toBe(200);
 
-    
-
     // wait for 30 seconds
-    await new Promise((r) => setTimeout(r, 10000));
+    await new Promise((r) => setTimeout(r, 20000));
 
     const response = await request(TEST_URL)
       .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
@@ -439,7 +437,7 @@ describe("E2E Tests for API Routes", () => {
     expect(response.body).toHaveProperty("status");
     expect(response.body.status).toBe("cancelled");
 
-    await new Promise((r) => setTimeout(r, 20000));
+    await new Promise((r) => setTimeout(r, 10000));
 
     const completedResponse = await request(TEST_URL)
       .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
@@ -456,8 +454,6 @@ describe("E2E Tests for API Routes", () => {
     
   }, 60000); // 60 seconds
 
-  
-
   describe("POST /v0/scrape with LLM Extraction", () => {
     it("should extract data using LLM extraction mode", async () => {
       const response = await request(TEST_URL)
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 73eda44..de941e0 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -144,14 +144,23 @@ export class WebScraperDataProvider {
       return this.returnOnlyUrlsResponse(links, inProgress);
     }
 
+    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
+    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);    
+
     let documents = await this.processLinks(links, inProgress);
+    // documents.push(...pdfDocuments);
     return this.cacheAndFinalizeDocuments(documents, links);
   }
 
   private async handleSingleUrlsMode(
     inProgress?: (progress: Progress) => void
   ): Promise<Document[]> {
-    let documents = await this.processLinks(this.urls, inProgress);
+    const links = this.urls;
+    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
+    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
+
+    let documents = await this.processLinks(links, inProgress);
+    // documents.push(...pdfDocuments);
     return documents;
   }
 
@@ -163,7 +172,11 @@ export class WebScraperDataProvider {
       return this.returnOnlyUrlsResponse(links, inProgress);
     }
 
+    // let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
+    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
+
     let documents = await this.processLinks(links, inProgress);
+    // documents.push(...pdfDocuments);
     return this.cacheAndFinalizeDocuments(documents, links);
   }
 
@@ -220,6 +233,19 @@ export class WebScraperDataProvider {
     );
   }
 
+  private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
+    const checks = links.map(async (link) => ({
+      link,
+      isPdf: await isUrlAPdf({ url: link })
+    }));
+  
+    const results = await Promise.all(checks);
+    const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
+    const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
+  
+    return [pdfLinks, notPdfLinks];
+  }
+
   private applyPathReplacements(documents: Document[]): Document[] {
     return this.replaceAllPathsWithAbsolutePaths
       ? replacePathsWithAbsolutePaths(documents)
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index c43ea40..33d8518 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities";
 import { parseMarkdown } from "../../lib/html-to-markdown";
 import { excludeNonMainTags } from "./utils/excludeTags";
 import { urlSpecificParams } from "./utils/custom/website_params";
+import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 
 dotenv.config();
 
@@ -66,9 +67,17 @@ export async function scrapWithScrapingBee(
       );
       return "";
     }
-    const decoder = new TextDecoder();
-    const text = decoder.decode(response.data);
-    return text;
+    // Check the content type of the response
+    const contentType = response.headers['content-type'];
+    if (contentType && contentType.includes('application/pdf')) {
+      // Handle PDF content type
+      return fetchAndProcessPdf(url);
+    } else {
+      // Assume the content is text and decode it
+      const decoder = new TextDecoder();
+      const text = decoder.decode(response.data);
+      return text;
+    }
   } catch (error) {
     console.error(`Error scraping with Scraping Bee: ${error}`);
     return "";
diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
index 67fb134..a72de30 100644
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise<string> {
           } else {
             // If the status code is not 200, increment the attempt counter and wait
             attempt++;
-            await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds
+            await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
           }
         } catch (error) {
-          console.error("Error fetching result:", error);
+          console.error("Error fetching result:", error.data.detail || '');
           attempt++;
-          await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying
+          await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
           // You may want to handle specific errors differently
         }
       }
@@ -127,7 +127,10 @@ export async function isUrlAPdf({
     if (fastMode) {
       return false;
     }
+    const before = Date.now();
     const response = await axios.head(url);
+    const after = Date.now();
+    console.log(`${after - before}ms - HEAD Request for ${url}`);
     const contentType = response.headers['content-type'];
     return contentType.includes('application/pdf');
   } catch (error) {

From 8eb2e95f19b4f5389f8447ccbd961ce53dc1391a Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Mon, 13 May 2024 16:13:10 -0300
Subject: [PATCH 10/12] Cleaned up

---
 apps/api/src/scraper/WebScraper/index.ts      | 26 +-------------
 apps/api/src/scraper/WebScraper/single_url.ts | 23 +++++++++----
 .../scraper/WebScraper/utils/pdfProcessor.ts  | 34 +------------------
 3 files changed, 18 insertions(+), 65 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index de941e0..1d9656e 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -10,7 +10,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
 import { WebCrawler } from "./crawler";
 import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/imageDescription";
-import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
+import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 import {
   replaceImgPathsWithAbsolutePaths,
   replacePathsWithAbsolutePaths,
@@ -144,11 +144,7 @@ export class WebScraperDataProvider {
       return this.returnOnlyUrlsResponse(links, inProgress);
     }
 
-    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
-    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);    
-
     let documents = await this.processLinks(links, inProgress);
-    // documents.push(...pdfDocuments);
     return this.cacheAndFinalizeDocuments(documents, links);
   }
 
@@ -156,11 +152,8 @@ export class WebScraperDataProvider {
     inProgress?: (progress: Progress) => void
   ): Promise<Document[]> {
     const links = this.urls;
-    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
-    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
 
     let documents = await this.processLinks(links, inProgress);
-    // documents.push(...pdfDocuments);
     return documents;
   }
 
@@ -172,11 +165,7 @@ export class WebScraperDataProvider {
       return this.returnOnlyUrlsResponse(links, inProgress);
     }
 
-    // let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
-    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
-
     let documents = await this.processLinks(links, inProgress);
-    // documents.push(...pdfDocuments);
     return this.cacheAndFinalizeDocuments(documents, links);
   }
 
@@ -233,19 +222,6 @@ export class WebScraperDataProvider {
     );
   }
 
-  private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
-    const checks = links.map(async (link) => ({
-      link,
-      isPdf: await isUrlAPdf({ url: link })
-    }));
-  
-    const results = await Promise.all(checks);
-    const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
-    const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
-  
-    return [pdfLinks, notPdfLinks];
-  }
-
   private applyPathReplacements(documents: Document[]): Document[] {
     return this.replaceAllPathsWithAbsolutePaths
       ? replacePathsWithAbsolutePaths(documents)
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index 33d8518..baf465e 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -67,13 +67,11 @@ export async function scrapWithScrapingBee(
       );
       return "";
     }
-    // Check the content type of the response
+    
     const contentType = response.headers['content-type'];
     if (contentType && contentType.includes('application/pdf')) {
-      // Handle PDF content type
       return fetchAndProcessPdf(url);
     } else {
-      // Assume the content is text and decode it
       const decoder = new TextDecoder();
       const text = decoder.decode(response.data);
       return text;
@@ -104,9 +102,14 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
       return "";
     }
 
-    const data = await response.json();
-    const html = data.content;
-    return html ?? "";
+    const contentType = response.headers['content-type'];
+    if (contentType && contentType.includes('application/pdf')) {
+      return fetchAndProcessPdf(url);
+    } else {
+      const data = await response.json();
+      const html = data.content;
+      return html ?? "";
+    }
   } catch (error) {
     console.error(`Error scraping with Puppeteer: ${error}`);
     return "";
@@ -173,7 +176,13 @@ export async function scrapSingleUrl(
             );
             return "";
           }
-          text = await response.text();
+
+          const contentType = response.headers['content-type'];
+          if (contentType && contentType.includes('application/pdf')) {
+            return fetchAndProcessPdf(url);
+          } else {
+            text = await response.text();
+          }
         } catch (error) {
           console.error(`Error scraping URL: ${error}`);
           return "";
diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
index a72de30..ba92fd4 100644
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -105,36 +105,4 @@ async function processPdf(file: string) {
   const fileContent = fs.readFileSync(file);
   const data = await pdf(fileContent);
   return data.text;
-}
-/**
- * Check if a url is a pdf
- * @param url The url to check
- * @param fastMode If true, the function will return false if the url is does not end with .pdf
- * @returns A promise that resolves to true if the url is a pdf, false otherwise
- */
-export async function isUrlAPdf({
-  url,
-  fastMode = false,
-}: {
-  url: string;
-  fastMode?: boolean;
-}): Promise<boolean> {
-  try {
-    if (url.endsWith(".pdf")) {
-      return true;
-    }
-    // If fast mode is enabled, we skip the HEAD request and return false
-    if (fastMode) {
-      return false;
-    }
-    const before = Date.now();
-    const response = await axios.head(url);
-    const after = Date.now();
-    console.log(`${after - before}ms - HEAD Request for ${url}`);
-    const contentType = response.headers['content-type'];
-    return contentType.includes('application/pdf');
-  } catch (error) {
-    // console.error("Error making HEAD request:", error);
-    return false;
-  }
-}
+}
\ No newline at end of file

From eb88447e8b9b1b16739f3b1357622fcfb90c5d53 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 17 May 2024 10:00:05 -0700
Subject: [PATCH 11/12] Update index.test.ts

---
 .../src/__tests__/e2e_withAuth/index.test.ts  | 52 ++++++++++---------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index e6c4c48..3fe1022 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -587,22 +587,23 @@ describe("E2E Tests for API Routes", () => {
         .send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }});
       expect(crawlResponse.statusCode).toBe(200);
 
-      const response = await request(TEST_URL)
-        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
+      let isCompleted = false;
+      let completedResponse;
+
+      while (!isCompleted) {
+        const response = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
         expect(response.statusCode).toBe(200);
         expect(response.body).toHaveProperty('status');
-        expect(response.body.status).toBe('active');
 
-        // wait for 30 seconds
-        await new Promise((r) => setTimeout(r, 30000));
-
-        const completedResponse = await request(TEST_URL)
-        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
-
-        expect(completedResponse.statusCode).toBe(200);
-        expect(completedResponse.body).toHaveProperty('status');
+        if (response.body.status === 'completed') {
+          isCompleted = true;
+          completedResponse = response;
+        } else {
+          await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+        }
+      }
         expect(completedResponse.body.status).toBe('completed');
         expect(completedResponse.body).toHaveProperty('data');
         expect(completedResponse.body.data.length).toBeGreaterThan(1);
@@ -626,18 +627,21 @@ describe("E2E Tests for API Routes", () => {
         });
       expect(crawlResponse.statusCode).toBe(200);
 
-      const response = await request(TEST_URL)
-        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-      expect(response.statusCode).toBe(200);
-      expect(response.body).toHaveProperty("status");
-      expect(response.body.status).toBe("active");
-      // wait for 60 seconds
-      await new Promise((r) => setTimeout(r, 60000));
-      const completedResponse = await request(TEST_URL)
-        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+      let isCompleted = false;
+      let completedResponse;
 
+      while (!isCompleted) {
+        const response = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("status");
+
+        if (response.body.status === "completed") {
+          isCompleted = true;
+          completedResponse = response;
+        }
+      }
       expect(completedResponse.statusCode).toBe(200);
       expect(completedResponse.body).toHaveProperty("status");
       expect(completedResponse.body.status).toBe("completed");

From 5be208f5950dec85823bf9c82ab4e9084249aec2 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 17 May 2024 10:40:44 -0700
Subject: [PATCH 12/12] Nick: fixed

---
 apps/api/src/__tests__/e2e_withAuth/index.test.ts     | 4 ++--
 apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 3fe1022..abe5c58 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -584,7 +584,7 @@ describe("E2E Tests for API Routes", () => {
         .post('/v0/crawl')
         .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
         .set('Content-Type', 'application/json')
-        .send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }});
+        .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }});
       expect(crawlResponse.statusCode).toBe(200);
 
       let isCompleted = false;
@@ -606,7 +606,7 @@ describe("E2E Tests for API Routes", () => {
       }
         expect(completedResponse.body.status).toBe('completed');
         expect(completedResponse.body).toHaveProperty('data');
-        expect(completedResponse.body.data.length).toBeGreaterThan(1);
+        expect(completedResponse.body.data.length).toEqual(1);
         expect(completedResponse.body.data).toEqual(
           expect.arrayContaining([
             expect.objectContaining({
diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
index ba92fd4..7c57007 100644
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -80,7 +80,7 @@ export async function processPdfToText(filePath: string): Promise<string> {
             await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
           }
         } catch (error) {
-          console.error("Error fetching result:", error.data.detail || '');
+          console.error("Error fetching result:", error || '');
           attempt++;
           await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
           // You may want to handle specific errors differently