From e37d15140428c5c2eec3a6126b2a25c86f08e23c Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Wed, 12 Jun 2024 15:06:47 -0300
Subject: [PATCH] added parsePDF option to pageOptions

user can decide if they are going to let us take care of the parse or they are going to parse the pdf by themselves
---
 .../src/__tests__/e2e_withAuth/index.test.ts  | 15 ++++++++++++
 apps/api/src/controllers/crawl.ts             |  6 ++++-
 apps/api/src/controllers/scrape.ts            |  8 ++++++-
 apps/api/src/lib/entities.ts                  |  1 +
 .../WebScraper/custom/handleCustomScraping.ts |  2 --
 apps/api/src/scraper/WebScraper/index.ts      |  9 ++++++--
 apps/api/src/scraper/WebScraper/single_url.ts | 23 +++++++++++--------
 .../utils/__tests__/pdfProcessor.test.ts      |  2 +-
 .../scraper/WebScraper/utils/pdfProcessor.ts  | 12 ++++++----
 9 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 02e4a47..4a1609b 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -136,6 +136,21 @@ describe("E2E Tests for API Routes", () => {
       expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
     }, 60000); // 60 seconds
 
+    it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
+      await new Promise((r) => setTimeout(r, 6000));
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1  7 Jan 1993)>>endobj');
+    }, 60000); // 60 seconds
+
     // TODO: add this test back once we nail the waitFor option to be more deterministic
     // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
     //   const startTime = Date.now();
diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts
index 58d01e2..fc3fe28 100644
--- a/apps/api/src/controllers/crawl.ts
+++ b/apps/api/src/controllers/crawl.ts
@@ -56,7 +56,11 @@ export async function crawlController(req: Request, res: Response) {
 
     const mode = req.body.mode ?? "crawl";
     const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    const pageOptions = req.body.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      parsePDF: true
+    };
 
     if (mode === "single_urls" && !url.includes(",")) {
       try {
diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts
index d5ab1de..ed28639 100644
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
       return res.status(status).json({ error });
     }
     const crawlerOptions = req.body.crawlerOptions ?? {};
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
+    const pageOptions = req.body.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      waitFor: 0,
+      screenshot: false,
+      parsePDF: true
+    };
     const extractorOptions = req.body.extractorOptions ?? {
       mode: "markdown"
     }
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 81bf12c..d676584 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -19,6 +19,7 @@ export type PageOptions = {
   screenshot?: boolean;
   headers?: Record<string, string>;
   replaceAllPathsWithAbsolutePaths?: boolean;
+  parsePDF?: boolean
 };
 
 export type ExtractorOptions = {
diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
index 8108a9e..081150b 100644
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@@ -1,5 +1,3 @@
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-
 export async function handleCustomScraping(
   text: string,
   url: string
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index f432f43..f0f423a 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -280,7 +280,7 @@ export class WebScraperDataProvider {
   private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
     return Promise.all(
       pdfLinks.map(async (pdfLink) => {
-        const pdfContent = await fetchAndProcessPdf(pdfLink);
+        const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
         return {
           content: pdfContent,
           metadata: { sourceURL: pdfLink },
@@ -475,7 +475,12 @@ export class WebScraperDataProvider {
     this.limit = options.crawlerOptions?.limit ?? 10000;
     this.generateImgAltText =
       options.crawlerOptions?.generateImgAltText ?? false;
-    this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
+    this.pageOptions = options.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      replaceAllPathsWithAbsolutePaths: false,
+      parsePDF: true
+    };
     this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
     this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
     //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index c2dcea1..8fa268f 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
   url: string,
   waitFor: number = 0,
   screenshot: boolean = false,
-  pageOptions: { scrollXPaths?: string[] } = {},
+  pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
   headers?: Record<string, string>,
   options?: any
 ): Promise<FireEngineResponse> {
@@ -88,7 +88,7 @@ export async function scrapWithFireEngine(
 
     const contentType = response.headers["content-type"];
     if (contentType && contentType.includes("application/pdf")) {
-      return { html: await fetchAndProcessPdf(url), screenshot: "" };
+      return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" };
     } else {
       const data = response.data;
       const html = data.content;
@@ -108,7 +108,8 @@ export async function scrapWithFireEngine(
 export async function scrapWithScrapingBee(
   url: string,
   wait_browser: string = "domcontentloaded",
-  timeout: number = universalTimeout
+  timeout: number = universalTimeout,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
 ): Promise<string> {
   try {
     const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
@@ -129,7 +130,7 @@ export async function scrapWithScrapingBee(
 
     const contentType = response.headers["content-type"];
     if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return fetchAndProcessPdf(url, pageOptions?.parsePDF);
     } else {
       const decoder = new TextDecoder();
       const text = decoder.decode(response.data);
@@ -144,7 +145,8 @@ export async function scrapWithScrapingBee(
 export async function scrapWithPlaywright(
   url: string,
   waitFor: number = 0,
-  headers?: Record<string, string>
+  headers?: Record<string, string>,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
 ): Promise<string> {
   try {
     const reqParams = await generateRequestParams(url);
@@ -172,7 +174,7 @@ export async function scrapWithPlaywright(
 
     const contentType = response.headers["content-type"];
     if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return fetchAndProcessPdf(url, pageOptions?.parsePDF);
     } else {
       const textData = response.data;
       try {
@@ -194,7 +196,10 @@ export async function scrapWithPlaywright(
   }
 }
 
-export async function scrapWithFetch(url: string): Promise<string> {
+export async function scrapWithFetch(
+  url: string,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
+): Promise<string> {
   try {
     const response = await axios.get(url, {
       headers: {
@@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
 
     const contentType = response.headers["content-type"];
     if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return fetchAndProcessPdf(url, pageOptions?.parsePDF);
     } else {
       const text = response.data;
       return text;
@@ -371,7 +376,7 @@ export async function scrapSingleUrl(
           }
           break;
         case "pdf":
-          customScrapedContent  = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
+          customScrapedContent  = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot }
           break;
       }
     }
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
index f14c8d4..f4ed3c6 100644
--- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
@@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor';
 describe('PDF Processing Module - Integration Test', () => {
   it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
     delete process.env.LLAMAPARSE_API_KEY;
-    const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
+    const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
     expect(pdfContent.trim()).toEqual("Dummy PDF file");
   });
 
diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
index 71984f2..1f0d6e8 100644
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -9,9 +9,9 @@ import os from "os";
 
 dotenv.config();
 
-export async function fetchAndProcessPdf(url: string): Promise<string> {
+export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<string> {
   const tempFilePath = await downloadPdf(url);
-  const content = await processPdfToText(tempFilePath);
+  const content = await processPdfToText(tempFilePath, parsePDF);
   fs.unlinkSync(tempFilePath); // Clean up the temporary file
   return content;
 }
@@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<string> {
   });
 }
 
-export async function processPdfToText(filePath: string): Promise<string> {
+export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
   let content = "";
 
-  if (process.env.LLAMAPARSE_API_KEY) {
+  if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
     const apiKey = process.env.LLAMAPARSE_API_KEY;
     const headers = {
       Authorization: `Bearer ${apiKey}`,
@@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
       console.error("Error processing pdf document w/ LlamaIndex(2)");
       content = await processPdf(filePath);
     }
-  } else {
+  } else if (parsePDF) {
     content = await processPdf(filePath);
+  } else {
+    content = fs.readFileSync(filePath, "utf-8");
   }
   return content;
 }