From 57e5b360142087390c6061408ccc1eb448bc6e78 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Thu, 18 Apr 2024 11:43:57 -0300
Subject: [PATCH] [Feat] Adding pdf parser

---
 apps/api/.env.local                           |   2 +-
 apps/api/package.json                         |   2 +
 apps/api/pnpm-lock.yaml                       |  26 ++++-
 apps/api/src/lib/entities.ts                  |   2 +
 apps/api/src/scraper/WebScraper/crawler.ts    |   2 +-
 apps/api/src/scraper/WebScraper/index.ts      |  51 ++++++++-
 .../utils/__tests__/pdfProcessor.test.ts      |  40 +++++++
 .../scraper/WebScraper/utils/pdfProcessor.ts  | 108 ++++++++++++++++++
 8 files changed, 224 insertions(+), 9 deletions(-)
 create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
 create mode 100644 apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts

diff --git a/apps/api/.env.local b/apps/api/.env.local
index 301c64b..852c5ed 100644
--- a/apps/api/.env.local
+++ b/apps/api/.env.local
@@ -10,4 +10,4 @@ OPENAI_API_KEY=
 BULL_AUTH_KEY=
 LOGTAIL_KEY=
 PLAYWRIGHT_MICROSERVICE_URL=
-
+LLAMAPARSE_API_KEY=
diff --git a/apps/api/package.json b/apps/api/package.json
index e8e5e02..1d12a96 100644
--- a/apps/api/package.json
+++ b/apps/api/package.json
@@ -60,6 +60,7 @@
     "date-fns": "^2.29.3",
     "dotenv": "^16.3.1",
     "express-rate-limit": "^6.7.0",
+    "form-data": "^4.0.0",
     "glob": "^10.3.12",
     "gpt3-tokenizer": "^1.1.5",
     "ioredis": "^5.3.2",
@@ -73,6 +74,7 @@
     "mongoose": "^8.0.3",
     "natural": "^6.3.0",
     "openai": "^4.28.4",
+    "pdf-parse": "^1.1.1",
     "pos": "^0.4.2",
     "promptable": "^0.0.9",
     "puppeteer": "^22.6.3",
diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml
index 8142189..fd0ffa0 100644
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
@@ -68,6 +68,9 @@ dependencies:
   express-rate-limit:
     specifier: ^6.7.0
     version: 6.11.2(express@4.18.3)
+  form-data:
+    specifier: ^4.0.0
+    version: 4.0.0
   glob:
     specifier: ^10.3.12
     version: 10.3.12
@@ -82,7 +85,7 @@ dependencies:
     version: 0.0.25
   langchain:
     specifier: ^0.1.25
-    version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2)
+    version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2)
   languagedetect:
     specifier: ^2.0.0
     version: 2.0.0
@@ -107,6 +110,9 @@ dependencies:
   openai:
     specifier: ^4.28.4
     version: 4.28.4
+  pdf-parse:
+    specifier: ^1.1.1
+    version: 1.1.1
   pos:
     specifier: ^0.4.2
     version: 0.4.2
@@ -2498,7 +2504,6 @@ packages:
     dependencies:
       ms: 2.1.3
       supports-color: 5.5.0
-    dev: true
 
   /debug@4.3.4:
     resolution: {integrity: sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==}
@@ -3997,7 +4002,7 @@ packages:
     engines: {node: '>=6'}
     dev: true
 
-  /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2):
+  /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2):
     resolution: {integrity: sha512-sfEChvr4H2CklHdSByNBbytwBrFhgtA5kPOnwcBrxuXGg1iOaTzhVxQA0QcNcQucI3hZrsNbZjxGp+Can1ooZQ==}
     engines: {node: '>=18'}
     peerDependencies:
@@ -4174,6 +4179,7 @@ packages:
       ml-distance: 4.0.1
       openapi-types: 12.1.3
       p-retry: 4.6.2
+      pdf-parse: 1.1.1
       puppeteer: 22.6.3(typescript@5.4.2)
       redis: 4.6.13
       uuid: 9.0.1
@@ -4653,6 +4659,10 @@ packages:
     resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==}
     engines: {node: '>=10.5.0'}
 
+  /node-ensure@0.0.0:
+    resolution: {integrity: sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==}
+    dev: false
+
   /node-fetch@2.7.0:
     resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==}
     engines: {node: 4.x || >=6.0.0}
@@ -4951,6 +4961,16 @@ packages:
   /path-to-regexp@0.1.7:
     resolution: {integrity: sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ==}
 
+  /pdf-parse@1.1.1:
+    resolution: {integrity: sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==}
+    engines: {node: '>=6.8.1'}
+    dependencies:
+      debug: 3.2.7(supports-color@5.5.0)
+      node-ensure: 0.0.0
+    transitivePeerDependencies:
+      - supports-color
+    dev: false
+
   /pend@1.2.0:
     resolution: {integrity: sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==}
     dev: false
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index c332914..d608756 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -39,6 +39,7 @@ export class Document {
     [key: string]: any;
   };
   childrenLinks?: string[];
+  provider?: string;
 
   constructor(data: Partial<Document>) {
     if (!data.content) {
@@ -51,5 +52,6 @@ export class Document {
     this.metadata = data.metadata || { sourceURL: "" };
     this.markdown = data.markdown || "";
     this.childrenLinks = data.childrenLinks || undefined;
+    this.provider = data.provider || undefined;
   }
 }
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 886efab..23cb629 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -257,7 +257,7 @@ export class WebCrawler {
       ".js",
       ".ico",
       ".svg",
-      ".pdf",
+      // ".pdf", 
       ".zip",
       ".exe",
       ".dmg",
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index fbfaa7b..4dda41c 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -5,6 +5,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
 import { WebCrawler } from "./crawler";
 import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/gptVision";
+import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 
 
 export class WebScraperDataProvider {
@@ -65,7 +66,7 @@ export class WebScraperDataProvider {
       throw new Error("Url is required");
     }
 
-    if (!useCaching) {
+    if (true) { // !useCaching) {
       if (this.mode === "crawl") {
         const crawler = new WebCrawler({
           initialUrl: this.urls[0],
@@ -75,7 +76,7 @@ export class WebScraperDataProvider {
           limit: this.limit,
           generateImgAltText: this.generateImgAltText,
         });
-        const links = await crawler.start(inProgress, 5, this.limit);
+        let links = await crawler.start(inProgress, 5, this.limit);
         if (this.returnOnlyUrls) {
           return links.map((url) => ({
             content: "",
@@ -84,12 +85,27 @@ export class WebScraperDataProvider {
             type: "text",
           }));
         }
+
+        let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
+        let pdfDocuments: Document[] = [];
+        for (let pdfLink of pdfLinks) {
+          const pdfContent = await fetchAndProcessPdf(pdfLink);
+          pdfDocuments.push({
+            content: pdfContent,
+            metadata: { sourceURL: pdfLink },
+            provider: "web",
+            type: "text",
+          });
+        }
+        links = links.filter((link) => !link.endsWith(".pdf"));
+
         let documents = await this.convertUrlsToDocuments(links, inProgress);
         documents = await this.getSitemapData(this.urls[0], documents);
         documents = this.replaceImgPathsWithAbsolutePaths(documents);
         if (this.generateImgAltText) {
           documents = await this.generatesImgAltText(documents);
         }
+        documents = documents.concat(pdfDocuments);
 
         // CACHING DOCUMENTS
         // - parent document
@@ -134,8 +150,20 @@ export class WebScraperDataProvider {
       }
 
       if (this.mode === "single_urls") {
+        let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf"));
+        let pdfDocuments: Document[] = [];
+        for (let pdfLink of pdfLinks) {
+          const pdfContent = await fetchAndProcessPdf(pdfLink);
+          pdfDocuments.push({
+            content: pdfContent,
+            metadata: { sourceURL: pdfLink },
+            provider: "web",
+            type: "text",
+          });
+        }
+
         let documents = await this.convertUrlsToDocuments(
-          this.urls,
+          this.urls.filter((link) => !link.endsWith(".pdf")),
           inProgress
         );
         documents = this.replaceImgPathsWithAbsolutePaths(documents);
@@ -144,6 +172,7 @@ export class WebScraperDataProvider {
         }
         const baseUrl = new URL(this.urls[0]).origin;
         documents = await this.getSitemapData(baseUrl, documents);
+        documents = documents.concat(pdfDocuments);
 
         await this.setCachedDocuments(documents);
         documents = this.removeChildLinks(documents);
@@ -151,7 +180,20 @@ export class WebScraperDataProvider {
         return documents;
       }
       if (this.mode === "sitemap") {
-        const links = await getLinksFromSitemap(this.urls[0]);
+        let links = await getLinksFromSitemap(this.urls[0]);
+        let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
+        let pdfDocuments: Document[] = [];
+        for (let pdfLink of pdfLinks) {
+          const pdfContent = await fetchAndProcessPdf(pdfLink);
+          pdfDocuments.push({
+            content: pdfContent,
+            metadata: { sourceURL: pdfLink },
+            provider: "web",
+            type: "text",
+          });
+        }
+        links = links.filter((link) => !link.endsWith(".pdf"));
+
         let documents = await this.convertUrlsToDocuments(
           links.slice(0, this.limit),
           inProgress
@@ -162,6 +204,7 @@ export class WebScraperDataProvider {
         if (this.generateImgAltText) {
           documents = await this.generatesImgAltText(documents);
         }
+        documents = documents.concat(pdfDocuments);
 
         await this.setCachedDocuments(documents);
         documents = this.removeChildLinks(documents);
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
new file mode 100644
index 0000000..7d25aec
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts
@@ -0,0 +1,40 @@
+import * as pdfProcessor from '../pdfProcessor';
+
+describe('PDF Processing Module - Integration Test', () => {
+  it('should download and read a simple PDF file by URL', async () => {
+    const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
+    expect(pdfContent).toEqual("Dummy PDF file");
+  });
+
+  it('should download and read a complex PDF file by URL', async () => {
+    const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf');
+
+    const expectedContent = 'A Comprehensive Overview of Large Language Models\n' +
+    '                       a                        a,∗            b,∗                         c,d,∗                   e,f                           e,f                     g,i\n' +
+    '   Humza Naveed         , Asad Ullah Khan          , Shi Qiu     , Muhammad Saqib               , Saeed Anwar        , Muhammad Usman              , Naveed Akhtar         ,\n' +
+    '                                                                     Nick Barnes      h, Ajmal Mian      i\n' +
+    '                                                   aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' +
+    '                                                     bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' +
+    '                                                        cUniversity of Technology Sydney (UTS), Sydney, Australia\n' +
+    '                                       dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' +
+    '                                           eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' +
+    '                                   fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' +
+    '                                                        gThe University of Melbourne (UoM), Melbourne, Australia\n' +
+    '                                                       hAustralian National University (ANU), Canberra, Australia\n' +
+    '                                                       iThe University of Western Australia (UWA), Perth, Australia\n' +
+    '  Abstract\n' +
+    '     Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' +
+    '  beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' +
+    '  topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' +
+    '  robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' +
+    '  LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' +
+    '  the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' +
+    '  yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' +
+    '  on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' +
+    '  concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' +
+    '  provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' +
+    '  extensive informative summaries of the existing works to advance the LLM research.\n'
+    expect(pdfContent).toContain(expectedContent);
+  }, 60000); 
+
+});
\ No newline at end of file
diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
new file mode 100644
index 0000000..fb08d9c
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -0,0 +1,108 @@
+import axios, { AxiosResponse } from "axios";
+import fs from "fs";
+import { createReadStream, createWriteStream } from "node:fs";
+import FormData from "form-data";
+import dotenv from "dotenv";
+import pdf from "pdf-parse";
+import path from "path";
+import os from "os";
+
+dotenv.config();
+
+export async function fetchAndProcessPdf(url: string): Promise<string> {
+  const tempFilePath = await downloadPdf(url);
+  const content = await processPdfToText(tempFilePath);
+  fs.unlinkSync(tempFilePath); // Clean up the temporary file
+  return content;
+}
+
+async function downloadPdf(url: string): Promise<string> {
+  const response = await axios({
+    url,
+    method: 'GET',
+    responseType: 'stream',
+  });
+
+  const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
+  const writer = createWriteStream(tempFilePath);
+
+  response.data.pipe(writer);
+
+  return new Promise((resolve, reject) => {
+    writer.on('finish', () => resolve(tempFilePath));
+    writer.on('error', reject);
+  });
+}
+
+export async function processPdfToText(filePath: string): Promise<string> {
+  let content = "";
+
+  if (process.env.LLAMAPARSE_API_KEY) {
+    const apiKey = process.env.LLAMAPARSE_API_KEY;
+    const headers = {
+      Authorization: `Bearer ${apiKey}`,
+    };
+    const base_url = "https://api.cloud.llamaindex.ai/api/parsing";
+    const fileType2 = "application/pdf";
+
+    try {
+      const formData = new FormData();
+      formData.append("file", createReadStream(filePath), {
+        filename: filePath,
+        contentType: fileType2,
+      });
+
+      const uploadUrl = `${base_url}/upload`;
+      const uploadResponse = await axios.post(uploadUrl, formData, {
+        headers: {
+          ...headers,
+          ...formData.getHeaders(),
+        },
+      });
+
+      const jobId = uploadResponse.data.id;
+      const resultType = "text";
+      const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`;
+
+      let resultResponse: AxiosResponse;
+      let attempt = 0;
+      const maxAttempts = 10; // Maximum number of attempts
+      let resultAvailable = false;
+
+      while (attempt < maxAttempts && !resultAvailable) {
+        try {
+          resultResponse = await axios.get(resultUrl, { headers });
+          if (resultResponse.status === 200) {
+            resultAvailable = true; // Exit condition met
+          } else {
+            // If the status code is not 200, increment the attempt counter and wait
+            attempt++;
+            await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds
+          }
+        } catch (error) {
+          console.error("Error fetching result:", error);
+          attempt++;
+          await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying
+          // You may want to handle specific errors differently
+        }
+      }
+
+      if (!resultAvailable) {
+        content = await processPdf(filePath);
+      }
+      content = resultResponse.data[resultType];
+    } catch (error) {
+      console.error("Error processing document:", filePath, error);
+      content = await processPdf(filePath);
+    }
+  } else {
+    content = await processPdf(filePath);
+  }
+  return content;
+}
+
+async function processPdf(file: string){
+  const fileContent = fs.readFileSync(file);
+  const data = await pdf(fileContent);
+  return data.text;
+}
\ No newline at end of file