From 57e5b360142087390c6061408ccc1eb448bc6e78 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Apr 2024 11:43:57 -0300 Subject: [PATCH] [Feat] Adding pdf parser --- apps/api/.env.local | 2 +- apps/api/package.json | 2 + apps/api/pnpm-lock.yaml | 26 ++++- apps/api/src/lib/entities.ts | 2 + apps/api/src/scraper/WebScraper/crawler.ts | 2 +- apps/api/src/scraper/WebScraper/index.ts | 51 ++++++++- .../utils/__tests__/pdfProcessor.test.ts | 40 +++++++ .../scraper/WebScraper/utils/pdfProcessor.ts | 108 ++++++++++++++++++ 8 files changed, 224 insertions(+), 9 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts diff --git a/apps/api/.env.local b/apps/api/.env.local index 301c64b..852c5ed 100644 --- a/apps/api/.env.local +++ b/apps/api/.env.local @@ -10,4 +10,4 @@ OPENAI_API_KEY= BULL_AUTH_KEY= LOGTAIL_KEY= PLAYWRIGHT_MICROSERVICE_URL= - +LLAMAPARSE_API_KEY= diff --git a/apps/api/package.json b/apps/api/package.json index e8e5e02..1d12a96 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -60,6 +60,7 @@ "date-fns": "^2.29.3", "dotenv": "^16.3.1", "express-rate-limit": "^6.7.0", + "form-data": "^4.0.0", "glob": "^10.3.12", "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.3.2", @@ -73,6 +74,7 @@ "mongoose": "^8.0.3", "natural": "^6.3.0", "openai": "^4.28.4", + "pdf-parse": "^1.1.1", "pos": "^0.4.2", "promptable": "^0.0.9", "puppeteer": "^22.6.3", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 8142189..fd0ffa0 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -68,6 +68,9 @@ dependencies: express-rate-limit: specifier: ^6.7.0 version: 6.11.2(express@4.18.3) + form-data: + specifier: ^4.0.0 + version: 4.0.0 glob: specifier: ^10.3.12 version: 10.3.12 @@ -82,7 +85,7 @@ dependencies: version: 0.0.25 langchain: specifier: ^0.1.25 - version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2) + version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2) languagedetect: specifier: ^2.0.0 version: 2.0.0 @@ -107,6 +110,9 @@ dependencies: openai: specifier: ^4.28.4 version: 4.28.4 + pdf-parse: + specifier: ^1.1.1 + version: 1.1.1 pos: specifier: ^0.4.2 version: 0.4.2 @@ -2498,7 +2504,6 @@ packages: dependencies: ms: 2.1.3 supports-color: 5.5.0 - dev: true /debug@4.3.4: resolution: {integrity: sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==} @@ -3997,7 +4002,7 @@ packages: engines: {node: '>=6'} dev: true - /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2): + /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2): resolution: {integrity: sha512-sfEChvr4H2CklHdSByNBbytwBrFhgtA5kPOnwcBrxuXGg1iOaTzhVxQA0QcNcQucI3hZrsNbZjxGp+Can1ooZQ==} engines: {node: '>=18'} peerDependencies: @@ -4174,6 +4179,7 @@ packages: ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 + pdf-parse: 1.1.1 puppeteer: 22.6.3(typescript@5.4.2) redis: 4.6.13 uuid: 9.0.1 @@ -4653,6 +4659,10 @@ packages: resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} engines: {node: '>=10.5.0'} + /node-ensure@0.0.0: + resolution: {integrity: sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==} + dev: false + /node-fetch@2.7.0: resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==} engines: {node: 4.x || >=6.0.0} @@ -4951,6 +4961,16 @@ packages: /path-to-regexp@0.1.7: resolution: {integrity: sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ==} + /pdf-parse@1.1.1: + resolution: {integrity: sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==} + engines: {node: '>=6.8.1'} + dependencies: + debug: 3.2.7(supports-color@5.5.0) + node-ensure: 0.0.0 + transitivePeerDependencies: + - supports-color + dev: false + /pend@1.2.0: resolution: {integrity: sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==} dev: false diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index c332914..d608756 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -39,6 +39,7 @@ export class Document { [key: string]: any; }; childrenLinks?: string[]; + provider?: string; constructor(data: Partial) { if (!data.content) { @@ -51,5 +52,6 @@ export class Document { this.metadata = data.metadata || { sourceURL: "" }; this.markdown = data.markdown || ""; this.childrenLinks = data.childrenLinks || undefined; + this.provider = data.provider || undefined; } } diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 886efab..23cb629 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -257,7 +257,7 @@ export class WebCrawler { ".js", ".ico", ".svg", - ".pdf", + // ".pdf", ".zip", ".exe", ".dmg", diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index fbfaa7b..4dda41c 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -5,6 +5,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/gptVision"; +import { fetchAndProcessPdf } from "./utils/pdfProcessor"; export class WebScraperDataProvider { @@ -65,7 +66,7 @@ export class WebScraperDataProvider { throw new Error("Url is required"); } - if (!useCaching) { + if (true) { // !useCaching) { if (this.mode === "crawl") { const crawler = new WebCrawler({ initialUrl: this.urls[0], @@ -75,7 +76,7 @@ export class WebScraperDataProvider { limit: this.limit, generateImgAltText: this.generateImgAltText, }); - const links = await crawler.start(inProgress, 5, this.limit); + let links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { return links.map((url) => ({ content: "", @@ -84,12 +85,27 @@ export class WebScraperDataProvider { type: "text", })); } + + let pdfLinks = links.filter((link) => link.endsWith(".pdf")); + let pdfDocuments: Document[] = []; + for (let pdfLink of pdfLinks) { + const pdfContent = await fetchAndProcessPdf(pdfLink); + pdfDocuments.push({ + content: pdfContent, + metadata: { sourceURL: pdfLink }, + provider: "web", + type: "text", + }); + } + links = links.filter((link) => !link.endsWith(".pdf")); + let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } + documents = documents.concat(pdfDocuments); // CACHING DOCUMENTS // - parent document @@ -134,8 +150,20 @@ export class WebScraperDataProvider { } if (this.mode === "single_urls") { + let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf")); + let pdfDocuments: Document[] = []; + for (let pdfLink of pdfLinks) { + const pdfContent = await fetchAndProcessPdf(pdfLink); + pdfDocuments.push({ + content: pdfContent, + metadata: { sourceURL: pdfLink }, + provider: "web", + type: "text", + }); + } + let documents = await this.convertUrlsToDocuments( - this.urls, + this.urls.filter((link) => !link.endsWith(".pdf")), inProgress ); documents = this.replaceImgPathsWithAbsolutePaths(documents); @@ -144,6 +172,7 @@ export class WebScraperDataProvider { } const baseUrl = new URL(this.urls[0]).origin; documents = await this.getSitemapData(baseUrl, documents); + documents = documents.concat(pdfDocuments); await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); @@ -151,7 +180,20 @@ export class WebScraperDataProvider { return documents; } if (this.mode === "sitemap") { - const links = await getLinksFromSitemap(this.urls[0]); + let links = await getLinksFromSitemap(this.urls[0]); + let pdfLinks = links.filter((link) => link.endsWith(".pdf")); + let pdfDocuments: Document[] = []; + for (let pdfLink of pdfLinks) { + const pdfContent = await fetchAndProcessPdf(pdfLink); + pdfDocuments.push({ + content: pdfContent, + metadata: { sourceURL: pdfLink }, + provider: "web", + type: "text", + }); + } + links = links.filter((link) => !link.endsWith(".pdf")); + let documents = await this.convertUrlsToDocuments( links.slice(0, this.limit), inProgress @@ -162,6 +204,7 @@ export class WebScraperDataProvider { if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } + documents = documents.concat(pdfDocuments); await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts new file mode 100644 index 0000000..7d25aec --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -0,0 +1,40 @@ +import * as pdfProcessor from '../pdfProcessor'; + +describe('PDF Processing Module - Integration Test', () => { + it('should download and read a simple PDF file by URL', async () => { + const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); + expect(pdfContent).toEqual("Dummy PDF file"); + }); + + it('should download and read a complex PDF file by URL', async () => { + const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf'); + + const expectedContent = 'A Comprehensive Overview of Large Language Models\n' + + ' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' + + ' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' + + ' Nick Barnes h, Ajmal Mian i\n' + + ' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' + + ' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' + + ' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' + + ' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' + + ' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' + + ' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' + + ' gThe University of Melbourne (UoM), Melbourne, Australia\n' + + ' hAustralian National University (ANU), Canberra, Australia\n' + + ' iThe University of Western Australia (UWA), Perth, Australia\n' + + ' Abstract\n' + + ' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' + + ' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' + + ' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' + + ' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' + + ' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' + + ' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' + + ' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' + + ' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' + + ' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' + + ' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' + + ' extensive informative summaries of the existing works to advance the LLM research.\n' + expect(pdfContent).toContain(expectedContent); + }, 60000); + +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts new file mode 100644 index 0000000..fb08d9c --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -0,0 +1,108 @@ +import axios, { AxiosResponse } from "axios"; +import fs from "fs"; +import { createReadStream, createWriteStream } from "node:fs"; +import FormData from "form-data"; +import dotenv from "dotenv"; +import pdf from "pdf-parse"; +import path from "path"; +import os from "os"; + +dotenv.config(); + +export async function fetchAndProcessPdf(url: string): Promise { + const tempFilePath = await downloadPdf(url); + const content = await processPdfToText(tempFilePath); + fs.unlinkSync(tempFilePath); // Clean up the temporary file + return content; +} + +async function downloadPdf(url: string): Promise { + const response = await axios({ + url, + method: 'GET', + responseType: 'stream', + }); + + const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`); + const writer = createWriteStream(tempFilePath); + + response.data.pipe(writer); + + return new Promise((resolve, reject) => { + writer.on('finish', () => resolve(tempFilePath)); + writer.on('error', reject); + }); +} + +export async function processPdfToText(filePath: string): Promise { + let content = ""; + + if (process.env.LLAMAPARSE_API_KEY) { + const apiKey = process.env.LLAMAPARSE_API_KEY; + const headers = { + Authorization: `Bearer ${apiKey}`, + }; + const base_url = "https://api.cloud.llamaindex.ai/api/parsing"; + const fileType2 = "application/pdf"; + + try { + const formData = new FormData(); + formData.append("file", createReadStream(filePath), { + filename: filePath, + contentType: fileType2, + }); + + const uploadUrl = `${base_url}/upload`; + const uploadResponse = await axios.post(uploadUrl, formData, { + headers: { + ...headers, + ...formData.getHeaders(), + }, + }); + + const jobId = uploadResponse.data.id; + const resultType = "text"; + const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`; + + let resultResponse: AxiosResponse; + let attempt = 0; + const maxAttempts = 10; // Maximum number of attempts + let resultAvailable = false; + + while (attempt < maxAttempts && !resultAvailable) { + try { + resultResponse = await axios.get(resultUrl, { headers }); + if (resultResponse.status === 200) { + resultAvailable = true; // Exit condition met + } else { + // If the status code is not 200, increment the attempt counter and wait + attempt++; + await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds + } + } catch (error) { + console.error("Error fetching result:", error); + attempt++; + await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying + // You may want to handle specific errors differently + } + } + + if (!resultAvailable) { + content = await processPdf(filePath); + } + content = resultResponse.data[resultType]; + } catch (error) { + console.error("Error processing document:", filePath, error); + content = await processPdf(filePath); + } + } else { + content = await processPdf(filePath); + } + return content; +} + +async function processPdf(file: string){ + const fileContent = fs.readFileSync(file); + const data = await pdf(fileContent); + return data.text; +} \ No newline at end of file