[Feat] Adding pdf parser
This commit is contained in:
parent
50cf97c709
commit
57e5b36014
@ -10,4 +10,4 @@ OPENAI_API_KEY=
|
|||||||
BULL_AUTH_KEY=
|
BULL_AUTH_KEY=
|
||||||
LOGTAIL_KEY=
|
LOGTAIL_KEY=
|
||||||
PLAYWRIGHT_MICROSERVICE_URL=
|
PLAYWRIGHT_MICROSERVICE_URL=
|
||||||
|
LLAMAPARSE_API_KEY=
|
||||||
|
@ -60,6 +60,7 @@
|
|||||||
"date-fns": "^2.29.3",
|
"date-fns": "^2.29.3",
|
||||||
"dotenv": "^16.3.1",
|
"dotenv": "^16.3.1",
|
||||||
"express-rate-limit": "^6.7.0",
|
"express-rate-limit": "^6.7.0",
|
||||||
|
"form-data": "^4.0.0",
|
||||||
"glob": "^10.3.12",
|
"glob": "^10.3.12",
|
||||||
"gpt3-tokenizer": "^1.1.5",
|
"gpt3-tokenizer": "^1.1.5",
|
||||||
"ioredis": "^5.3.2",
|
"ioredis": "^5.3.2",
|
||||||
@ -73,6 +74,7 @@
|
|||||||
"mongoose": "^8.0.3",
|
"mongoose": "^8.0.3",
|
||||||
"natural": "^6.3.0",
|
"natural": "^6.3.0",
|
||||||
"openai": "^4.28.4",
|
"openai": "^4.28.4",
|
||||||
|
"pdf-parse": "^1.1.1",
|
||||||
"pos": "^0.4.2",
|
"pos": "^0.4.2",
|
||||||
"promptable": "^0.0.9",
|
"promptable": "^0.0.9",
|
||||||
"puppeteer": "^22.6.3",
|
"puppeteer": "^22.6.3",
|
||||||
|
@ -68,6 +68,9 @@ dependencies:
|
|||||||
express-rate-limit:
|
express-rate-limit:
|
||||||
specifier: ^6.7.0
|
specifier: ^6.7.0
|
||||||
version: 6.11.2(express@4.18.3)
|
version: 6.11.2(express@4.18.3)
|
||||||
|
form-data:
|
||||||
|
specifier: ^4.0.0
|
||||||
|
version: 4.0.0
|
||||||
glob:
|
glob:
|
||||||
specifier: ^10.3.12
|
specifier: ^10.3.12
|
||||||
version: 10.3.12
|
version: 10.3.12
|
||||||
@ -82,7 +85,7 @@ dependencies:
|
|||||||
version: 0.0.25
|
version: 0.0.25
|
||||||
langchain:
|
langchain:
|
||||||
specifier: ^0.1.25
|
specifier: ^0.1.25
|
||||||
version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2)
|
version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2)
|
||||||
languagedetect:
|
languagedetect:
|
||||||
specifier: ^2.0.0
|
specifier: ^2.0.0
|
||||||
version: 2.0.0
|
version: 2.0.0
|
||||||
@ -107,6 +110,9 @@ dependencies:
|
|||||||
openai:
|
openai:
|
||||||
specifier: ^4.28.4
|
specifier: ^4.28.4
|
||||||
version: 4.28.4
|
version: 4.28.4
|
||||||
|
pdf-parse:
|
||||||
|
specifier: ^1.1.1
|
||||||
|
version: 1.1.1
|
||||||
pos:
|
pos:
|
||||||
specifier: ^0.4.2
|
specifier: ^0.4.2
|
||||||
version: 0.4.2
|
version: 0.4.2
|
||||||
@ -2498,7 +2504,6 @@ packages:
|
|||||||
dependencies:
|
dependencies:
|
||||||
ms: 2.1.3
|
ms: 2.1.3
|
||||||
supports-color: 5.5.0
|
supports-color: 5.5.0
|
||||||
dev: true
|
|
||||||
|
|
||||||
/debug@4.3.4:
|
/debug@4.3.4:
|
||||||
resolution: {integrity: sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==}
|
resolution: {integrity: sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==}
|
||||||
@ -3997,7 +4002,7 @@ packages:
|
|||||||
engines: {node: '>=6'}
|
engines: {node: '>=6'}
|
||||||
dev: true
|
dev: true
|
||||||
|
|
||||||
/langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2):
|
/langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2):
|
||||||
resolution: {integrity: sha512-sfEChvr4H2CklHdSByNBbytwBrFhgtA5kPOnwcBrxuXGg1iOaTzhVxQA0QcNcQucI3hZrsNbZjxGp+Can1ooZQ==}
|
resolution: {integrity: sha512-sfEChvr4H2CklHdSByNBbytwBrFhgtA5kPOnwcBrxuXGg1iOaTzhVxQA0QcNcQucI3hZrsNbZjxGp+Can1ooZQ==}
|
||||||
engines: {node: '>=18'}
|
engines: {node: '>=18'}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
@ -4174,6 +4179,7 @@ packages:
|
|||||||
ml-distance: 4.0.1
|
ml-distance: 4.0.1
|
||||||
openapi-types: 12.1.3
|
openapi-types: 12.1.3
|
||||||
p-retry: 4.6.2
|
p-retry: 4.6.2
|
||||||
|
pdf-parse: 1.1.1
|
||||||
puppeteer: 22.6.3(typescript@5.4.2)
|
puppeteer: 22.6.3(typescript@5.4.2)
|
||||||
redis: 4.6.13
|
redis: 4.6.13
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
@ -4653,6 +4659,10 @@ packages:
|
|||||||
resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==}
|
resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==}
|
||||||
engines: {node: '>=10.5.0'}
|
engines: {node: '>=10.5.0'}
|
||||||
|
|
||||||
|
/node-ensure@0.0.0:
|
||||||
|
resolution: {integrity: sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==}
|
||||||
|
dev: false
|
||||||
|
|
||||||
/node-fetch@2.7.0:
|
/node-fetch@2.7.0:
|
||||||
resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==}
|
resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==}
|
||||||
engines: {node: 4.x || >=6.0.0}
|
engines: {node: 4.x || >=6.0.0}
|
||||||
@ -4951,6 +4961,16 @@ packages:
|
|||||||
/path-to-regexp@0.1.7:
|
/path-to-regexp@0.1.7:
|
||||||
resolution: {integrity: sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ==}
|
resolution: {integrity: sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ==}
|
||||||
|
|
||||||
|
/pdf-parse@1.1.1:
|
||||||
|
resolution: {integrity: sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==}
|
||||||
|
engines: {node: '>=6.8.1'}
|
||||||
|
dependencies:
|
||||||
|
debug: 3.2.7(supports-color@5.5.0)
|
||||||
|
node-ensure: 0.0.0
|
||||||
|
transitivePeerDependencies:
|
||||||
|
- supports-color
|
||||||
|
dev: false
|
||||||
|
|
||||||
/pend@1.2.0:
|
/pend@1.2.0:
|
||||||
resolution: {integrity: sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==}
|
resolution: {integrity: sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==}
|
||||||
dev: false
|
dev: false
|
||||||
|
@ -39,6 +39,7 @@ export class Document {
|
|||||||
[key: string]: any;
|
[key: string]: any;
|
||||||
};
|
};
|
||||||
childrenLinks?: string[];
|
childrenLinks?: string[];
|
||||||
|
provider?: string;
|
||||||
|
|
||||||
constructor(data: Partial<Document>) {
|
constructor(data: Partial<Document>) {
|
||||||
if (!data.content) {
|
if (!data.content) {
|
||||||
@ -51,5 +52,6 @@ export class Document {
|
|||||||
this.metadata = data.metadata || { sourceURL: "" };
|
this.metadata = data.metadata || { sourceURL: "" };
|
||||||
this.markdown = data.markdown || "";
|
this.markdown = data.markdown || "";
|
||||||
this.childrenLinks = data.childrenLinks || undefined;
|
this.childrenLinks = data.childrenLinks || undefined;
|
||||||
|
this.provider = data.provider || undefined;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -257,7 +257,7 @@ export class WebCrawler {
|
|||||||
".js",
|
".js",
|
||||||
".ico",
|
".ico",
|
||||||
".svg",
|
".svg",
|
||||||
".pdf",
|
// ".pdf",
|
||||||
".zip",
|
".zip",
|
||||||
".exe",
|
".exe",
|
||||||
".dmg",
|
".dmg",
|
||||||
|
@ -5,6 +5,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
|||||||
import { WebCrawler } from "./crawler";
|
import { WebCrawler } from "./crawler";
|
||||||
import { getValue, setValue } from "../../services/redis";
|
import { getValue, setValue } from "../../services/redis";
|
||||||
import { getImageDescription } from "./utils/gptVision";
|
import { getImageDescription } from "./utils/gptVision";
|
||||||
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
|
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
@ -65,7 +66,7 @@ export class WebScraperDataProvider {
|
|||||||
throw new Error("Url is required");
|
throw new Error("Url is required");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!useCaching) {
|
if (true) { // !useCaching) {
|
||||||
if (this.mode === "crawl") {
|
if (this.mode === "crawl") {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
@ -75,7 +76,7 @@ export class WebScraperDataProvider {
|
|||||||
limit: this.limit,
|
limit: this.limit,
|
||||||
generateImgAltText: this.generateImgAltText,
|
generateImgAltText: this.generateImgAltText,
|
||||||
});
|
});
|
||||||
const links = await crawler.start(inProgress, 5, this.limit);
|
let links = await crawler.start(inProgress, 5, this.limit);
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
return links.map((url) => ({
|
return links.map((url) => ({
|
||||||
content: "",
|
content: "",
|
||||||
@ -84,12 +85,27 @@ export class WebScraperDataProvider {
|
|||||||
type: "text",
|
type: "text",
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||||
|
let pdfDocuments: Document[] = [];
|
||||||
|
for (let pdfLink of pdfLinks) {
|
||||||
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
|
pdfDocuments.push({
|
||||||
|
content: pdfContent,
|
||||||
|
metadata: { sourceURL: pdfLink },
|
||||||
|
provider: "web",
|
||||||
|
type: "text",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
if (this.generateImgAltText) {
|
if (this.generateImgAltText) {
|
||||||
documents = await this.generatesImgAltText(documents);
|
documents = await this.generatesImgAltText(documents);
|
||||||
}
|
}
|
||||||
|
documents = documents.concat(pdfDocuments);
|
||||||
|
|
||||||
// CACHING DOCUMENTS
|
// CACHING DOCUMENTS
|
||||||
// - parent document
|
// - parent document
|
||||||
@ -134,8 +150,20 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (this.mode === "single_urls") {
|
if (this.mode === "single_urls") {
|
||||||
|
let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf"));
|
||||||
|
let pdfDocuments: Document[] = [];
|
||||||
|
for (let pdfLink of pdfLinks) {
|
||||||
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
|
pdfDocuments.push({
|
||||||
|
content: pdfContent,
|
||||||
|
metadata: { sourceURL: pdfLink },
|
||||||
|
provider: "web",
|
||||||
|
type: "text",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(
|
let documents = await this.convertUrlsToDocuments(
|
||||||
this.urls,
|
this.urls.filter((link) => !link.endsWith(".pdf")),
|
||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
documents = this.replaceImgPathsWithAbsolutePaths(documents);
|
||||||
@ -144,6 +172,7 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
const baseUrl = new URL(this.urls[0]).origin;
|
const baseUrl = new URL(this.urls[0]).origin;
|
||||||
documents = await this.getSitemapData(baseUrl, documents);
|
documents = await this.getSitemapData(baseUrl, documents);
|
||||||
|
documents = documents.concat(pdfDocuments);
|
||||||
|
|
||||||
await this.setCachedDocuments(documents);
|
await this.setCachedDocuments(documents);
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
@ -151,7 +180,20 @@ export class WebScraperDataProvider {
|
|||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
if (this.mode === "sitemap") {
|
if (this.mode === "sitemap") {
|
||||||
const links = await getLinksFromSitemap(this.urls[0]);
|
let links = await getLinksFromSitemap(this.urls[0]);
|
||||||
|
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||||
|
let pdfDocuments: Document[] = [];
|
||||||
|
for (let pdfLink of pdfLinks) {
|
||||||
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
|
pdfDocuments.push({
|
||||||
|
content: pdfContent,
|
||||||
|
metadata: { sourceURL: pdfLink },
|
||||||
|
provider: "web",
|
||||||
|
type: "text",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(
|
let documents = await this.convertUrlsToDocuments(
|
||||||
links.slice(0, this.limit),
|
links.slice(0, this.limit),
|
||||||
inProgress
|
inProgress
|
||||||
@ -162,6 +204,7 @@ export class WebScraperDataProvider {
|
|||||||
if (this.generateImgAltText) {
|
if (this.generateImgAltText) {
|
||||||
documents = await this.generatesImgAltText(documents);
|
documents = await this.generatesImgAltText(documents);
|
||||||
}
|
}
|
||||||
|
documents = documents.concat(pdfDocuments);
|
||||||
|
|
||||||
await this.setCachedDocuments(documents);
|
await this.setCachedDocuments(documents);
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
|
@ -0,0 +1,40 @@
|
|||||||
|
import * as pdfProcessor from '../pdfProcessor';
|
||||||
|
|
||||||
|
describe('PDF Processing Module - Integration Test', () => {
|
||||||
|
it('should download and read a simple PDF file by URL', async () => {
|
||||||
|
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
|
||||||
|
expect(pdfContent).toEqual("Dummy PDF file");
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should download and read a complex PDF file by URL', async () => {
|
||||||
|
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf');
|
||||||
|
|
||||||
|
const expectedContent = 'A Comprehensive Overview of Large Language Models\n' +
|
||||||
|
' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' +
|
||||||
|
' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' +
|
||||||
|
' Nick Barnes h, Ajmal Mian i\n' +
|
||||||
|
' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' +
|
||||||
|
' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' +
|
||||||
|
' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' +
|
||||||
|
' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' +
|
||||||
|
' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' +
|
||||||
|
' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' +
|
||||||
|
' gThe University of Melbourne (UoM), Melbourne, Australia\n' +
|
||||||
|
' hAustralian National University (ANU), Canberra, Australia\n' +
|
||||||
|
' iThe University of Western Australia (UWA), Perth, Australia\n' +
|
||||||
|
' Abstract\n' +
|
||||||
|
' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' +
|
||||||
|
' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' +
|
||||||
|
' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' +
|
||||||
|
' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' +
|
||||||
|
' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' +
|
||||||
|
' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' +
|
||||||
|
' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' +
|
||||||
|
' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' +
|
||||||
|
' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' +
|
||||||
|
' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' +
|
||||||
|
' extensive informative summaries of the existing works to advance the LLM research.\n'
|
||||||
|
expect(pdfContent).toContain(expectedContent);
|
||||||
|
}, 60000);
|
||||||
|
|
||||||
|
});
|
108
apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
Normal file
108
apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
import axios, { AxiosResponse } from "axios";
|
||||||
|
import fs from "fs";
|
||||||
|
import { createReadStream, createWriteStream } from "node:fs";
|
||||||
|
import FormData from "form-data";
|
||||||
|
import dotenv from "dotenv";
|
||||||
|
import pdf from "pdf-parse";
|
||||||
|
import path from "path";
|
||||||
|
import os from "os";
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
export async function fetchAndProcessPdf(url: string): Promise<string> {
|
||||||
|
const tempFilePath = await downloadPdf(url);
|
||||||
|
const content = await processPdfToText(tempFilePath);
|
||||||
|
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function downloadPdf(url: string): Promise<string> {
|
||||||
|
const response = await axios({
|
||||||
|
url,
|
||||||
|
method: 'GET',
|
||||||
|
responseType: 'stream',
|
||||||
|
});
|
||||||
|
|
||||||
|
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
|
||||||
|
const writer = createWriteStream(tempFilePath);
|
||||||
|
|
||||||
|
response.data.pipe(writer);
|
||||||
|
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
writer.on('finish', () => resolve(tempFilePath));
|
||||||
|
writer.on('error', reject);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function processPdfToText(filePath: string): Promise<string> {
|
||||||
|
let content = "";
|
||||||
|
|
||||||
|
if (process.env.LLAMAPARSE_API_KEY) {
|
||||||
|
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
||||||
|
const headers = {
|
||||||
|
Authorization: `Bearer ${apiKey}`,
|
||||||
|
};
|
||||||
|
const base_url = "https://api.cloud.llamaindex.ai/api/parsing";
|
||||||
|
const fileType2 = "application/pdf";
|
||||||
|
|
||||||
|
try {
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append("file", createReadStream(filePath), {
|
||||||
|
filename: filePath,
|
||||||
|
contentType: fileType2,
|
||||||
|
});
|
||||||
|
|
||||||
|
const uploadUrl = `${base_url}/upload`;
|
||||||
|
const uploadResponse = await axios.post(uploadUrl, formData, {
|
||||||
|
headers: {
|
||||||
|
...headers,
|
||||||
|
...formData.getHeaders(),
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const jobId = uploadResponse.data.id;
|
||||||
|
const resultType = "text";
|
||||||
|
const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`;
|
||||||
|
|
||||||
|
let resultResponse: AxiosResponse;
|
||||||
|
let attempt = 0;
|
||||||
|
const maxAttempts = 10; // Maximum number of attempts
|
||||||
|
let resultAvailable = false;
|
||||||
|
|
||||||
|
while (attempt < maxAttempts && !resultAvailable) {
|
||||||
|
try {
|
||||||
|
resultResponse = await axios.get(resultUrl, { headers });
|
||||||
|
if (resultResponse.status === 200) {
|
||||||
|
resultAvailable = true; // Exit condition met
|
||||||
|
} else {
|
||||||
|
// If the status code is not 200, increment the attempt counter and wait
|
||||||
|
attempt++;
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error fetching result:", error);
|
||||||
|
attempt++;
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying
|
||||||
|
// You may want to handle specific errors differently
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!resultAvailable) {
|
||||||
|
content = await processPdf(filePath);
|
||||||
|
}
|
||||||
|
content = resultResponse.data[resultType];
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error processing document:", filePath, error);
|
||||||
|
content = await processPdf(filePath);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
content = await processPdf(filePath);
|
||||||
|
}
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function processPdf(file: string){
|
||||||
|
const fileContent = fs.readFileSync(file);
|
||||||
|
const data = await pdf(fileContent);
|
||||||
|
return data.text;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user