0

Update pdfProcessor.ts

This commit is contained in:
Nicolas 2024-04-19 13:13:42 -07:00
parent 43cfcec326
commit c5cb268b61

View File

@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise<string> {
async function downloadPdf(url: string): Promise<string> { async function downloadPdf(url: string): Promise<string> {
const response = await axios({ const response = await axios({
url, url,
method: 'GET', method: "GET",
responseType: 'stream', responseType: "stream",
}); });
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`); const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
@ -29,13 +29,12 @@ async function downloadPdf(url: string): Promise<string> {
response.data.pipe(writer); response.data.pipe(writer);
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
writer.on('finish', () => resolve(tempFilePath)); writer.on("finish", () => resolve(tempFilePath));
writer.on('error', reject); writer.on("error", reject);
}); });
} }
export async function processPdfToText(filePath: string): Promise<string> { export async function processPdfToText(filePath: string): Promise<string> {
let content = ""; let content = "";
if (process.env.LLAMAPARSE_API_KEY) { if (process.env.LLAMAPARSE_API_KEY) {
@ -107,27 +106,32 @@ async function processPdf(file: string){
const data = await pdf(fileContent); const data = await pdf(fileContent);
return data.text; return data.text;
} }
/**
// fetchAndProcessPdf("https://www.fda.gov/media/167973/download?attachment").then((e)=>{ * Check if a url is a pdf
// console.log(e); * @param url The url to check
// }) * @param fastMode If true, the function will return false if the url is does not end with .pdf
* @returns A promise that resolves to true if the url is a pdf, false otherwise
export async function isUrlAPdf({url, fastMode}: {url: string, fastMode: boolean}): Promise<boolean> { */
export async function isUrlAPdf({
url,
fastMode,
}: {
url: string;
fastMode: boolean;
}): Promise<boolean> {
try { try {
if (url.endsWith('.pdf')) { if (url.endsWith(".pdf")) {
return true; return true;
} }
// If fast mode is enabled, we skip the HEAD request and return false // If fast mode is enabled, we skip the HEAD request and return false
if (fastMode) { if (fastMode) {
return false; return false;
} }
const response = await fetch(url, { method: 'HEAD' }); const response = await fetch(url, { method: "HEAD" });
const contentType = response.headers.get('Content-Type'); const contentType = response.headers.get("Content-Type");
return contentType !== null && contentType.includes('application/pdf'); return contentType !== null && contentType.includes("application/pdf");
} catch (error) { } catch (error) {
console.error('Error making HEAD request:', error); console.error("Error making HEAD request:", error);
return false; return false;
} }
} }