From c5cb268b61cd9b1fe7035b8d6a72bc80cfe3d4e2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Apr 2024 13:13:42 -0700 Subject: [PATCH] Update pdfProcessor.ts --- .../scraper/WebScraper/utils/pdfProcessor.ts | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 2d0203f..80476e9 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise { async function downloadPdf(url: string): Promise { const response = await axios({ url, - method: 'GET', - responseType: 'stream', + method: "GET", + responseType: "stream", }); const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`); @@ -29,13 +29,12 @@ async function downloadPdf(url: string): Promise { response.data.pipe(writer); return new Promise((resolve, reject) => { - writer.on('finish', () => resolve(tempFilePath)); - writer.on('error', reject); + writer.on("finish", () => resolve(tempFilePath)); + writer.on("error", reject); }); } export async function processPdfToText(filePath: string): Promise { - let content = ""; if (process.env.LLAMAPARSE_API_KEY) { @@ -102,32 +101,37 @@ export async function processPdfToText(filePath: string): Promise { return content; } -async function processPdf(file: string){ +async function processPdf(file: string) { const fileContent = fs.readFileSync(file); const data = await pdf(fileContent); return data.text; } - -// fetchAndProcessPdf("https://www.fda.gov/media/167973/download?attachment").then((e)=>{ -// console.log(e); -// }) - -export async function isUrlAPdf({url, fastMode}: {url: string, fastMode: boolean}): Promise { +/** + * Check if a url is a pdf + * @param url The url to check + * @param fastMode If true, the function will return false if the url is does not end with .pdf + * @returns A promise that resolves to true if the url is a pdf, false otherwise + */ +export async function isUrlAPdf({ + url, + fastMode, +}: { + url: string; + fastMode: boolean; +}): Promise { try { - if (url.endsWith('.pdf')) { + if (url.endsWith(".pdf")) { return true; } // If fast mode is enabled, we skip the HEAD request and return false if (fastMode) { return false; } - const response = await fetch(url, { method: 'HEAD' }); - const contentType = response.headers.get('Content-Type'); - return contentType !== null && contentType.includes('application/pdf'); + const response = await fetch(url, { method: "HEAD" }); + const contentType = response.headers.get("Content-Type"); + return contentType !== null && contentType.includes("application/pdf"); } catch (error) { - console.error('Error making HEAD request:', error); + console.error("Error making HEAD request:", error); return false; } } - -