Update pdfProcessor.ts
This commit is contained in:
parent
43cfcec326
commit
c5cb268b61
@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise<string> {
|
|||||||
async function downloadPdf(url: string): Promise<string> {
|
async function downloadPdf(url: string): Promise<string> {
|
||||||
const response = await axios({
|
const response = await axios({
|
||||||
url,
|
url,
|
||||||
method: 'GET',
|
method: "GET",
|
||||||
responseType: 'stream',
|
responseType: "stream",
|
||||||
});
|
});
|
||||||
|
|
||||||
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
|
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
|
||||||
@ -29,13 +29,12 @@ async function downloadPdf(url: string): Promise<string> {
|
|||||||
response.data.pipe(writer);
|
response.data.pipe(writer);
|
||||||
|
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
writer.on('finish', () => resolve(tempFilePath));
|
writer.on("finish", () => resolve(tempFilePath));
|
||||||
writer.on('error', reject);
|
writer.on("error", reject);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function processPdfToText(filePath: string): Promise<string> {
|
export async function processPdfToText(filePath: string): Promise<string> {
|
||||||
|
|
||||||
let content = "";
|
let content = "";
|
||||||
|
|
||||||
if (process.env.LLAMAPARSE_API_KEY) {
|
if (process.env.LLAMAPARSE_API_KEY) {
|
||||||
@ -102,32 +101,37 @@ export async function processPdfToText(filePath: string): Promise<string> {
|
|||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function processPdf(file: string){
|
async function processPdf(file: string) {
|
||||||
const fileContent = fs.readFileSync(file);
|
const fileContent = fs.readFileSync(file);
|
||||||
const data = await pdf(fileContent);
|
const data = await pdf(fileContent);
|
||||||
return data.text;
|
return data.text;
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
// fetchAndProcessPdf("https://www.fda.gov/media/167973/download?attachment").then((e)=>{
|
* Check if a url is a pdf
|
||||||
// console.log(e);
|
* @param url The url to check
|
||||||
// })
|
* @param fastMode If true, the function will return false if the url is does not end with .pdf
|
||||||
|
* @returns A promise that resolves to true if the url is a pdf, false otherwise
|
||||||
export async function isUrlAPdf({url, fastMode}: {url: string, fastMode: boolean}): Promise<boolean> {
|
*/
|
||||||
|
export async function isUrlAPdf({
|
||||||
|
url,
|
||||||
|
fastMode,
|
||||||
|
}: {
|
||||||
|
url: string;
|
||||||
|
fastMode: boolean;
|
||||||
|
}): Promise<boolean> {
|
||||||
try {
|
try {
|
||||||
if (url.endsWith('.pdf')) {
|
if (url.endsWith(".pdf")) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// If fast mode is enabled, we skip the HEAD request and return false
|
// If fast mode is enabled, we skip the HEAD request and return false
|
||||||
if (fastMode) {
|
if (fastMode) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const response = await fetch(url, { method: 'HEAD' });
|
const response = await fetch(url, { method: "HEAD" });
|
||||||
const contentType = response.headers.get('Content-Type');
|
const contentType = response.headers.get("Content-Type");
|
||||||
return contentType !== null && contentType.includes('application/pdf');
|
return contentType !== null && contentType.includes("application/pdf");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error making HEAD request:', error);
|
console.error("Error making HEAD request:", error);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user