From 140529c6090ece93ba60cdb3ce360f9a28b8ffb7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Apr 2024 13:05:21 -0700 Subject: [PATCH] Nick: fixes pdfs not found --- apps/api/src/scraper/WebScraper/index.ts | 14 +++++------ .../scraper/WebScraper/utils/pdfProcessor.ts | 23 ++++++++++++++++++- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index c2146be..47fa05c 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -5,7 +5,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/gptVision"; -import { fetchAndProcessPdf } from "./utils/pdfProcessor"; +import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; @@ -88,7 +88,7 @@ export class WebScraperDataProvider { })); } - let pdfLinks = links.filter((link) => link.endsWith(".pdf")); + let pdfLinks = links.filter((link) => isUrlAPdf(link)); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -98,7 +98,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !link.endsWith(".pdf")); + links = links.filter((link) => !isUrlAPdf(link)); let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); @@ -157,7 +157,7 @@ export class WebScraperDataProvider { } if (this.mode === "single_urls") { - let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf")); + let pdfLinks = this.urls.filter((link) => isUrlAPdf(link)); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -169,7 +169,7 @@ export class WebScraperDataProvider { } let documents = await this.convertUrlsToDocuments( - this.urls.filter((link) => !link.endsWith(".pdf")), + this.urls.filter((link) => !isUrlAPdf(link)), inProgress ); @@ -193,7 +193,7 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { let links = await getLinksFromSitemap(this.urls[0]); - let pdfLinks = links.filter((link) => link.endsWith(".pdf")); + let pdfLinks = links.filter((link) => isUrlAPdf(link)); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -203,7 +203,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !link.endsWith(".pdf")); + links = links.filter((link) => !isUrlAPdf(link)); let documents = await this.convertUrlsToDocuments( links.slice(0, this.limit), diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index fb08d9c..75f80fb 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -35,6 +35,7 @@ async function downloadPdf(url: string): Promise { } export async function processPdfToText(filePath: string): Promise { + let content = ""; if (process.env.LLAMAPARSE_API_KEY) { @@ -105,4 +106,24 @@ async function processPdf(file: string){ const fileContent = fs.readFileSync(file); const data = await pdf(fileContent); return data.text; -} \ No newline at end of file +} + +// fetchAndProcessPdf("https://www.fda.gov/media/167973/download?attachment").then((e)=>{ +// console.log(e); +// }) + +export async function isUrlAPdf(url: string): Promise { + try { + if (url.endsWith('.pdf')) { + return true; + } + const response = await fetch(url, { method: 'HEAD' }); + const contentType = response.headers.get('Content-Type'); + return contentType !== null && contentType.includes('application/pdf'); + } catch (error) { + console.error('Error making HEAD request:', error); + return false; + } +} + +