Nick: fixes pdfs not found

2024-04-19 13:05:21 -07:00 · 2024-04-19 13:05:21 -07:00 · 140529c609
commit 140529c609
parent 15cfc01f5d
2 changed files with 29 additions and 8 deletions
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -5,7 +5,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
 import { WebCrawler } from "./crawler";
 import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/gptVision";
-import { fetchAndProcessPdf } from "./utils/pdfProcessor";
+import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
 import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";


@ -88,7 +88,7 @@ export class WebScraperDataProvider {
          }));
        }

-        let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
+        let pdfLinks = links.filter((link) => isUrlAPdf(link));
        let pdfDocuments: Document[] = [];
        for (let pdfLink of pdfLinks) {
          const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -98,7 +98,7 @@ export class WebScraperDataProvider {
            provider: "web-scraper"
          });
        }
-        links = links.filter((link) => !link.endsWith(".pdf"));
+        links = links.filter((link) => !isUrlAPdf(link));

        let documents = await this.convertUrlsToDocuments(links, inProgress);
        documents = await this.getSitemapData(this.urls[0], documents);
@ -157,7 +157,7 @@ export class WebScraperDataProvider {
      }

      if (this.mode === "single_urls") {
-        let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf"));
+        let pdfLinks = this.urls.filter((link) => isUrlAPdf(link));
        let pdfDocuments: Document[] = [];
        for (let pdfLink of pdfLinks) {
          const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -169,7 +169,7 @@ export class WebScraperDataProvider {
        }

        let documents = await this.convertUrlsToDocuments(
-          this.urls.filter((link) => !link.endsWith(".pdf")),
+          this.urls.filter((link) => !isUrlAPdf(link)),
          inProgress
        );

@ -193,7 +193,7 @@ export class WebScraperDataProvider {
      }
      if (this.mode === "sitemap") {
        let links = await getLinksFromSitemap(this.urls[0]);
-        let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
+        let pdfLinks = links.filter((link) => isUrlAPdf(link));
        let pdfDocuments: Document[] = [];
        for (let pdfLink of pdfLinks) {
          const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -203,7 +203,7 @@ export class WebScraperDataProvider {
            provider: "web-scraper"
          });
        }
-        links = links.filter((link) => !link.endsWith(".pdf"));
+        links = links.filter((link) => !isUrlAPdf(link));

        let documents = await this.convertUrlsToDocuments(
          links.slice(0, this.limit),
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@ -35,6 +35,7 @@ async function downloadPdf(url: string): Promise<string> {
 }

 export async function processPdfToText(filePath: string): Promise<string> {
+
  let content = "";

  if (process.env.LLAMAPARSE_API_KEY) {
@ -105,4 +106,24 @@ async function processPdf(file: string){
  const fileContent = fs.readFileSync(file);
  const data = await pdf(fileContent);
  return data.text;
-}
+}
+
+// fetchAndProcessPdf("https://www.fda.gov/media/167973/download?attachment").then((e)=>{
+//   console.log(e);
+// })
+
+export async function isUrlAPdf(url: string): Promise<boolean> {
+  try {
+    if (url.endsWith('.pdf')) {
+      return true;
+    }
+    const response = await fetch(url, { method: 'HEAD' });
+    const contentType = response.headers.get('Content-Type');
+    return contentType !== null && contentType.includes('application/pdf');
+  } catch (error) {
+    console.error('Error making HEAD request:', error);
+    return false;
+  }
+}
+
+