0

Nick: fixes pdfs not found

This commit is contained in:
Nicolas 2024-04-19 13:05:21 -07:00
parent 15cfc01f5d
commit 140529c609
2 changed files with 29 additions and 8 deletions

View File

@ -5,7 +5,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
import { WebCrawler } from "./crawler"; import { WebCrawler } from "./crawler";
import { getValue, setValue } from "../../services/redis"; import { getValue, setValue } from "../../services/redis";
import { getImageDescription } from "./utils/gptVision"; import { getImageDescription } from "./utils/gptVision";
import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
@ -88,7 +88,7 @@ export class WebScraperDataProvider {
})); }));
} }
let pdfLinks = links.filter((link) => link.endsWith(".pdf")); let pdfLinks = links.filter((link) => isUrlAPdf(link));
let pdfDocuments: Document[] = []; let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) { for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink); const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -98,7 +98,7 @@ export class WebScraperDataProvider {
provider: "web-scraper" provider: "web-scraper"
}); });
} }
links = links.filter((link) => !link.endsWith(".pdf")); links = links.filter((link) => !isUrlAPdf(link));
let documents = await this.convertUrlsToDocuments(links, inProgress); let documents = await this.convertUrlsToDocuments(links, inProgress);
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
@ -157,7 +157,7 @@ export class WebScraperDataProvider {
} }
if (this.mode === "single_urls") { if (this.mode === "single_urls") {
let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf")); let pdfLinks = this.urls.filter((link) => isUrlAPdf(link));
let pdfDocuments: Document[] = []; let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) { for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink); const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -169,7 +169,7 @@ export class WebScraperDataProvider {
} }
let documents = await this.convertUrlsToDocuments( let documents = await this.convertUrlsToDocuments(
this.urls.filter((link) => !link.endsWith(".pdf")), this.urls.filter((link) => !isUrlAPdf(link)),
inProgress inProgress
); );
@ -193,7 +193,7 @@ export class WebScraperDataProvider {
} }
if (this.mode === "sitemap") { if (this.mode === "sitemap") {
let links = await getLinksFromSitemap(this.urls[0]); let links = await getLinksFromSitemap(this.urls[0]);
let pdfLinks = links.filter((link) => link.endsWith(".pdf")); let pdfLinks = links.filter((link) => isUrlAPdf(link));
let pdfDocuments: Document[] = []; let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) { for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink); const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -203,7 +203,7 @@ export class WebScraperDataProvider {
provider: "web-scraper" provider: "web-scraper"
}); });
} }
links = links.filter((link) => !link.endsWith(".pdf")); links = links.filter((link) => !isUrlAPdf(link));
let documents = await this.convertUrlsToDocuments( let documents = await this.convertUrlsToDocuments(
links.slice(0, this.limit), links.slice(0, this.limit),

View File

@ -35,6 +35,7 @@ async function downloadPdf(url: string): Promise<string> {
} }
export async function processPdfToText(filePath: string): Promise<string> { export async function processPdfToText(filePath: string): Promise<string> {
let content = ""; let content = "";
if (process.env.LLAMAPARSE_API_KEY) { if (process.env.LLAMAPARSE_API_KEY) {
@ -105,4 +106,24 @@ async function processPdf(file: string){
const fileContent = fs.readFileSync(file); const fileContent = fs.readFileSync(file);
const data = await pdf(fileContent); const data = await pdf(fileContent);
return data.text; return data.text;
} }
// fetchAndProcessPdf("https://www.fda.gov/media/167973/download?attachment").then((e)=>{
// console.log(e);
// })
export async function isUrlAPdf(url: string): Promise<boolean> {
try {
if (url.endsWith('.pdf')) {
return true;
}
const response = await fetch(url, { method: 'HEAD' });
const contentType = response.headers.get('Content-Type');
return contentType !== null && contentType.includes('application/pdf');
} catch (error) {
console.error('Error making HEAD request:', error);
return false;
}
}