Nick: fixes pdfs not found
This commit is contained in:
parent
15cfc01f5d
commit
140529c609
@ -5,7 +5,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
|||||||
import { WebCrawler } from "./crawler";
|
import { WebCrawler } from "./crawler";
|
||||||
import { getValue, setValue } from "../../services/redis";
|
import { getValue, setValue } from "../../services/redis";
|
||||||
import { getImageDescription } from "./utils/gptVision";
|
import { getImageDescription } from "./utils/gptVision";
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
|
||||||
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
||||||
|
|
||||||
|
|
||||||
@ -88,7 +88,7 @@ export class WebScraperDataProvider {
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
let pdfLinks = links.filter((link) => isUrlAPdf(link));
|
||||||
let pdfDocuments: Document[] = [];
|
let pdfDocuments: Document[] = [];
|
||||||
for (let pdfLink of pdfLinks) {
|
for (let pdfLink of pdfLinks) {
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
@ -98,7 +98,7 @@ export class WebScraperDataProvider {
|
|||||||
provider: "web-scraper"
|
provider: "web-scraper"
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
links = links.filter((link) => !isUrlAPdf(link));
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
@ -157,7 +157,7 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (this.mode === "single_urls") {
|
if (this.mode === "single_urls") {
|
||||||
let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf"));
|
let pdfLinks = this.urls.filter((link) => isUrlAPdf(link));
|
||||||
let pdfDocuments: Document[] = [];
|
let pdfDocuments: Document[] = [];
|
||||||
for (let pdfLink of pdfLinks) {
|
for (let pdfLink of pdfLinks) {
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
@ -169,7 +169,7 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(
|
let documents = await this.convertUrlsToDocuments(
|
||||||
this.urls.filter((link) => !link.endsWith(".pdf")),
|
this.urls.filter((link) => !isUrlAPdf(link)),
|
||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -193,7 +193,7 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
if (this.mode === "sitemap") {
|
if (this.mode === "sitemap") {
|
||||||
let links = await getLinksFromSitemap(this.urls[0]);
|
let links = await getLinksFromSitemap(this.urls[0]);
|
||||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
let pdfLinks = links.filter((link) => isUrlAPdf(link));
|
||||||
let pdfDocuments: Document[] = [];
|
let pdfDocuments: Document[] = [];
|
||||||
for (let pdfLink of pdfLinks) {
|
for (let pdfLink of pdfLinks) {
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
@ -203,7 +203,7 @@ export class WebScraperDataProvider {
|
|||||||
provider: "web-scraper"
|
provider: "web-scraper"
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
links = links.filter((link) => !isUrlAPdf(link));
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(
|
let documents = await this.convertUrlsToDocuments(
|
||||||
links.slice(0, this.limit),
|
links.slice(0, this.limit),
|
||||||
|
@ -35,6 +35,7 @@ async function downloadPdf(url: string): Promise<string> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function processPdfToText(filePath: string): Promise<string> {
|
export async function processPdfToText(filePath: string): Promise<string> {
|
||||||
|
|
||||||
let content = "";
|
let content = "";
|
||||||
|
|
||||||
if (process.env.LLAMAPARSE_API_KEY) {
|
if (process.env.LLAMAPARSE_API_KEY) {
|
||||||
@ -105,4 +106,24 @@ async function processPdf(file: string){
|
|||||||
const fileContent = fs.readFileSync(file);
|
const fileContent = fs.readFileSync(file);
|
||||||
const data = await pdf(fileContent);
|
const data = await pdf(fileContent);
|
||||||
return data.text;
|
return data.text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// fetchAndProcessPdf("https://www.fda.gov/media/167973/download?attachment").then((e)=>{
|
||||||
|
// console.log(e);
|
||||||
|
// })
|
||||||
|
|
||||||
|
export async function isUrlAPdf(url: string): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
if (url.endsWith('.pdf')) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
const response = await fetch(url, { method: 'HEAD' });
|
||||||
|
const contentType = response.headers.get('Content-Type');
|
||||||
|
return contentType !== null && contentType.includes('application/pdf');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error making HEAD request:', error);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user