0

added parsePDF option to pageOptions

user can decide if they are going to let us take care of the parse or they are going to parse the pdf by themselves
This commit is contained in:
rafaelsideguide 2024-06-12 15:06:47 -03:00
parent 48f6c19a05
commit e37d151404
9 changed files with 57 additions and 21 deletions

View File

@ -136,6 +136,21 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 60000); // 60 seconds }, 60000); // 60 seconds
it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
}, 60000); // 60 seconds
// TODO: add this test back once we nail the waitFor option to be more deterministic // TODO: add this test back once we nail the waitFor option to be more deterministic
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
// const startTime = Date.now(); // const startTime = Date.now();

View File

@ -56,7 +56,11 @@ export async function crawlController(req: Request, res: Response) {
const mode = req.body.mode ?? "crawl"; const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false }; const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
parsePDF: true
};
if (mode === "single_urls" && !url.includes(",")) { if (mode === "single_urls" && !url.includes(",")) {
try { try {

View File

@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false }; const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
waitFor: 0,
screenshot: false,
parsePDF: true
};
const extractorOptions = req.body.extractorOptions ?? { const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown" mode: "markdown"
} }

View File

@ -19,6 +19,7 @@ export type PageOptions = {
screenshot?: boolean; screenshot?: boolean;
headers?: Record<string, string>; headers?: Record<string, string>;
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean
}; };
export type ExtractorOptions = { export type ExtractorOptions = {

View File

@ -1,5 +1,3 @@
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
export async function handleCustomScraping( export async function handleCustomScraping(
text: string, text: string,
url: string url: string

View File

@ -280,7 +280,7 @@ export class WebScraperDataProvider {
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> { private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
return Promise.all( return Promise.all(
pdfLinks.map(async (pdfLink) => { pdfLinks.map(async (pdfLink) => {
const pdfContent = await fetchAndProcessPdf(pdfLink); const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
return { return {
content: pdfContent, content: pdfContent,
metadata: { sourceURL: pdfLink }, metadata: { sourceURL: pdfLink },
@ -475,7 +475,12 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000; this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText = this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false; options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false }; this.pageOptions = options.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
replaceAllPathsWithAbsolutePaths: false,
parsePDF: true
};
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check

View File

@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
url: string, url: string,
waitFor: number = 0, waitFor: number = 0,
screenshot: boolean = false, screenshot: boolean = false,
pageOptions: { scrollXPaths?: string[] } = {}, pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
headers?: Record<string, string>, headers?: Record<string, string>,
options?: any options?: any
): Promise<FireEngineResponse> { ): Promise<FireEngineResponse> {
@ -88,7 +88,7 @@ export async function scrapWithFireEngine(
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return { html: await fetchAndProcessPdf(url), screenshot: "" }; return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" };
} else { } else {
const data = response.data; const data = response.data;
const html = data.content; const html = data.content;
@ -108,7 +108,8 @@ export async function scrapWithFireEngine(
export async function scrapWithScrapingBee( export async function scrapWithScrapingBee(
url: string, url: string,
wait_browser: string = "domcontentloaded", wait_browser: string = "domcontentloaded",
timeout: number = universalTimeout timeout: number = universalTimeout,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> { ): Promise<string> {
try { try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
@ -129,7 +130,7 @@ export async function scrapWithScrapingBee(
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else { } else {
const decoder = new TextDecoder(); const decoder = new TextDecoder();
const text = decoder.decode(response.data); const text = decoder.decode(response.data);
@ -144,7 +145,8 @@ export async function scrapWithScrapingBee(
export async function scrapWithPlaywright( export async function scrapWithPlaywright(
url: string, url: string,
waitFor: number = 0, waitFor: number = 0,
headers?: Record<string, string> headers?: Record<string, string>,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> { ): Promise<string> {
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
@ -172,7 +174,7 @@ export async function scrapWithPlaywright(
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else { } else {
const textData = response.data; const textData = response.data;
try { try {
@ -194,7 +196,10 @@ export async function scrapWithPlaywright(
} }
} }
export async function scrapWithFetch(url: string): Promise<string> { export async function scrapWithFetch(
url: string,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> {
try { try {
const response = await axios.get(url, { const response = await axios.get(url, {
headers: { headers: {
@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else { } else {
const text = response.data; const text = response.data;
return text; return text;
@ -371,7 +376,7 @@ export async function scrapSingleUrl(
} }
break; break;
case "pdf": case "pdf":
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot } customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot }
break; break;
} }
} }

View File

@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor';
describe('PDF Processing Module - Integration Test', () => { describe('PDF Processing Module - Integration Test', () => {
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
delete process.env.LLAMAPARSE_API_KEY; delete process.env.LLAMAPARSE_API_KEY;
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
expect(pdfContent.trim()).toEqual("Dummy PDF file"); expect(pdfContent.trim()).toEqual("Dummy PDF file");
}); });

View File

@ -9,9 +9,9 @@ import os from "os";
dotenv.config(); dotenv.config();
export async function fetchAndProcessPdf(url: string): Promise<string> { export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<string> {
const tempFilePath = await downloadPdf(url); const tempFilePath = await downloadPdf(url);
const content = await processPdfToText(tempFilePath); const content = await processPdfToText(tempFilePath, parsePDF);
fs.unlinkSync(tempFilePath); // Clean up the temporary file fs.unlinkSync(tempFilePath); // Clean up the temporary file
return content; return content;
} }
@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<string> {
}); });
} }
export async function processPdfToText(filePath: string): Promise<string> { export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
let content = ""; let content = "";
if (process.env.LLAMAPARSE_API_KEY) { if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
const apiKey = process.env.LLAMAPARSE_API_KEY; const apiKey = process.env.LLAMAPARSE_API_KEY;
const headers = { const headers = {
Authorization: `Bearer ${apiKey}`, Authorization: `Bearer ${apiKey}`,
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
console.error("Error processing pdf document w/ LlamaIndex(2)"); console.error("Error processing pdf document w/ LlamaIndex(2)");
content = await processPdf(filePath); content = await processPdf(filePath);
} }
} else { } else if (parsePDF) {
content = await processPdf(filePath); content = await processPdf(filePath);
} else {
content = fs.readFileSync(filePath, "utf-8");
} }
return content; return content;
} }