From e37d15140428c5c2eec3a6126b2a25c86f08e23c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 12 Jun 2024 15:06:47 -0300 Subject: [PATCH] added parsePDF option to pageOptions user can decide if they are going to let us take care of the parse or they are going to parse the pdf by themselves --- .../src/__tests__/e2e_withAuth/index.test.ts | 15 ++++++++++++ apps/api/src/controllers/crawl.ts | 6 ++++- apps/api/src/controllers/scrape.ts | 8 ++++++- apps/api/src/lib/entities.ts | 1 + .../WebScraper/custom/handleCustomScraping.ts | 2 -- apps/api/src/scraper/WebScraper/index.ts | 9 ++++++-- apps/api/src/scraper/WebScraper/single_url.ts | 23 +++++++++++-------- .../utils/__tests__/pdfProcessor.test.ts | 2 +- .../scraper/WebScraper/utils/pdfProcessor.ts | 12 ++++++---- 9 files changed, 57 insertions(+), 21 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 02e4a47..4a1609b 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -136,6 +136,21 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 60000); // 60 seconds + it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); + }, 60000); // 60 seconds + // TODO: add this test back once we nail the waitFor option to be more deterministic // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { // const startTime = Date.now(); diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 58d01e2..fc3fe28 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -56,7 +56,11 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false }; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + parsePDF: true + }; if (mode === "single_urls" && !url.includes(",")) { try { diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index d5ab1de..ed28639 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false }; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + waitFor: 0, + screenshot: false, + parsePDF: true + }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 81bf12c..d676584 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -19,6 +19,7 @@ export type PageOptions = { screenshot?: boolean; headers?: Record; replaceAllPathsWithAbsolutePaths?: boolean; + parsePDF?: boolean }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index 8108a9e..081150b 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -1,5 +1,3 @@ -import { fetchAndProcessPdf } from "../utils/pdfProcessor"; - export async function handleCustomScraping( text: string, url: string diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index f432f43..f0f423a 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -280,7 +280,7 @@ export class WebScraperDataProvider { private async fetchPdfDocuments(pdfLinks: string[]): Promise { return Promise.all( pdfLinks.map(async (pdfLink) => { - const pdfContent = await fetchAndProcessPdf(pdfLink); + const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF); return { content: pdfContent, metadata: { sourceURL: pdfLink }, @@ -475,7 +475,12 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false }; + this.pageOptions = options.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + replaceAllPathsWithAbsolutePaths: false, + parsePDF: true + }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c2dcea1..8fa268f 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -49,7 +49,7 @@ export async function scrapWithFireEngine( url: string, waitFor: number = 0, screenshot: boolean = false, - pageOptions: { scrollXPaths?: string[] } = {}, + pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true }, headers?: Record, options?: any ): Promise { @@ -88,7 +88,7 @@ export async function scrapWithFireEngine( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return { html: await fetchAndProcessPdf(url), screenshot: "" }; + return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" }; } else { const data = response.data; const html = data.content; @@ -108,7 +108,8 @@ export async function scrapWithFireEngine( export async function scrapWithScrapingBee( url: string, wait_browser: string = "domcontentloaded", - timeout: number = universalTimeout + timeout: number = universalTimeout, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); @@ -129,7 +130,7 @@ export async function scrapWithScrapingBee( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const decoder = new TextDecoder(); const text = decoder.decode(response.data); @@ -144,7 +145,8 @@ export async function scrapWithScrapingBee( export async function scrapWithPlaywright( url: string, waitFor: number = 0, - headers?: Record + headers?: Record, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise { try { const reqParams = await generateRequestParams(url); @@ -172,7 +174,7 @@ export async function scrapWithPlaywright( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const textData = response.data; try { @@ -194,7 +196,10 @@ export async function scrapWithPlaywright( } } -export async function scrapWithFetch(url: string): Promise { +export async function scrapWithFetch( + url: string, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise { try { const response = await axios.get(url, { headers: { @@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise { const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const text = response.data; return text; @@ -371,7 +376,7 @@ export async function scrapSingleUrl( } break; case "pdf": - customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot } + customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot } break; } } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts index f14c8d4..f4ed3c6 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor'; describe('PDF Processing Module - Integration Test', () => { it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { delete process.env.LLAMAPARSE_API_KEY; - const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); + const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true); expect(pdfContent.trim()).toEqual("Dummy PDF file"); }); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 71984f2..1f0d6e8 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -9,9 +9,9 @@ import os from "os"; dotenv.config(); -export async function fetchAndProcessPdf(url: string): Promise { +export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise { const tempFilePath = await downloadPdf(url); - const content = await processPdfToText(tempFilePath); + const content = await processPdfToText(tempFilePath, parsePDF); fs.unlinkSync(tempFilePath); // Clean up the temporary file return content; } @@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise { }); } -export async function processPdfToText(filePath: string): Promise { +export async function processPdfToText(filePath: string, parsePDF: boolean): Promise { let content = ""; - if (process.env.LLAMAPARSE_API_KEY) { + if (process.env.LLAMAPARSE_API_KEY && parsePDF) { const apiKey = process.env.LLAMAPARSE_API_KEY; const headers = { Authorization: `Bearer ${apiKey}`, @@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise { console.error("Error processing pdf document w/ LlamaIndex(2)"); content = await processPdf(filePath); } - } else { + } else if (parsePDF) { content = await processPdf(filePath); + } else { + content = fs.readFileSync(filePath, "utf-8"); } return content; }