diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index d6b2b53..780ad39 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -135,6 +135,21 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 60000); // 60 seconds + it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); + }, 60000); // 60 seconds + it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { const responseWithoutRemoveTags = await request(TEST_URL) .post("/v0/scrape") diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 7eab78f..8fd876d 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -55,13 +55,15 @@ export async function crawlController(req: Request, res: Response) { } const mode = req.body.mode ?? "crawl"; + const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, - removeTags: [] + removeTags: [], + parsePDF: true }; if (mode === "single_urls" && !url.includes(",")) { diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index d5ab1de..ed28639 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false }; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + waitFor: 0, + screenshot: false, + parsePDF: true + }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 92170c1..0dae9ba 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -19,6 +19,7 @@ export type PageOptions = { screenshot?: boolean; headers?: Record; replaceAllPathsWithAbsolutePaths?: boolean; + parsePDF?: boolean; removeTags?: string | string[]; }; diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index 8108a9e..081150b 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -1,5 +1,3 @@ -import { fetchAndProcessPdf } from "../utils/pdfProcessor"; - export async function handleCustomScraping( text: string, url: string diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1a6ffd0..36af58a 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -280,7 +280,7 @@ export class WebScraperDataProvider { private async fetchPdfDocuments(pdfLinks: string[]): Promise { return Promise.all( pdfLinks.map(async (pdfLink) => { - const pdfContent = await fetchAndProcessPdf(pdfLink); + const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF); return { content: pdfContent, metadata: { sourceURL: pdfLink }, @@ -479,6 +479,7 @@ export class WebScraperDataProvider { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false, + parsePDF: true, removeTags: [] }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index a16f6f0..4723a56 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -49,7 +49,7 @@ export async function scrapWithFireEngine( url: string, waitFor: number = 0, screenshot: boolean = false, - pageOptions: { scrollXPaths?: string[] } = {}, + pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true }, headers?: Record, options?: any ): Promise { @@ -88,7 +88,7 @@ export async function scrapWithFireEngine( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return { html: await fetchAndProcessPdf(url), screenshot: "" }; + return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" }; } else { const data = response.data; const html = data.content; @@ -108,7 +108,8 @@ export async function scrapWithFireEngine( export async function scrapWithScrapingBee( url: string, wait_browser: string = "domcontentloaded", - timeout: number = universalTimeout + timeout: number = universalTimeout, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); @@ -129,7 +130,7 @@ export async function scrapWithScrapingBee( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const decoder = new TextDecoder(); const text = decoder.decode(response.data); @@ -144,7 +145,8 @@ export async function scrapWithScrapingBee( export async function scrapWithPlaywright( url: string, waitFor: number = 0, - headers?: Record + headers?: Record, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise { try { const reqParams = await generateRequestParams(url); @@ -172,7 +174,7 @@ export async function scrapWithPlaywright( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const textData = response.data; try { @@ -194,7 +196,10 @@ export async function scrapWithPlaywright( } } -export async function scrapWithFetch(url: string): Promise { +export async function scrapWithFetch( + url: string, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise { try { const response = await axios.get(url, { headers: { @@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise { const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const text = response.data; return text; @@ -384,7 +389,7 @@ export async function scrapSingleUrl( } break; case "pdf": - customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot } + customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot } break; } } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts index f14c8d4..f4ed3c6 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor'; describe('PDF Processing Module - Integration Test', () => { it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { delete process.env.LLAMAPARSE_API_KEY; - const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); + const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true); expect(pdfContent.trim()).toEqual("Dummy PDF file"); }); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 71984f2..1f0d6e8 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -9,9 +9,9 @@ import os from "os"; dotenv.config(); -export async function fetchAndProcessPdf(url: string): Promise { +export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise { const tempFilePath = await downloadPdf(url); - const content = await processPdfToText(tempFilePath); + const content = await processPdfToText(tempFilePath, parsePDF); fs.unlinkSync(tempFilePath); // Clean up the temporary file return content; } @@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise { }); } -export async function processPdfToText(filePath: string): Promise { +export async function processPdfToText(filePath: string, parsePDF: boolean): Promise { let content = ""; - if (process.env.LLAMAPARSE_API_KEY) { + if (process.env.LLAMAPARSE_API_KEY && parsePDF) { const apiKey = process.env.LLAMAPARSE_API_KEY; const headers = { Authorization: `Bearer ${apiKey}`, @@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise { console.error("Error processing pdf document w/ LlamaIndex(2)"); content = await processPdf(filePath); } - } else { + } else if (parsePDF) { content = await processPdf(filePath); + } else { + content = fs.readFileSync(filePath, "utf-8"); } return content; }