added parsePDF option to pageOptions

user can decide if they are going to let us take care of the parse or they are going to parse the pdf by themselves
2024-06-12 15:06:47 -03:00 · 2024-06-12 15:06:47 -03:00 · e37d151404
commit e37d151404
parent 48f6c19a05
9 changed files with 57 additions and 21 deletions
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -136,6 +136,21 @@ describe("E2E Tests for API Routes", () => {
      expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
    }, 60000); // 60 seconds

+    it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
+      await new Promise((r) => setTimeout(r, 6000));
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1  7 Jan 1993)>>endobj');
+    }, 60000); // 60 seconds
+
    // TODO: add this test back once we nail the waitFor option to be more deterministic
    // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
    //   const startTime = Date.now();
--- a/apps/api/src/controllers/crawl.ts
+++ b/apps/api/src/controllers/crawl.ts
@ -56,7 +56,11 @@ export async function crawlController(req: Request, res: Response) {

    const mode = req.body.mode ?? "crawl";
    const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    const pageOptions = req.body.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      parsePDF: true
+    };

    if (mode === "single_urls" && !url.includes(",")) {
      try {
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
      return res.status(status).json({ error });
    }
    const crawlerOptions = req.body.crawlerOptions ?? {};
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
+    const pageOptions = req.body.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      waitFor: 0,
+      screenshot: false,
+      parsePDF: true
+    };
    const extractorOptions = req.body.extractorOptions ?? {
      mode: "markdown"
    }
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -19,6 +19,7 @@ export type PageOptions = {
  screenshot?: boolean;
  headers?: Record<string, string>;
  replaceAllPathsWithAbsolutePaths?: boolean;
+  parsePDF?: boolean
 };

 export type ExtractorOptions = {
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@ -1,5 +1,3 @@
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-
 export async function handleCustomScraping(
  text: string,
  url: string
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -280,7 +280,7 @@ export class WebScraperDataProvider {
  private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
    return Promise.all(
      pdfLinks.map(async (pdfLink) => {
-        const pdfContent = await fetchAndProcessPdf(pdfLink);
+        const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
        return {
          content: pdfContent,
          metadata: { sourceURL: pdfLink },
@ -475,7 +475,12 @@ export class WebScraperDataProvider {
    this.limit = options.crawlerOptions?.limit ?? 10000;
    this.generateImgAltText =
      options.crawlerOptions?.generateImgAltText ?? false;
-    this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
+    this.pageOptions = options.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      replaceAllPathsWithAbsolutePaths: false,
+      parsePDF: true
+    };
    this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
    //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
  url: string,
  waitFor: number = 0,
  screenshot: boolean = false,
-  pageOptions: { scrollXPaths?: string[] } = {},
+  pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
  headers?: Record<string, string>,
  options?: any
 ): Promise<FireEngineResponse> {
@ -88,7 +88,7 @@ export async function scrapWithFireEngine(

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return { html: await fetchAndProcessPdf(url), screenshot: "" };
+      return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" };
    } else {
      const data = response.data;
      const html = data.content;
@ -108,7 +108,8 @@ export async function scrapWithFireEngine(
 export async function scrapWithScrapingBee(
  url: string,
  wait_browser: string = "domcontentloaded",
-  timeout: number = universalTimeout
+  timeout: number = universalTimeout,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
 ): Promise<string> {
  try {
    const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
@ -129,7 +130,7 @@ export async function scrapWithScrapingBee(

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return fetchAndProcessPdf(url, pageOptions?.parsePDF);
    } else {
      const decoder = new TextDecoder();
      const text = decoder.decode(response.data);
@ -144,7 +145,8 @@ export async function scrapWithScrapingBee(
 export async function scrapWithPlaywright(
  url: string,
  waitFor: number = 0,
-  headers?: Record<string, string>
+  headers?: Record<string, string>,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
 ): Promise<string> {
  try {
    const reqParams = await generateRequestParams(url);
@ -172,7 +174,7 @@ export async function scrapWithPlaywright(

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return fetchAndProcessPdf(url, pageOptions?.parsePDF);
    } else {
      const textData = response.data;
      try {
@ -194,7 +196,10 @@ export async function scrapWithPlaywright(
  }
 }

-export async function scrapWithFetch(url: string): Promise<string> {
+export async function scrapWithFetch(
+  url: string,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
+): Promise<string> {
  try {
    const response = await axios.get(url, {
      headers: {
@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise<string> {

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return fetchAndProcessPdf(url, pageOptions?.parsePDF);
    } else {
      const text = response.data;
      return text;
@ -371,7 +376,7 @@ export async function scrapSingleUrl(
          }
          break;
        case "pdf":
-          customScrapedContent  = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
+          customScrapedContent  = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot }
          break;
      }
    }
--- a/apps/api/src/scraper/WebScraper/utils/tests/pdfProcessor.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/pdfProcessor.test.ts
@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor';
 describe('PDF Processing Module - Integration Test', () => {
  it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
    delete process.env.LLAMAPARSE_API_KEY;
-    const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
+    const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
    expect(pdfContent.trim()).toEqual("Dummy PDF file");
  });

--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@ -9,9 +9,9 @@ import os from "os";

 dotenv.config();

-export async function fetchAndProcessPdf(url: string): Promise<string> {
+export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<string> {
  const tempFilePath = await downloadPdf(url);
-  const content = await processPdfToText(tempFilePath);
+  const content = await processPdfToText(tempFilePath, parsePDF);
  fs.unlinkSync(tempFilePath); // Clean up the temporary file
  return content;
 }
@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<string> {
  });
 }

-export async function processPdfToText(filePath: string): Promise<string> {
+export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
  let content = "";

-  if (process.env.LLAMAPARSE_API_KEY) {
+  if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
    const apiKey = process.env.LLAMAPARSE_API_KEY;
    const headers = {
      Authorization: `Bearer ${apiKey}`,
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
      console.error("Error processing pdf document w/ LlamaIndex(2)");
      content = await processPdf(filePath);
    }
-  } else {
+  } else if (parsePDF) {
    content = await processPdf(filePath);
+  } else {
+    content = fs.readFileSync(filePath, "utf-8");
  }
  return content;
 }