Merge pull request #271 from mendableai/feat/issue-205

[Feat] Added parsePDF option to pageOptions
2024-06-14 11:29:26 -03:00 · 2024-06-14 11:29:26 -03:00 · 2c0a2c742a
commit 2c0a2c742a
parent 5fd228f9ec 3e2e76311c
9 changed files with 50 additions and 20 deletions
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -135,6 +135,21 @@ describe("E2E Tests for API Routes", () => {
      expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
    }, 60000); // 60 seconds

+    it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
+      await new Promise((r) => setTimeout(r, 6000));
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1  7 Jan 1993)>>endobj');
+    }, 60000); // 60 seconds
+
    it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
      const responseWithoutRemoveTags = await request(TEST_URL)
        .post("/v0/scrape")
--- a/apps/api/src/controllers/crawl.ts
+++ b/apps/api/src/controllers/crawl.ts
@ -55,13 +55,15 @@ export async function crawlController(req: Request, res: Response) {
    }

    const mode = req.body.mode ?? "crawl";
+
    const crawlerOptions = req.body.crawlerOptions ?? {
      allowBackwardCrawling: false
    };
    const pageOptions = req.body.pageOptions ?? {
      onlyMainContent: false,
      includeHtml: false,
-      removeTags: []
+      removeTags: [],
+      parsePDF: true
    };

    if (mode === "single_urls" && !url.includes(",")) {
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
      return res.status(status).json({ error });
    }
    const crawlerOptions = req.body.crawlerOptions ?? {};
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
+    const pageOptions = req.body.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      waitFor: 0,
+      screenshot: false,
+      parsePDF: true
+    };
    const extractorOptions = req.body.extractorOptions ?? {
      mode: "markdown"
    }
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -19,6 +19,7 @@ export type PageOptions = {
  screenshot?: boolean;
  headers?: Record<string, string>;
  replaceAllPathsWithAbsolutePaths?: boolean;
+  parsePDF?: boolean;
  removeTags?: string | string[];
 };

--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@ -1,5 +1,3 @@
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-
 export async function handleCustomScraping(
  text: string,
  url: string
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -280,7 +280,7 @@ export class WebScraperDataProvider {
  private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
    return Promise.all(
      pdfLinks.map(async (pdfLink) => {
-        const pdfContent = await fetchAndProcessPdf(pdfLink);
+        const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
        return {
          content: pdfContent,
          metadata: { sourceURL: pdfLink },
@ -479,6 +479,7 @@ export class WebScraperDataProvider {
      onlyMainContent: false,
      includeHtml: false,
      replaceAllPathsWithAbsolutePaths: false,
+      parsePDF: true,
      removeTags: []
    };
    this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
  url: string,
  waitFor: number = 0,
  screenshot: boolean = false,
-  pageOptions: { scrollXPaths?: string[] } = {},
+  pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
  headers?: Record<string, string>,
  options?: any
 ): Promise<FireEngineResponse> {
@ -88,7 +88,7 @@ export async function scrapWithFireEngine(

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return { html: await fetchAndProcessPdf(url), screenshot: "" };
+      return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" };
    } else {
      const data = response.data;
      const html = data.content;
@ -108,7 +108,8 @@ export async function scrapWithFireEngine(
 export async function scrapWithScrapingBee(
  url: string,
  wait_browser: string = "domcontentloaded",
-  timeout: number = universalTimeout
+  timeout: number = universalTimeout,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
 ): Promise<string> {
  try {
    const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
@ -129,7 +130,7 @@ export async function scrapWithScrapingBee(

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return fetchAndProcessPdf(url, pageOptions?.parsePDF);
    } else {
      const decoder = new TextDecoder();
      const text = decoder.decode(response.data);
@ -144,7 +145,8 @@ export async function scrapWithScrapingBee(
 export async function scrapWithPlaywright(
  url: string,
  waitFor: number = 0,
-  headers?: Record<string, string>
+  headers?: Record<string, string>,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
 ): Promise<string> {
  try {
    const reqParams = await generateRequestParams(url);
@ -172,7 +174,7 @@ export async function scrapWithPlaywright(

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return fetchAndProcessPdf(url, pageOptions?.parsePDF);
    } else {
      const textData = response.data;
      try {
@ -194,7 +196,10 @@ export async function scrapWithPlaywright(
  }
 }

-export async function scrapWithFetch(url: string): Promise<string> {
+export async function scrapWithFetch(
+  url: string,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
+): Promise<string> {
  try {
    const response = await axios.get(url, {
      headers: {
@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise<string> {

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return fetchAndProcessPdf(url, pageOptions?.parsePDF);
    } else {
      const text = response.data;
      return text;
@ -384,7 +389,7 @@ export async function scrapSingleUrl(
          }
          break;
        case "pdf":
-          customScrapedContent  = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
+          customScrapedContent  = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot }
          break;
      }
    }
--- a/apps/api/src/scraper/WebScraper/utils/tests/pdfProcessor.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/pdfProcessor.test.ts
@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor';
 describe('PDF Processing Module - Integration Test', () => {
  it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
    delete process.env.LLAMAPARSE_API_KEY;
-    const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
+    const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
    expect(pdfContent.trim()).toEqual("Dummy PDF file");
  });

--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@ -9,9 +9,9 @@ import os from "os";

 dotenv.config();

-export async function fetchAndProcessPdf(url: string): Promise<string> {
+export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<string> {
  const tempFilePath = await downloadPdf(url);
-  const content = await processPdfToText(tempFilePath);
+  const content = await processPdfToText(tempFilePath, parsePDF);
  fs.unlinkSync(tempFilePath); // Clean up the temporary file
  return content;
 }
@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<string> {
  });
 }

-export async function processPdfToText(filePath: string): Promise<string> {
+export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
  let content = "";

-  if (process.env.LLAMAPARSE_API_KEY) {
+  if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
    const apiKey = process.env.LLAMAPARSE_API_KEY;
    const headers = {
      Authorization: `Bearer ${apiKey}`,
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
      console.error("Error processing pdf document w/ LlamaIndex(2)");
      content = await processPdf(filePath);
    }
-  } else {
+  } else if (parsePDF) {
    content = await processPdf(filePath);
+  } else {
+    content = fs.readFileSync(filePath, "utf-8");
  }
  return content;
 }