0

Merge pull request #271 from mendableai/feat/issue-205

[Feat] Added parsePDF option to pageOptions
This commit is contained in:
Rafael Miller 2024-06-14 11:29:26 -03:00 committed by GitHub
commit 2c0a2c742a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 50 additions and 20 deletions

View File

@ -135,6 +135,21 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 60000); // 60 seconds }, 60000); // 60 seconds
it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
}, 60000); // 60 seconds
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
const responseWithoutRemoveTags = await request(TEST_URL) const responseWithoutRemoveTags = await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")

View File

@ -55,13 +55,15 @@ export async function crawlController(req: Request, res: Response) {
} }
const mode = req.body.mode ?? "crawl"; const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? { const crawlerOptions = req.body.crawlerOptions ?? {
allowBackwardCrawling: false allowBackwardCrawling: false
}; };
const pageOptions = req.body.pageOptions ?? { const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false, onlyMainContent: false,
includeHtml: false, includeHtml: false,
removeTags: [] removeTags: [],
parsePDF: true
}; };
if (mode === "single_urls" && !url.includes(",")) { if (mode === "single_urls" && !url.includes(",")) {

View File

@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false }; const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
waitFor: 0,
screenshot: false,
parsePDF: true
};
const extractorOptions = req.body.extractorOptions ?? { const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown" mode: "markdown"
} }

View File

@ -19,6 +19,7 @@ export type PageOptions = {
screenshot?: boolean; screenshot?: boolean;
headers?: Record<string, string>; headers?: Record<string, string>;
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean;
removeTags?: string | string[]; removeTags?: string | string[];
}; };

View File

@ -1,5 +1,3 @@
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
export async function handleCustomScraping( export async function handleCustomScraping(
text: string, text: string,
url: string url: string

View File

@ -280,7 +280,7 @@ export class WebScraperDataProvider {
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> { private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
return Promise.all( return Promise.all(
pdfLinks.map(async (pdfLink) => { pdfLinks.map(async (pdfLink) => {
const pdfContent = await fetchAndProcessPdf(pdfLink); const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
return { return {
content: pdfContent, content: pdfContent,
metadata: { sourceURL: pdfLink }, metadata: { sourceURL: pdfLink },
@ -479,6 +479,7 @@ export class WebScraperDataProvider {
onlyMainContent: false, onlyMainContent: false,
includeHtml: false, includeHtml: false,
replaceAllPathsWithAbsolutePaths: false, replaceAllPathsWithAbsolutePaths: false,
parsePDF: true,
removeTags: [] removeTags: []
}; };
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}

View File

@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
url: string, url: string,
waitFor: number = 0, waitFor: number = 0,
screenshot: boolean = false, screenshot: boolean = false,
pageOptions: { scrollXPaths?: string[] } = {}, pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
headers?: Record<string, string>, headers?: Record<string, string>,
options?: any options?: any
): Promise<FireEngineResponse> { ): Promise<FireEngineResponse> {
@ -88,7 +88,7 @@ export async function scrapWithFireEngine(
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return { html: await fetchAndProcessPdf(url), screenshot: "" }; return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" };
} else { } else {
const data = response.data; const data = response.data;
const html = data.content; const html = data.content;
@ -108,7 +108,8 @@ export async function scrapWithFireEngine(
export async function scrapWithScrapingBee( export async function scrapWithScrapingBee(
url: string, url: string,
wait_browser: string = "domcontentloaded", wait_browser: string = "domcontentloaded",
timeout: number = universalTimeout timeout: number = universalTimeout,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> { ): Promise<string> {
try { try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
@ -129,7 +130,7 @@ export async function scrapWithScrapingBee(
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else { } else {
const decoder = new TextDecoder(); const decoder = new TextDecoder();
const text = decoder.decode(response.data); const text = decoder.decode(response.data);
@ -144,7 +145,8 @@ export async function scrapWithScrapingBee(
export async function scrapWithPlaywright( export async function scrapWithPlaywright(
url: string, url: string,
waitFor: number = 0, waitFor: number = 0,
headers?: Record<string, string> headers?: Record<string, string>,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> { ): Promise<string> {
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
@ -172,7 +174,7 @@ export async function scrapWithPlaywright(
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else { } else {
const textData = response.data; const textData = response.data;
try { try {
@ -194,7 +196,10 @@ export async function scrapWithPlaywright(
} }
} }
export async function scrapWithFetch(url: string): Promise<string> { export async function scrapWithFetch(
url: string,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> {
try { try {
const response = await axios.get(url, { const response = await axios.get(url, {
headers: { headers: {
@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else { } else {
const text = response.data; const text = response.data;
return text; return text;
@ -384,7 +389,7 @@ export async function scrapSingleUrl(
} }
break; break;
case "pdf": case "pdf":
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot } customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot }
break; break;
} }
} }

View File

@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor';
describe('PDF Processing Module - Integration Test', () => { describe('PDF Processing Module - Integration Test', () => {
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
delete process.env.LLAMAPARSE_API_KEY; delete process.env.LLAMAPARSE_API_KEY;
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
expect(pdfContent.trim()).toEqual("Dummy PDF file"); expect(pdfContent.trim()).toEqual("Dummy PDF file");
}); });

View File

@ -9,9 +9,9 @@ import os from "os";
dotenv.config(); dotenv.config();
export async function fetchAndProcessPdf(url: string): Promise<string> { export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<string> {
const tempFilePath = await downloadPdf(url); const tempFilePath = await downloadPdf(url);
const content = await processPdfToText(tempFilePath); const content = await processPdfToText(tempFilePath, parsePDF);
fs.unlinkSync(tempFilePath); // Clean up the temporary file fs.unlinkSync(tempFilePath); // Clean up the temporary file
return content; return content;
} }
@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<string> {
}); });
} }
export async function processPdfToText(filePath: string): Promise<string> { export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
let content = ""; let content = "";
if (process.env.LLAMAPARSE_API_KEY) { if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
const apiKey = process.env.LLAMAPARSE_API_KEY; const apiKey = process.env.LLAMAPARSE_API_KEY;
const headers = { const headers = {
Authorization: `Bearer ${apiKey}`, Authorization: `Bearer ${apiKey}`,
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
console.error("Error processing pdf document w/ LlamaIndex(2)"); console.error("Error processing pdf document w/ LlamaIndex(2)");
content = await processPdf(filePath); content = await processPdf(filePath);
} }
} else { } else if (parsePDF) {
content = await processPdf(filePath); content = await processPdf(filePath);
} else {
content = fs.readFileSync(filePath, "utf-8");
} }
return content; return content;
} }