0

Merge pull request #271 from mendableai/feat/issue-205

[Feat] Added parsePDF option to pageOptions
This commit is contained in:
Rafael Miller 2024-06-14 11:29:26 -03:00 committed by GitHub
commit 2c0a2c742a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 50 additions and 20 deletions

View File

@ -135,6 +135,21 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
}, 60000); // 60 seconds
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
const responseWithoutRemoveTags = await request(TEST_URL)
.post("/v0/scrape")

View File

@ -55,13 +55,15 @@ export async function crawlController(req: Request, res: Response) {
}
const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {
allowBackwardCrawling: false
};
const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
removeTags: []
removeTags: [],
parsePDF: true
};
if (mode === "single_urls" && !url.includes(",")) {

View File

@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(status).json({ error });
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
waitFor: 0,
screenshot: false,
parsePDF: true
};
const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown"
}

View File

@ -19,6 +19,7 @@ export type PageOptions = {
screenshot?: boolean;
headers?: Record<string, string>;
replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean;
removeTags?: string | string[];
};

View File

@ -1,5 +1,3 @@
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
export async function handleCustomScraping(
text: string,
url: string

View File

@ -280,7 +280,7 @@ export class WebScraperDataProvider {
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
return Promise.all(
pdfLinks.map(async (pdfLink) => {
const pdfContent = await fetchAndProcessPdf(pdfLink);
const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
return {
content: pdfContent,
metadata: { sourceURL: pdfLink },
@ -479,6 +479,7 @@ export class WebScraperDataProvider {
onlyMainContent: false,
includeHtml: false,
replaceAllPathsWithAbsolutePaths: false,
parsePDF: true,
removeTags: []
};
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}

View File

@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
url: string,
waitFor: number = 0,
screenshot: boolean = false,
pageOptions: { scrollXPaths?: string[] } = {},
pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
headers?: Record<string, string>,
options?: any
): Promise<FireEngineResponse> {
@ -88,7 +88,7 @@ export async function scrapWithFireEngine(
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return { html: await fetchAndProcessPdf(url), screenshot: "" };
return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" };
} else {
const data = response.data;
const html = data.content;
@ -108,7 +108,8 @@ export async function scrapWithFireEngine(
export async function scrapWithScrapingBee(
url: string,
wait_browser: string = "domcontentloaded",
timeout: number = universalTimeout
timeout: number = universalTimeout,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> {
try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
@ -129,7 +130,7 @@ export async function scrapWithScrapingBee(
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url);
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else {
const decoder = new TextDecoder();
const text = decoder.decode(response.data);
@ -144,7 +145,8 @@ export async function scrapWithScrapingBee(
export async function scrapWithPlaywright(
url: string,
waitFor: number = 0,
headers?: Record<string, string>
headers?: Record<string, string>,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> {
try {
const reqParams = await generateRequestParams(url);
@ -172,7 +174,7 @@ export async function scrapWithPlaywright(
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url);
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else {
const textData = response.data;
try {
@ -194,7 +196,10 @@ export async function scrapWithPlaywright(
}
}
export async function scrapWithFetch(url: string): Promise<string> {
export async function scrapWithFetch(
url: string,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> {
try {
const response = await axios.get(url, {
headers: {
@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url);
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else {
const text = response.data;
return text;
@ -384,7 +389,7 @@ export async function scrapSingleUrl(
}
break;
case "pdf":
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot }
break;
}
}

View File

@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor';
describe('PDF Processing Module - Integration Test', () => {
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
delete process.env.LLAMAPARSE_API_KEY;
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
expect(pdfContent.trim()).toEqual("Dummy PDF file");
});

View File

@ -9,9 +9,9 @@ import os from "os";
dotenv.config();
export async function fetchAndProcessPdf(url: string): Promise<string> {
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<string> {
const tempFilePath = await downloadPdf(url);
const content = await processPdfToText(tempFilePath);
const content = await processPdfToText(tempFilePath, parsePDF);
fs.unlinkSync(tempFilePath); // Clean up the temporary file
return content;
}
@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<string> {
});
}
export async function processPdfToText(filePath: string): Promise<string> {
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
let content = "";
if (process.env.LLAMAPARSE_API_KEY) {
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
const apiKey = process.env.LLAMAPARSE_API_KEY;
const headers = {
Authorization: `Bearer ${apiKey}`,
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
console.error("Error processing pdf document w/ LlamaIndex(2)");
content = await processPdf(filePath);
}
} else {
} else if (parsePDF) {
content = await processPdf(filePath);
} else {
content = fs.readFileSync(filePath, "utf-8");
}
return content;
}