Merge pull request #271 from mendableai/feat/issue-205
[Feat] Added parsePDF option to pageOptions
This commit is contained in:
commit
2c0a2c742a
@ -135,6 +135,21 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||
const responseWithoutRemoveTags = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
|
@ -55,13 +55,15 @@ export async function crawlController(req: Request, res: Response) {
|
||||
}
|
||||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {
|
||||
allowBackwardCrawling: false
|
||||
};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
removeTags: []
|
||||
removeTags: [],
|
||||
parsePDF: true
|
||||
};
|
||||
|
||||
if (mode === "single_urls" && !url.includes(",")) {
|
||||
|
@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
parsePDF: true
|
||||
};
|
||||
const extractorOptions = req.body.extractorOptions ?? {
|
||||
mode: "markdown"
|
||||
}
|
||||
|
@ -19,6 +19,7 @@ export type PageOptions = {
|
||||
screenshot?: boolean;
|
||||
headers?: Record<string, string>;
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
parsePDF?: boolean;
|
||||
removeTags?: string | string[];
|
||||
};
|
||||
|
||||
|
@ -1,5 +1,3 @@
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
|
||||
export async function handleCustomScraping(
|
||||
text: string,
|
||||
url: string
|
||||
|
@ -280,7 +280,7 @@ export class WebScraperDataProvider {
|
||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
pdfLinks.map(async (pdfLink) => {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
|
||||
return {
|
||||
content: pdfContent,
|
||||
metadata: { sourceURL: pdfLink },
|
||||
@ -479,6 +479,7 @@ export class WebScraperDataProvider {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
replaceAllPathsWithAbsolutePaths: false,
|
||||
parsePDF: true,
|
||||
removeTags: []
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||
|
@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
|
||||
url: string,
|
||||
waitFor: number = 0,
|
||||
screenshot: boolean = false,
|
||||
pageOptions: { scrollXPaths?: string[] } = {},
|
||||
pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
|
||||
headers?: Record<string, string>,
|
||||
options?: any
|
||||
): Promise<FireEngineResponse> {
|
||||
@ -88,7 +88,7 @@ export async function scrapWithFireEngine(
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
||||
return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" };
|
||||
} else {
|
||||
const data = response.data;
|
||||
const html = data.content;
|
||||
@ -108,7 +108,8 @@ export async function scrapWithFireEngine(
|
||||
export async function scrapWithScrapingBee(
|
||||
url: string,
|
||||
wait_browser: string = "domcontentloaded",
|
||||
timeout: number = universalTimeout
|
||||
timeout: number = universalTimeout,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<string> {
|
||||
try {
|
||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||
@ -129,7 +130,7 @@ export async function scrapWithScrapingBee(
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return fetchAndProcessPdf(url);
|
||||
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||
} else {
|
||||
const decoder = new TextDecoder();
|
||||
const text = decoder.decode(response.data);
|
||||
@ -144,7 +145,8 @@ export async function scrapWithScrapingBee(
|
||||
export async function scrapWithPlaywright(
|
||||
url: string,
|
||||
waitFor: number = 0,
|
||||
headers?: Record<string, string>
|
||||
headers?: Record<string, string>,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<string> {
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
@ -172,7 +174,7 @@ export async function scrapWithPlaywright(
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return fetchAndProcessPdf(url);
|
||||
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||
} else {
|
||||
const textData = response.data;
|
||||
try {
|
||||
@ -194,7 +196,10 @@ export async function scrapWithPlaywright(
|
||||
}
|
||||
}
|
||||
|
||||
export async function scrapWithFetch(url: string): Promise<string> {
|
||||
export async function scrapWithFetch(
|
||||
url: string,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<string> {
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return fetchAndProcessPdf(url);
|
||||
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||
} else {
|
||||
const text = response.data;
|
||||
return text;
|
||||
@ -384,7 +389,7 @@ export async function scrapSingleUrl(
|
||||
}
|
||||
break;
|
||||
case "pdf":
|
||||
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
|
||||
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot }
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor';
|
||||
describe('PDF Processing Module - Integration Test', () => {
|
||||
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
||||
delete process.env.LLAMAPARSE_API_KEY;
|
||||
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
|
||||
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
|
||||
expect(pdfContent.trim()).toEqual("Dummy PDF file");
|
||||
});
|
||||
|
||||
|
@ -9,9 +9,9 @@ import os from "os";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function fetchAndProcessPdf(url: string): Promise<string> {
|
||||
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<string> {
|
||||
const tempFilePath = await downloadPdf(url);
|
||||
const content = await processPdfToText(tempFilePath);
|
||||
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
return content;
|
||||
}
|
||||
@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<string> {
|
||||
});
|
||||
}
|
||||
|
||||
export async function processPdfToText(filePath: string): Promise<string> {
|
||||
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
|
||||
let content = "";
|
||||
|
||||
if (process.env.LLAMAPARSE_API_KEY) {
|
||||
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
||||
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
||||
const headers = {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
|
||||
console.error("Error processing pdf document w/ LlamaIndex(2)");
|
||||
content = await processPdf(filePath);
|
||||
}
|
||||
} else {
|
||||
} else if (parsePDF) {
|
||||
content = await processPdf(filePath);
|
||||
} else {
|
||||
content = fs.readFileSync(filePath, "utf-8");
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user