Merge pull request #271 from mendableai/feat/issue-205
[Feat] Added parsePDF option to pageOptions
This commit is contained in:
commit
2c0a2c742a
@ -135,6 +135,21 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
|
||||||
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||||
const responseWithoutRemoveTags = await request(TEST_URL)
|
const responseWithoutRemoveTags = await request(TEST_URL)
|
||||||
.post("/v0/scrape")
|
.post("/v0/scrape")
|
||||||
|
@ -55,13 +55,15 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
|
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {
|
const crawlerOptions = req.body.crawlerOptions ?? {
|
||||||
allowBackwardCrawling: false
|
allowBackwardCrawling: false
|
||||||
};
|
};
|
||||||
const pageOptions = req.body.pageOptions ?? {
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
removeTags: []
|
removeTags: [],
|
||||||
|
parsePDF: true
|
||||||
};
|
};
|
||||||
|
|
||||||
if (mode === "single_urls" && !url.includes(",")) {
|
if (mode === "single_urls" && !url.includes(",")) {
|
||||||
|
@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
waitFor: 0,
|
||||||
|
screenshot: false,
|
||||||
|
parsePDF: true
|
||||||
|
};
|
||||||
const extractorOptions = req.body.extractorOptions ?? {
|
const extractorOptions = req.body.extractorOptions ?? {
|
||||||
mode: "markdown"
|
mode: "markdown"
|
||||||
}
|
}
|
||||||
|
@ -19,6 +19,7 @@ export type PageOptions = {
|
|||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
|
parsePDF?: boolean;
|
||||||
removeTags?: string | string[];
|
removeTags?: string | string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
|
||||||
|
|
||||||
export async function handleCustomScraping(
|
export async function handleCustomScraping(
|
||||||
text: string,
|
text: string,
|
||||||
url: string
|
url: string
|
||||||
|
@ -280,7 +280,7 @@ export class WebScraperDataProvider {
|
|||||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||||
return Promise.all(
|
return Promise.all(
|
||||||
pdfLinks.map(async (pdfLink) => {
|
pdfLinks.map(async (pdfLink) => {
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
|
||||||
return {
|
return {
|
||||||
content: pdfContent,
|
content: pdfContent,
|
||||||
metadata: { sourceURL: pdfLink },
|
metadata: { sourceURL: pdfLink },
|
||||||
@ -479,6 +479,7 @@ export class WebScraperDataProvider {
|
|||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
replaceAllPathsWithAbsolutePaths: false,
|
replaceAllPathsWithAbsolutePaths: false,
|
||||||
|
parsePDF: true,
|
||||||
removeTags: []
|
removeTags: []
|
||||||
};
|
};
|
||||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
|
@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
|
|||||||
url: string,
|
url: string,
|
||||||
waitFor: number = 0,
|
waitFor: number = 0,
|
||||||
screenshot: boolean = false,
|
screenshot: boolean = false,
|
||||||
pageOptions: { scrollXPaths?: string[] } = {},
|
pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
|
||||||
headers?: Record<string, string>,
|
headers?: Record<string, string>,
|
||||||
options?: any
|
options?: any
|
||||||
): Promise<FireEngineResponse> {
|
): Promise<FireEngineResponse> {
|
||||||
@ -88,7 +88,7 @@ export async function scrapWithFireEngine(
|
|||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" };
|
||||||
} else {
|
} else {
|
||||||
const data = response.data;
|
const data = response.data;
|
||||||
const html = data.content;
|
const html = data.content;
|
||||||
@ -108,7 +108,8 @@ export async function scrapWithFireEngine(
|
|||||||
export async function scrapWithScrapingBee(
|
export async function scrapWithScrapingBee(
|
||||||
url: string,
|
url: string,
|
||||||
wait_browser: string = "domcontentloaded",
|
wait_browser: string = "domcontentloaded",
|
||||||
timeout: number = universalTimeout
|
timeout: number = universalTimeout,
|
||||||
|
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||||
@ -129,7 +130,7 @@ export async function scrapWithScrapingBee(
|
|||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
} else {
|
} else {
|
||||||
const decoder = new TextDecoder();
|
const decoder = new TextDecoder();
|
||||||
const text = decoder.decode(response.data);
|
const text = decoder.decode(response.data);
|
||||||
@ -144,7 +145,8 @@ export async function scrapWithScrapingBee(
|
|||||||
export async function scrapWithPlaywright(
|
export async function scrapWithPlaywright(
|
||||||
url: string,
|
url: string,
|
||||||
waitFor: number = 0,
|
waitFor: number = 0,
|
||||||
headers?: Record<string, string>
|
headers?: Record<string, string>,
|
||||||
|
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const reqParams = await generateRequestParams(url);
|
const reqParams = await generateRequestParams(url);
|
||||||
@ -172,7 +174,7 @@ export async function scrapWithPlaywright(
|
|||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
} else {
|
} else {
|
||||||
const textData = response.data;
|
const textData = response.data;
|
||||||
try {
|
try {
|
||||||
@ -194,7 +196,10 @@ export async function scrapWithPlaywright(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapWithFetch(url: string): Promise<string> {
|
export async function scrapWithFetch(
|
||||||
|
url: string,
|
||||||
|
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||||
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(url, {
|
const response = await axios.get(url, {
|
||||||
headers: {
|
headers: {
|
||||||
@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
|||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
} else {
|
} else {
|
||||||
const text = response.data;
|
const text = response.data;
|
||||||
return text;
|
return text;
|
||||||
@ -384,7 +389,7 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "pdf":
|
case "pdf":
|
||||||
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
|
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot }
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor';
|
|||||||
describe('PDF Processing Module - Integration Test', () => {
|
describe('PDF Processing Module - Integration Test', () => {
|
||||||
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
||||||
delete process.env.LLAMAPARSE_API_KEY;
|
delete process.env.LLAMAPARSE_API_KEY;
|
||||||
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
|
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
|
||||||
expect(pdfContent.trim()).toEqual("Dummy PDF file");
|
expect(pdfContent.trim()).toEqual("Dummy PDF file");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -9,9 +9,9 @@ import os from "os";
|
|||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
export async function fetchAndProcessPdf(url: string): Promise<string> {
|
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<string> {
|
||||||
const tempFilePath = await downloadPdf(url);
|
const tempFilePath = await downloadPdf(url);
|
||||||
const content = await processPdfToText(tempFilePath);
|
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<string> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function processPdfToText(filePath: string): Promise<string> {
|
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
|
||||||
let content = "";
|
let content = "";
|
||||||
|
|
||||||
if (process.env.LLAMAPARSE_API_KEY) {
|
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
||||||
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
||||||
const headers = {
|
const headers = {
|
||||||
Authorization: `Bearer ${apiKey}`,
|
Authorization: `Bearer ${apiKey}`,
|
||||||
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
|
|||||||
console.error("Error processing pdf document w/ LlamaIndex(2)");
|
console.error("Error processing pdf document w/ LlamaIndex(2)");
|
||||||
content = await processPdf(filePath);
|
content = await processPdf(filePath);
|
||||||
}
|
}
|
||||||
} else {
|
} else if (parsePDF) {
|
||||||
content = await processPdf(filePath);
|
content = await processPdf(filePath);
|
||||||
|
} else {
|
||||||
|
content = fs.readFileSync(filePath, "utf-8");
|
||||||
}
|
}
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user