added parsePDF option to pageOptions
user can decide if they are going to let us take care of the parse or they are going to parse the pdf by themselves
This commit is contained in:
parent
48f6c19a05
commit
e37d151404
@ -136,6 +136,21 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
|
||||||
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
||||||
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
|
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
|
||||||
// const startTime = Date.now();
|
// const startTime = Date.now();
|
||||||
|
@ -56,7 +56,11 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
|
const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
parsePDF: true
|
||||||
|
};
|
||||||
|
|
||||||
if (mode === "single_urls" && !url.includes(",")) {
|
if (mode === "single_urls" && !url.includes(",")) {
|
||||||
try {
|
try {
|
||||||
|
@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
waitFor: 0,
|
||||||
|
screenshot: false,
|
||||||
|
parsePDF: true
|
||||||
|
};
|
||||||
const extractorOptions = req.body.extractorOptions ?? {
|
const extractorOptions = req.body.extractorOptions ?? {
|
||||||
mode: "markdown"
|
mode: "markdown"
|
||||||
}
|
}
|
||||||
|
@ -19,6 +19,7 @@ export type PageOptions = {
|
|||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
|
parsePDF?: boolean
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
|
||||||
|
|
||||||
export async function handleCustomScraping(
|
export async function handleCustomScraping(
|
||||||
text: string,
|
text: string,
|
||||||
url: string
|
url: string
|
||||||
|
@ -280,7 +280,7 @@ export class WebScraperDataProvider {
|
|||||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||||
return Promise.all(
|
return Promise.all(
|
||||||
pdfLinks.map(async (pdfLink) => {
|
pdfLinks.map(async (pdfLink) => {
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
|
||||||
return {
|
return {
|
||||||
content: pdfContent,
|
content: pdfContent,
|
||||||
metadata: { sourceURL: pdfLink },
|
metadata: { sourceURL: pdfLink },
|
||||||
@ -475,7 +475,12 @@ export class WebScraperDataProvider {
|
|||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
|
this.pageOptions = options.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
replaceAllPathsWithAbsolutePaths: false,
|
||||||
|
parsePDF: true
|
||||||
|
};
|
||||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||||
|
@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
|
|||||||
url: string,
|
url: string,
|
||||||
waitFor: number = 0,
|
waitFor: number = 0,
|
||||||
screenshot: boolean = false,
|
screenshot: boolean = false,
|
||||||
pageOptions: { scrollXPaths?: string[] } = {},
|
pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
|
||||||
headers?: Record<string, string>,
|
headers?: Record<string, string>,
|
||||||
options?: any
|
options?: any
|
||||||
): Promise<FireEngineResponse> {
|
): Promise<FireEngineResponse> {
|
||||||
@ -88,7 +88,7 @@ export async function scrapWithFireEngine(
|
|||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" };
|
||||||
} else {
|
} else {
|
||||||
const data = response.data;
|
const data = response.data;
|
||||||
const html = data.content;
|
const html = data.content;
|
||||||
@ -108,7 +108,8 @@ export async function scrapWithFireEngine(
|
|||||||
export async function scrapWithScrapingBee(
|
export async function scrapWithScrapingBee(
|
||||||
url: string,
|
url: string,
|
||||||
wait_browser: string = "domcontentloaded",
|
wait_browser: string = "domcontentloaded",
|
||||||
timeout: number = universalTimeout
|
timeout: number = universalTimeout,
|
||||||
|
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||||
@ -129,7 +130,7 @@ export async function scrapWithScrapingBee(
|
|||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
} else {
|
} else {
|
||||||
const decoder = new TextDecoder();
|
const decoder = new TextDecoder();
|
||||||
const text = decoder.decode(response.data);
|
const text = decoder.decode(response.data);
|
||||||
@ -144,7 +145,8 @@ export async function scrapWithScrapingBee(
|
|||||||
export async function scrapWithPlaywright(
|
export async function scrapWithPlaywright(
|
||||||
url: string,
|
url: string,
|
||||||
waitFor: number = 0,
|
waitFor: number = 0,
|
||||||
headers?: Record<string, string>
|
headers?: Record<string, string>,
|
||||||
|
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const reqParams = await generateRequestParams(url);
|
const reqParams = await generateRequestParams(url);
|
||||||
@ -172,7 +174,7 @@ export async function scrapWithPlaywright(
|
|||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
} else {
|
} else {
|
||||||
const textData = response.data;
|
const textData = response.data;
|
||||||
try {
|
try {
|
||||||
@ -194,7 +196,10 @@ export async function scrapWithPlaywright(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapWithFetch(url: string): Promise<string> {
|
export async function scrapWithFetch(
|
||||||
|
url: string,
|
||||||
|
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||||
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(url, {
|
const response = await axios.get(url, {
|
||||||
headers: {
|
headers: {
|
||||||
@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
|||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
} else {
|
} else {
|
||||||
const text = response.data;
|
const text = response.data;
|
||||||
return text;
|
return text;
|
||||||
@ -371,7 +376,7 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "pdf":
|
case "pdf":
|
||||||
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
|
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot }
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor';
|
|||||||
describe('PDF Processing Module - Integration Test', () => {
|
describe('PDF Processing Module - Integration Test', () => {
|
||||||
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
||||||
delete process.env.LLAMAPARSE_API_KEY;
|
delete process.env.LLAMAPARSE_API_KEY;
|
||||||
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
|
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
|
||||||
expect(pdfContent.trim()).toEqual("Dummy PDF file");
|
expect(pdfContent.trim()).toEqual("Dummy PDF file");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -9,9 +9,9 @@ import os from "os";
|
|||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
export async function fetchAndProcessPdf(url: string): Promise<string> {
|
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<string> {
|
||||||
const tempFilePath = await downloadPdf(url);
|
const tempFilePath = await downloadPdf(url);
|
||||||
const content = await processPdfToText(tempFilePath);
|
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<string> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function processPdfToText(filePath: string): Promise<string> {
|
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
|
||||||
let content = "";
|
let content = "";
|
||||||
|
|
||||||
if (process.env.LLAMAPARSE_API_KEY) {
|
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
||||||
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
||||||
const headers = {
|
const headers = {
|
||||||
Authorization: `Bearer ${apiKey}`,
|
Authorization: `Bearer ${apiKey}`,
|
||||||
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
|
|||||||
console.error("Error processing pdf document w/ LlamaIndex(2)");
|
console.error("Error processing pdf document w/ LlamaIndex(2)");
|
||||||
content = await processPdf(filePath);
|
content = await processPdf(filePath);
|
||||||
}
|
}
|
||||||
} else {
|
} else if (parsePDF) {
|
||||||
content = await processPdf(filePath);
|
content = await processPdf(filePath);
|
||||||
|
} else {
|
||||||
|
content = fs.readFileSync(filePath, "utf-8");
|
||||||
}
|
}
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user