diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 8106ae1..abe5c58 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -103,6 +103,36 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.markdown).toContain("🔥 FireCrawl"); expect(response.body.data.html).toContain(" { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 60000); // 60 seconds + + it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 60000); // 60 seconds }); describe("POST /v0/crawl", () => { @@ -548,7 +578,130 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); }, 60000); // 60 seconds - }); + + it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { + const crawlResponse = await request(TEST_URL) + .post('/v0/crawl') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }}); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + let completedResponse; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('status'); + + if (response.body.status === 'completed') { + isCompleted = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + expect(completedResponse.body.status).toBe('completed'); + expect(completedResponse.body).toHaveProperty('data'); + expect(completedResponse.body.data.length).toEqual(1); + expect(completedResponse.body.data).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.') + }) + ]) + ); + }, 60000); // 60 seconds + + it("should return a successful response with max depth option for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 2 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + let completedResponse; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + completedResponse = response; + } + } + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url) => { + const depth = new URL(url).pathname.split("/").filter(Boolean).length; + expect(depth).toBeLessThanOrEqual(1); + }); + }, 120000); + + it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + pageOptions: { includeHtml: true }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + + // 120 seconds + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); + expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); + expect(completedResponse.body.data[0].html).toContain(" { const crawlResponse = await request(TEST_URL) @@ -559,7 +712,7 @@ describe("E2E Tests for API Routes", () => { expect(crawlResponse.statusCode).toBe(200); // wait for 30 seconds - await new Promise((r) => setTimeout(r, 10000)); + await new Promise((r) => setTimeout(r, 20000)); const response = await request(TEST_URL) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) @@ -568,7 +721,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body).toHaveProperty("status"); expect(response.body.status).toBe("cancelled"); - await new Promise((r) => setTimeout(r, 20000)); + await new Promise((r) => setTimeout(r, 10000)); const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) @@ -585,8 +738,6 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds - - describe("POST /v0/scrape with LLM Extraction", () => { it("should extract data using LLM extraction mode", async () => { const response = await request(TEST_URL) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index d244993..0e295ae 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -197,7 +197,9 @@ export class WebScraperDataProvider { private async handleSingleUrlsMode( inProgress?: (progress: Progress) => void ): Promise { - let documents = await this.processLinks(this.urls, inProgress); + const links = this.urls; + + let documents = await this.processLinks(links, inProgress); return documents; } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4bbaee7..4c08168 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; +import { fetchAndProcessPdf } from "./utils/pdfProcessor"; dotenv.config(); @@ -66,9 +67,15 @@ export async function scrapWithScrapingBee( ); return ""; } - const decoder = new TextDecoder(); - const text = decoder.decode(response.data); - return text; + + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + const decoder = new TextDecoder(); + const text = decoder.decode(response.data); + return text; + } } catch (error) { console.error(`Error scraping with Scraping Bee: ${error}`); return ""; @@ -95,9 +102,14 @@ export async function scrapWithPlaywright(url: string): Promise { return ""; } - const data = await response.json(); - const html = data.content; - return html ?? ""; + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + const data = await response.json(); + const html = data.content; + return html ?? ""; + } } catch (error) { console.error(`Error scraping with Puppeteer: ${error}`); return ""; @@ -165,7 +177,13 @@ export async function scrapSingleUrl( ); return ""; } - text = await response.text(); + + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + text = await response.text(); + } } catch (error) { console.error(`Error scraping URL: ${error}`); return ""; diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index fb08d9c..7c57007 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise { async function downloadPdf(url: string): Promise { const response = await axios({ url, - method: 'GET', - responseType: 'stream', + method: "GET", + responseType: "stream", }); const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`); @@ -29,8 +29,8 @@ async function downloadPdf(url: string): Promise { response.data.pipe(writer); return new Promise((resolve, reject) => { - writer.on('finish', () => resolve(tempFilePath)); - writer.on('error', reject); + writer.on("finish", () => resolve(tempFilePath)); + writer.on("error", reject); }); } @@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise { } else { // If the status code is not 200, increment the attempt counter and wait attempt++; - await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds + await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds } } catch (error) { - console.error("Error fetching result:", error); + console.error("Error fetching result:", error || ''); attempt++; - await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying + await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying // You may want to handle specific errors differently } } @@ -101,7 +101,7 @@ export async function processPdfToText(filePath: string): Promise { return content; } -async function processPdf(file: string){ +async function processPdf(file: string) { const fileContent = fs.readFileSync(file); const data = await pdf(fileContent); return data.text;