diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 8106ae1..abe5c58 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -103,6 +103,36 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.markdown).toContain("🔥 FireCrawl");
expect(response.body.data.html).toContain("
{
+ const response = await request(TEST_URL)
+ .post('/v0/scrape')
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+ .set('Content-Type', 'application/json')
+ .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' });
+ await new Promise((r) => setTimeout(r, 6000));
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('data');
+ expect(response.body.data).toHaveProperty('content');
+ expect(response.body.data).toHaveProperty('metadata');
+ expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
+ }, 60000); // 60 seconds
+
+ it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
+ const response = await request(TEST_URL)
+ .post('/v0/scrape')
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+ .set('Content-Type', 'application/json')
+ .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' });
+ await new Promise((r) => setTimeout(r, 6000));
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('data');
+ expect(response.body.data).toHaveProperty('content');
+ expect(response.body.data).toHaveProperty('metadata');
+ expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
+ }, 60000); // 60 seconds
});
describe("POST /v0/crawl", () => {
@@ -548,7 +578,130 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
}, 60000); // 60 seconds
- });
+
+ it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
+ const crawlResponse = await request(TEST_URL)
+ .post('/v0/crawl')
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+ .set('Content-Type', 'application/json')
+ .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }});
+ expect(crawlResponse.statusCode).toBe(200);
+
+ let isCompleted = false;
+ let completedResponse;
+
+ while (!isCompleted) {
+ const response = await request(TEST_URL)
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+ .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty('status');
+
+ if (response.body.status === 'completed') {
+ isCompleted = true;
+ completedResponse = response;
+ } else {
+ await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+ }
+ }
+ expect(completedResponse.body.status).toBe('completed');
+ expect(completedResponse.body).toHaveProperty('data');
+ expect(completedResponse.body.data.length).toEqual(1);
+ expect(completedResponse.body.data).toEqual(
+ expect.arrayContaining([
+ expect.objectContaining({
+ content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.')
+ })
+ ])
+ );
+ }, 60000); // 60 seconds
+
+ it("should return a successful response with max depth option for a valid crawl job", async () => {
+ const crawlResponse = await request(TEST_URL)
+ .post("/v0/crawl")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "https://www.scrapethissite.com",
+ crawlerOptions: { maxDepth: 2 },
+ });
+ expect(crawlResponse.statusCode).toBe(200);
+
+ let isCompleted = false;
+ let completedResponse;
+
+ while (!isCompleted) {
+ const response = await request(TEST_URL)
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("status");
+
+ if (response.body.status === "completed") {
+ isCompleted = true;
+ completedResponse = response;
+ }
+ }
+ expect(completedResponse.statusCode).toBe(200);
+ expect(completedResponse.body).toHaveProperty("status");
+ expect(completedResponse.body.status).toBe("completed");
+ expect(completedResponse.body).toHaveProperty("data");
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+ const urls = completedResponse.body.data.map(
+ (item: any) => item.metadata?.sourceURL
+ );
+ expect(urls.length).toBeGreaterThan(1);
+
+ // Check if all URLs have a maximum depth of 1
+ urls.forEach((url) => {
+ const depth = new URL(url).pathname.split("/").filter(Boolean).length;
+ expect(depth).toBeLessThanOrEqual(1);
+ });
+ }, 120000);
+
+ it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => {
+ const crawlResponse = await request(TEST_URL)
+ .post("/v0/crawl")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "https://firecrawl.dev",
+ pageOptions: { includeHtml: true },
+ });
+ expect(crawlResponse.statusCode).toBe(200);
+
+ const response = await request(TEST_URL)
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("status");
+ expect(response.body.status).toBe("active");
+
+ // wait for 30 seconds
+ await new Promise((r) => setTimeout(r, 30000));
+
+ const completedResponse = await request(TEST_URL)
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+
+ expect(completedResponse.statusCode).toBe(200);
+ expect(completedResponse.body).toHaveProperty("status");
+ expect(completedResponse.body.status).toBe("completed");
+ expect(completedResponse.body).toHaveProperty("data");
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+
+ // 120 seconds
+ expect(completedResponse.body.data[0]).toHaveProperty("html");
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+ expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
+ expect(completedResponse.body.data[0].markdown).toContain("FireCrawl");
+ expect(completedResponse.body.data[0].html).toContain(" {
const crawlResponse = await request(TEST_URL)
@@ -559,7 +712,7 @@ describe("E2E Tests for API Routes", () => {
expect(crawlResponse.statusCode).toBe(200);
// wait for 30 seconds
- await new Promise((r) => setTimeout(r, 10000));
+ await new Promise((r) => setTimeout(r, 20000));
const response = await request(TEST_URL)
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
@@ -568,7 +721,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("cancelled");
- await new Promise((r) => setTimeout(r, 20000));
+ await new Promise((r) => setTimeout(r, 10000));
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
@@ -585,8 +738,6 @@ describe("E2E Tests for API Routes", () => {
}, 60000); // 60 seconds
-
-
describe("POST /v0/scrape with LLM Extraction", () => {
it("should extract data using LLM extraction mode", async () => {
const response = await request(TEST_URL)
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index d244993..0e295ae 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -197,7 +197,9 @@ export class WebScraperDataProvider {
private async handleSingleUrlsMode(
inProgress?: (progress: Progress) => void
): Promise {
- let documents = await this.processLinks(this.urls, inProgress);
+ const links = this.urls;
+
+ let documents = await this.processLinks(links, inProgress);
return documents;
}
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index 4bbaee7..4c08168 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params";
+import { fetchAndProcessPdf } from "./utils/pdfProcessor";
dotenv.config();
@@ -66,9 +67,15 @@ export async function scrapWithScrapingBee(
);
return "";
}
- const decoder = new TextDecoder();
- const text = decoder.decode(response.data);
- return text;
+
+ const contentType = response.headers['content-type'];
+ if (contentType && contentType.includes('application/pdf')) {
+ return fetchAndProcessPdf(url);
+ } else {
+ const decoder = new TextDecoder();
+ const text = decoder.decode(response.data);
+ return text;
+ }
} catch (error) {
console.error(`Error scraping with Scraping Bee: ${error}`);
return "";
@@ -95,9 +102,14 @@ export async function scrapWithPlaywright(url: string): Promise {
return "";
}
- const data = await response.json();
- const html = data.content;
- return html ?? "";
+ const contentType = response.headers['content-type'];
+ if (contentType && contentType.includes('application/pdf')) {
+ return fetchAndProcessPdf(url);
+ } else {
+ const data = await response.json();
+ const html = data.content;
+ return html ?? "";
+ }
} catch (error) {
console.error(`Error scraping with Puppeteer: ${error}`);
return "";
@@ -165,7 +177,13 @@ export async function scrapSingleUrl(
);
return "";
}
- text = await response.text();
+
+ const contentType = response.headers['content-type'];
+ if (contentType && contentType.includes('application/pdf')) {
+ return fetchAndProcessPdf(url);
+ } else {
+ text = await response.text();
+ }
} catch (error) {
console.error(`Error scraping URL: ${error}`);
return "";
diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
index fb08d9c..7c57007 100644
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise {
async function downloadPdf(url: string): Promise {
const response = await axios({
url,
- method: 'GET',
- responseType: 'stream',
+ method: "GET",
+ responseType: "stream",
});
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
@@ -29,8 +29,8 @@ async function downloadPdf(url: string): Promise {
response.data.pipe(writer);
return new Promise((resolve, reject) => {
- writer.on('finish', () => resolve(tempFilePath));
- writer.on('error', reject);
+ writer.on("finish", () => resolve(tempFilePath));
+ writer.on("error", reject);
});
}
@@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise {
} else {
// If the status code is not 200, increment the attempt counter and wait
attempt++;
- await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds
+ await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
}
} catch (error) {
- console.error("Error fetching result:", error);
+ console.error("Error fetching result:", error || '');
attempt++;
- await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying
+ await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
// You may want to handle specific errors differently
}
}
@@ -101,7 +101,7 @@ export async function processPdfToText(filePath: string): Promise {
return content;
}
-async function processPdf(file: string){
+async function processPdf(file: string) {
const fileContent = fs.readFileSync(file);
const data = await pdf(fileContent);
return data.text;