Merge pull request #29 from mendableai/detect-pdfs
Fixes pdfs not found if .pdf is not present
This commit is contained in:
commit
2a1f2e396a
@ -103,6 +103,36 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data.markdown).toContain("🔥 FireCrawl");
|
expect(response.body.data.markdown).toContain("🔥 FireCrawl");
|
||||||
expect(response.body.data.html).toContain("<h1");
|
expect(response.body.data.html).toContain("<h1");
|
||||||
}, 30000); // 30 seconds timeout
|
}, 30000); // 30 seconds timeout
|
||||||
|
|
||||||
|
it('should return a successful response for a valid scrape with PDF file', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' });
|
||||||
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post('/v0/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' });
|
||||||
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
expect(response.body.data).toHaveProperty('content');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||||
|
}, 60000); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("POST /v0/crawl", () => {
|
describe("POST /v0/crawl", () => {
|
||||||
@ -548,7 +578,130 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
});
|
|
||||||
|
it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post('/v0/crawl')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }});
|
||||||
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
let isCompleted = false;
|
||||||
|
let completedResponse;
|
||||||
|
|
||||||
|
while (!isCompleted) {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('status');
|
||||||
|
|
||||||
|
if (response.body.status === 'completed') {
|
||||||
|
isCompleted = true;
|
||||||
|
completedResponse = response;
|
||||||
|
} else {
|
||||||
|
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(completedResponse.body.status).toBe('completed');
|
||||||
|
expect(completedResponse.body).toHaveProperty('data');
|
||||||
|
expect(completedResponse.body.data.length).toEqual(1);
|
||||||
|
expect(completedResponse.body.data).toEqual(
|
||||||
|
expect.arrayContaining([
|
||||||
|
expect.objectContaining({
|
||||||
|
content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.')
|
||||||
|
})
|
||||||
|
])
|
||||||
|
);
|
||||||
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it("should return a successful response with max depth option for a valid crawl job", async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://www.scrapethissite.com",
|
||||||
|
crawlerOptions: { maxDepth: 2 },
|
||||||
|
});
|
||||||
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
let isCompleted = false;
|
||||||
|
let completedResponse;
|
||||||
|
|
||||||
|
while (!isCompleted) {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("status");
|
||||||
|
|
||||||
|
if (response.body.status === "completed") {
|
||||||
|
isCompleted = true;
|
||||||
|
completedResponse = response;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
const urls = completedResponse.body.data.map(
|
||||||
|
(item: any) => item.metadata?.sourceURL
|
||||||
|
);
|
||||||
|
expect(urls.length).toBeGreaterThan(1);
|
||||||
|
|
||||||
|
// Check if all URLs have a maximum depth of 1
|
||||||
|
urls.forEach((url) => {
|
||||||
|
const depth = new URL(url).pathname.split("/").filter(Boolean).length;
|
||||||
|
expect(depth).toBeLessThanOrEqual(1);
|
||||||
|
});
|
||||||
|
}, 120000);
|
||||||
|
|
||||||
|
it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://firecrawl.dev",
|
||||||
|
pageOptions: { includeHtml: true },
|
||||||
|
});
|
||||||
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("status");
|
||||||
|
expect(response.body.status).toBe("active");
|
||||||
|
|
||||||
|
// wait for 30 seconds
|
||||||
|
await new Promise((r) => setTimeout(r, 30000));
|
||||||
|
|
||||||
|
const completedResponse = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
|
||||||
|
// 120 seconds
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
||||||
|
expect(completedResponse.body.data[0].markdown).toContain("FireCrawl");
|
||||||
|
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||||
|
}, 60000);
|
||||||
|
}); // 60 seconds
|
||||||
|
|
||||||
it("If someone cancels a crawl job, it should turn into failed status", async () => {
|
it("If someone cancels a crawl job, it should turn into failed status", async () => {
|
||||||
const crawlResponse = await request(TEST_URL)
|
const crawlResponse = await request(TEST_URL)
|
||||||
@ -559,7 +712,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
// wait for 30 seconds
|
// wait for 30 seconds
|
||||||
await new Promise((r) => setTimeout(r, 10000));
|
await new Promise((r) => setTimeout(r, 20000));
|
||||||
|
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
|
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
|
||||||
@ -568,7 +721,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body).toHaveProperty("status");
|
expect(response.body).toHaveProperty("status");
|
||||||
expect(response.body.status).toBe("cancelled");
|
expect(response.body.status).toBe("cancelled");
|
||||||
|
|
||||||
await new Promise((r) => setTimeout(r, 20000));
|
await new Promise((r) => setTimeout(r, 10000));
|
||||||
|
|
||||||
const completedResponse = await request(TEST_URL)
|
const completedResponse = await request(TEST_URL)
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
@ -585,8 +738,6 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
|
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
describe("POST /v0/scrape with LLM Extraction", () => {
|
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||||
it("should extract data using LLM extraction mode", async () => {
|
it("should extract data using LLM extraction mode", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
|
@ -197,7 +197,9 @@ export class WebScraperDataProvider {
|
|||||||
private async handleSingleUrlsMode(
|
private async handleSingleUrlsMode(
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
let documents = await this.processLinks(this.urls, inProgress);
|
const links = this.urls;
|
||||||
|
|
||||||
|
let documents = await this.processLinks(links, inProgress);
|
||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities";
|
|||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||||
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
@ -66,9 +67,15 @@ export async function scrapWithScrapingBee(
|
|||||||
);
|
);
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
const decoder = new TextDecoder();
|
|
||||||
const text = decoder.decode(response.data);
|
const contentType = response.headers['content-type'];
|
||||||
return text;
|
if (contentType && contentType.includes('application/pdf')) {
|
||||||
|
return fetchAndProcessPdf(url);
|
||||||
|
} else {
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
const text = decoder.decode(response.data);
|
||||||
|
return text;
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error scraping with Scraping Bee: ${error}`);
|
console.error(`Error scraping with Scraping Bee: ${error}`);
|
||||||
return "";
|
return "";
|
||||||
@ -95,9 +102,14 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
const data = await response.json();
|
const contentType = response.headers['content-type'];
|
||||||
const html = data.content;
|
if (contentType && contentType.includes('application/pdf')) {
|
||||||
return html ?? "";
|
return fetchAndProcessPdf(url);
|
||||||
|
} else {
|
||||||
|
const data = await response.json();
|
||||||
|
const html = data.content;
|
||||||
|
return html ?? "";
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error scraping with Puppeteer: ${error}`);
|
console.error(`Error scraping with Puppeteer: ${error}`);
|
||||||
return "";
|
return "";
|
||||||
@ -165,7 +177,13 @@ export async function scrapSingleUrl(
|
|||||||
);
|
);
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
text = await response.text();
|
|
||||||
|
const contentType = response.headers['content-type'];
|
||||||
|
if (contentType && contentType.includes('application/pdf')) {
|
||||||
|
return fetchAndProcessPdf(url);
|
||||||
|
} else {
|
||||||
|
text = await response.text();
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error scraping URL: ${error}`);
|
console.error(`Error scraping URL: ${error}`);
|
||||||
return "";
|
return "";
|
||||||
|
@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise<string> {
|
|||||||
async function downloadPdf(url: string): Promise<string> {
|
async function downloadPdf(url: string): Promise<string> {
|
||||||
const response = await axios({
|
const response = await axios({
|
||||||
url,
|
url,
|
||||||
method: 'GET',
|
method: "GET",
|
||||||
responseType: 'stream',
|
responseType: "stream",
|
||||||
});
|
});
|
||||||
|
|
||||||
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
|
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
|
||||||
@ -29,8 +29,8 @@ async function downloadPdf(url: string): Promise<string> {
|
|||||||
response.data.pipe(writer);
|
response.data.pipe(writer);
|
||||||
|
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
writer.on('finish', () => resolve(tempFilePath));
|
writer.on("finish", () => resolve(tempFilePath));
|
||||||
writer.on('error', reject);
|
writer.on("error", reject);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise<string> {
|
|||||||
} else {
|
} else {
|
||||||
// If the status code is not 200, increment the attempt counter and wait
|
// If the status code is not 200, increment the attempt counter and wait
|
||||||
attempt++;
|
attempt++;
|
||||||
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds
|
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error fetching result:", error);
|
console.error("Error fetching result:", error || '');
|
||||||
attempt++;
|
attempt++;
|
||||||
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying
|
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
||||||
// You may want to handle specific errors differently
|
// You may want to handle specific errors differently
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -101,7 +101,7 @@ export async function processPdfToText(filePath: string): Promise<string> {
|
|||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function processPdf(file: string){
|
async function processPdf(file: string) {
|
||||||
const fileContent = fs.readFileSync(file);
|
const fileContent = fs.readFileSync(file);
|
||||||
const data = await pdf(fileContent);
|
const data = await pdf(fileContent);
|
||||||
return data.text;
|
return data.text;
|
||||||
|
Loading…
Reference in New Issue
Block a user