0

changed the request to do a HEAD to check for a PDF instead

This commit is contained in:
rafaelsideguide 2024-04-29 15:15:32 -03:00
parent f1dd97af0f
commit f8b207793f
3 changed files with 70 additions and 21 deletions

View File

@ -170,6 +170,45 @@ describe('E2E Tests for API Routes', () => {
expect(completedResponse.body.data[0]).toHaveProperty('metadata'); expect(completedResponse.body.data[0]).toHaveProperty('metadata');
expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl'); expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl');
}, 60000); // 60 seconds }, 60000); // 60 seconds
// it('should return a successful response for a valid crawl job with PDF content', async () => {
// });
it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
const crawlResponse = await request(TEST_URL)
.post('/v0/crawl')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 5 }});
expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('status');
expect(response.body.status).toBe('active');
// wait for 30 seconds
await new Promise((r) => setTimeout(r, 60000));
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
console.log(completedResponse.body.data)
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty('status');
expect(completedResponse.body.status).toBe('completed');
expect(completedResponse.body).toHaveProperty('data');
expect(completedResponse.body.data.length).toBeGreaterThan(1);
expect(completedResponse.body.data[0]).toHaveProperty('content');
expect(completedResponse.body.data[0]).toHaveProperty('markdown');
expect(completedResponse.body.data[0]).toHaveProperty('metadata');
expect(completedResponse.body.data[0].content).toContain('The Peculiar Balmer Line Profiles of OQ 208');
}, 90000); // 60 seconds
}); });
describe('GET /is-production', () => { describe('GET /is-production', () => {

View File

@ -88,9 +88,17 @@ export class WebScraperDataProvider {
})); }));
} }
let pdfLinks = links.filter( let pdfLinks = [];
async (link) => await isUrlAPdf({ url: link, fastMode: true }) let notPdfLinks = [];
); for (let link of links) {
if (await isUrlAPdf({ url: link })) {
pdfLinks.push(link);
} else {
notPdfLinks.push(link);
}
}
console.log("crawl", {pdfLinks})
let pdfDocuments: Document[] = []; let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) { for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink); const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -100,11 +108,8 @@ export class WebScraperDataProvider {
provider: "web-scraper", provider: "web-scraper",
}); });
} }
links = links.filter(
async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
);
let documents = await this.convertUrlsToDocuments(links, inProgress); let documents = await this.convertUrlsToDocuments(notPdfLinks, inProgress);
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
if (this.replaceAllPathsWithAbsolutePaths) { if (this.replaceAllPathsWithAbsolutePaths) {
@ -164,7 +169,7 @@ export class WebScraperDataProvider {
let pdfDocuments: Document[] = []; let pdfDocuments: Document[] = [];
let nonPdfUrls: string[] = []; let nonPdfUrls: string[] = [];
for (let url of this.urls) { for (let url of this.urls) {
if (await isUrlAPdf({ url: url, fastMode: false })) { if (await isUrlAPdf({ url: url })) {
const pdfContent = await fetchAndProcessPdf(url); const pdfContent = await fetchAndProcessPdf(url);
pdfDocuments.push({ pdfDocuments.push({
content: pdfContent, content: pdfContent,
@ -201,9 +206,17 @@ export class WebScraperDataProvider {
} }
if (this.mode === "sitemap") { if (this.mode === "sitemap") {
let links = await getLinksFromSitemap(this.urls[0]); let links = await getLinksFromSitemap(this.urls[0]);
let pdfLinks = links.filter(
async (link) => await isUrlAPdf({ url: link, fastMode: true }) let pdfLinks = [];
); let nonPdfLinks = [];
for (let link of links) {
if (await isUrlAPdf({ url: link })) {
pdfLinks.push(link);
} else {
nonPdfLinks.push(link);
}
}
let pdfDocuments: Document[] = []; let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) { for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink); const pdfContent = await fetchAndProcessPdf(pdfLink);
@ -213,12 +226,9 @@ export class WebScraperDataProvider {
provider: "web-scraper", provider: "web-scraper",
}); });
} }
links = links.filter(
async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
);
let documents = await this.convertUrlsToDocuments( let documents = await this.convertUrlsToDocuments(
links.slice(0, this.limit), nonPdfLinks.slice(0, this.limit),
inProgress inProgress
); );

View File

@ -114,10 +114,10 @@ async function processPdf(file: string) {
*/ */
export async function isUrlAPdf({ export async function isUrlAPdf({
url, url,
fastMode, fastMode = false,
}: { }: {
url: string; url: string;
fastMode: boolean; fastMode?: boolean;
}): Promise<boolean> { }): Promise<boolean> {
try { try {
if (url.endsWith(".pdf")) { if (url.endsWith(".pdf")) {
@ -127,11 +127,11 @@ export async function isUrlAPdf({
if (fastMode) { if (fastMode) {
return false; return false;
} }
const response = await fetch(url, { method: "HEAD" }); const response = await axios.head(url);
const contentType = response.headers.get("Content-Type"); const contentType = response.headers['content-type'];
return contentType !== null && contentType.includes("application/pdf"); return contentType.includes('application/pdf');
} catch (error) { } catch (error) {
console.error("Error making HEAD request:", error); // console.error("Error making HEAD request:", error);
return false; return false;
} }
} }