diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index 9e7a75f..a652619 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -170,6 +170,45 @@ describe('E2E Tests for API Routes', () => { expect(completedResponse.body.data[0]).toHaveProperty('metadata'); expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl'); }, 60000); // 60 seconds + + // it('should return a successful response for a valid crawl job with PDF content', async () => { + + // }); + + it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { + const crawlResponse = await request(TEST_URL) + .post('/v0/crawl') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/abs/astro-ph/9301001', crawlerOptions: { limit: 5 }}); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('status'); + expect(response.body.status).toBe('active'); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 60000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + console.log(completedResponse.body.data) + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty('status'); + expect(completedResponse.body.status).toBe('completed'); + expect(completedResponse.body).toHaveProperty('data'); + expect(completedResponse.body.data.length).toBeGreaterThan(1); + expect(completedResponse.body.data[0]).toHaveProperty('content'); + expect(completedResponse.body.data[0]).toHaveProperty('markdown'); + expect(completedResponse.body.data[0]).toHaveProperty('metadata'); + expect(completedResponse.body.data[0].content).toContain('The Peculiar Balmer Line Profiles of OQ 208'); + }, 90000); // 60 seconds + + }); describe('GET /is-production', () => { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index fe291fb..2b02076 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -88,9 +88,17 @@ export class WebScraperDataProvider { })); } - let pdfLinks = links.filter( - async (link) => await isUrlAPdf({ url: link, fastMode: true }) - ); + let pdfLinks = []; + let notPdfLinks = []; + for (let link of links) { + if (await isUrlAPdf({ url: link })) { + pdfLinks.push(link); + } else { + notPdfLinks.push(link); + } + } + + console.log("crawl", {pdfLinks}) let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -100,11 +108,8 @@ export class WebScraperDataProvider { provider: "web-scraper", }); } - links = links.filter( - async (link) => !(await isUrlAPdf({ url: link, fastMode: true })) - ); - let documents = await this.convertUrlsToDocuments(links, inProgress); + let documents = await this.convertUrlsToDocuments(notPdfLinks, inProgress); documents = await this.getSitemapData(this.urls[0], documents); if (this.replaceAllPathsWithAbsolutePaths) { @@ -164,7 +169,7 @@ export class WebScraperDataProvider { let pdfDocuments: Document[] = []; let nonPdfUrls: string[] = []; for (let url of this.urls) { - if (await isUrlAPdf({ url: url, fastMode: false })) { + if (await isUrlAPdf({ url: url })) { const pdfContent = await fetchAndProcessPdf(url); pdfDocuments.push({ content: pdfContent, @@ -201,9 +206,17 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { let links = await getLinksFromSitemap(this.urls[0]); - let pdfLinks = links.filter( - async (link) => await isUrlAPdf({ url: link, fastMode: true }) - ); + + let pdfLinks = []; + let nonPdfLinks = []; + for (let link of links) { + if (await isUrlAPdf({ url: link })) { + pdfLinks.push(link); + } else { + nonPdfLinks.push(link); + } + } + let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -213,12 +226,9 @@ export class WebScraperDataProvider { provider: "web-scraper", }); } - links = links.filter( - async (link) => !(await isUrlAPdf({ url: link, fastMode: true })) - ); let documents = await this.convertUrlsToDocuments( - links.slice(0, this.limit), + nonPdfLinks.slice(0, this.limit), inProgress ); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 80476e9..67fb134 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -114,10 +114,10 @@ async function processPdf(file: string) { */ export async function isUrlAPdf({ url, - fastMode, + fastMode = false, }: { url: string; - fastMode: boolean; + fastMode?: boolean; }): Promise { try { if (url.endsWith(".pdf")) { @@ -127,11 +127,11 @@ export async function isUrlAPdf({ if (fastMode) { return false; } - const response = await fetch(url, { method: "HEAD" }); - const contentType = response.headers.get("Content-Type"); - return contentType !== null && contentType.includes("application/pdf"); + const response = await axios.head(url); + const contentType = response.headers['content-type']; + return contentType.includes('application/pdf'); } catch (error) { - console.error("Error making HEAD request:", error); + // console.error("Error making HEAD request:", error); return false; } }