From 84cebf618bb316f7494b2c0c350a22d66528f698 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Apr 2024 15:36:00 -0700 Subject: [PATCH] Nick: --- apps/api/src/__tests__/e2e/index.test.ts | 3 +++ apps/api/src/scraper/WebScraper/index.ts | 13 ++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts index 554453b..9e7a75f 100644 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ b/apps/api/src/__tests__/e2e/index.test.ts @@ -42,6 +42,7 @@ describe('E2E Tests for API Routes', () => { .set('Authorization', `Bearer this_is_just_a_preview_token`) .set('Content-Type', 'application/json') .send({ url: 'https://firecrawl.dev' }); + expect(response.statusCode).toBe(200); }, 10000); // 10 seconds timeout @@ -51,6 +52,8 @@ describe('E2E Tests for API Routes', () => { .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') .send({ url: 'https://firecrawl.dev' }); + await new Promise((r) => setTimeout(r, 2000)); + expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty('data'); expect(response.body.data).toHaveProperty('content'); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 0dc68b0..9d9a236 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -88,7 +88,7 @@ export class WebScraperDataProvider { })); } - let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true})); + let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true})); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -98,7 +98,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true})); + links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true})); let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); @@ -157,10 +157,12 @@ export class WebScraperDataProvider { } if (this.mode === "single_urls") { + console.log("Single urls mode"); let pdfDocuments: Document[] = []; let nonPdfUrls: string[] = []; for (let url of this.urls) { - if (isUrlAPdf({url: url, fastMode: false})) { + console.log("Checking if url is a pdf", url); + if (await isUrlAPdf({url: url, fastMode: false})) { const pdfContent = await fetchAndProcessPdf(url); pdfDocuments.push({ content: pdfContent, @@ -169,6 +171,7 @@ export class WebScraperDataProvider { }); } else { nonPdfUrls.push(url); + console.log("Fetching and processing url", url); } } @@ -197,7 +200,7 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { let links = await getLinksFromSitemap(this.urls[0]); - let pdfLinks = links.filter((link) => isUrlAPdf({url: link, fastMode: true})); + let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true})); let pdfDocuments: Document[] = []; for (let pdfLink of pdfLinks) { const pdfContent = await fetchAndProcessPdf(pdfLink); @@ -207,7 +210,7 @@ export class WebScraperDataProvider { provider: "web-scraper" }); } - links = links.filter((link) => !isUrlAPdf({url: link, fastMode: true})); + links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true})); let documents = await this.convertUrlsToDocuments( links.slice(0, this.limit),