From 7b11ace87dd116b696e30b9d4f13cf38ee266805 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 14 Jun 2024 12:31:42 -0700 Subject: [PATCH 01/12] Create rate-limiter.test.ts --- apps/api/src/services/rate-limiter.test.ts | 87 ++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 apps/api/src/services/rate-limiter.test.ts diff --git a/apps/api/src/services/rate-limiter.test.ts b/apps/api/src/services/rate-limiter.test.ts new file mode 100644 index 0000000..47a24a7 --- /dev/null +++ b/apps/api/src/services/rate-limiter.test.ts @@ -0,0 +1,87 @@ +import { getRateLimiter, serverRateLimiter, testSuiteRateLimiter, redisClient } from "./rate-limiter"; +import { RateLimiterMode } from "../../src/types"; +import { RateLimiterRedis } from "rate-limiter-flexible"; + +describe("Rate Limiter Service", () => { + beforeAll(async () => { + await redisClient.connect(); + }); + + afterAll(async () => { + await redisClient.disconnect(); + }); + + it("should return the testSuiteRateLimiter for specific tokens", () => { + const limiter = getRateLimiter("crawl" as RateLimiterMode, "a01ccae"); + expect(limiter).toBe(testSuiteRateLimiter); + + const limiter2 = getRateLimiter("scrape" as RateLimiterMode, "6254cf9"); + expect(limiter2).toBe(testSuiteRateLimiter); + }); + + it("should return the serverRateLimiter if mode is not found", () => { + const limiter = getRateLimiter("nonexistent" as RateLimiterMode, "someToken"); + expect(limiter).toBe(serverRateLimiter); + }); + + it("should return the correct rate limiter based on mode and plan", () => { + const limiter = getRateLimiter("crawl" as RateLimiterMode, "someToken", "free"); + expect(limiter.points).toBe(2); + + const limiter2 = getRateLimiter("scrape" as RateLimiterMode, "someToken", "standard"); + expect(limiter2.points).toBe(50); + + const limiter3 = getRateLimiter("search" as RateLimiterMode, "someToken", "growth"); + expect(limiter3.points).toBe(500); + + const limiter4 = getRateLimiter("crawlStatus" as RateLimiterMode, "someToken", "growth"); + expect(limiter4.points).toBe(150); + }); + + it("should return the default rate limiter if plan is not provided", () => { + const limiter = getRateLimiter("crawl" as RateLimiterMode, "someToken"); + expect(limiter.points).toBe(3); + + const limiter2 = getRateLimiter("scrape" as RateLimiterMode, "someToken"); + expect(limiter2.points).toBe(20); + }); + + it("should create a new RateLimiterRedis instance with correct parameters", () => { + const keyPrefix = "test-prefix"; + const points = 10; + const limiter = new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix, + points, + duration: 60, + }); + + expect(limiter.keyPrefix).toBe(keyPrefix); + expect(limiter.points).toBe(points); + expect(limiter.duration).toBe(60); + }); + + it("should return the correct rate limiter for 'preview' mode", () => { + const limiter = getRateLimiter("preview" as RateLimiterMode, "someToken", "free"); + expect(limiter.points).toBe(5); + + const limiter2 = getRateLimiter("preview" as RateLimiterMode, "someToken"); + expect(limiter2.points).toBe(5); + }); + + it("should return the correct rate limiter for 'account' mode", () => { + const limiter = getRateLimiter("account" as RateLimiterMode, "someToken", "free"); + expect(limiter.points).toBe(100); + + const limiter2 = getRateLimiter("account" as RateLimiterMode, "someToken"); + expect(limiter2.points).toBe(100); + }); + + it("should return the correct rate limiter for 'crawlStatus' mode", () => { + const limiter = getRateLimiter("crawlStatus" as RateLimiterMode, "someToken", "free"); + expect(limiter.points).toBe(150); + + const limiter2 = getRateLimiter("crawlStatus" as RateLimiterMode, "someToken"); + expect(limiter2.points).toBe(150); + }); +}); From b2bd562bb2ea0a51eaba30d0f309058d79241989 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 17 Jun 2024 17:09:44 -0300 Subject: [PATCH 02/12] transcribed from e2e to unit tests for many cases --- apps/api/package.json | 3 +- .../__tests__/e2e_full_withAuth/index.test.ts | 1390 +++++++++++++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 685 +------- .../src/controllers/__tests__/crawl.test.ts | 47 + .../WebScraper/__tests__/crawler.test.ts | 37 +- .../WebScraper/__tests__/single_url.test.ts | 24 + .../utils/__tests__/blocklist.test.ts | 89 ++ .../utils/__tests__/pdfProcessor.test.ts | 44 +- .../src/scraper/WebScraper/utils/blocklist.ts | 23 +- 9 files changed, 1635 insertions(+), 707 deletions(-) create mode 100644 apps/api/src/__tests__/e2e_full_withAuth/index.test.ts create mode 100644 apps/api/src/controllers/__tests__/crawl.test.ts create mode 100644 apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts diff --git a/apps/api/package.json b/apps/api/package.json index c786b17..e114a0f 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -12,8 +12,7 @@ "build": "tsc", "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", - "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", - "workers": "nodemon --exec ts-node src/services/queue-worker.ts", + "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'", "worker:production": "node dist/src/services/queue-worker.js", "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", "mongo-docker-console": "docker exec -it mongodb mongosh", diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts new file mode 100644 index 0000000..9f04093 --- /dev/null +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -0,0 +1,1390 @@ +import request from "supertest"; +import dotenv from "dotenv"; +import { v4 as uuidv4 } from "uuid"; + +dotenv.config(); + +// const TEST_URL = 'http://localhost:3002' +const TEST_URL = "http://127.0.0.1:3002"; + +describe("E2E Tests for API Routes", () => { + beforeAll(() => { + process.env.USE_DB_AUTHENTICATION = "true"; + }); + + afterAll(() => { + delete process.env.USE_DB_AUTHENTICATION; + }); + describe("GET /", () => { + it.concurrent("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/"); + + expect(response.statusCode).toBe(200); + expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); + }); + }); + + describe("GET /test", () => { + it.concurrent("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/test"); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("Hello, world!"); + }); + }); + + describe("POST /v0/scrape", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/scrape"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://facebook.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); + }); + + // tested on rate limit test + // it.concurrent("should return a successful response with a valid preview token", async () => { + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer this_is_just_a_preview_token`) + // .set("Content-Type", "application/json") + // .send({ url: "https://roastmywebsite.ai" }); + // expect(response.statusCode).toBe(200); + // }, 30000); // 30 seconds timeout + + it.concurrent("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://roastmywebsite.ai" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, 30000); // 30 seconds timeout + + it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeHtml: true }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.html).toContain(" { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { + const responseWithoutRemoveTags = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/" }); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer + expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav + expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong + + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("Scrape This Site"); + expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer + expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav + expect(response.body.data.content).not.toContain("web scraping"); // strong + }, 30000); // 30 seconds timeout + + // TODO: add this test back once we nail the waitFor option to be more deterministic + // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { + // const startTime = Date.now(); + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } }); + // const endTime = Date.now(); + // const duration = endTime - startTime; + + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty("data"); + // expect(response.body.data).toHaveProperty("content"); + // expect(response.body.data).toHaveProperty("markdown"); + // expect(response.body.data).toHaveProperty("metadata"); + // expect(response.body.data).not.toHaveProperty("html"); + // expect(response.body.data.content).toContain("🔥 Firecrawl"); + // expect(duration).toBeGreaterThanOrEqual(7000); + // }, 12000); // 12 seconds timeout + + it.concurrent('should return a successful response for a scrape with 400 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/400' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(400); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 401 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/401' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(401); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized"); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response for a scrape with 403 page", async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/403' }); + + await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(403); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 404 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/404' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(404); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 405 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/405' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(405); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 500 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/500' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(500); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error"); + }, 60000); // 60 seconds + }); + + describe("POST /v0/crawl", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawl"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://twitter.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); + }); + + it.concurrent("should return a successful response with a valid API key for crawl", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + it.concurrent('should prevent duplicate requests using the same idempotency key', async () => { + const uniqueIdempotencyKey = uuidv4(); + + // First request with the idempotency key + const firstResponse = await request(TEST_URL) + .post('/v0/crawl') + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .set("x-idempotency-key", uniqueIdempotencyKey) + .send({ url: 'https://mendable.ai' }); + + expect(firstResponse.statusCode).toBe(200); + + // Second request with the same idempotency key + const secondResponse = await request(TEST_URL) + .post('/v0/crawl') + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .set("x-idempotency-key", uniqueIdempotencyKey) + .send({ url: 'https://mendable.ai' }); + + expect(secondResponse.statusCode).toBe(409); + expect(secondResponse.body.error).toBe('Idempotency key already used'); + }); + + it.concurrent("should return a successful response with a valid API key and valid includes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + includes: ["blog/*"], + }, + }); + + let response; + let isFinished = false; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy(); + }); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + excludes: ["blog/*"], + }, + }); + + let isFinished = false; + let response; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); + }); + }, 90000); // 90 seconds + + it.concurrent("should return a successful response with a valid API key and limit to 3", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + crawlerOptions: { limit: 3 }, + }); + + let isFinished = false; + let response; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data.length).toBe(3); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 1 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + expect(depth).toBeLessThanOrEqual(2); + }); + }, 180000); + + it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com/pages/", + crawlerOptions: { maxDepth: 1 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + expect(depth).toBeLessThanOrEqual(3); + }); + }, 180000); + + it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { + + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.mendable.ai", + crawlerOptions: { maxDepth: 0 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const testurls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + //console.log(testurls) + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); + + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + expect(depth).toBeLessThanOrEqual(1); + }); + }, 180000); + + + + + + // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { + // const crawlResponse = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://mendable.ai", + // crawlerOptions: { limit: 10 }, + // }); + + // const response = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty("status"); + // expect(response.body.status).toBe("active"); + + // let isCompleted = false; + // while (!isCompleted) { + // const statusCheckResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + // expect(statusCheckResponse.statusCode).toBe(200); + // isCompleted = statusCheckResponse.body.status === "completed"; + // if (!isCompleted) { + // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + // } + // } + + // const completedResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + // expect(completedResponse.statusCode).toBe(200); + // expect(completedResponse.body).toHaveProperty("status"); + // expect(completedResponse.body.status).toBe("completed"); + // expect(completedResponse.body).toHaveProperty("data"); + // expect(completedResponse.body.data.length).toBe(10); + // expect(completedResponse.body.data[0]).toHaveProperty("content"); + // expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + // expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + // expect(completedResponse.body.data[0].content).toContain("Mendable"); + // expect(completedResponse.body.data[0].content).not.toContain("main menu"); + // }, 60000); // 60 seconds + + it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeHtml: true }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + + // 120 seconds + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("_Roast_"); + expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); + expect(completedResponse.body.data[0].html).toContain(" { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawlWebsitePreview"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + // it.concurrent("should return an error for a blocklisted URL", async () => { + // const blocklistedUrl = "https://instagram.com/fake-test"; + // const response = await request(TEST_URL) + // .post("/v0/crawlWebsitePreview") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: blocklistedUrl }); + // // is returning 429 instead of 403 + // expect(response.statusCode).toBe(403); + // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + // }); + + it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", timeout: 1000 }); + + expect(response.statusCode).toBe(408); + }, 3000); + + // it.concurrent("should return a successful response with a valid API key for crawlWebsitePreview", async () => { + // const response = await request(TEST_URL) + // .post("/v0/crawlWebsitePreview") + // .set("Authorization", `Bearer this_is_just_a_preview_token`) + // .set("Content-Type", "application/json") + // .send({ url: "https://firecrawl.dev" }); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty("jobId"); + // expect(response.body.jobId).toMatch( + // /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + // ); + // }); + }); + + describe("POST /v0/search", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/search"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return a successful response with a valid API key for search", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }, 30000); // 30 seconds timeout + }); + + describe("GET /v0/crawl/status/:jobId", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).get("/v0/crawl/status/123"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return Job not found for invalid job ID", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + }); + + it.concurrent("should return a successful crawl status response for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://mendable.ai/blog" }); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + let completedResponse; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + + const childrenLinks = completedResponse.body.data.filter(doc => + doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") + ); + + expect(childrenLinks.length).toBe(completedResponse.body.data.length); + }, 180000); // 120 seconds + + it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { + const crawlResponse = await request(TEST_URL) + .post('/v0/crawl') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }}); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + let completedResponse; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('status'); + + if (response.body.status === 'completed') { + isCompleted = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + expect(completedResponse.body.status).toBe('completed'); + expect(completedResponse.body).toHaveProperty('data'); + expect(completedResponse.body.data.length).toEqual(1); + expect(completedResponse.body.data).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.') + }) + ]) + ); + + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + }, 180000); // 120 seconds + + + + it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeHtml: true }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + + let isFinished = false; + let completedResponse; + + while (!isFinished) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isFinished = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0].content).toContain("_Roast_"); + expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); + expect(completedResponse.body.data[0].html).toContain(" { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai/blog", + pageOptions: { includeHtml: true }, + crawlerOptions: { allowBackwardCrawling: true }, + }); + expect(crawlResponse.statusCode).toBe(200); + + let isFinished = false; + let completedResponse; + + while (!isFinished) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isFinished = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].markdown).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + + const onlyChildrenLinks = completedResponse.body.data.filter(doc => { + return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") + }); + + expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length); + }, 60000); + + it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://jestjs.io" }); + + expect(crawlResponse.statusCode).toBe(200); + + await new Promise((r) => setTimeout(r, 20000)); + + const responseCancel = await request(TEST_URL) + .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); + + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("failed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data).toBeNull(); + expect(completedResponse.body).toHaveProperty("partial_data"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + + describe("POST /v0/scrape with LLM Extraction", () => { + it.concurrent("should extract data using LLM extraction mode", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + pageOptions: { + onlyMainContent: true, + }, + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: + "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionSchema: { + type: "object", + properties: { + company_mission: { + type: "string", + }, + supports_sso: { + type: "boolean", + }, + is_open_source: { + type: "boolean", + }, + }, + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, + }); + + // Ensure that the job was successfully created before proceeding with LLM extraction + expect(response.statusCode).toBe(200); + + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` + let llmExtraction = response.body.data.llm_extraction; + + // Check if the llm_extraction object has the required properties with correct types and values + expect(llmExtraction).toHaveProperty("company_mission"); + expect(typeof llmExtraction.company_mission).toBe("string"); + expect(llmExtraction).toHaveProperty("supports_sso"); + expect(llmExtraction.supports_sso).toBe(true); + expect(typeof llmExtraction.supports_sso).toBe("boolean"); + expect(llmExtraction).toHaveProperty("is_open_source"); + expect(llmExtraction.is_open_source).toBe(false); + expect(typeof llmExtraction.is_open_source).toBe("boolean"); + }, 60000); // 60 secs + }); + + // describe("POST /v0/scrape for Top 100 Companies", () => { + // it.concurrent("should extract data for the top 100 companies", async () => { + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://companiesmarketcap.com/", + // pageOptions: { + // onlyMainContent: true + // }, + // extractorOptions: { + // mode: "llm-extraction", + // extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.", + // extractionSchema: { + // type: "object", + // properties: { + // companies: { + // type: "array", + // items: { + // type: "object", + // properties: { + // rank: { type: "number" }, + // name: { type: "string" }, + // marketCap: { type: "string" }, + // price: { type: "string" }, + // todayChange: { type: "string" } + // }, + // required: ["rank", "name", "marketCap", "price", "todayChange"] + // } + // } + // }, + // required: ["companies"] + // } + // } + // }); + + // // Print the response body to the console for debugging purposes + // console.log("Response companies:", response.body.data.llm_extraction.companies); + + // // Check if the response has the correct structure and data types + // expect(response.status).toBe(200); + // expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true); + // expect(response.body.data.llm_extraction.companies.length).toBe(40); + + // // Sample check for the first company + // const firstCompany = response.body.data.llm_extraction.companies[0]; + // expect(firstCompany).toHaveProperty("name"); + // expect(typeof firstCompany.name).toBe("string"); + // expect(firstCompany).toHaveProperty("marketCap"); + // expect(typeof firstCompany.marketCap).toBe("string"); + // expect(firstCompany).toHaveProperty("price"); + // expect(typeof firstCompany.price).toBe("string"); + // expect(firstCompany).toHaveProperty("todayChange"); + // expect(typeof firstCompany.todayChange).toBe("string"); + // }, 120000); // 120 secs + // }); + + describe("POST /v0/crawl with fast mode", () => { + it.concurrent("should complete the crawl under 20 seconds", async () => { + const startTime = Date.now(); + + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://flutterbricks.com", + crawlerOptions: { + mode: "fast" + } + }); + + expect(crawlResponse.statusCode).toBe(200); + + const jobId = crawlResponse.body.jobId; + let statusResponse; + let isFinished = false; + + while (!isFinished) { + statusResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(statusResponse.statusCode).toBe(200); + isFinished = statusResponse.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + // const endTime = Date.now(); + // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + + // console.log(`Time elapsed: ${timeElapsed} seconds`); + + expect(statusResponse.body.status).toBe("completed"); + expect(statusResponse.body).toHaveProperty("data"); + expect(statusResponse.body.data[0]).toHaveProperty("content"); + expect(statusResponse.body.data[0]).toHaveProperty("markdown"); + expect(statusResponse.body.data[0]).toHaveProperty("metadata"); + expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined(); + + const results = statusResponse.body.data; + // results.forEach((result, i) => { + // console.log(result.metadata.sourceURL); + // }); + expect(results.length).toBeGreaterThanOrEqual(10); + expect(results.length).toBeLessThanOrEqual(15); + + }, 20000); + + // it.concurrent("should complete the crawl in more than 10 seconds", async () => { + // const startTime = Date.now(); + + // const crawlResponse = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://flutterbricks.com", + // }); + + // expect(crawlResponse.statusCode).toBe(200); + + // const jobId = crawlResponse.body.jobId; + // let statusResponse; + // let isFinished = false; + + // while (!isFinished) { + // statusResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + // expect(statusResponse.statusCode).toBe(200); + // isFinished = statusResponse.body.status === "completed"; + + // if (!isFinished) { + // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + // } + // } + + // const endTime = Date.now(); + // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + + // console.log(`Time elapsed: ${timeElapsed} seconds`); + + // expect(statusResponse.body.status).toBe("completed"); + // expect(statusResponse.body).toHaveProperty("data"); + // expect(statusResponse.body.data[0]).toHaveProperty("content"); + // expect(statusResponse.body.data[0]).toHaveProperty("markdown"); + // const results = statusResponse.body.data; + // // results.forEach((result, i) => { + // // console.log(result.metadata.sourceURL); + // // }); + // expect(results.length).toBeGreaterThanOrEqual(10); + // expect(results.length).toBeLessThanOrEqual(15); + + // }, 50000);// 15 seconds timeout to account for network delays + }); + + describe("GET /is-production", () => { + it.concurrent("should return the production status", async () => { + const response = await request(TEST_URL).get("/is-production"); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("isProduction"); + }); + }); + + describe("Rate Limiter", () => { + it.concurrent("should return 429 when rate limit is exceeded for preview token", async () => { + for (let i = 0; i < 5; i++) { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com" }); + + expect(response.statusCode).toBe(200); + } + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com" }); + + expect(response.statusCode).toBe(429); + }, 90000); + }); + + // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => { + // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) { + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); + + // expect(response.statusCode).toBe(200); + // } + + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); + + // expect(response.statusCode).toBe(429); + // }, 60000); + + // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => { + // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) { + // const response = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); + + // expect(response.statusCode).toBe(200); + // } + + // const response = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); + + // expect(response.statusCode).toBe(429); + // }, 60000); +}); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 9f04093..b1b0cc0 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -1,10 +1,7 @@ import request from "supertest"; import dotenv from "dotenv"; -import { v4 as uuidv4 } from "uuid"; dotenv.config(); - -// const TEST_URL = 'http://localhost:3002' const TEST_URL = "http://127.0.0.1:3002"; describe("E2E Tests for API Routes", () => { @@ -15,20 +12,12 @@ describe("E2E Tests for API Routes", () => { afterAll(() => { delete process.env.USE_DB_AUTHENTICATION; }); - describe("GET /", () => { - it.concurrent("should return Hello, world! message", async () => { - const response = await request(TEST_URL).get("/"); + describe("GET /is-production", () => { + it.concurrent("should return the production status", async () => { + const response = await request(TEST_URL).get("/is-production"); expect(response.statusCode).toBe(200); - expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); - }); - }); - - describe("GET /test", () => { - it.concurrent("should return Hello, world! message", async () => { - const response = await request(TEST_URL).get("/test"); - expect(response.statusCode).toBe(200); - expect(response.text).toContain("Hello, world!"); + expect(response.body).toHaveProperty("isProduction"); }); }); @@ -47,29 +36,6 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://facebook.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." - ); - }); - - // tested on rate limit test - // it.concurrent("should return a successful response with a valid preview token", async () => { - // const response = await request(TEST_URL) - // .post("/v0/scrape") - // .set("Authorization", `Bearer this_is_just_a_preview_token`) - // .set("Content-Type", "application/json") - // .send({ url: "https://roastmywebsite.ai" }); - // expect(response.statusCode).toBe(200); - // }, 30000); // 30 seconds timeout - it.concurrent("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -143,21 +109,6 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds - it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } }); - await new Promise((r) => setTimeout(r, 6000)); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); - }, 60000); // 60 seconds - it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { const responseWithoutRemoveTags = await request(TEST_URL) .post("/v0/scrape") @@ -192,27 +143,6 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).not.toContain("web scraping"); // strong }, 30000); // 30 seconds timeout - // TODO: add this test back once we nail the waitFor option to be more deterministic - // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { - // const startTime = Date.now(); - // const response = await request(TEST_URL) - // .post("/v0/scrape") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } }); - // const endTime = Date.now(); - // const duration = endTime - startTime; - - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty("data"); - // expect(response.body.data).toHaveProperty("content"); - // expect(response.body.data).toHaveProperty("markdown"); - // expect(response.body.data).toHaveProperty("metadata"); - // expect(response.body.data).not.toHaveProperty("html"); - // expect(response.body.data.content).toContain("🔥 Firecrawl"); - // expect(duration).toBeGreaterThanOrEqual(7000); - // }, 12000); // 12 seconds timeout - it.concurrent('should return a successful response for a scrape with 400 page', async () => { const response = await request(TEST_URL) .post('/v0/scrape') @@ -325,19 +255,6 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://twitter.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." - ); - }); - it.concurrent("should return a successful response with a valid API key for crawl", async () => { const response = await request(TEST_URL) .post("/v0/crawl") @@ -350,31 +267,7 @@ describe("E2E Tests for API Routes", () => { /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ ); }); - it.concurrent('should prevent duplicate requests using the same idempotency key', async () => { - const uniqueIdempotencyKey = uuidv4(); - - // First request with the idempotency key - const firstResponse = await request(TEST_URL) - .post('/v0/crawl') - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://mendable.ai' }); - - expect(firstResponse.statusCode).toBe(200); - - // Second request with the same idempotency key - const secondResponse = await request(TEST_URL) - .post('/v0/crawl') - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://mendable.ai' }); - - expect(secondResponse.statusCode).toBe(409); - expect(secondResponse.body.error).toBe('Idempotency key already used'); - }); - + it.concurrent("should return a successful response with a valid API key and valid includes option", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") @@ -564,223 +457,6 @@ describe("E2E Tests for API Routes", () => { expect(depth).toBeLessThanOrEqual(2); }); }, 180000); - - it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com/pages/", - crawlerOptions: { maxDepth: 1 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(1); - - // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(3); - }); - }, 180000); - - it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { - - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.mendable.ai", - crawlerOptions: { maxDepth: 0 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - const testurls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - //console.log(testurls) - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThanOrEqual(1); - - // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(1); - }); - }, 180000); - - - - - - // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { - // const crawlResponse = await request(TEST_URL) - // .post("/v0/crawl") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ - // url: "https://mendable.ai", - // crawlerOptions: { limit: 10 }, - // }); - - // const response = await request(TEST_URL) - // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty("status"); - // expect(response.body.status).toBe("active"); - - // let isCompleted = false; - // while (!isCompleted) { - // const statusCheckResponse = await request(TEST_URL) - // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - // expect(statusCheckResponse.statusCode).toBe(200); - // isCompleted = statusCheckResponse.body.status === "completed"; - // if (!isCompleted) { - // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - // } - // } - - // const completedResponse = await request(TEST_URL) - // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - // expect(completedResponse.statusCode).toBe(200); - // expect(completedResponse.body).toHaveProperty("status"); - // expect(completedResponse.body.status).toBe("completed"); - // expect(completedResponse.body).toHaveProperty("data"); - // expect(completedResponse.body.data.length).toBe(10); - // expect(completedResponse.body.data[0]).toHaveProperty("content"); - // expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - // expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - // expect(completedResponse.body.data[0].content).toContain("Mendable"); - // expect(completedResponse.body.data[0].content).not.toContain("main menu"); - // }, 60000); // 60 seconds - - it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - - // 120 seconds - expect(completedResponse.body.data[0]).toHaveProperty("html"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("_Roast_"); - expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); - expect(completedResponse.body.data[0].html).toContain(" { @@ -798,18 +474,6 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - // it.concurrent("should return an error for a blocklisted URL", async () => { - // const blocklistedUrl = "https://instagram.com/fake-test"; - // const response = await request(TEST_URL) - // .post("/v0/crawlWebsitePreview") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: blocklistedUrl }); - // // is returning 429 instead of 403 - // expect(response.statusCode).toBe(403); - // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - // }); - it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -819,19 +483,6 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(408); }, 3000); - - // it.concurrent("should return a successful response with a valid API key for crawlWebsitePreview", async () => { - // const response = await request(TEST_URL) - // .post("/v0/crawlWebsitePreview") - // .set("Authorization", `Bearer this_is_just_a_preview_token`) - // .set("Content-Type", "application/json") - // .send({ url: "https://firecrawl.dev" }); - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty("jobId"); - // expect(response.body.jobId).toMatch( - // /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ - // ); - // }); }); describe("POST /v0/search", () => { @@ -965,145 +616,42 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); }, 180000); // 120 seconds - - - it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => { + it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ - url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true }, - }); + .send({ url: "https://jestjs.io" }); + expect(crawlResponse.statusCode).toBe(200); - const response = await request(TEST_URL) + await new Promise((r) => setTimeout(r, 20000)); + + const responseCancel = await request(TEST_URL) + .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); + + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - - let isFinished = false; - let completedResponse; - - while (!isFinished) { - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - - if (response.body.status === "completed") { - isFinished = true; - completedResponse = response; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again - } - } expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body.status).toBe("failed"); expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0]).toHaveProperty("html"); - expect(completedResponse.body.data[0].content).toContain("_Roast_"); - expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); - expect(completedResponse.body.data[0].html).toContain(" { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai/blog", - pageOptions: { includeHtml: true }, - crawlerOptions: { allowBackwardCrawling: true }, - }); - expect(crawlResponse.statusCode).toBe(200); - - let isFinished = false; - let completedResponse; - - while (!isFinished) { - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - - if (response.body.status === "completed") { - isFinished = true; - completedResponse = response; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again - } - } - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0]).toHaveProperty("html"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].markdown).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - - const onlyChildrenLinks = completedResponse.body.data.filter(doc => { - return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") - }); - - expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length); - }, 60000); - - it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://jestjs.io" }); - - expect(crawlResponse.statusCode).toBe(200); - - await new Promise((r) => setTimeout(r, 20000)); - - const responseCancel = await request(TEST_URL) - .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(responseCancel.statusCode).toBe(200); - expect(responseCancel.body).toHaveProperty("status"); - expect(responseCancel.body.status).toBe("cancelled"); - - await new Promise((r) => setTimeout(r, 10000)); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("failed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data).toBeNull(); - expect(completedResponse.body).toHaveProperty("partial_data"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds + expect(completedResponse.body.data).toBeNull(); + expect(completedResponse.body).toHaveProperty("partial_data"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + }); describe("POST /v0/scrape with LLM Extraction", () => { it.concurrent("should extract data using LLM extraction mode", async () => { @@ -1156,64 +704,6 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 secs }); - // describe("POST /v0/scrape for Top 100 Companies", () => { - // it.concurrent("should extract data for the top 100 companies", async () => { - // const response = await request(TEST_URL) - // .post("/v0/scrape") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ - // url: "https://companiesmarketcap.com/", - // pageOptions: { - // onlyMainContent: true - // }, - // extractorOptions: { - // mode: "llm-extraction", - // extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.", - // extractionSchema: { - // type: "object", - // properties: { - // companies: { - // type: "array", - // items: { - // type: "object", - // properties: { - // rank: { type: "number" }, - // name: { type: "string" }, - // marketCap: { type: "string" }, - // price: { type: "string" }, - // todayChange: { type: "string" } - // }, - // required: ["rank", "name", "marketCap", "price", "todayChange"] - // } - // } - // }, - // required: ["companies"] - // } - // } - // }); - - // // Print the response body to the console for debugging purposes - // console.log("Response companies:", response.body.data.llm_extraction.companies); - - // // Check if the response has the correct structure and data types - // expect(response.status).toBe(200); - // expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true); - // expect(response.body.data.llm_extraction.companies.length).toBe(40); - - // // Sample check for the first company - // const firstCompany = response.body.data.llm_extraction.companies[0]; - // expect(firstCompany).toHaveProperty("name"); - // expect(typeof firstCompany.name).toBe("string"); - // expect(firstCompany).toHaveProperty("marketCap"); - // expect(typeof firstCompany.marketCap).toBe("string"); - // expect(firstCompany).toHaveProperty("price"); - // expect(typeof firstCompany.price).toBe("string"); - // expect(firstCompany).toHaveProperty("todayChange"); - // expect(typeof firstCompany.todayChange).toBe("string"); - // }, 120000); // 120 secs - // }); - describe("POST /v0/crawl with fast mode", () => { it.concurrent("should complete the crawl under 20 seconds", async () => { const startTime = Date.now(); @@ -1269,122 +759,5 @@ describe("E2E Tests for API Routes", () => { expect(results.length).toBeLessThanOrEqual(15); }, 20000); - - // it.concurrent("should complete the crawl in more than 10 seconds", async () => { - // const startTime = Date.now(); - - // const crawlResponse = await request(TEST_URL) - // .post("/v0/crawl") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ - // url: "https://flutterbricks.com", - // }); - - // expect(crawlResponse.statusCode).toBe(200); - - // const jobId = crawlResponse.body.jobId; - // let statusResponse; - // let isFinished = false; - - // while (!isFinished) { - // statusResponse = await request(TEST_URL) - // .get(`/v0/crawl/status/${jobId}`) - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - // expect(statusResponse.statusCode).toBe(200); - // isFinished = statusResponse.body.status === "completed"; - - // if (!isFinished) { - // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - // } - // } - - // const endTime = Date.now(); - // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds - - // console.log(`Time elapsed: ${timeElapsed} seconds`); - - // expect(statusResponse.body.status).toBe("completed"); - // expect(statusResponse.body).toHaveProperty("data"); - // expect(statusResponse.body.data[0]).toHaveProperty("content"); - // expect(statusResponse.body.data[0]).toHaveProperty("markdown"); - // const results = statusResponse.body.data; - // // results.forEach((result, i) => { - // // console.log(result.metadata.sourceURL); - // // }); - // expect(results.length).toBeGreaterThanOrEqual(10); - // expect(results.length).toBeLessThanOrEqual(15); - - // }, 50000);// 15 seconds timeout to account for network delays }); - - describe("GET /is-production", () => { - it.concurrent("should return the production status", async () => { - const response = await request(TEST_URL).get("/is-production"); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("isProduction"); - }); - }); - - describe("Rate Limiter", () => { - it.concurrent("should return 429 when rate limit is exceeded for preview token", async () => { - for (let i = 0; i < 5; i++) { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer this_is_just_a_preview_token`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com" }); - - expect(response.statusCode).toBe(200); - } - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer this_is_just_a_preview_token`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com" }); - - expect(response.statusCode).toBe(429); - }, 90000); - }); - - // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => { - // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) { - // const response = await request(TEST_URL) - // .post("/v0/scrape") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: "https://www.scrapethissite.com" }); - - // expect(response.statusCode).toBe(200); - // } - - // const response = await request(TEST_URL) - // .post("/v0/scrape") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: "https://www.scrapethissite.com" }); - - // expect(response.statusCode).toBe(429); - // }, 60000); - - // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => { - // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) { - // const response = await request(TEST_URL) - // .post("/v0/crawl") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: "https://www.scrapethissite.com" }); - - // expect(response.statusCode).toBe(200); - // } - - // const response = await request(TEST_URL) - // .post("/v0/crawl") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: "https://www.scrapethissite.com" }); - - // expect(response.statusCode).toBe(429); - // }, 60000); }); diff --git a/apps/api/src/controllers/__tests__/crawl.test.ts b/apps/api/src/controllers/__tests__/crawl.test.ts new file mode 100644 index 0000000..621c743 --- /dev/null +++ b/apps/api/src/controllers/__tests__/crawl.test.ts @@ -0,0 +1,47 @@ +import { crawlController } from '../crawl' +import { Request, Response } from 'express'; +import { authenticateUser } from '../auth'; // Ensure this import is correct +import { createIdempotencyKey } from '../../services/idempotency/create'; +import { validateIdempotencyKey } from '../../services/idempotency/validate'; +import { v4 as uuidv4 } from 'uuid'; + +jest.mock('../auth', () => ({ + authenticateUser: jest.fn().mockResolvedValue({ + success: true, + team_id: 'team123', + error: null, + status: 200 + }), + reduce: jest.fn() +})); +jest.mock('../../services/idempotency/validate'); + +describe('crawlController', () => { + it('should prevent duplicate requests using the same idempotency key', async () => { + const req = { + headers: { + 'x-idempotency-key': await uuidv4(), + 'Authorization': `Bearer ${process.env.TEST_API_KEY}` + }, + body: { + url: 'https://mendable.ai' + } + } as unknown as Request; + const res = { + status: jest.fn().mockReturnThis(), + json: jest.fn() + } as unknown as Response; + + // Mock the idempotency key validation to return false for the second call + (validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false); + + // First request should succeed + await crawlController(req, res); + expect(res.status).not.toHaveBeenCalledWith(409); + + // Second request with the same key should fail + await crawlController(req, res); + expect(res.status).toHaveBeenCalledWith(409); + expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' }); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts index c7c54aa..6d38370 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts @@ -7,7 +7,7 @@ import { getAdjustedMaxDepth } from '../utils/maxDepthUtils'; jest.mock('axios'); jest.mock('robots-parser'); -describe('WebCrawler maxDepth and filterLinks', () => { +describe('WebCrawler', () => { let crawler: WebCrawler; const mockAxios = axios as jest.Mocked; const mockRobotsParser = robotsParser as jest.MockedFunction; @@ -156,8 +156,37 @@ describe('WebCrawler maxDepth and filterLinks', () => { ]); }); - - - // Add more tests to cover other scenarios, such as checking includes and excludes + it('should handle allowBackwardCrawling option correctly', async () => { + const initialUrl = 'https://mendable.ai/blog'; + + // Setup the crawler with the specific test case options + const crawler = new WebCrawler({ + initialUrl: initialUrl, + includes: [], + excludes: [], + limit: 100, + maxCrawledDepth: 3, // Example depth + allowBackwardCrawling: true + }); + + // Mock the sitemap fetching function to simulate backward crawling + crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ + initialUrl, + 'https://mendable.ai', // backward link + initialUrl + '/page1', + initialUrl + '/page1/page2' + ]); + + const results = await crawler.start(); + expect(results).toEqual([ + { url: initialUrl, html: '' }, + { url: 'https://mendable.ai', html: '' }, // Expect the backward link to be included + { url: initialUrl + '/page1', html: '' }, + { url: initialUrl + '/page1/page2', html: '' } + ]); + + // Check that the backward link is included if allowBackwardCrawling is true + expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true); + }); }); diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts new file mode 100644 index 0000000..7966648 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -0,0 +1,24 @@ +jest.mock('../single_url', () => { + const originalModule = jest.requireActual('../single_url'); + originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('Test

Roast

'); + + return originalModule; +}); + +import { scrapSingleUrl } from '../single_url'; +import { PageOptions } from '../../../lib/entities'; + +describe('scrapSingleUrl', () => { + it('should handle includeHtml option correctly', async () => { + const url = 'https://roastmywebsite.ai'; + const pageOptionsWithHtml: PageOptions = { includeHtml: true }; + const pageOptionsWithoutHtml: PageOptions = { includeHtml: false }; + + const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml); + const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml); + + expect(resultWithHtml.html).toBeDefined(); + expect(resultWithoutHtml.html).toBeUndefined(); + }, 10000); +}); + diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts new file mode 100644 index 0000000..4252525 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts @@ -0,0 +1,89 @@ +import { isUrlBlocked } from '../blocklist'; + +describe('Blocklist Functionality', () => { + describe('isUrlBlocked', () => { + test.each([ + 'https://facebook.com/fake-test', + 'https://x.com/user-profile', + 'https://twitter.com/home', + 'https://instagram.com/explore', + 'https://linkedin.com/in/johndoe', + 'https://pinterest.com/pin/create', + 'https://snapchat.com/add/johndoe', + 'https://tiktok.com/@johndoe', + 'https://reddit.com/r/funny', + 'https://tumblr.com/dashboard', + 'https://flickr.com/photos/johndoe', + 'https://whatsapp.com/download', + 'https://wechat.com/features', + 'https://telegram.org/apps' + ])('should return true for blocklisted URL %s', (url) => { + expect(isUrlBlocked(url)).toBe(true); + }); + + test.each([ + 'https://facebook.com/policy', + 'https://twitter.com/tos', + 'https://instagram.com/about/legal/terms', + 'https://linkedin.com/legal/privacy-policy', + 'https://pinterest.com/about/privacy', + 'https://snapchat.com/legal/terms', + 'https://tiktok.com/legal/privacy-policy', + 'https://reddit.com/policies', + 'https://tumblr.com/policy/en/privacy', + 'https://flickr.com/help/terms', + 'https://whatsapp.com/legal', + 'https://wechat.com/en/privacy-policy', + 'https://telegram.org/tos' + ])('should return false for allowed URLs with keywords %s', (url) => { + expect(isUrlBlocked(url)).toBe(false); + }); + + test('should return false for non-blocklisted domain', () => { + const url = 'https://example.com'; + expect(isUrlBlocked(url)).toBe(false); + }); + + test('should handle invalid URLs gracefully', () => { + const url = 'htp://invalid-url'; + expect(isUrlBlocked(url)).toBe(false); + }); + }); + + test.each([ + 'https://subdomain.facebook.com', + 'https://facebook.com.someotherdomain.com', + 'https://www.facebook.com/profile', + 'https://api.twitter.com/info', + 'https://instagram.com/accounts/login' + ])('should return true for URLs with blocklisted domains in subdomains or paths %s', (url) => { + expect(isUrlBlocked(url)).toBe(true); + }); + + test.each([ + 'https://example.com/facebook.com', + 'https://example.com/redirect?url=https://twitter.com', + 'https://facebook.com.policy.example.com' + ])('should return false for URLs where blocklisted domain is part of another domain or path %s', (url) => { + expect(isUrlBlocked(url)).toBe(false); + }); + + test.each([ + 'https://FACEBOOK.com', + 'https://INSTAGRAM.com/@something' + ])('should handle case variations %s', (url) => { + expect(isUrlBlocked(url)).toBe(true); + }); + + test.each([ + 'https://facebook.com?redirect=https://example.com', + 'https://twitter.com?query=something' + ])('should handle query parameters %s', (url) => { + expect(isUrlBlocked(url)).toBe(true); + }); + + test('should handle internationalized domain names', () => { + const url = 'https://xn--d1acpjx3f.xn--p1ai'; + expect(isUrlBlocked(url)).toBe(false); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts index 55930f2..1830265 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -9,41 +9,11 @@ describe('PDF Processing Module - Integration Test', () => { expect(pageError).toBeUndefined(); }); -// We're hitting the LLAMAPARSE rate limit 🫠 -// it('should download and read a simple PDF file by URL', async () => { -// const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); -// expect(pdfContent).toEqual("Dummy PDF file"); -// }); + it('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { + const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/astro-ph/9301001.pdf', false); + expect(pageStatusCode).toBe(200); + expect(pageError).toBeUndefined(); + expect(content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); + }, 60000); // 60 seconds -// it('should download and read a complex PDF file by URL', async () => { -// const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf'); - -// const expectedContent = 'A Comprehensive Overview of Large Language Models\n' + -// ' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' + -// ' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' + -// ' Nick Barnes h, Ajmal Mian i\n' + -// ' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' + -// ' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' + -// ' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' + -// ' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' + -// ' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' + -// ' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' + -// ' gThe University of Melbourne (UoM), Melbourne, Australia\n' + -// ' hAustralian National University (ANU), Canberra, Australia\n' + -// ' iThe University of Western Australia (UWA), Perth, Australia\n' + -// ' Abstract\n' + -// ' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' + -// ' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' + -// ' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' + -// ' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' + -// ' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' + -// ' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' + -// ' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' + -// ' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' + -// ' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' + -// ' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' + -// ' extensive informative summaries of the existing works to advance the LLM research.\n' -// expect(pdfContent).toContain(expectedContent); -// }, 60000); - -}); \ No newline at end of file +}); diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 45d1970..7116963 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -23,6 +23,7 @@ const allowedKeywords = [ 'user-agreement', 'legal', 'help', + 'policies', 'support', 'contact', 'about', @@ -30,25 +31,31 @@ const allowedKeywords = [ 'blog', 'press', 'conditions', + 'tos' ]; export function isUrlBlocked(url: string): boolean { - // Check if the URL contains any allowed keywords - if (allowedKeywords.some(keyword => url.includes(keyword))) { + const lowerCaseUrl = url.toLowerCase(); + + // Check if the URL contains any allowed keywords as whole words + if (allowedKeywords.some(keyword => new RegExp(`\\b${keyword}\\b`, 'i').test(lowerCaseUrl))) { return false; } try { + const urlObj = new URL(url); + const hostname = urlObj.hostname.toLowerCase(); + // Check if the URL matches any domain in the blocklist - return socialMediaBlocklist.some(domain => { - // Create a regular expression to match the exact domain - const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`); - // Test the hostname of the URL against the pattern - return domainPattern.test(new URL(url).hostname); + const isBlocked = socialMediaBlocklist.some(domain => { + const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}(\\.|$)`, 'i'); + return domainPattern.test(hostname); }); + + return isBlocked; } catch (e) { // If an error occurs (e.g., invalid URL), return false + console.error(`Error processing URL: ${url}`, e); return false; } } - From e5ffda1eeca2715774596b021fe5a8cdab862917 Mon Sep 17 00:00:00 2001 From: neev jewalkar Date: Tue, 18 Jun 2024 05:42:25 +0530 Subject: [PATCH 03/12] Added local host support for the javascript SDK --- apps/js-sdk/firecrawl/README.md | 5 + apps/js-sdk/firecrawl/build/index.js | 4 +- apps/js-sdk/firecrawl/package-lock.json | 4 +- apps/js-sdk/firecrawl/src/index.ts | 577 ++++++++++++------------ apps/js-sdk/firecrawl/types/index.d.ts | 2 +- apps/js-sdk/package-lock.json | 40 -- 6 files changed, 299 insertions(+), 333 deletions(-) diff --git a/apps/js-sdk/firecrawl/README.md b/apps/js-sdk/firecrawl/README.md index 085e865..d916bf7 100644 --- a/apps/js-sdk/firecrawl/README.md +++ b/apps/js-sdk/firecrawl/README.md @@ -176,6 +176,11 @@ async function checkStatusExample(jobId) { checkStatusExample('your_job_id_here'); ``` +## Running Locally +To use the SDK when running Firecrawl locally, you can change the initial Firecrawl app instance to: +```js +const app = new FirecrawlApp({ apiKey: "YOUR_API_KEY", apiUrl: "http://localhost:3002" }); +``` ## Error Handling diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js index b418513..e54e532 100644 --- a/apps/js-sdk/firecrawl/build/index.js +++ b/apps/js-sdk/firecrawl/build/index.js @@ -18,9 +18,9 @@ export default class FirecrawlApp { * Initializes a new instance of the FirecrawlApp class. * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. */ - constructor({ apiKey = null }) { - this.apiUrl = "https://api.firecrawl.dev"; + constructor({ apiKey = null, apiUrl = null }) { this.apiKey = apiKey || ""; + this.apiUrl = apiUrl || "https://api.firecrawl.dev"; if (!this.apiKey) { throw new Error("No API key provided"); } diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index b1cebde..7094cc9 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.22", + "version": "0.0.26", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "0.0.22", + "version": "0.0.26", "license": "MIT", "dependencies": { "axios": "^1.6.8", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index f884125..fd4f2ca 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -5,346 +5,347 @@ import { zodToJsonSchema } from "zod-to-json-schema"; * Configuration interface for FirecrawlApp. */ export interface FirecrawlAppConfig { - apiKey?: string | null; - apiUrl?: string | null; + apiKey?: string | null; + apiUrl?: string | null; } /** * Generic parameter interface. */ export interface Params { - [key: string]: any; - extractorOptions?: { - extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction"; - extractionPrompt?: string; - }; + [key: string]: any; + extractorOptions?: { + extractionSchema: z.ZodSchema | any; + mode?: "llm-extraction"; + extractionPrompt?: string; + }; } /** * Response interface for scraping operations. */ export interface ScrapeResponse { - success: boolean; - data?: any; - error?: string; + success: boolean; + data?: any; + error?: string; } /** * Response interface for searching operations. */ export interface SearchResponse { - success: boolean; - data?: any; - error?: string; + success: boolean; + data?: any; + error?: string; } /** * Response interface for crawling operations. */ export interface CrawlResponse { - success: boolean; - jobId?: string; - data?: any; - error?: string; + success: boolean; + jobId?: string; + data?: any; + error?: string; } /** * Response interface for job status checks. */ export interface JobStatusResponse { - success: boolean; - status: string; - jobId?: string; - data?: any; - partial_data?: any, - error?: string; + success: boolean; + status: string; + jobId?: string; + data?: any; + partial_data?: any, + error?: string; } /** * Main class for interacting with the Firecrawl API. */ export default class FirecrawlApp { - private apiKey: string; - private apiUrl: string = "https://api.firecrawl.dev"; + private apiKey: string; + private apiUrl: string; - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null }: FirecrawlAppConfig) { - this.apiKey = apiKey || ""; - if (!this.apiKey) { - throw new Error("No API key provided"); - } - } - - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - async scrapeUrl( - url: string, - params: Params | null = null - ): Promise { - const headers: AxiosRequestHeaders = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - } as AxiosRequestHeaders; - let jsonData: Params = { url, ...params }; - if (params?.extractorOptions?.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof z.ZodSchema) { - schema = zodToJsonSchema(schema); - } - jsonData = { - ...jsonData, - extractorOptions: { - ...params.extractorOptions, - extractionSchema: schema, - mode: params.extractorOptions.mode || "llm-extraction", - }, - }; - } - try { - const response: AxiosResponse = await axios.post( - this.apiUrl + "/v0/scrape", - jsonData, - { headers }, - ); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { + this.apiKey = apiKey || ""; + this.apiUrl = apiUrl || "https://api.firecrawl.dev" + if (!this.apiKey) { + throw new Error("No API key provided"); } - } else { - this.handleError(response, "scrape URL"); - } - } catch (error: any) { - throw new Error(error.message); } - return { success: false, error: "Internal server error." }; - } - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - async search( - query: string, - params: Params | null = null - ): Promise { - const headers: AxiosRequestHeaders = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - } as AxiosRequestHeaders; - let jsonData: Params = { query }; - if (params) { - jsonData = { ...jsonData, ...params }; - } - try { - const response: AxiosResponse = await axios.post( - this.apiUrl + "/v0/search", - jsonData, - { headers } - ); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } else { - throw new Error(`Failed to search. Error: ${responseData.error}`); + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + async scrapeUrl( + url: string, + params: Params | null = null + ): Promise { + const headers: AxiosRequestHeaders = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + } as AxiosRequestHeaders; + let jsonData: Params = { url, ...params }; + if (params?.extractorOptions?.extractionSchema) { + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof z.ZodSchema) { + schema = zodToJsonSchema(schema); + } + jsonData = { + ...jsonData, + extractorOptions: { + ...params.extractorOptions, + extractionSchema: schema, + mode: params.extractorOptions.mode || "llm-extraction", + }, + }; } - } else { - this.handleError(response, "search"); - } - } catch (error: any) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - } - - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - async crawlUrl( - url: string, - params: Params | null = null, - waitUntilDone: boolean = true, - pollInterval: number = 2, - idempotencyKey?: string - ): Promise { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData: Params = { url }; - if (params) { - jsonData = { ...jsonData, ...params }; - } - try { - const response: AxiosResponse = await this.postRequest( - this.apiUrl + "/v0/crawl", - jsonData, - headers - ); - if (response.status === 200) { - const jobId: string = response.data.jobId; - if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); - } else { - return { success: true, jobId }; + try { + const response: AxiosResponse = await axios.post( + this.apiUrl + "/v0/scrape", + jsonData, + { headers }, + ); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } else { + throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); + } + } else { + this.handleError(response, "scrape URL"); + } + } catch (error: any) { + throw new Error(error.message); } - } else { - this.handleError(response, "start crawl job"); - } - } catch (error: any) { - console.log(error); - throw new Error(error.message); + return { success: false, error: "Internal server error." }; } - return { success: false, error: "Internal server error." }; - } - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - async checkCrawlStatus(jobId: string): Promise { - const headers: AxiosRequestHeaders = this.prepareHeaders(); - try { - const response: AxiosResponse = await this.getRequest( - this.apiUrl + `/v0/crawl/status/${jobId}`, - headers - ); - if (response.status === 200) { + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + async search( + query: string, + params: Params | null = null + ): Promise { + const headers: AxiosRequestHeaders = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + } as AxiosRequestHeaders; + let jsonData: Params = { query }; + if (params) { + jsonData = { ...jsonData, ...params }; + } + try { + const response: AxiosResponse = await axios.post( + this.apiUrl + "/v0/search", + jsonData, + { headers } + ); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } else { + this.handleError(response, "search"); + } + } catch (error: any) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + } + + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + async crawlUrl( + url: string, + params: Params | null = null, + waitUntilDone: boolean = true, + pollInterval: number = 2, + idempotencyKey?: string + ): Promise { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData: Params = { url }; + if (params) { + jsonData = { ...jsonData, ...params }; + } + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + "/v0/crawl", + jsonData, + headers + ); + if (response.status === 200) { + const jobId: string = response.data.jobId; + if (waitUntilDone) { + return this.monitorJobStatus(jobId, headers, pollInterval); + } else { + return { success: true, jobId }; + } + } else { + this.handleError(response, "start crawl job"); + } + } catch (error: any) { + console.log(error); + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + } + + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + async checkCrawlStatus(jobId: string): Promise { + const headers: AxiosRequestHeaders = this.prepareHeaders(); + try { + const response: AxiosResponse = await this.getRequest( + this.apiUrl + `/v0/crawl/status/${jobId}`, + headers + ); + if (response.status === 200) { + return { + success: true, + status: response.data.status, + data: response.data.data, + partial_data: !response.data.data ? response.data.partial_data : undefined, + }; + } else { + this.handleError(response, "check crawl status"); + } + } catch (error: any) { + throw new Error(error.message); + } return { - success: true, - status: response.data.status, - data: response.data.data, - partial_data: !response.data.data ? response.data.partial_data : undefined, + success: false, + status: "unknown", + error: "Internal server error.", }; - } else { - this.handleError(response, "check crawl status"); - } - } catch (error: any) { - throw new Error(error.message); } - return { - success: false, - status: "unknown", - error: "Internal server error.", - }; - } - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { - return { - 'Content-Type': 'application/json', - 'Authorization': `Bearer ${this.apiKey}`, - ...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}), - } as AxiosRequestHeaders & { 'x-idempotency-key'?: string }; - } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { + return { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${this.apiKey}`, + ...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}), + } as AxiosRequestHeaders & { 'x-idempotency-key'?: string }; + } - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest( - url: string, - data: Params, - headers: AxiosRequestHeaders - ): Promise { - return axios.post(url, data, { headers }); - } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest( + url: string, + data: Params, + headers: AxiosRequestHeaders + ): Promise { + return axios.post(url, data, { headers }); + } - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest( - url: string, - headers: AxiosRequestHeaders - ): Promise { - return axios.get(url, { headers }); - } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest( + url: string, + headers: AxiosRequestHeaders + ): Promise { + return axios.get(url, { headers }); + } - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - async monitorJobStatus( - jobId: string, - headers: AxiosRequestHeaders, - checkInterval: number - ): Promise { - while (true) { - const statusResponse: AxiosResponse = await this.getRequest( - this.apiUrl + `/v0/crawl/status/${jobId}`, - headers - ); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData.data; - } else { - throw new Error("Crawl job completed but no data was returned"); - } - } else if ( - ["active", "paused", "pending", "queued"].includes(statusData.status) - ) { - if (checkInterval < 2) { - checkInterval = 2; - } - await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again - } else { - throw new Error( - `Crawl job failed or was stopped. Status: ${statusData.status}` - ); + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + async monitorJobStatus( + jobId: string, + headers: AxiosRequestHeaders, + checkInterval: number + ): Promise { + while (true) { + const statusResponse: AxiosResponse = await this.getRequest( + this.apiUrl + `/v0/crawl/status/${jobId}`, + headers + ); + if (statusResponse.status === 200) { + const statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + return statusData.data; + } else { + throw new Error("Crawl job completed but no data was returned"); + } + } else if ( + ["active", "paused", "pending", "queued"].includes(statusData.status) + ) { + if (checkInterval < 2) { + checkInterval = 2; + } + await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again + } else { + throw new Error( + `Crawl job failed or was stopped. Status: ${statusData.status}` + ); + } + } else { + this.handleError(statusResponse, "check crawl status"); + } } - } else { - this.handleError(statusResponse, "check crawl status"); - } } - } - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response: AxiosResponse, action: string): void { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage: string = - response.data.error || "Unknown error occurred"; - throw new Error( - `Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}` - ); - } else { - throw new Error( - `Unexpected error occurred while trying to ${action}. Status code: ${response.status}` - ); + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response: AxiosResponse, action: string): void { + if ([402, 408, 409, 500].includes(response.status)) { + const errorMessage: string = + response.data.error || "Unknown error occurred"; + throw new Error( + `Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}` + ); + } else { + throw new Error( + `Unexpected error occurred while trying to ${action}. Status code: ${response.status}` + ); + } } - } } diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts index 52a7d1e..bdf698e 100644 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -64,7 +64,7 @@ export default class FirecrawlApp { * Initializes a new instance of the FirecrawlApp class. * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. */ - constructor({ apiKey }: FirecrawlAppConfig); + constructor({ apiKey, apiUrl }: FirecrawlAppConfig); /** * Scrapes a URL using the Firecrawl API. * @param {string} url - The URL to scrape. diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index c59a371..2bf3f00 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -11,10 +11,8 @@ "dependencies": { "@mendable/firecrawl-js": "^0.0.19", "axios": "^1.6.8", - "dotenv": "^16.4.5", "ts-node": "^10.9.2", "typescript": "^5.4.5", - "uuid": "^9.0.1", "zod": "^3.23.8" }, "devDependencies": { @@ -452,15 +450,6 @@ "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz", "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==" }, - "node_modules/@types/node": { - "version": "20.12.11", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.11.tgz", - "integrity": "sha512-vDg9PZ/zi+Nqp6boSOT7plNuthRugEKixDv5sFTIpkE89MmNtEArAShI4mxuX2+UrLEe9pxC1vm2cjm9YlWbJw==", - "peer": true, - "dependencies": { - "undici-types": "~5.26.4" - } - }, "node_modules/acorn": { "version": "8.11.3", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz", @@ -532,17 +521,6 @@ "node": ">=0.3.1" } }, - "node_modules/dotenv": { - "version": "16.4.5", - "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", - "integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://dotenvx.com" - } - }, "node_modules/esbuild": { "version": "0.20.2", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.20.2.tgz", @@ -750,24 +728,6 @@ "node": ">=14.17" } }, - "node_modules/undici-types": { - "version": "5.26.5", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", - "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", - "peer": true - }, - "node_modules/uuid": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", - "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "bin": { - "uuid": "dist/bin/uuid" - } - }, "node_modules/v8-compile-cache-lib": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", From 20f14bcf7fcd00997a293238ecffa481ebc0f638 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 18 Jun 2024 10:55:07 -0300 Subject: [PATCH 04/12] Added some types --- .../src/__tests__/e2e_withAuth/index.test.ts | 61 ++++++++++--------- apps/api/src/types.ts | 30 ++++++++- 2 files changed, 60 insertions(+), 31 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index b1b0cc0..54be495 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -1,5 +1,6 @@ import request from "supertest"; import dotenv from "dotenv"; +import { FirecrawlCrawlResponse, FirecrawlCrawlStatusResponse, FirecrawlScrapeResponse } from "../../types"; dotenv.config(); const TEST_URL = "http://127.0.0.1:3002"; @@ -23,12 +24,12 @@ describe("E2E Tests for API Routes", () => { describe("POST /v0/scrape", () => { it.concurrent("should require authorization", async () => { - const response = await request(TEST_URL).post("/v0/scrape"); + const response: FirecrawlScrapeResponse = await request(TEST_URL).post("/v0/scrape"); expect(response.statusCode).toBe(401); }); it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) + const response: FirecrawlScrapeResponse = await request(TEST_URL) .post("/v0/scrape") .set("Authorization", `Bearer invalid-api-key`) .set("Content-Type", "application/json") @@ -37,7 +38,7 @@ describe("E2E Tests for API Routes", () => { }); it.concurrent("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) + const response: FirecrawlScrapeResponse = await request(TEST_URL) .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -54,7 +55,7 @@ describe("E2E Tests for API Routes", () => { }, 30000); // 30 seconds timeout it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => { - const response = await request(TEST_URL) + const response: FirecrawlScrapeResponse = await request(TEST_URL) .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -76,7 +77,7 @@ describe("E2E Tests for API Routes", () => { }, 30000); // 30 seconds timeout it.concurrent('should return a successful response for a valid scrape with PDF file', async () => { - const response = await request(TEST_URL) + const response: FirecrawlScrapeResponse = await request(TEST_URL) .post('/v0/scrape') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') @@ -93,7 +94,7 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { - const response = await request(TEST_URL) + const response: FirecrawlScrapeResponse = await request(TEST_URL) .post('/v0/scrape') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') @@ -110,7 +111,7 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { - const responseWithoutRemoveTags = await request(TEST_URL) + const responseWithoutRemoveTags: FirecrawlScrapeResponse = await request(TEST_URL) .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -126,7 +127,7 @@ describe("E2E Tests for API Routes", () => { expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong - const response = await request(TEST_URL) + const response: FirecrawlScrapeResponse = await request(TEST_URL) .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -144,7 +145,7 @@ describe("E2E Tests for API Routes", () => { }, 30000); // 30 seconds timeout it.concurrent('should return a successful response for a scrape with 400 page', async () => { - const response = await request(TEST_URL) + const response: FirecrawlScrapeResponse = await request(TEST_URL) .post('/v0/scrape') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') @@ -160,7 +161,7 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds it.concurrent('should return a successful response for a scrape with 401 page', async () => { - const response = await request(TEST_URL) + const response: FirecrawlScrapeResponse = await request(TEST_URL) .post('/v0/scrape') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') @@ -176,7 +177,7 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds it.concurrent("should return a successful response for a scrape with 403 page", async () => { - const response = await request(TEST_URL) + const response: FirecrawlScrapeResponse = await request(TEST_URL) .post('/v0/scrape') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') @@ -192,7 +193,7 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds it.concurrent('should return a successful response for a scrape with 404 page', async () => { - const response = await request(TEST_URL) + const response: FirecrawlScrapeResponse = await request(TEST_URL) .post('/v0/scrape') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') @@ -224,7 +225,7 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds it.concurrent('should return a successful response for a scrape with 500 page', async () => { - const response = await request(TEST_URL) + const response: FirecrawlScrapeResponse = await request(TEST_URL) .post('/v0/scrape') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') @@ -242,12 +243,12 @@ describe("E2E Tests for API Routes", () => { describe("POST /v0/crawl", () => { it.concurrent("should require authorization", async () => { - const response = await request(TEST_URL).post("/v0/crawl"); + const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawl"); expect(response.statusCode).toBe(401); }); it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) + const response: FirecrawlCrawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer invalid-api-key`) .set("Content-Type", "application/json") @@ -256,7 +257,7 @@ describe("E2E Tests for API Routes", () => { }); it.concurrent("should return a successful response with a valid API key for crawl", async () => { - const response = await request(TEST_URL) + const response: FirecrawlCrawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -269,7 +270,7 @@ describe("E2E Tests for API Routes", () => { }); it.concurrent("should return a successful response with a valid API key and valid includes option", async () => { - const crawlResponse = await request(TEST_URL) + const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -281,7 +282,7 @@ describe("E2E Tests for API Routes", () => { }, }); - let response; + let response: FirecrawlCrawlStatusResponse; let isFinished = false; while (!isFinished) { @@ -321,7 +322,7 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => { - const crawlResponse = await request(TEST_URL) + const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -334,7 +335,7 @@ describe("E2E Tests for API Routes", () => { }); let isFinished = false; - let response; + let response: FirecrawlCrawlStatusResponse; while (!isFinished) { response = await request(TEST_URL) @@ -350,7 +351,7 @@ describe("E2E Tests for API Routes", () => { } } - const completedResponse = response; + const completedResponse: FirecrawlCrawlStatusResponse = response; const urls = completedResponse.body.data.map( (item: any) => item.metadata?.sourceURL @@ -362,7 +363,7 @@ describe("E2E Tests for API Routes", () => { }, 90000); // 90 seconds it.concurrent("should return a successful response with a valid API key and limit to 3", async () => { - const crawlResponse = await request(TEST_URL) + const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -372,7 +373,7 @@ describe("E2E Tests for API Routes", () => { }); let isFinished = false; - let response; + let response: FirecrawlCrawlStatusResponse; while (!isFinished) { response = await request(TEST_URL) @@ -388,7 +389,7 @@ describe("E2E Tests for API Routes", () => { } } - const completedResponse = response; + const completedResponse: FirecrawlCrawlStatusResponse = response; expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); @@ -404,7 +405,7 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) + const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -414,7 +415,7 @@ describe("E2E Tests for API Routes", () => { }); expect(crawlResponse.statusCode).toBe(200); - const response = await request(TEST_URL) + const response: FirecrawlCrawlStatusResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); @@ -432,7 +433,7 @@ describe("E2E Tests for API Routes", () => { await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again } } - const completedResponse = await request(TEST_URL) + const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); @@ -461,12 +462,12 @@ describe("E2E Tests for API Routes", () => { describe("POST /v0/crawlWebsitePreview", () => { it.concurrent("should require authorization", async () => { - const response = await request(TEST_URL).post("/v0/crawlWebsitePreview"); + const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawlWebsitePreview"); expect(response.statusCode).toBe(401); }); it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) + const response: FirecrawlCrawlResponse = await request(TEST_URL) .post("/v0/crawlWebsitePreview") .set("Authorization", `Bearer invalid-api-key`) .set("Content-Type", "application/json") @@ -475,7 +476,7 @@ describe("E2E Tests for API Routes", () => { }); it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { - const response = await request(TEST_URL) + const response: FirecrawlCrawlResponse = await request(TEST_URL) .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 0d5be01..971cc18 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -1,4 +1,4 @@ -import { ExtractorOptions } from "./lib/entities"; +import { ExtractorOptions, Document } from "./lib/entities"; export interface CrawlResult { source: string; @@ -43,6 +43,34 @@ export interface FirecrawlJob { num_tokens?: number, } +export interface FirecrawlScrapeResponse { + statusCode: number; + body: { + status: string; + data: Document; + }; + error?: string; +} + +export interface FirecrawlCrawlResponse { + statusCode: number; + body: { + status: string; + jobId: string; + + }; + error?: string; +} + +export interface FirecrawlCrawlStatusResponse { + statusCode: number; + body: { + status: string; + data: Document[]; + }; + error?: string; +} + export enum RateLimiterMode { Crawl = "crawl", CrawlStatus = "crawlStatus", From 3c1af0aa338a58e8ffc68f7c062dd38d0d97a1f5 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:03:57 -0300 Subject: [PATCH 05/12] Update ci.yml --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 049aeaf..b2e42e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,6 +27,7 @@ env: TEST_API_KEY: ${{ secrets.TEST_API_KEY }} HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} HDX_NODE_BETA_MODE: 1 + FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }} jobs: From 6e32522fa2283a8f30635d60c98a44eaee19c69d Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:43:06 -0300 Subject: [PATCH 06/12] Improvements on response document types --- apps/js-sdk/example.ts | 21 ++-- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 120 ++++++++++++++++------ apps/js-sdk/firecrawl/types/index.d.ts | 131 ++++++++++++++++++------- 4 files changed, 198 insertions(+), 76 deletions(-) diff --git a/apps/js-sdk/example.ts b/apps/js-sdk/example.ts index 9fa823a..f314c08 100644 --- a/apps/js-sdk/example.ts +++ b/apps/js-sdk/example.ts @@ -1,11 +1,14 @@ -import FirecrawlApp, { JobStatusResponse } from '@mendable/firecrawl-js'; +import FirecrawlApp, { JobStatusResponse } from './firecrawl/src/index' //'@mendable/firecrawl-js'; import { z } from "zod"; const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); // Scrape a website: const scrapeResult = await app.scrapeUrl('firecrawl.dev'); -console.log(scrapeResult.data.content) + +if (scrapeResult.data) { + console.log(scrapeResult.data.content) +} // Crawl a website: const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false); @@ -23,12 +26,13 @@ while (true) { await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second } -console.log(job.data[0].content); +if (job.data) { + console.log(job.data[0].content); +} // Search for a query: const query = 'what is mendable?' const searchResult = await app.search(query) -console.log(searchResult) // LLM Extraction: // Define schema to extract contents into using zod schema @@ -50,7 +54,9 @@ let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { extractorOptions: { extractionSchema: zodSchema }, }); -console.log(llmExtractionResult.data.llm_extraction); +if (llmExtractionResult.data) { + console.log(llmExtractionResult.data.llm_extraction); +} // Define schema to extract contents into using json schema const jsonSchema = { @@ -80,4 +86,7 @@ llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { extractorOptions: { extractionSchema: jsonSchema }, }); -console.log(llmExtractionResult.data.llm_extraction); \ No newline at end of file +if (llmExtractionResult.data) { + console.log(llmExtractionResult.data.llm_extraction); +} + diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 4ab793c..15d8034 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.26", + "version": "0.0.27", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index f884125..5028a20 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -10,56 +10,112 @@ export interface FirecrawlAppConfig { } /** - * Generic parameter interface. + * Metadata for a Firecrawl document. */ -export interface Params { +export interface FirecrawlDocumentMetadata { + title?: string; + description?: string; + language?: string; + keywords?: string; + robots?: string; + ogTitle?: string; + ogDescription?: string; + ogUrl?: string; + ogImage?: string; + ogAudio?: string; + ogDeterminer?: string; + ogLocale?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogVideo?: string; + dctermsCreated?: string; + dcDateCreated?: string; + dcDate?: string; + dctermsType?: string; + dcType?: string; + dctermsAudience?: string; + dctermsSubject?: string; + dcSubject?: string; + dcDescription?: string; + dctermsKeywords?: string; + modifiedTime?: string; + publishedTime?: string; + articleTag?: string; + articleSection?: string; + sourceURL?: string; + pageStatusCode?: number; + pageError?: string; [key: string]: any; - extractorOptions?: { - extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction"; - extractionPrompt?: string; - }; +} + +/** + * Document interface for Firecrawl. + */ +export interface FirecrawlDocument { + id?: string; + url?: string; + content: string; + markdown?: string; + html?: string; + llm_extraction?: Record; + createdAt?: Date; + updatedAt?: Date; + type?: string; + metadata: FirecrawlDocumentMetadata; + childrenLinks?: string[]; + provider?: string; + warning?: string; + + index?: number; } /** * Response interface for scraping operations. */ export interface ScrapeResponse { - success: boolean; - data?: any; - error?: string; + success: boolean; + data?: FirecrawlDocument; + error?: string; } - /** - * Response interface for searching operations. - */ +* Response interface for searching operations. +*/ export interface SearchResponse { - success: boolean; - data?: any; - error?: string; + success: boolean; + data?: FirecrawlDocument[]; + error?: string; } /** - * Response interface for crawling operations. - */ +* Response interface for crawling operations. +*/ export interface CrawlResponse { - success: boolean; - jobId?: string; - data?: any; - error?: string; + success: boolean; + jobId?: string; + data?: FirecrawlDocument[]; + error?: string; } - /** - * Response interface for job status checks. - */ +* Response interface for job status checks. +*/ export interface JobStatusResponse { - success: boolean; - status: string; - jobId?: string; - data?: any; - partial_data?: any, - error?: string; + success: boolean; + status: string; + jobId?: string; + data?: FirecrawlDocument[]; + partial_data?: FirecrawlDocument[]; + error?: string; +} +/** + * Generic parameter interface. + */ +export interface Params { + [key: string]: any; + extractorOptions?: { + extractionSchema: z.ZodSchema | any; + mode?: "llm-extraction"; + extractionPrompt?: string; + }; } - /** * Main class for interacting with the Firecrawl API. */ diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts index 52a7d1e..1a7cd1f 100644 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -8,8 +8,101 @@ export interface FirecrawlAppConfig { apiUrl?: string | null; } /** - * Generic parameter interface. + * Metadata for a Firecrawl document. */ +export interface FirecrawlDocumentMetadata { + title?: string; + description?: string; + language?: string; + keywords?: string; + robots?: string; + ogTitle?: string; + ogDescription?: string; + ogUrl?: string; + ogImage?: string; + ogAudio?: string; + ogDeterminer?: string; + ogLocale?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogVideo?: string; + dctermsCreated?: string; + dcDateCreated?: string; + dcDate?: string; + dctermsType?: string; + dcType?: string; + dctermsAudience?: string; + dctermsSubject?: string; + dcSubject?: string; + dcDescription?: string; + dctermsKeywords?: string; + modifiedTime?: string; + publishedTime?: string; + articleTag?: string; + articleSection?: string; + sourceURL?: string; + pageStatusCode?: number; + pageError?: string; + [key: string]: any; +} +/** + * Document interface for Firecrawl. + */ +export interface FirecrawlDocument { + id?: string; + url?: string; + content: string; + markdown?: string; + html?: string; + llm_extraction?: Record; + createdAt?: Date; + updatedAt?: Date; + type?: string; + metadata: FirecrawlDocumentMetadata; + childrenLinks?: string[]; + provider?: string; + warning?: string; + index?: number; +} +/** + * Response interface for scraping operations. + */ +export interface ScrapeResponse { + success: boolean; + data?: FirecrawlDocument; + error?: string; +} +/** +* Response interface for searching operations. +*/ +export interface SearchResponse { + success: boolean; + data?: FirecrawlDocument[]; + error?: string; +} +/** +* Response interface for crawling operations. +*/ +export interface CrawlResponse { + success: boolean; + jobId?: string; + data?: FirecrawlDocument[]; + error?: string; +} +/** +* Response interface for job status checks. +*/ +export interface JobStatusResponse { + success: boolean; + status: string; + jobId?: string; + data?: FirecrawlDocument[]; + partial_data?: FirecrawlDocument[]; + error?: string; +} +/** + * Generic parameter interface. + */ export interface Params { [key: string]: any; extractorOptions?: { @@ -18,42 +111,6 @@ export interface Params { extractionPrompt?: string; }; } -/** - * Response interface for scraping operations. - */ -export interface ScrapeResponse { - success: boolean; - data?: any; - error?: string; -} -/** - * Response interface for searching operations. - */ -export interface SearchResponse { - success: boolean; - data?: any; - error?: string; -} -/** - * Response interface for crawling operations. - */ -export interface CrawlResponse { - success: boolean; - jobId?: string; - data?: any; - error?: string; -} -/** - * Response interface for job status checks. - */ -export interface JobStatusResponse { - success: boolean; - status: string; - jobId?: string; - data?: any; - partial_data?: any; - error?: string; -} /** * Main class for interacting with the Firecrawl API. */ From c54e797eb15850c89f3b0de3ee1d4f4dca394970 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:51:28 -0300 Subject: [PATCH 07/12] =?UTF-8?q?(=E2=95=AF=C2=B0=E2=96=A1=C2=B0)=E2=95=AF?= =?UTF-8?q?=EF=B8=B5=20=E2=94=BB=E2=94=81=E2=94=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/api/package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/package.json b/apps/api/package.json index e114a0f..407f4c5 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -13,6 +13,7 @@ "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'", + "workers": "nodemon --exec ts-node src/services/queue-worker.ts", "worker:production": "node dist/src/services/queue-worker.js", "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", "mongo-docker-console": "docker exec -it mongodb mongosh", From 727e5de8c562de537d8b3391a0fa8c9f5cd109e2 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:54:10 -0300 Subject: [PATCH 08/12] Update index.test.ts --- .../src/__tests__/e2e_withAuth/index.test.ts | 42 ------------------- 1 file changed, 42 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 54be495..3044938 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -361,48 +361,6 @@ describe("E2E Tests for API Routes", () => { expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); }); }, 90000); // 90 seconds - - it.concurrent("should return a successful response with a valid API key and limit to 3", async () => { - const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - crawlerOptions: { limit: 3 }, - }); - - let isFinished = false; - let response: FirecrawlCrawlStatusResponse; - - while (!isFinished) { - response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - isFinished = response.body.status === "completed"; - - if (!isFinished) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - - const completedResponse: FirecrawlCrawlStatusResponse = response; - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data.length).toBe(3); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL) From 90a807c54701b5ae062d3ff220d75b6509da5be1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 18 Jun 2024 12:56:13 -0400 Subject: [PATCH 09/12] Update index.ts --- apps/js-sdk/firecrawl/src/index.ts | 618 +++++++++++++++-------------- 1 file changed, 311 insertions(+), 307 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 4cb9d57..5ee9043 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -73,335 +73,339 @@ export interface FirecrawlDocument { * Response interface for scraping operations. */ export interface ScrapeResponse { - success: boolean; - data?: FirecrawlDocument; - error?: string; + success: boolean; + data?: FirecrawlDocument; + error?: string; } /** -* Response interface for searching operations. -*/ + * Response interface for searching operations. + */ export interface SearchResponse { - success: boolean; - data?: FirecrawlDocument[]; - error?: string; + success: boolean; + data?: FirecrawlDocument[]; + error?: string; } /** -* Response interface for crawling operations. -*/ + * Response interface for crawling operations. + */ export interface CrawlResponse { - success: boolean; - jobId?: string; - data?: FirecrawlDocument[]; - error?: string; + success: boolean; + jobId?: string; + data?: FirecrawlDocument[]; + error?: string; } /** -* Response interface for job status checks. -*/ + * Response interface for job status checks. + */ export interface JobStatusResponse { - success: boolean; - status: string; - jobId?: string; - data?: FirecrawlDocument[]; - partial_data?: FirecrawlDocument[]; - error?: string; + success: boolean; + status: string; + jobId?: string; + data?: FirecrawlDocument[]; + partial_data?: FirecrawlDocument[]; + error?: string; } /** - * Generic parameter interface. - */ + * Generic parameter interface. + */ export interface Params { - [key: string]: any; - extractorOptions?: { - extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction"; - extractionPrompt?: string; - }; + [key: string]: any; + extractorOptions?: { + extractionSchema: z.ZodSchema | any; + mode?: "llm-extraction"; + extractionPrompt?: string; + }; } /** * Main class for interacting with the Firecrawl API. */ export default class FirecrawlApp { - private apiKey: string; - private apiUrl: string; + private apiKey: string; + private apiUrl: string; - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev" - if (!this.apiKey) { - throw new Error("No API key provided"); - } + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { + this.apiKey = apiKey || ""; + this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + if (!this.apiKey) { + throw new Error("No API key provided"); } + } - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - async scrapeUrl( - url: string, - params: Params | null = null - ): Promise { - const headers: AxiosRequestHeaders = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - } as AxiosRequestHeaders; - let jsonData: Params = { url, ...params }; - if (params?.extractorOptions?.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof z.ZodSchema) { - schema = zodToJsonSchema(schema); - } - jsonData = { - ...jsonData, - extractorOptions: { - ...params.extractorOptions, - extractionSchema: schema, - mode: params.extractorOptions.mode || "llm-extraction", - }, - }; - } - try { - const response: AxiosResponse = await axios.post( - this.apiUrl + "/v0/scrape", - jsonData, - { headers }, - ); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); - } - } else { - this.handleError(response, "scrape URL"); - } - } catch (error: any) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + async scrapeUrl( + url: string, + params: Params | null = null + ): Promise { + const headers: AxiosRequestHeaders = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + } as AxiosRequestHeaders; + let jsonData: Params = { url, ...params }; + if (params?.extractorOptions?.extractionSchema) { + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof z.ZodSchema) { + schema = zodToJsonSchema(schema); + } + jsonData = { + ...jsonData, + extractorOptions: { + ...params.extractorOptions, + extractionSchema: schema, + mode: params.extractorOptions.mode || "llm-extraction", + }, + }; } - - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - async search( - query: string, - params: Params | null = null - ): Promise { - const headers: AxiosRequestHeaders = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - } as AxiosRequestHeaders; - let jsonData: Params = { query }; - if (params) { - jsonData = { ...jsonData, ...params }; - } - try { - const response: AxiosResponse = await axios.post( - this.apiUrl + "/v0/search", - jsonData, - { headers } - ); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } else { - throw new Error(`Failed to search. Error: ${responseData.error}`); - } - } else { - this.handleError(response, "search"); - } - } catch (error: any) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - } - - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - async crawlUrl( - url: string, - params: Params | null = null, - waitUntilDone: boolean = true, - pollInterval: number = 2, - idempotencyKey?: string - ): Promise { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData: Params = { url }; - if (params) { - jsonData = { ...jsonData, ...params }; - } - try { - const response: AxiosResponse = await this.postRequest( - this.apiUrl + "/v0/crawl", - jsonData, - headers - ); - if (response.status === 200) { - const jobId: string = response.data.jobId; - if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); - } else { - return { success: true, jobId }; - } - } else { - this.handleError(response, "start crawl job"); - } - } catch (error: any) { - console.log(error); - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - } - - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - async checkCrawlStatus(jobId: string): Promise { - const headers: AxiosRequestHeaders = this.prepareHeaders(); - try { - const response: AxiosResponse = await this.getRequest( - this.apiUrl + `/v0/crawl/status/${jobId}`, - headers - ); - if (response.status === 200) { - return { - success: true, - status: response.data.status, - data: response.data.data, - partial_data: !response.data.data ? response.data.partial_data : undefined, - }; - } else { - this.handleError(response, "check crawl status"); - } - } catch (error: any) { - throw new Error(error.message); - } - return { - success: false, - status: "unknown", - error: "Internal server error.", - }; - } - - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { - return { - 'Content-Type': 'application/json', - 'Authorization': `Bearer ${this.apiKey}`, - ...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}), - } as AxiosRequestHeaders & { 'x-idempotency-key'?: string }; - } - - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest( - url: string, - data: Params, - headers: AxiosRequestHeaders - ): Promise { - return axios.post(url, data, { headers }); - } - - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest( - url: string, - headers: AxiosRequestHeaders - ): Promise { - return axios.get(url, { headers }); - } - - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - async monitorJobStatus( - jobId: string, - headers: AxiosRequestHeaders, - checkInterval: number - ): Promise { - while (true) { - const statusResponse: AxiosResponse = await this.getRequest( - this.apiUrl + `/v0/crawl/status/${jobId}`, - headers - ); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData.data; - } else { - throw new Error("Crawl job completed but no data was returned"); - } - } else if ( - ["active", "paused", "pending", "queued"].includes(statusData.status) - ) { - if (checkInterval < 2) { - checkInterval = 2; - } - await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again - } else { - throw new Error( - `Crawl job failed or was stopped. Status: ${statusData.status}` - ); - } - } else { - this.handleError(statusResponse, "check crawl status"); - } - } - } - - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response: AxiosResponse, action: string): void { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage: string = - response.data.error || "Unknown error occurred"; - throw new Error( - `Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}` - ); + try { + const response: AxiosResponse = await axios.post( + this.apiUrl + "/v0/scrape", + jsonData, + { headers } + ); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; } else { - throw new Error( - `Unexpected error occurred while trying to ${action}. Status code: ${response.status}` - ); + throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); } + } else { + this.handleError(response, "scrape URL"); + } + } catch (error: any) { + throw new Error(error.message); } + return { success: false, error: "Internal server error." }; + } + + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + async search( + query: string, + params: Params | null = null + ): Promise { + const headers: AxiosRequestHeaders = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + } as AxiosRequestHeaders; + let jsonData: Params = { query }; + if (params) { + jsonData = { ...jsonData, ...params }; + } + try { + const response: AxiosResponse = await axios.post( + this.apiUrl + "/v0/search", + jsonData, + { headers } + ); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } else { + this.handleError(response, "search"); + } + } catch (error: any) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + } + + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + async crawlUrl( + url: string, + params: Params | null = null, + waitUntilDone: boolean = true, + pollInterval: number = 2, + idempotencyKey?: string + ): Promise { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData: Params = { url }; + if (params) { + jsonData = { ...jsonData, ...params }; + } + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + "/v0/crawl", + jsonData, + headers + ); + if (response.status === 200) { + const jobId: string = response.data.jobId; + if (waitUntilDone) { + return this.monitorJobStatus(jobId, headers, pollInterval); + } else { + return { success: true, jobId }; + } + } else { + this.handleError(response, "start crawl job"); + } + } catch (error: any) { + console.log(error); + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + } + + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + async checkCrawlStatus(jobId: string): Promise { + const headers: AxiosRequestHeaders = this.prepareHeaders(); + try { + const response: AxiosResponse = await this.getRequest( + this.apiUrl + `/v0/crawl/status/${jobId}`, + headers + ); + if (response.status === 200) { + return { + success: true, + status: response.data.status, + data: response.data.data, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, + }; + } else { + this.handleError(response, "check crawl status"); + } + } catch (error: any) { + throw new Error(error.message); + } + return { + success: false, + status: "unknown", + error: "Internal server error.", + }; + } + + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { + return { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + ...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}), + } as AxiosRequestHeaders & { "x-idempotency-key"?: string }; + } + + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest( + url: string, + data: Params, + headers: AxiosRequestHeaders + ): Promise { + return axios.post(url, data, { headers }); + } + + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest( + url: string, + headers: AxiosRequestHeaders + ): Promise { + return axios.get(url, { headers }); + } + + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + async monitorJobStatus( + jobId: string, + headers: AxiosRequestHeaders, + checkInterval: number + ): Promise { + while (true) { + const statusResponse: AxiosResponse = await this.getRequest( + this.apiUrl + `/v0/crawl/status/${jobId}`, + headers + ); + if (statusResponse.status === 200) { + const statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + return statusData.data; + } else { + throw new Error("Crawl job completed but no data was returned"); + } + } else if ( + ["active", "paused", "pending", "queued"].includes(statusData.status) + ) { + if (checkInterval < 2) { + checkInterval = 2; + } + await new Promise((resolve) => + setTimeout(resolve, checkInterval * 1000) + ); // Wait for the specified timeout before checking again + } else { + throw new Error( + `Crawl job failed or was stopped. Status: ${statusData.status}` + ); + } + } else { + this.handleError(statusResponse, "check crawl status"); + } + } + } + + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response: AxiosResponse, action: string): void { + if ([402, 408, 409, 500].includes(response.status)) { + const errorMessage: string = + response.data.error || "Unknown error occurred"; + throw new Error( + `Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}` + ); + } else { + throw new Error( + `Unexpected error occurred while trying to ${action}. Status code: ${response.status}` + ); + } + } } From 754c9fa08d19398d026fe901b8ef0dbc65842126 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 18 Jun 2024 12:58:57 -0400 Subject: [PATCH 10/12] Update package.json --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 15d8034..b162882 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.27", + "version": "0.0.28", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "types": "types/index.d.ts", From d0c05accf662f0b6f4c96e1bfd1e58e30bb3c21a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 18 Jun 2024 13:21:50 -0400 Subject: [PATCH 11/12] Nick: --- apps/api/src/controllers/search.ts | 1 + apps/js-sdk/firecrawl/package-lock.json | 11 +++++-- apps/js-sdk/firecrawl/package.json | 1 + .../src/__tests__/e2e_withAuth/index.test.ts | 31 ++++++++++--------- .../firecrawl/src/__tests__/index.test.ts | 2 +- 5 files changed, 28 insertions(+), 18 deletions(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index b555197..8cb6d55 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -91,6 +91,7 @@ export async function searchHelper( }); const docs = await a.getDocuments(false); + if (docs.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index 7094cc9..906ab47 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.26", + "version": "0.0.28", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "0.0.26", + "version": "0.0.28", "license": "MIT", "dependencies": { "axios": "^1.6.8", @@ -20,6 +20,7 @@ "@types/axios": "^0.14.0", "@types/dotenv": "^8.2.0", "@types/jest": "^29.5.12", + "@types/mocha": "^10.0.6", "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", "jest": "^29.7.0", @@ -1071,6 +1072,12 @@ "pretty-format": "^29.0.0" } }, + "node_modules/@types/mocha": { + "version": "10.0.6", + "resolved": "https://registry.npmjs.org/@types/mocha/-/mocha-10.0.6.tgz", + "integrity": "sha512-dJvrYWxP/UcXm36Qn36fxhUKu8A/xMRXVT2cliFF1Z7UA9liG5Psj3ezNSZw+5puH2czDXRLcXQxf8JbJt0ejg==", + "dev": true + }, "node_modules/@types/node": { "version": "20.12.12", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.12.tgz", diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index b162882..71cf91a 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -33,6 +33,7 @@ "@types/axios": "^0.14.0", "@types/dotenv": "^8.2.0", "@types/jest": "^29.5.12", + "@types/mocha": "^10.0.6", "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", "jest": "^29.7.0", diff --git a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts index 2725c23..af6aa84 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts @@ -2,6 +2,7 @@ import FirecrawlApp from '../../index'; import { v4 as uuidv4 } from 'uuid'; import dotenv from 'dotenv'; + dotenv.config(); const TEST_API_KEY = process.env.TEST_API_KEY; @@ -29,14 +30,14 @@ describe('FirecrawlApp E2E Tests', () => { const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); const response = await app.scrapeUrl('https://roastmywebsite.ai'); expect(response).not.toBeNull(); - expect(response.data.content).toContain("_Roast_"); + expect(response.data?.content).toContain("_Roast_"); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://roastmywebsite.ai'); expect(response).not.toBeNull(); - expect(response.data.content).toContain("_Roast_"); + expect(response.data?.content).toContain("_Roast_"); expect(response.data).toHaveProperty('markdown'); expect(response.data).toHaveProperty('metadata'); expect(response.data).not.toHaveProperty('html'); @@ -46,23 +47,23 @@ describe('FirecrawlApp E2E Tests', () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }); expect(response).not.toBeNull(); - expect(response.data.content).toContain("_Roast_"); - expect(response.data.markdown).toContain("_Roast_"); - expect(response.data.html).toContain(" { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf'); expect(response).not.toBeNull(); - expect(response.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001'); expect(response).not.toBeNull(); - expect(response.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on crawl', async () => { @@ -112,15 +113,15 @@ describe('FirecrawlApp E2E Tests', () => { expect(statusResponse).not.toBeNull(); expect(statusResponse.status).toBe('completed'); - expect(statusResponse.data.length).toBeGreaterThan(0); + expect(statusResponse?.data?.length).toBeGreaterThan(0); }, 35000); // 35 seconds timeout test.concurrent('should return successful response for search', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.search("test query"); expect(response).not.toBeNull(); - expect(response.data[0].content).toBeDefined(); - expect(response.data.length).toBeGreaterThan(2); + expect(response?.data?.[0]?.content).toBeDefined(); + expect(response?.data?.length).toBeGreaterThan(2); }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on search', async () => { @@ -146,10 +147,10 @@ describe('FirecrawlApp E2E Tests', () => { } }); expect(response).not.toBeNull(); - expect(response.data.llm_extraction).toBeDefined(); - const llmExtraction = response.data.llm_extraction; - expect(llmExtraction.company_mission).toBeDefined(); - expect(typeof llmExtraction.supports_sso).toBe('boolean'); - expect(typeof llmExtraction.is_open_source).toBe('boolean'); + expect(response.data?.llm_extraction).toBeDefined(); + const llmExtraction = response.data?.llm_extraction; + expect(llmExtraction?.company_mission).toBeDefined(); + expect(typeof llmExtraction?.supports_sso).toBe('boolean'); + expect(typeof llmExtraction?.is_open_source).toBe('boolean'); }, 30000); // 30 seconds timeout }); diff --git a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts index 8c5ed5a..dcda96f 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts @@ -43,6 +43,6 @@ describe('the firecrawl JS SDK', () => { expect.objectContaining({ headers: expect.objectContaining({'Authorization': `Bearer ${apiKey}`}) }), ) expect(scrapedData.success).toBe(true); - expect(scrapedData.data.metadata.title).toEqual('Mendable'); + expect(scrapedData?.data?.metadata.title).toEqual('Mendable'); }); }) \ No newline at end of file From 8db8997daf8cf6fe688c5d3a7d4ec68f60bfb49e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 18 Jun 2024 13:34:44 -0400 Subject: [PATCH 12/12] Nick: test suite + fly --- .github/workflows/fly-direct.yml | 2 +- .github/workflows/fly.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/fly-direct.yml b/.github/workflows/fly-direct.yml index a049143..944e399 100644 --- a/.github/workflows/fly-direct.yml +++ b/.github/workflows/fly-direct.yml @@ -1,7 +1,7 @@ name: Fly Deploy Direct on: schedule: - - cron: '0 * * * *' + - cron: '0 */2 * * *' env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index 84017b1..627409e 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -93,7 +93,7 @@ jobs: working-directory: ./apps/test-suite - name: Run E2E tests run: | - npm run test + npm run test:suite working-directory: ./apps/test-suite python-sdk-tests: