diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b9a5b79..69a8a24 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,5 +54,5 @@ jobs: id: start_workers - name: Run E2E tests run: | - npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false + npm run test:prod working-directory: ./apps/api \ No newline at end of file diff --git a/.gitignore b/.gitignore index cbfb076..9029012 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ dump.rdb /mongo-data apps/js-sdk/node_modules/ + +apps/api/.env.local diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 224eb57..733c787 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,8 +1,114 @@ -# Contributing +# Contributors guide: -We love contributions! Our contribution guide will be coming soon! +Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute) - +If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to hello@mendable.ai for more or submit an issue! +## Running the project locally + +First, start by installing dependencies +1. node.js [instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs) +2. pnpm [instructions](https://pnpm.io/installation) +3. redis [instructions](https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/) + + +Set environment variables in a .env in the /apps/api/ directoryyou can copy over the template in .env.example. + +To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features ) + +.env: +``` +# ===== Required ENVS ====== +NUM_WORKERS_PER_QUEUE=8 +PORT=3002 +HOST=0.0.0.0 +REDIS_URL=redis://localhost:6379 + +## To turn on DB authentication, you need to set up supabase. +USE_DB_AUTHENTICATION=false + +# ===== Optional ENVS ====== + +# Supabase Setup (used to support DB authentication, advanced logging, etc.) +SUPABASE_ANON_TOKEN= +SUPABASE_URL= +SUPABASE_SERVICE_TOKEN= + +# Other Optionals +TEST_API_KEY= # use if you've set up authentication and want to test with a real API key +SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking +OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) +BULL_AUTH_KEY= # +LOGTAIL_KEY= # Use if you're configuring basic logging with logtail +PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback +LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs + +``` + +### Installing dependencies + +First, install the dependencies using pnpm. + +```bash +pnpm install +``` + +### Running the project + +You're going to need to open 3 terminals. + +### Terminal 1 - setting up redis + +Run the command anywhere within your project + +```bash +redis-server +``` + +### Terminal 2 - setting up workers + +Now, navigate to the apps/api/ directory and run: +```bash +pnpm run workers +``` + +This will start the workers who are responsible for processing crawl jobs. + +### Terminal 3 - setting up the main server + + +To do this, navigate to the apps/api/ directory and run if you don’t have this already, install pnpm here: https://pnpm.io/installation +Next, run your server with: + +```bash +pnpm run start +``` + +### Terminal 3 - sending our first request. + +Alright: now let’s send our first request. + +```curl +curl -X GET http://localhost:3002/test +``` +This should return the response Hello, world! + + +If you’d like to test the crawl endpoint, you can run this + +```curl +curl -X POST http://localhost:3002/v0/crawl \ + -H 'Content-Type: application/json' \ + -d '{ + "url": "https://mendable.ai" + }' +``` + +## Tests: + +The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication. + +If you'd like to run the tests with authentication, run `npm run test:prod` + diff --git a/README.md b/README.md index 56f8c5c..f6b67b7 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ We provide an easy to use API with our hosted version. You can find the playgrou - [ ] LangchainJS - Coming Soon -Self-host. To self-host refer to guide [here](https://github.com/mendableai/firecrawl/blob/main/SELF_HOST.md). +To run locally, refer to guide [here](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md). ### API Key diff --git a/SELF_HOST.md b/SELF_HOST.md index ba0ae23..8d1d490 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,6 +1,6 @@ # Self-hosting Firecrawl -Guide coming soon. +Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. *This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* diff --git a/apps/api/.env.example b/apps/api/.env.example new file mode 100644 index 0000000..34e24b1 --- /dev/null +++ b/apps/api/.env.example @@ -0,0 +1,24 @@ +# ===== Required ENVS ====== +NUM_WORKERS_PER_QUEUE=8 +PORT=3002 +HOST=0.0.0.0 +REDIS_URL=redis://localhost:6379 + +## To turn on DB authentication, you need to set up supabase. +USE_DB_AUTHENTICATION=true + +# ===== Optional ENVS ====== + +# Supabase Setup (used to support DB authentication, advanced logging, etc.) +SUPABASE_ANON_TOKEN= +SUPABASE_URL= +SUPABASE_SERVICE_TOKEN= + +# Other Optionals +TEST_API_KEY= # use if you've set up authentication and want to test with a real API key +SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking +OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) +BULL_AUTH_KEY= # +LOGTAIL_KEY= # Use if you're configuring basic logging with logtail +PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback +LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs \ No newline at end of file diff --git a/apps/api/.env.local b/apps/api/.env.local deleted file mode 100644 index 6c58f19..0000000 --- a/apps/api/.env.local +++ /dev/null @@ -1,15 +0,0 @@ -ENV= -NUM_WORKERS_PER_QUEUE=8 -PORT= -HOST= -SUPABASE_ANON_TOKEN= -SUPABASE_URL= -SUPABASE_SERVICE_TOKEN= -REDIS_URL= -SCRAPING_BEE_API_KEY= -OPENAI_API_KEY= -BULL_AUTH_KEY= -LOGTAIL_KEY= -PLAYWRIGHT_MICROSERVICE_URL= -LLAMAPARSE_API_KEY= -TEST_API_KEY= \ No newline at end of file diff --git a/apps/api/package.json b/apps/api/package.json index cbce4be..0b533f9 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -11,6 +11,8 @@ "start:dev": "nodemon --exec ts-node src/index.ts", "build": "tsc", "test": "jest --verbose", + "test:local-no-auth":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", + "test:prod":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "workers": "nodemon --exec ts-node src/services/queue-worker.ts", "worker:production": "node dist/src/services/queue-worker.js", "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", diff --git a/apps/api/src/__tests__/e2e/index.test.ts b/apps/api/src/__tests__/e2e/index.test.ts deleted file mode 100644 index 554453b..0000000 --- a/apps/api/src/__tests__/e2e/index.test.ts +++ /dev/null @@ -1,179 +0,0 @@ -import request from 'supertest'; -import { app } from '../../index'; -import dotenv from 'dotenv'; - -dotenv.config(); -const TEST_URL = 'http://localhost:3002' - -describe('E2E Tests for API Routes', () => { - describe('GET /', () => { - it('should return Hello, world! message', async () => { - const response = await request(TEST_URL).get('/'); - expect(response.statusCode).toBe(200); - expect(response.text).toContain('SCRAPERS-JS: Hello, world! Fly.io'); - }); - }); - - describe('GET /test', () => { - it('should return Hello, world! message', async () => { - const response = await request(TEST_URL).get('/test'); - expect(response.statusCode).toBe(200); - expect(response.text).toContain('Hello, world!'); - }); - }); - - describe('POST /v0/scrape', () => { - it('should require authorization', async () => { - const response = await request(app).post('/v0/scrape'); - expect(response.statusCode).toBe(401); - }); - - it('should return an error response with an invalid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer invalid-api-key`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(401); - }); - it('should return a successful response with a valid preview token', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer this_is_just_a_preview_token`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(200); - }, 10000); // 10 seconds timeout - - it('should return a successful response with a valid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('🔥 FireCrawl'); - }, 30000); // 30 seconds timeout - }); - - describe('POST /v0/crawl', () => { - it('should require authorization', async () => { - const response = await request(TEST_URL).post('/v0/crawl'); - expect(response.statusCode).toBe(401); - }); - - it('should return an error response with an invalid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/crawl') - .set('Authorization', `Bearer invalid-api-key`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(401); - }); - - it('should return a successful response with a valid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/crawl') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('jobId'); - expect(response.body.jobId).toMatch(/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/); - }); - - // Additional tests for insufficient credits? - }); - - describe('POST /v0/crawlWebsitePreview', () => { - it('should require authorization', async () => { - const response = await request(TEST_URL).post('/v0/crawlWebsitePreview'); - expect(response.statusCode).toBe(401); - }); - - it('should return an error response with an invalid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/crawlWebsitePreview') - .set('Authorization', `Bearer invalid-api-key`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(401); - }); - - it('should return a successful response with a valid API key', async () => { - const response = await request(TEST_URL) - .post('/v0/crawlWebsitePreview') - .set('Authorization', `Bearer this_is_just_a_preview_token`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('jobId'); - expect(response.body.jobId).toMatch(/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/); - }); - }); - - describe('GET /v0/crawl/status/:jobId', () => { - it('should require authorization', async () => { - const response = await request(TEST_URL).get('/v0/crawl/status/123'); - expect(response.statusCode).toBe(401); - }); - - it('should return an error response with an invalid API key', async () => { - const response = await request(TEST_URL) - .get('/v0/crawl/status/123') - .set('Authorization', `Bearer invalid-api-key`); - expect(response.statusCode).toBe(401); - }); - - it('should return Job not found for invalid job ID', async () => { - const response = await request(TEST_URL) - .get('/v0/crawl/status/invalidJobId') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(404); - }); - - it('should return a successful response for a valid crawl job', async () => { - const crawlResponse = await request(TEST_URL) - .post('/v0/crawl') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://firecrawl.dev' }); - expect(crawlResponse.statusCode).toBe(200); - - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('status'); - expect(response.body.status).toBe('active'); - - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); - - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty('status'); - expect(completedResponse.body.status).toBe('completed'); - expect(completedResponse.body).toHaveProperty('data'); - expect(completedResponse.body.data[0]).toHaveProperty('content'); - expect(completedResponse.body.data[0]).toHaveProperty('markdown'); - expect(completedResponse.body.data[0]).toHaveProperty('metadata'); - expect(completedResponse.body.data[0].content).toContain('🔥 FireCrawl'); - }, 60000); // 60 seconds - }); - - describe('GET /is-production', () => { - it('should return the production status', async () => { - const response = await request(TEST_URL).get('/is-production'); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('isProduction'); - }); - }); -}); \ No newline at end of file diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts new file mode 100644 index 0000000..e0aca36 --- /dev/null +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -0,0 +1,156 @@ +import request from "supertest"; +import { app } from "../../index"; +import dotenv from "dotenv"; +const fs = require("fs"); +const path = require("path"); + +dotenv.config(); + +const TEST_URL = "http://127.0.0.1:3002"; + +describe("E2E Tests for API Routes with No Authentication", () => { + let originalEnv: NodeJS.ProcessEnv; + + // save original process.env + beforeAll(() => { + originalEnv = { ...process.env }; + process.env.USE_DB_AUTHENTICATION = "false"; + process.env.SUPABASE_ANON_TOKEN = ""; + process.env.SUPABASE_URL = ""; + process.env.SUPABASE_SERVICE_TOKEN = ""; + process.env.SCRAPING_BEE_API_KEY = ""; + process.env.OPENAI_API_KEY = ""; + process.env.BULL_AUTH_KEY = ""; + process.env.LOGTAIL_KEY = ""; + process.env.PLAYWRIGHT_MICROSERVICE_URL = ""; + process.env.LLAMAPARSE_API_KEY = ""; + process.env.TEST_API_KEY = ""; + }); + + // restore original process.env + afterAll(() => { + process.env = originalEnv; + }); + + + describe("GET /", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/"); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); + }); + }); + + describe("GET /test", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/test"); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("Hello, world!"); + }); + }); + + describe("POST /v0/scrape", () => { + it("should not require authorization", async () => { + const response = await request(TEST_URL).post("/v0/scrape"); + expect(response.statusCode).not.toBe(401); + }); + + it("should return a successful response", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + }, 10000); // 10 seconds timeout + }); + + describe("POST /v0/crawl", () => { + it("should not require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawl"); + expect(response.statusCode).not.toBe(401); + }); + + it("should return a successful response", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + }); + + describe("POST /v0/crawlWebsitePreview", () => { + it("should not require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawlWebsitePreview"); + expect(response.statusCode).not.toBe(401); + }); + + it("should return a successful response", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + }); + + describe("GET /v0/crawl/status/:jobId", () => { + it("should not require authorization", async () => { + const response = await request(TEST_URL).get("/v0/crawl/status/123"); + expect(response.statusCode).not.toBe(401); + }); + + it("should return Job not found for invalid job ID", async () => { + const response = await request(TEST_URL).get( + "/v0/crawl/status/invalidJobId" + ); + expect(response.statusCode).toBe(404); + }); + + it("should return a successful response for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL).get( + `/v0/crawl/status/${crawlResponse.body.jobId}` + ); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL).get( + `/v0/crawl/status/${crawlResponse.body.jobId}` + ); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); + }, 60000); // 60 seconds + }); + + describe("GET /is-production", () => { + it("should return the production status", async () => { + const response = await request(TEST_URL).get("/is-production"); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("isProduction"); + }); + }); +}); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts new file mode 100644 index 0000000..ba01a7c --- /dev/null +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -0,0 +1,197 @@ +import request from "supertest"; +import { app } from "../../index"; +import dotenv from "dotenv"; + +dotenv.config(); + +// const TEST_URL = 'http://localhost:3002' +const TEST_URL = "http://127.0.0.1:3002"; + + + describe("E2E Tests for API Routes", () => { + beforeAll(() => { + process.env.USE_DB_AUTHENTICATION = "true"; + }); + + afterAll(() => { + delete process.env.USE_DB_AUTHENTICATION; + }); + describe("GET /", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/"); + + expect(response.statusCode).toBe(200); + expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); + }); + }); + + describe("GET /test", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/test"); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("Hello, world!"); + }); + }); + + describe("POST /v0/scrape", () => { + it("should require authorization", async () => { + const response = await request(app).post("/v0/scrape"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + it("should return a successful response with a valid preview token", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + }, 10000); // 10 seconds timeout + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("🔥 FireCrawl"); + }, 30000); // 30 seconds timeout + }); + + describe("POST /v0/crawl", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawl"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + + // Additional tests for insufficient credits? + }); + + describe("POST /v0/crawlWebsitePreview", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post( + "/v0/crawlWebsitePreview" + ); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + }); + + describe("GET /v0/crawl/status/:jobId", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).get("/v0/crawl/status/123"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + }); + + it("should return Job not found for invalid job ID", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + }); + + it("should return a successful response for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain( + "🔥 FireCrawl" + ); + }, 60000); // 60 seconds + }); + + describe("GET /is-production", () => { + it("should return the production status", async () => { + const response = await request(TEST_URL).get("/is-production"); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("isProduction"); + }); + }); + }); diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 76bacbe..49b2146 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -1,9 +1,15 @@ import { parseApi } from "../../src/lib/parseApi"; import { getRateLimiter } from "../../src/services/rate-limiter"; -import { RateLimiterMode } from "../../src/types"; +import { AuthResponse, RateLimiterMode } from "../../src/types"; import { supabase_service } from "../../src/services/supabase"; +import { withAuth } from "../../src/lib/withAuth"; -export async function authenticateUser( + +export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise { + return withAuth(supaAuthenticateUser)(req, res, mode); +} + +export async function supaAuthenticateUser( req, res, mode?: RateLimiterMode @@ -13,6 +19,7 @@ export async function authenticateUser( error?: string; status?: number; }> { + const authHeader = req.headers.authorization; if (!authHeader) { return { success: false, error: "Unauthorized", status: 401 }; diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 17cfa62..bd3feca 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -23,7 +23,6 @@ export async function crawlController(req: Request, res: Response) { return res.status(402).json({ error: "Insufficient credits" }); } - // authenticate on supabase const url = req.body.url; if (!url) { return res.status(400).json({ error: "Url is required" }); @@ -42,7 +41,6 @@ export async function crawlController(req: Request, res: Response) { returnOnlyUrls: true, }, pageOptions: pageOptions, - }); const docs = await a.getDocuments(false, (progress) => { diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 632fff5..be70800 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -40,18 +40,20 @@ export async function scrapeHelper( if (filteredDocs.length === 0) { return { success: true, error: "No page found", returnCode: 200 }; } - const { success, credit_usage } = await billTeam( - team_id, - filteredDocs.length - ); - if (!success) { - return { - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - returnCode: 402, - }; - } + + const { success, credit_usage } = await billTeam( + team_id, + filteredDocs.length + ); + if (!success) { + return { + success: false, + error: + "Failed to bill team. Insufficient credits or subscription not found.", + returnCode: 402, + }; + } + return { success: true, data: filteredDocs[0], diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 1a42eb4..a2e5c51 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -5,7 +5,6 @@ import "dotenv/config"; import { getWebScraperQueue } from "./services/queue-service"; import { redisClient } from "./services/rate-limiter"; import { v0Router } from "./routes/v0"; - const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { ExpressAdapter } = require("@bull-board/express"); @@ -48,6 +47,7 @@ const DEFAULT_PORT = process.env.PORT ?? 3002; const HOST = process.env.HOST ?? "localhost"; redisClient.connect(); + export function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { console.log(`Server listening on port ${port}`); diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts new file mode 100644 index 0000000..ea5aa4d --- /dev/null +++ b/apps/api/src/lib/withAuth.ts @@ -0,0 +1,24 @@ +import { AuthResponse } from "../../src/types"; + +let warningCount = 0; + +export function withAuth( + originalFunction: (...args: U) => Promise +) { + return async function (...args: U): Promise { + if (process.env.USE_DB_AUTHENTICATION === "false") { + if (warningCount < 5) { + console.warn("WARNING - You're bypassing authentication"); + warningCount++; + } + return { success: true } as T; + } else { + try { + return await originalFunction(...args); + } catch (error) { + console.error("Error in withAuth function: ", error); + return { success: false, error: error.message } as T; + } + } + }; +} diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 6ac0843..bf5be60 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -1,7 +1,12 @@ +import { withAuth } from "../../lib/withAuth"; import { supabase_service } from "../supabase"; const FREE_CREDITS = 100; + export async function billTeam(team_id: string, credits: number) { + return withAuth(supaBillTeam)(team_id, credits); +} +export async function supaBillTeam(team_id: string, credits: number) { if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } @@ -52,8 +57,11 @@ export async function billTeam(team_id: string, credits: number) { return { success: true, credit_usage }; } -// if team has enough credits for the operation, return true, else return false export async function checkTeamCredits(team_id: string, credits: number) { + return withAuth(supaCheckTeamCredits)(team_id, credits); +} +// if team has enough credits for the operation, return true, else return false +export async function supaCheckTeamCredits(team_id: string, credits: number) { if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } diff --git a/apps/api/src/services/logtail.ts b/apps/api/src/services/logtail.ts index 19ab773..8b86a6b 100644 --- a/apps/api/src/services/logtail.ts +++ b/apps/api/src/services/logtail.ts @@ -1,4 +1,19 @@ -const { Logtail } = require("@logtail/node"); -//dot env -require("dotenv").config(); -export const logtail = new Logtail(process.env.LOGTAIL_KEY); +import { Logtail } from "@logtail/node"; +import "dotenv/config"; + +// A mock Logtail class to handle cases where LOGTAIL_KEY is not provided +class MockLogtail { + info(message: string, context?: Record): void { + console.log(message, context); + } + error(message: string, context: Record = {}): void { + console.error(message, context); + } +} + +// Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class +// Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided +export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => { + console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more."); + return new MockLogtail(); +})(); diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index 49121fa..fa6404d 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -1,6 +1,56 @@ -import { createClient } from "@supabase/supabase-js"; +import { createClient, SupabaseClient } from "@supabase/supabase-js"; -export const supabase_service = createClient( - process.env.SUPABASE_URL, - process.env.SUPABASE_SERVICE_TOKEN, -); +// SupabaseService class initializes the Supabase client conditionally based on environment variables. +class SupabaseService { + private client: SupabaseClient | null = null; + + constructor() { + const supabaseUrl = process.env.SUPABASE_URL; + const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN; + // Only initialize the Supabase client if both URL and Service Token are provided. + if (process.env.USE_DB_AUTHENTICATION === "false") { + // Warn the user that Authentication is disabled by setting the client to null + console.warn( + "\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m" + ); + this.client = null; + } else if (!supabaseUrl || !supabaseServiceToken) { + console.error( + "\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m" + ); + } else { + this.client = createClient(supabaseUrl, supabaseServiceToken); + } + } + + // Provides access to the initialized Supabase client, if available. + getClient(): SupabaseClient | null { + return this.client; + } +} + +// Using a Proxy to handle dynamic access to the Supabase client or service methods. +// This approach ensures that if Supabase is not configured, any attempt to use it will result in a clear error. +export const supabase_service: SupabaseClient = new Proxy( + new SupabaseService(), + { + get: function (target, prop, receiver) { + const client = target.getClient(); + // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. + if (client === null) { + console.error( + "Attempted to access Supabase client when it's not configured." + ); + return () => { + throw new Error("Supabase client is not configured."); + }; + } + // Direct access to SupabaseService properties takes precedence. + if (prop in target) { + return Reflect.get(target, prop, receiver); + } + // Otherwise, delegate access to the Supabase client. + return Reflect.get(client, prop, receiver); + }, + } +) as unknown as SupabaseClient; diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index f9e5c73..7f527fb 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -25,7 +25,6 @@ export interface WebScraperOptions { origin?: string; } - export interface FirecrawlJob { success: boolean; message: string; @@ -40,8 +39,6 @@ export interface FirecrawlJob { origin: string; } - - export enum RateLimiterMode { Crawl = "crawl", CrawlStatus = "crawl-status", @@ -49,4 +46,9 @@ export enum RateLimiterMode { Preview = "preview", } - +export interface AuthResponse { + success: boolean; + team_id?: string; + error?: string; + status?: number; +}