diff --git a/SELF_HOST.md b/SELF_HOST.md index 8d1d490..bbce267 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,6 +1,30 @@ # Self-hosting Firecrawl +*We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version.* Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. -*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* +## Getting Started +First, clone this repository and copy the example env file from api folder `.env.example` to `.env`. +```bash +git clone https://github.com/mendableai/firecrawl.git +cd firecrawl +cp ./apps/api/.env.example ./.env +``` + +For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` on `.env` to not use the database authentication. +```yml +USE_DB_AUTHENTICATION=false +``` + +Update the Redis URL in the .env file to align with the Docker configuration: +```yml +REDIS_URL=redis://redis:6379 +``` + +Once that's complete, you can simply run the following commands to get started: +```bash +docker compose up +``` + +This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`. diff --git a/apps/api/.env.example b/apps/api/.env.example index b025326..55271ec 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -3,6 +3,7 @@ NUM_WORKERS_PER_QUEUE=8 PORT=3002 HOST=0.0.0.0 REDIS_URL=redis://localhost:6379 +PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000 ## To turn on DB authentication, you need to set up supabase. USE_DB_AUTHENTICATION=true @@ -20,7 +21,6 @@ SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blockin OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) BULL_AUTH_KEY= # LOGTAIL_KEY= # Use if you're configuring basic logging with logtail -PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..049672d --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,78 @@ +name: firecrawl +version: '3.9' +services: + playwright-service: + build: apps/playwright-service + environment: + - PORT=3000 + networks: + - backend + + api: + build: apps/api + environment: + - REDIS_URL=${REDIS_URL:-redis://redis:6379} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} + - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} + - PORT=${PORT:-3002} + - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} + - SERPER_API_KEY=${SERPER_API_KEY} + - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} + - LOGTAIL_KEY=${LOGTAIL_KEY} + - BULL_AUTH_KEY=${BULL_AUTH_KEY} + - TEST_API_KEY=${TEST_API_KEY} + - POSTHOG_API_KEY=${POSTHOG_API_KEY} + - POSTHOG_HOST=${POSTHOG_HOST} + - SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN} + - SUPABASE_URL=${SUPABASE_URL} + - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} + - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} + - HOST=${HOST:-0.0.0.0} + depends_on: + - redis + - playwright-service + ports: + - "3002:3002" + command: [ "pnpm", "run", "start:production" ] + networks: + - backend + + worker: + build: apps/api + environment: + - REDIS_URL=${REDIS_URL:-redis://redis:6379} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} + - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} + - PORT=${PORT:-3002} + - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} + - SERPER_API_KEY=${SERPER_API_KEY} + - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} + - LOGTAIL_KEY=${LOGTAIL_KEY} + - BULL_AUTH_KEY=${BULL_AUTH_KEY} + - TEST_API_KEY=${TEST_API_KEY} + - POSTHOG_API_KEY=${POSTHOG_API_KEY} + - POSTHOG_HOST=${POSTHOG_HOST} + - SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN} + - SUPABASE_URL=${SUPABASE_URL} + - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} + - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} + - HOST=${HOST:-0.0.0.0} + depends_on: + - redis + - playwright-service + - api + networks: + - backend + redis: + image: redis:alpine + networks: + - backend + command: redis-server --bind 0.0.0.0 + +networks: + backend: + driver: bridge