From 07012ca19c0e33033d30494aecefed5bfde2fd02 Mon Sep 17 00:00:00 2001 From: chand1012 Date: Sat, 4 May 2024 15:12:17 -0400 Subject: [PATCH 01/10] Add docker compose file for self hosting --- .env.example | 16 ++++++++++++ docker-compose.yaml | 64 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 .env.example create mode 100644 docker-compose.yaml diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..e95ead0 --- /dev/null +++ b/.env.example @@ -0,0 +1,16 @@ +NUM_WORKERS_PER_QUEUE=8 +OPENAI_API_KEY= +SLACK_WEBHOOK_URL= +SERPER_API_KEY= +LLAMAPARSE_API_KEY= +LOGTAIL_KEY= +BULL_AUTH_KEY= +TEST_API_KEY= +POSTHOG_API_KEY= +POSTHOG_HOST= +SUPABASE_ANON_TOKEN= +SUPABASE_URL= +SUPABASE_SERVICE_TOKEN= +SCRAPING_BEE_API_KEY= +USE_DB_AUTHENTICATION=false +SELFHOST_API_KEY= diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..c65de3f --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,64 @@ +name: firecrawl +version: '3.9' +services: + redis: + image: redis:alpine + + playwright-service: + build: apps/playwright-service + environment: + - PORT=3000 + + api: + build: apps/api + environment: + - REDIS_URL=redis://redis:6379 + - PLAYWRIGHT_SERVICE_URL=http://playwright-service:3000 + - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} + - PORT=3002 + - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} + - SERPER_API_KEY=${SERPER_API_KEY} + - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} + - LOGTAIL_KEY=${LOGTAIL_KEY} + - BULL_AUTH_KEY=${BULL_AUTH_KEY} + - TEST_API_KEY=${TEST_API_KEY} + - POSTHOG_API_KEY=${POSTHOG_API_KEY} + - POSTHOG_HOST=${POSTHOG_HOST} + - SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN} + - SUPABASE_URL=${SUPABASE_URL} + - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} + - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} + - HOST=0.0.0.0 + depends_on: + - redis + - playwright-service + ports: + - "3002:3002" + command: [ "pnpm", "run", "start:production" ] + + worker: + build: apps/api + environment: + - REDIS_URL=redis://redis:6379 + - PLAYWRIGHT_SERVICE_URL=http://playwright-service:3000 + - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} + - PORT=3002 + - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} + - SERPER_API_KEY=${SERPER_API_KEY} + - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} + - LOGTAIL_KEY=${LOGTAIL_KEY} + - BULL_AUTH_KEY=${BULL_AUTH_KEY} + - TEST_API_KEY=${TEST_API_KEY} + - POSTHOG_API_KEY=${POSTHOG_API_KEY} + - POSTHOG_HOST=${POSTHOG_HOST} + - SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN} + - SUPABASE_URL=${SUPABASE_URL} + - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} + - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} + depends_on: + - redis + - playwright-service From 5a352b2b4f008b2d70178b57ebe5f771b5cc30e4 Mon Sep 17 00:00:00 2001 From: chand1012 Date: Sat, 4 May 2024 15:13:51 -0400 Subject: [PATCH 02/10] Remove selfhost api key --- .env.example | 1 - 1 file changed, 1 deletion(-) diff --git a/.env.example b/.env.example index e95ead0..e7ddc9b 100644 --- a/.env.example +++ b/.env.example @@ -13,4 +13,3 @@ SUPABASE_URL= SUPABASE_SERVICE_TOKEN= SCRAPING_BEE_API_KEY= USE_DB_AUTHENTICATION=false -SELFHOST_API_KEY= From b32057ec890dc5b79ea7b05323820cf337afe68f Mon Sep 17 00:00:00 2001 From: chand1012 Date: Sun, 5 May 2024 12:03:42 -0400 Subject: [PATCH 03/10] Update SELF_HOST.md --- SELF_HOST.md | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/SELF_HOST.md b/SELF_HOST.md index 8d1d490..0deb543 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,6 +1,41 @@ # Self-hosting Firecrawl -Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. +First, clone this repository and copy `.env.example` to `.env`. +```bash +git clone https://github.com/mendableai/firecrawl.git +cd firecrawl +cp .env.example .env +``` -*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* +Then, edit the .env.example to have the correct values for your environment. +``` +## To turn on DB authentication, you need to set up supabase. +USE_DB_AUTHENTICATION=false +# ===== Optional ENVS ====== + +# Supabase Setup (used to support DB authentication, advanced logging, etc.) +SUPABASE_ANON_TOKEN= +SUPABASE_URL= +SUPABASE_SERVICE_TOKEN= + +# Other Optionals +TEST_API_KEY= # use if you've set up authentication and want to test with a real API key +SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking +OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) +BULL_AUTH_KEY= # +LOGTAIL_KEY= # Use if you're configuring basic logging with logtail +PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback +LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs +SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api +SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages +POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs +POSTHOG_HOST= # set if you'd like to send posthog events like job logs +``` + +Once that's complete, you can simply run the following commands to get started: +```bash +docker compose up +``` + +This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`. From 18480b2005dcc669762294f9cf40cf8bb57f17ce Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 10 May 2024 11:38:17 -0300 Subject: [PATCH 04/10] Removed .env.example, improved docs and docker compose envs --- .env.example | 15 --------------- SELF_HOST.md | 31 ++++++------------------------- apps/api/.env.example | 2 +- docker-compose.yaml | 16 +++++++++------- 4 files changed, 16 insertions(+), 48 deletions(-) delete mode 100644 .env.example diff --git a/.env.example b/.env.example deleted file mode 100644 index e7ddc9b..0000000 --- a/.env.example +++ /dev/null @@ -1,15 +0,0 @@ -NUM_WORKERS_PER_QUEUE=8 -OPENAI_API_KEY= -SLACK_WEBHOOK_URL= -SERPER_API_KEY= -LLAMAPARSE_API_KEY= -LOGTAIL_KEY= -BULL_AUTH_KEY= -TEST_API_KEY= -POSTHOG_API_KEY= -POSTHOG_HOST= -SUPABASE_ANON_TOKEN= -SUPABASE_URL= -SUPABASE_SERVICE_TOKEN= -SCRAPING_BEE_API_KEY= -USE_DB_AUTHENTICATION=false diff --git a/SELF_HOST.md b/SELF_HOST.md index 0deb543..a695f84 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,36 +1,17 @@ # Self-hosting Firecrawl -First, clone this repository and copy `.env.example` to `.env`. +## Getting Started + +First, clone this repository and copy the example env file from api folder `.env.example` to `.env`. ```bash git clone https://github.com/mendableai/firecrawl.git cd firecrawl -cp .env.example .env +cp ./apps/api/.env.example ./.env ``` -Then, edit the .env.example to have the correct values for your environment. -``` -## To turn on DB authentication, you need to set up supabase. +For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` on `.env` to not use the database authentication. +```yml USE_DB_AUTHENTICATION=false - -# ===== Optional ENVS ====== - -# Supabase Setup (used to support DB authentication, advanced logging, etc.) -SUPABASE_ANON_TOKEN= -SUPABASE_URL= -SUPABASE_SERVICE_TOKEN= - -# Other Optionals -TEST_API_KEY= # use if you've set up authentication and want to test with a real API key -SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking -OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) -BULL_AUTH_KEY= # -LOGTAIL_KEY= # Use if you're configuring basic logging with logtail -PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback -LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs -SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api -SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages -POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs -POSTHOG_HOST= # set if you'd like to send posthog events like job logs ``` Once that's complete, you can simply run the following commands to get started: diff --git a/apps/api/.env.example b/apps/api/.env.example index b025326..55271ec 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -3,6 +3,7 @@ NUM_WORKERS_PER_QUEUE=8 PORT=3002 HOST=0.0.0.0 REDIS_URL=redis://localhost:6379 +PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000 ## To turn on DB authentication, you need to set up supabase. USE_DB_AUTHENTICATION=true @@ -20,7 +21,6 @@ SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blockin OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) BULL_AUTH_KEY= # LOGTAIL_KEY= # Use if you're configuring basic logging with logtail -PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages diff --git a/docker-compose.yaml b/docker-compose.yaml index c65de3f..af6921c 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,10 +12,10 @@ services: api: build: apps/api environment: - - REDIS_URL=redis://redis:6379 - - PLAYWRIGHT_SERVICE_URL=http://playwright-service:3000 + - REDIS_URL=${REDIS_URL} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - - PORT=3002 + - PORT=${PORT} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} - OPENAI_API_KEY=${OPENAI_API_KEY} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} @@ -30,7 +30,7 @@ services: - SUPABASE_URL=${SUPABASE_URL} - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} - - HOST=0.0.0.0 + - HOST=${HOST} depends_on: - redis - playwright-service @@ -41,10 +41,10 @@ services: worker: build: apps/api environment: - - REDIS_URL=redis://redis:6379 - - PLAYWRIGHT_SERVICE_URL=http://playwright-service:3000 + - REDIS_URL=${REDIS_URL} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - - PORT=3002 + - PORT=${PORT} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} - OPENAI_API_KEY=${OPENAI_API_KEY} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} @@ -59,6 +59,8 @@ services: - SUPABASE_URL=${SUPABASE_URL} - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} + - HOST=${HOST} depends_on: - redis - playwright-service + - api From 02450660092e402671b36f03d5b77d349bd4b403 Mon Sep 17 00:00:00 2001 From: chand1012 Date: Fri, 10 May 2024 17:15:32 -0400 Subject: [PATCH 05/10] chore: Update docker-compose.yaml with default values for REDIS_URL and PLAYWRIGHT_MICROSERVICE_URL --- docker-compose.yaml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index af6921c..9128042 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,8 +12,8 @@ services: api: build: apps/api environment: - - REDIS_URL=${REDIS_URL} - - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL} + - REDIS_URL=${REDIS_URL:-redis://redis:6379} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - PORT=${PORT} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} @@ -41,8 +41,8 @@ services: worker: build: apps/api environment: - - REDIS_URL=${REDIS_URL} - - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL} + - REDIS_URL=${REDIS_URL:-redis://redis:6379} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - PORT=${PORT} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} @@ -64,3 +64,7 @@ services: - redis - playwright-service - api + +networks: + default: + name: firecrawl From 2021a822ffccde73e9cefbcb2a2467179db3cb0e Mon Sep 17 00:00:00 2001 From: chand1012 Date: Fri, 10 May 2024 17:20:33 -0400 Subject: [PATCH 06/10] chore: Add firecrawl network to docker-compose.yaml --- docker-compose.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docker-compose.yaml b/docker-compose.yaml index 9128042..2daabec 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -3,11 +3,15 @@ version: '3.9' services: redis: image: redis:alpine + networks: + - firecrawl playwright-service: build: apps/playwright-service environment: - PORT=3000 + networks: + - firecrawl api: build: apps/api @@ -37,6 +41,8 @@ services: ports: - "3002:3002" command: [ "pnpm", "run", "start:production" ] + networks: + - firecrawl worker: build: apps/api @@ -64,6 +70,8 @@ services: - redis - playwright-service - api + networks: + - firecrawl networks: default: From b498e9881c5bcaf7ddad6a4d8d1e540e24d316f5 Mon Sep 17 00:00:00 2001 From: chand1012 Date: Fri, 10 May 2024 17:23:22 -0400 Subject: [PATCH 07/10] chore: Update docker-compose.yaml network configuration --- docker-compose.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 2daabec..12a8219 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -4,14 +4,14 @@ services: redis: image: redis:alpine networks: - - firecrawl + - default playwright-service: build: apps/playwright-service environment: - PORT=3000 networks: - - firecrawl + - default api: build: apps/api @@ -42,7 +42,7 @@ services: - "3002:3002" command: [ "pnpm", "run", "start:production" ] networks: - - firecrawl + - default worker: build: apps/api @@ -71,8 +71,7 @@ services: - playwright-service - api networks: - - firecrawl + - default networks: default: - name: firecrawl From 5cbce060edee4cd860f21b3a6c2d7660defda604 Mon Sep 17 00:00:00 2001 From: chand1012 Date: Fri, 10 May 2024 17:26:00 -0400 Subject: [PATCH 08/10] chore: Update docker-compose.yaml with default values for PORT and HOST --- docker-compose.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 12a8219..0cc9d43 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -19,7 +19,7 @@ services: - REDIS_URL=${REDIS_URL:-redis://redis:6379} - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - - PORT=${PORT} + - PORT=${PORT:-3002} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} - OPENAI_API_KEY=${OPENAI_API_KEY} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} @@ -34,7 +34,7 @@ services: - SUPABASE_URL=${SUPABASE_URL} - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} - - HOST=${HOST} + - HOST=${HOST:-0.0.0.0} depends_on: - redis - playwright-service @@ -50,7 +50,7 @@ services: - REDIS_URL=${REDIS_URL:-redis://redis:6379} - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - - PORT=${PORT} + - PORT=${PORT:-3002} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} - OPENAI_API_KEY=${OPENAI_API_KEY} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} @@ -65,7 +65,7 @@ services: - SUPABASE_URL=${SUPABASE_URL} - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} - - HOST=${HOST} + - HOST=${HOST:-0.0.0.0} depends_on: - redis - playwright-service From 4737fe871127764f2d868fa434a4249e7a8939ef Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 13 May 2024 13:47:49 -0300 Subject: [PATCH 09/10] Added missing instruction --- SELF_HOST.md | 5 +++++ docker-compose.yaml | 19 ++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/SELF_HOST.md b/SELF_HOST.md index a695f84..8c3c0aa 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -14,6 +14,11 @@ For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` USE_DB_AUTHENTICATION=false ``` +Update the Redis URL in the .env file to align with the Docker configuration: +```yml +REDIS_URL=redis://redis:6379 +``` + Once that's complete, you can simply run the following commands to get started: ```bash docker compose up diff --git a/docker-compose.yaml b/docker-compose.yaml index 0cc9d43..049672d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,17 +1,12 @@ name: firecrawl version: '3.9' services: - redis: - image: redis:alpine - networks: - - default - playwright-service: build: apps/playwright-service environment: - PORT=3000 networks: - - default + - backend api: build: apps/api @@ -42,7 +37,7 @@ services: - "3002:3002" command: [ "pnpm", "run", "start:production" ] networks: - - default + - backend worker: build: apps/api @@ -71,7 +66,13 @@ services: - playwright-service - api networks: - - default + - backend + redis: + image: redis:alpine + networks: + - backend + command: redis-server --bind 0.0.0.0 networks: - default: + backend: + driver: bridge From eb36d4b3bdcaa2475c846af7ca5a217070993963 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 13:25:39 -0700 Subject: [PATCH 10/10] Update SELF_HOST.md --- SELF_HOST.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/SELF_HOST.md b/SELF_HOST.md index 8c3c0aa..bbce267 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,4 +1,7 @@ # Self-hosting Firecrawl +*We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version.* + +Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. ## Getting Started