Merge branch 'main' into test/crawl-options
This commit is contained in:
commit
6ca368327f
26
SELF_HOST.md
26
SELF_HOST.md
@ -1,6 +1,30 @@
|
|||||||
# Self-hosting Firecrawl
|
# Self-hosting Firecrawl
|
||||||
|
*We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version.*
|
||||||
|
|
||||||
Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally.
|
Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally.
|
||||||
|
|
||||||
*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it*
|
## Getting Started
|
||||||
|
|
||||||
|
First, clone this repository and copy the example env file from api folder `.env.example` to `.env`.
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/mendableai/firecrawl.git
|
||||||
|
cd firecrawl
|
||||||
|
cp ./apps/api/.env.example ./.env
|
||||||
|
```
|
||||||
|
|
||||||
|
For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` on `.env` to not use the database authentication.
|
||||||
|
```yml
|
||||||
|
USE_DB_AUTHENTICATION=false
|
||||||
|
```
|
||||||
|
|
||||||
|
Update the Redis URL in the .env file to align with the Docker configuration:
|
||||||
|
```yml
|
||||||
|
REDIS_URL=redis://redis:6379
|
||||||
|
```
|
||||||
|
|
||||||
|
Once that's complete, you can simply run the following commands to get started:
|
||||||
|
```bash
|
||||||
|
docker compose up
|
||||||
|
```
|
||||||
|
|
||||||
|
This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`.
|
||||||
|
@ -3,6 +3,7 @@ NUM_WORKERS_PER_QUEUE=8
|
|||||||
PORT=3002
|
PORT=3002
|
||||||
HOST=0.0.0.0
|
HOST=0.0.0.0
|
||||||
REDIS_URL=redis://localhost:6379
|
REDIS_URL=redis://localhost:6379
|
||||||
|
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000
|
||||||
|
|
||||||
## To turn on DB authentication, you need to set up supabase.
|
## To turn on DB authentication, you need to set up supabase.
|
||||||
USE_DB_AUTHENTICATION=true
|
USE_DB_AUTHENTICATION=true
|
||||||
@ -20,7 +21,6 @@ SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blockin
|
|||||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||||
BULL_AUTH_KEY= #
|
BULL_AUTH_KEY= #
|
||||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
|
||||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||||
|
@ -445,14 +445,10 @@ export class WebScraperDataProvider {
|
|||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? {
|
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
||||||
onlyMainContent: false,
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
includeHtml: false,
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
};
|
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
|
||||||
this.replaceAllPathsWithAbsolutePaths =
|
|
||||||
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
|
||||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||||
|
|
||||||
|
78
docker-compose.yaml
Normal file
78
docker-compose.yaml
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
name: firecrawl
|
||||||
|
version: '3.9'
|
||||||
|
services:
|
||||||
|
playwright-service:
|
||||||
|
build: apps/playwright-service
|
||||||
|
environment:
|
||||||
|
- PORT=3000
|
||||||
|
networks:
|
||||||
|
- backend
|
||||||
|
|
||||||
|
api:
|
||||||
|
build: apps/api
|
||||||
|
environment:
|
||||||
|
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
|
||||||
|
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
|
||||||
|
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
|
||||||
|
- PORT=${PORT:-3002}
|
||||||
|
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
|
||||||
|
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||||
|
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
|
||||||
|
- SERPER_API_KEY=${SERPER_API_KEY}
|
||||||
|
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
|
||||||
|
- LOGTAIL_KEY=${LOGTAIL_KEY}
|
||||||
|
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
|
||||||
|
- TEST_API_KEY=${TEST_API_KEY}
|
||||||
|
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
|
||||||
|
- POSTHOG_HOST=${POSTHOG_HOST}
|
||||||
|
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
|
||||||
|
- SUPABASE_URL=${SUPABASE_URL}
|
||||||
|
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
|
||||||
|
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
|
||||||
|
- HOST=${HOST:-0.0.0.0}
|
||||||
|
depends_on:
|
||||||
|
- redis
|
||||||
|
- playwright-service
|
||||||
|
ports:
|
||||||
|
- "3002:3002"
|
||||||
|
command: [ "pnpm", "run", "start:production" ]
|
||||||
|
networks:
|
||||||
|
- backend
|
||||||
|
|
||||||
|
worker:
|
||||||
|
build: apps/api
|
||||||
|
environment:
|
||||||
|
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
|
||||||
|
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
|
||||||
|
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
|
||||||
|
- PORT=${PORT:-3002}
|
||||||
|
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
|
||||||
|
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||||
|
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
|
||||||
|
- SERPER_API_KEY=${SERPER_API_KEY}
|
||||||
|
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
|
||||||
|
- LOGTAIL_KEY=${LOGTAIL_KEY}
|
||||||
|
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
|
||||||
|
- TEST_API_KEY=${TEST_API_KEY}
|
||||||
|
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
|
||||||
|
- POSTHOG_HOST=${POSTHOG_HOST}
|
||||||
|
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
|
||||||
|
- SUPABASE_URL=${SUPABASE_URL}
|
||||||
|
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
|
||||||
|
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
|
||||||
|
- HOST=${HOST:-0.0.0.0}
|
||||||
|
depends_on:
|
||||||
|
- redis
|
||||||
|
- playwright-service
|
||||||
|
- api
|
||||||
|
networks:
|
||||||
|
- backend
|
||||||
|
redis:
|
||||||
|
image: redis:alpine
|
||||||
|
networks:
|
||||||
|
- backend
|
||||||
|
command: redis-server --bind 0.0.0.0
|
||||||
|
|
||||||
|
networks:
|
||||||
|
backend:
|
||||||
|
driver: bridge
|
Loading…
Reference in New Issue
Block a user