0

Merge branch 'main' into test/crawl-options

This commit is contained in:
Nicolas 2024-05-15 17:18:25 -07:00
commit 6ca368327f
4 changed files with 108 additions and 10 deletions

View File

@ -1,6 +1,30 @@
# Self-hosting Firecrawl
*We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version.*
Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally.
*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it*
## Getting Started
First, clone this repository and copy the example env file from api folder `.env.example` to `.env`.
```bash
git clone https://github.com/mendableai/firecrawl.git
cd firecrawl
cp ./apps/api/.env.example ./.env
```
For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` on `.env` to not use the database authentication.
```yml
USE_DB_AUTHENTICATION=false
```
Update the Redis URL in the .env file to align with the Docker configuration:
```yml
REDIS_URL=redis://redis:6379
```
Once that's complete, you can simply run the following commands to get started:
```bash
docker compose up
```
This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`.

View File

@ -3,6 +3,7 @@ NUM_WORKERS_PER_QUEUE=8
PORT=3002
HOST=0.0.0.0
REDIS_URL=redis://localhost:6379
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000
## To turn on DB authentication, you need to set up supabase.
USE_DB_AUTHENTICATION=true
@ -20,7 +21,6 @@ SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blockin
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
BULL_AUTH_KEY= #
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages

View File

@ -445,14 +445,10 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
};
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths =
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== "");
this.crawlerMode = options.crawlerOptions?.mode ?? "default";

78
docker-compose.yaml Normal file
View File

@ -0,0 +1,78 @@
name: firecrawl
version: '3.9'
services:
playwright-service:
build: apps/playwright-service
environment:
- PORT=3000
networks:
- backend
api:
build: apps/api
environment:
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
- PORT=${PORT:-3002}
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
- POSTHOG_HOST=${POSTHOG_HOST}
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
- SUPABASE_URL=${SUPABASE_URL}
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
- HOST=${HOST:-0.0.0.0}
depends_on:
- redis
- playwright-service
ports:
- "3002:3002"
command: [ "pnpm", "run", "start:production" ]
networks:
- backend
worker:
build: apps/api
environment:
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
- PORT=${PORT:-3002}
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
- POSTHOG_HOST=${POSTHOG_HOST}
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
- SUPABASE_URL=${SUPABASE_URL}
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
- HOST=${HOST:-0.0.0.0}
depends_on:
- redis
- playwright-service
- api
networks:
- backend
redis:
image: redis:alpine
networks:
- backend
command: redis-server --bind 0.0.0.0
networks:
backend:
driver: bridge