0

Merge branch 'main' into nsc/improvemnts-fixes-misc

This commit is contained in:
Nicolas 2024-06-03 16:46:02 -07:00
commit 918059ee9e
117 changed files with 13061 additions and 433 deletions

35
.github/ISSUE_TEMPLATE/bug_report.md vendored Normal file
View File

@ -0,0 +1,35 @@
---
name: Bug report
about: Create a report to help us improve
title: "[BUG]"
labels: bug
assignees: ''
---
**Describe the Bug**
Provide a clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the issue:
1. Configure the environment or settings with '...'
2. Run the command '...'
3. Observe the error or unexpected output at '...'
4. Log output/error message
**Expected Behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots or copies of the command line output to help explain the issue.
**Environment (please complete the following information):**
- OS: [e.g. macOS, Linux, Windows]
- Firecrawl Version: [e.g. 1.2.3]
- Node.js Version: [e.g. 14.x]
**Logs**
If applicable, include detailed logs to help understand the problem.
**Additional Context**
Add any other context about the problem here, such as configuration specifics, network conditions, data volumes, etc.

View File

@ -0,0 +1,26 @@
---
name: Feature request
about: Suggest an idea for this project
title: "[Feat]"
labels: ''
assignees: ''
---
**Problem Description**
Describe the issue you're experiencing that has prompted this feature request. For example, "I find it difficult when..."
**Proposed Feature**
Provide a clear and concise description of the feature you would like implemented.
**Alternatives Considered**
Discuss any alternative solutions or features you've considered. Why were these alternatives not suitable?
**Implementation Suggestions**
If you have ideas on how the feature could be implemented, share them here. This could include technical details, API changes, or interaction mechanisms.
**Use Case**
Explain how this feature would be used and what benefits it would bring. Include specific examples to illustrate how this would improve functionality or user experience.
**Additional Context**
Add any other context such as comparisons with similar features in other products, or links to prototypes or mockups.

View File

@ -25,6 +25,9 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
jobs: jobs:
pre-deploy: pre-deploy:

View File

@ -94,6 +94,25 @@ jobs:
run: | run: |
npm run test npm run test
working-directory: ./apps/test-suite working-directory: ./apps/test-suite
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
working-directory: ./apps/python-sdk
- name: Run E2E tests for Python SDK
run: |
pytest firecrawl/__tests__/e2e_withAuth/test.py
working-directory: ./apps/python-sdk
- name: Install dependencies for JavaScript SDK
run: pnpm install
working-directory: ./apps/js-sdk/firecrawl
- name: Run E2E tests for JavaScript SDK
run: npm run test
working-directory: ./apps/js-sdk/firecrawl
deploy: deploy:
name: Deploy app name: Deploy app

60
.github/workflows/js-sdk.yml vendored Normal file
View File

@ -0,0 +1,60 @@
name: Run JavaScript SDK E2E Tests
on:
pull_request:
branches:
- main
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
PORT: ${{ secrets.PORT }}
REDIS_URL: ${{ secrets.REDIS_URL }}
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
jobs:
build:
runs-on: ubuntu-latest
services:
redis:
image: redis
ports:
- 6379:6379
steps:
- uses: actions/checkout@v3
- name: Set up Node.js
uses: actions/setup-node@v3
with:
node-version: "20"
- name: Install pnpm
run: npm install -g pnpm
- name: Install dependencies for API
run: pnpm install
working-directory: ./apps/api
- name: Start the application
run: npm start &
working-directory: ./apps/api
- name: Start workers
run: npm run workers &
working-directory: ./apps/api
- name: Install dependencies for JavaScript SDK
run: pnpm install
working-directory: ./apps/js-sdk/firecrawl
- name: Run E2E tests for JavaScript SDK
run: npm run test
working-directory: ./apps/js-sdk/firecrawl

72
.github/workflows/python-sdk.yml vendored Normal file
View File

@ -0,0 +1,72 @@
name: Run Python SDK E2E Tests
on:
pull_request:
branches:
- main
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
PORT: ${{ secrets.PORT }}
REDIS_URL: ${{ secrets.REDIS_URL }}
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10"]
services:
redis:
image: redis
ports:
- 6379:6379
steps:
- uses: actions/checkout@v3
- name: Set up Node.js
uses: actions/setup-node@v3
with:
node-version: "20"
- name: Install pnpm
run: npm install -g pnpm
- name: Install dependencies for API
run: pnpm install
working-directory: ./apps/api
- name: Start the application
run: npm start &
working-directory: ./apps/api
id: start_app
- name: Start workers
run: npm run workers &
working-directory: ./apps/api
id: start_workers
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
working-directory: ./apps/python-sdk
- name: Run E2E tests for Python SDK
run: |
pytest firecrawl/__tests__/e2e_withAuth/test.py
working-directory: ./apps/python-sdk

View File

@ -39,7 +39,7 @@ SUPABASE_SERVICE_TOKEN=
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
BULL_AUTH_KEY= # BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs

View File

@ -6,7 +6,7 @@ _This repository is in its early development stages. We are still merging custom
## What is Firecrawl? ## What is Firecrawl?
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown. We crawl all accessible subpages and give you clean markdown for each. No sitemap required. [Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required.
_Pst. hey, you, join our stargazers :)_ _Pst. hey, you, join our stargazers :)_
@ -114,7 +114,7 @@ Response:
### Search (Beta) ### Search (Beta)
Used to search the web, get the most relevant results, scrap each page and return the markdown. Used to search the web, get the most relevant results, scrape each page and return the markdown.
```bash ```bash
curl -X POST https://api.firecrawl.dev/v0/search \ curl -X POST https://api.firecrawl.dev/v0/search \
@ -296,7 +296,6 @@ npm install @mendable/firecrawl-js
1. Get an API key from [firecrawl.dev](https://firecrawl.dev) 1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. 2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
### Scraping a URL ### Scraping a URL
To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary. To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
@ -403,7 +402,6 @@ const searchResults = await app.search(query, {
``` ```
## Contributing ## Contributing
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request. We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.

View File

@ -1,6 +1,31 @@
# Self-hosting Firecrawl # Self-hosting Firecrawl
*We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version.*
Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally.
*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* ## Getting Started
First, clone this repository and copy the example env file from api folder `.env.example` to `.env`.
```bash
git clone https://github.com/mendableai/firecrawl.git
cd firecrawl
cp ./apps/api/.env.example ./.env
```
For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` on `.env` to not use the database authentication.
```yml
USE_DB_AUTHENTICATION=false
```
Update the Redis URL in the .env file to align with the Docker configuration:
```yml
REDIS_URL=redis://redis:6379
```
Once that's complete, you can simply run the following commands to get started:
```bash
docker compose up
```
This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`.

View File

@ -3,6 +3,7 @@ NUM_WORKERS_PER_QUEUE=8
PORT=3002 PORT=3002
HOST=0.0.0.0 HOST=0.0.0.0
REDIS_URL=redis://localhost:6379 REDIS_URL=redis://localhost:6379
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000
## To turn on DB authentication, you need to set up supabase. ## To turn on DB authentication, you need to set up supabase.
USE_DB_AUTHENTICATION=true USE_DB_AUTHENTICATION=true
@ -16,14 +17,36 @@ SUPABASE_SERVICE_TOKEN=
# Other Optionals # Other Optionals
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
RATE_LIMIT_TEST_API_KEY_SCRAPE= # set if you'd like to test the scraping rate limit
RATE_LIMIT_TEST_API_KEY_CRAWL= # set if you'd like to test the crawling rate limit
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
BULL_AUTH_KEY= # BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs POSTHOG_HOST= # set if you'd like to send posthog events like job logs
STRIPE_PRICE_ID_STANDARD=
STRIPE_PRICE_ID_SCALE=
STRIPE_PRICE_ID_STARTER=
STRIPE_PRICE_ID_HOBBY=
STRIPE_PRICE_ID_HOBBY_YEARLY=
STRIPE_PRICE_ID_STANDARD_NEW=
STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
STRIPE_PRICE_ID_GROWTH=
STRIPE_PRICE_ID_GROWTH_YEARLY=
HYPERDX_API_KEY=
HDX_NODE_BETA_MODE=1
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta
# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
PROXY_SERVER=
PROXY_USERNAME=
PROXY_PASSWORD=
# set if you'd like to block media requests to save proxy bandwidth
BLOCK_MEDIA=

View File

@ -27,6 +27,13 @@ kill_timeout = '5s'
hard_limit = 200 hard_limit = 200
soft_limit = 100 soft_limit = 100
[[http_service.checks]]
grace_period = "10s"
interval = "30s"
method = "GET"
timeout = "5s"
path = "/"
[[services]] [[services]]
protocol = 'tcp' protocol = 'tcp'
internal_port = 8080 internal_port = 8080

View File

@ -18,8 +18,8 @@
"paths": { "paths": {
"/scrape": { "/scrape": {
"post": { "post": {
"summary": "Scrape a single URL", "summary": "Scrape a single URL and optionally extract information using an LLM",
"operationId": "scrapeSingleUrl", "operationId": "scrapeAndExtractFromUrl",
"tags": ["Scraping"], "tags": ["Scraping"],
"security": [ "security": [
{ {
@ -45,8 +45,48 @@
"type": "boolean", "type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.", "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false "default": false
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
} }
} }
},
"extractorOptions": {
"type": "object",
"description": "Options for LLM-based extraction of structured information from the page content",
"properties": {
"mode": {
"type": "string",
"enum": ["llm-extraction"],
"description": "The extraction mode to use, currently supports 'llm-extraction'"
},
"extractionPrompt": {
"type": "string",
"description": "A prompt describing what information to extract from the page"
},
"extractionSchema": {
"type": "object",
"additionalProperties": true,
"description": "The schema for the data to be extracted",
"required": [
"company_mission",
"supports_sso",
"is_open_source"
]
}
}
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for the request",
"default": 30000
} }
}, },
"required": ["url"] "required": ["url"]
@ -126,6 +166,16 @@
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
"default": false "default": false
}, },
"maxDepth": {
"type": "integer",
"description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on."
},
"mode": {
"type": "string",
"enum": ["default", "fast"],
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
"default": "default"
},
"limit": { "limit": {
"type": "integer", "type": "integer",
"description": "Maximum number of pages to crawl", "description": "Maximum number of pages to crawl",
@ -140,6 +190,11 @@
"type": "boolean", "type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.", "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false "default": false
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
} }
} }
} }
@ -192,7 +247,7 @@
"query": { "query": {
"type": "string", "type": "string",
"format": "uri", "format": "uri",
"description": "The URL to scrape" "description": "The query to search for"
}, },
"pageOptions": { "pageOptions": {
"type": "object", "type": "object",
@ -206,6 +261,11 @@
"type": "boolean", "type": "boolean",
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
"default": true "default": true
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
} }
} }
}, },
@ -299,9 +359,66 @@
"data": { "data": {
"type": "array", "type": "array",
"items": { "items": {
"$ref": "#/components/schemas/ScrapeResponse" "$ref": "#/components/schemas/CrawlStatusResponseObj"
}, },
"description": "Data returned from the job (null when it is in progress)" "description": "Data returned from the job (null when it is in progress)"
},
"partial_data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CrawlStatusResponseObj"
},
"description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
}
}
}
}
}
},
"402": {
"description": "Payment required"
},
"429": {
"description": "Too many requests"
},
"500": {
"description": "Server error"
}
}
}
},
"/crawl/cancel/{jobId}": {
"delete": {
"tags": ["Crawl"],
"summary": "Cancel a crawl job",
"operationId": "cancelCrawlJob",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Returns cancelled."
} }
} }
} }
@ -344,6 +461,11 @@
"content": { "content": {
"type": "string" "type": "string"
}, },
"html": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeHtml` is true"
},
"metadata": { "metadata": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -362,6 +484,51 @@
"format": "uri" "format": "uri"
} }
} }
},
"llm_extraction": {
"type": "object",
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
"nullable": true
},
"warning": {
"type": "string",
"nullable": true,
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
}
}
}
}
},
"CrawlStatusResponseObj": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"html": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeHtml` is true"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
} }
} }
} }

View File

@ -33,6 +33,7 @@
"express": "^4.18.2", "express": "^4.18.2",
"jest": "^29.6.3", "jest": "^29.6.3",
"jest-fetch-mock": "^3.0.3", "jest-fetch-mock": "^3.0.3",
"mammoth": "^1.7.2",
"nodemon": "^2.0.20", "nodemon": "^2.0.20",
"supabase": "^1.77.9", "supabase": "^1.77.9",
"supertest": "^6.3.3", "supertest": "^6.3.3",
@ -47,6 +48,7 @@
"@bull-board/express": "^5.8.0", "@bull-board/express": "^5.8.0",
"@devil7softwares/pos": "^1.0.2", "@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.13", "@dqbd/tiktoken": "^1.0.13",
"@hyperdx/node-opentelemetry": "^0.7.0",
"@logtail/node": "^0.4.12", "@logtail/node": "^0.4.12",
"@nangohq/node": "^0.36.33", "@nangohq/node": "^0.36.33",
"@sentry/node": "^7.48.0", "@sentry/node": "^7.48.0",

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,7 @@
import request from "supertest"; import request from "supertest";
import { app } from "../../index"; import { app } from "../../index";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { v4 as uuidv4 } from "uuid";
dotenv.config(); dotenv.config();
@ -67,7 +68,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" }); .send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
}, 10000); // 10 seconds timeout }, 30000); // 30 seconds timeout
it("should return a successful response with a valid API key", async () => { it("should return a successful response with a valid API key", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
@ -81,7 +82,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.content).toContain("🔥 FireCrawl"); expect(response.body.data.content).toContain("🔥 Firecrawl");
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
it("should return a successful response with a valid API key and includeHtml set to true", async () => { it("should return a successful response with a valid API key and includeHtml set to true", async () => {
@ -99,10 +100,61 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("html"); expect(response.body.data).toHaveProperty("html");
expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain("🔥 FireCrawl"); expect(response.body.data.content).toContain("🔥 Firecrawl");
expect(response.body.data.markdown).toContain("🔥 FireCrawl"); expect(response.body.data.markdown).toContain("🔥 Firecrawl");
expect(response.body.data.html).toContain("<h1"); expect(response.body.data.html).toContain("<h1");
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
it('should return a successful response for a valid scrape with PDF file', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' });
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 60000); // 60 seconds
it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' });
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 60000); // 60 seconds
// TODO: add this test back once we nail the waitFor option to be more deterministic
// it("should return a successful response with a valid API key and waitFor option", async () => {
// const startTime = Date.now();
// const response = await request(TEST_URL)
// .post("/v0/scrape")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } });
// const endTime = Date.now();
// const duration = endTime - startTime;
// expect(response.statusCode).toBe(200);
// expect(response.body).toHaveProperty("data");
// expect(response.body.data).toHaveProperty("content");
// expect(response.body.data).toHaveProperty("markdown");
// expect(response.body.data).toHaveProperty("metadata");
// expect(response.body.data).not.toHaveProperty("html");
// expect(response.body.data.content).toContain("🔥 Firecrawl");
// expect(duration).toBeGreaterThanOrEqual(7000);
// }, 12000); // 12 seconds timeout
}); });
describe("POST /v0/crawl", () => { describe("POST /v0/crawl", () => {
@ -145,8 +197,299 @@ describe("E2E Tests for API Routes", () => {
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
); );
}); });
it('should prevent duplicate requests using the same idempotency key', async () => {
const uniqueIdempotencyKey = uuidv4();
// First request with the idempotency key
const firstResponse = await request(TEST_URL)
.post('/v0/crawl')
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.set("x-idempotency-key", uniqueIdempotencyKey)
.send({ url: 'https://mendable.ai' });
expect(firstResponse.statusCode).toBe(200);
// Second request with the same idempotency key
const secondResponse = await request(TEST_URL)
.post('/v0/crawl')
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.set("x-idempotency-key", uniqueIdempotencyKey)
.send({ url: 'https://mendable.ai' });
expect(secondResponse.statusCode).toBe(409);
expect(secondResponse.body.error).toBe('Idempotency key already used');
});
// Additional tests for insufficient credits? it("should return a successful response with a valid API key and valid includes option", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
limit: 10,
crawlerOptions: {
includes: ["blog/*"],
},
});
let response;
let isFinished = false;
while (!isFinished) {
response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
isFinished = response.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = response;
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThan(5);
urls.forEach((url: string) => {
console.log({url})
expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy();
});
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
}, 60000); // 60 seconds
it("should return a successful response with a valid API key and valid excludes option", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
limit: 10,
crawlerOptions: {
excludes: ["blog/*"],
},
});
let isFinished = false;
let response;
while (!isFinished) {
response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
isFinished = response.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = response;
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThan(5);
urls.forEach((url: string) => {
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
});
}, 90000); // 90 seconds
it("should return a successful response with a valid API key and limit to 3", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
crawlerOptions: { limit: 3 },
});
let isFinished = false;
let response;
while (!isFinished) {
response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
isFinished = response.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = response;
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data.length).toBe(3);
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
}, 60000); // 60 seconds
it("should return a successful response with max depth option for a valid crawl job", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com",
crawlerOptions: { maxDepth: 2 },
});
expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("active");
// wait for 60 seconds
await new Promise((r) => setTimeout(r, 60000));
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThan(1);
// Check if all URLs have a maximum depth of 1
urls.forEach((url: string) => {
const depth = new URL(url).pathname.split("/").filter(Boolean).length;
expect(depth).toBeLessThanOrEqual(1);
});
}, 120000);
// it("should return a successful response with a valid API key and valid limit option", async () => {
// const crawlResponse = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({
// url: "https://mendable.ai",
// crawlerOptions: { limit: 10 },
// });
// const response = await request(TEST_URL)
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(response.statusCode).toBe(200);
// expect(response.body).toHaveProperty("status");
// expect(response.body.status).toBe("active");
// let isCompleted = false;
// while (!isCompleted) {
// const statusCheckResponse = await request(TEST_URL)
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(statusCheckResponse.statusCode).toBe(200);
// isCompleted = statusCheckResponse.body.status === "completed";
// if (!isCompleted) {
// await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
// }
// }
// const completedResponse = await request(TEST_URL)
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(completedResponse.statusCode).toBe(200);
// expect(completedResponse.body).toHaveProperty("status");
// expect(completedResponse.body.status).toBe("completed");
// expect(completedResponse.body).toHaveProperty("data");
// expect(completedResponse.body.data.length).toBe(10);
// expect(completedResponse.body.data[0]).toHaveProperty("content");
// expect(completedResponse.body.data[0]).toHaveProperty("markdown");
// expect(completedResponse.body.data[0]).toHaveProperty("metadata");
// expect(completedResponse.body.data[0].content).toContain("Mendable");
// expect(completedResponse.body.data[0].content).not.toContain("main menu");
// }, 60000); // 60 seconds
it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
pageOptions: { includeHtml: true },
});
expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("active");
let isCompleted = false;
while (!isCompleted) {
const statusCheckResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(statusCheckResponse.statusCode).toBe(200);
isCompleted = statusCheckResponse.body.status === "completed";
if (!isCompleted) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
// 120 seconds
expect(completedResponse.body.data[0]).toHaveProperty("html");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
expect(completedResponse.body.data[0].markdown).toContain("Firecrawl");
expect(completedResponse.body.data[0].html).toContain("<h1");
}, 60000);
}); });
describe("POST /v0/crawlWebsitePreview", () => { describe("POST /v0/crawlWebsitePreview", () => {
@ -248,7 +591,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.statusCode).toBe(404); expect(response.statusCode).toBe(404);
}); });
it("should return a successful response for a valid crawl job", async () => { it("should return a successful crawl status response for a valid crawl job", async () => {
const crawlResponse = await request(TEST_URL) const crawlResponse = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -256,27 +599,67 @@ describe("E2E Tests for API Routes", () => {
.send({ url: "https://firecrawl.dev" }); .send({ url: "https://firecrawl.dev" });
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL) let isCompleted = false;
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) let completedResponse;
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("active");
// wait for 30 seconds while (!isCompleted) {
await new Promise((r) => setTimeout(r, 30000)); const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
const completedResponse = await request(TEST_URL) if (response.body.status === "completed") {
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) isCompleted = true;
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); completedResponse = response;
expect(completedResponse.statusCode).toBe(200); } else {
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
}
}
expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data"); expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
}, 60000); // 60 seconds
it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
const crawlResponse = await request(TEST_URL)
.post('/v0/crawl')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }});
expect(crawlResponse.statusCode).toBe(200);
let isCompleted = false;
let completedResponse;
while (!isCompleted) {
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('status');
if (response.body.status === 'completed') {
isCompleted = true;
completedResponse = response;
} else {
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
}
}
expect(completedResponse.body.status).toBe('completed');
expect(completedResponse.body).toHaveProperty('data');
expect(completedResponse.body.data.length).toEqual(1);
expect(completedResponse.body.data).toEqual(
expect.arrayContaining([
expect.objectContaining({
content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.')
})
])
);
}, 60000); // 60 seconds }, 60000); // 60 seconds
it("should return a successful response with max depth option for a valid crawl job", async () => { it("should return a successful response with max depth option for a valid crawl job", async () => {
@ -290,18 +673,21 @@ describe("E2E Tests for API Routes", () => {
}); });
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL) let isCompleted = false;
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) let completedResponse;
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("active");
// wait for 60 seconds
await new Promise((r) => setTimeout(r, 60000));
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
while (!isCompleted) {
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
if (response.body.status === "completed") {
isCompleted = true;
completedResponse = response;
}
}
expect(completedResponse.statusCode).toBe(200); expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body.status).toBe("completed");
@ -357,8 +743,8 @@ describe("E2E Tests for API Routes", () => {
// 120 seconds // 120 seconds
expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("html");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); expect(completedResponse.body.data[0].markdown).toContain("Firecrawl");
expect(completedResponse.body.data[0].html).toContain("<h1"); expect(completedResponse.body.data[0].html).toContain("<h1");
}, 60000); }, 60000);
}); // 60 seconds }); // 60 seconds
@ -371,10 +757,8 @@ describe("E2E Tests for API Routes", () => {
.send({ url: "https://jestjs.io" }); .send({ url: "https://jestjs.io" });
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
// wait for 30 seconds // wait for 30 seconds
await new Promise((r) => setTimeout(r, 10000)); await new Promise((r) => setTimeout(r, 20000));
const response = await request(TEST_URL) const response = await request(TEST_URL)
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
@ -383,7 +767,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body).toHaveProperty("status"); expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("cancelled"); expect(response.body.status).toBe("cancelled");
await new Promise((r) => setTimeout(r, 20000)); await new Promise((r) => setTimeout(r, 10000));
const completedResponse = await request(TEST_URL) const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
@ -400,8 +784,6 @@ describe("E2E Tests for API Routes", () => {
}, 60000); // 60 seconds }, 60000); // 60 seconds
describe("POST /v0/scrape with LLM Extraction", () => { describe("POST /v0/scrape with LLM Extraction", () => {
it("should extract data using LLM extraction mode", async () => { it("should extract data using LLM extraction mode", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
@ -511,6 +893,107 @@ describe("E2E Tests for API Routes", () => {
// }, 120000); // 120 secs // }, 120000); // 120 secs
// }); // });
describe("POST /v0/crawl with fast mode", () => {
it("should complete the crawl under 20 seconds", async () => {
const startTime = Date.now();
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://flutterbricks.com",
crawlerOptions: {
mode: "fast"
}
});
expect(crawlResponse.statusCode).toBe(200);
const jobId = crawlResponse.body.jobId;
let statusResponse;
let isFinished = false;
while (!isFinished) {
statusResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(statusResponse.statusCode).toBe(200);
isFinished = statusResponse.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const endTime = Date.now();
const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
console.log(`Time elapsed: ${timeElapsed} seconds`);
expect(statusResponse.body.status).toBe("completed");
expect(statusResponse.body).toHaveProperty("data");
expect(statusResponse.body.data[0]).toHaveProperty("content");
expect(statusResponse.body.data[0]).toHaveProperty("markdown");
const results = statusResponse.body.data;
// results.forEach((result, i) => {
// console.log(result.metadata.sourceURL);
// });
expect(results.length).toBeGreaterThanOrEqual(10);
expect(results.length).toBeLessThanOrEqual(15);
}, 20000);
// it("should complete the crawl in more than 10 seconds", async () => {
// const startTime = Date.now();
// const crawlResponse = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({
// url: "https://flutterbricks.com",
// });
// expect(crawlResponse.statusCode).toBe(200);
// const jobId = crawlResponse.body.jobId;
// let statusResponse;
// let isFinished = false;
// while (!isFinished) {
// statusResponse = await request(TEST_URL)
// .get(`/v0/crawl/status/${jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(statusResponse.statusCode).toBe(200);
// isFinished = statusResponse.body.status === "completed";
// if (!isFinished) {
// await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
// }
// }
// const endTime = Date.now();
// const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
// console.log(`Time elapsed: ${timeElapsed} seconds`);
// expect(statusResponse.body.status).toBe("completed");
// expect(statusResponse.body).toHaveProperty("data");
// expect(statusResponse.body.data[0]).toHaveProperty("content");
// expect(statusResponse.body.data[0]).toHaveProperty("markdown");
// const results = statusResponse.body.data;
// // results.forEach((result, i) => {
// // console.log(result.metadata.sourceURL);
// // });
// expect(results.length).toBeGreaterThanOrEqual(10);
// expect(results.length).toBeLessThanOrEqual(15);
// }, 50000);// 15 seconds timeout to account for network delays
});
describe("GET /is-production", () => { describe("GET /is-production", () => {
it("should return the production status", async () => { it("should return the production status", async () => {
const response = await request(TEST_URL).get("/is-production"); const response = await request(TEST_URL).get("/is-production");
@ -518,4 +1001,65 @@ describe("E2E Tests for API Routes", () => {
expect(response.body).toHaveProperty("isProduction"); expect(response.body).toHaveProperty("isProduction");
}); });
}); });
describe("Rate Limiter", () => {
it("should return 429 when rate limit is exceeded for preview token", async () => {
for (let i = 0; i < 4; i++) {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer this_is_just_a_preview_token`)
.set("Content-Type", "application/json")
.send({ url: "https://www.scrapethissite.com" });
expect(response.statusCode).toBe(200);
}
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer this_is_just_a_preview_token`)
.set("Content-Type", "application/json")
.send({ url: "https://www.scrapethissite.com" });
expect(response.statusCode).toBe(429);
}, 60000);
});
// it("should return 429 when rate limit is exceeded for API key", async () => {
// for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) {
// const response = await request(TEST_URL)
// .post("/v0/scrape")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
// expect(response.statusCode).toBe(200);
// }
// const response = await request(TEST_URL)
// .post("/v0/scrape")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
// expect(response.statusCode).toBe(429);
// }, 60000);
// it("should return 429 when rate limit is exceeded for API key", async () => {
// for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) {
// const response = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
// expect(response.statusCode).toBe(200);
// }
// const response = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
// expect(response.statusCode).toBe(429);
// }, 60000);
}); });

View File

@ -1,14 +1,25 @@
import { parseApi } from "../../src/lib/parseApi"; import { parseApi } from "../../src/lib/parseApi";
import { getRateLimiter } from "../../src/services/rate-limiter"; import { getRateLimiter, } from "../../src/services/rate-limiter";
import { AuthResponse, RateLimiterMode } from "../../src/types"; import { AuthResponse, RateLimiterMode } from "../../src/types";
import { supabase_service } from "../../src/services/supabase"; import { supabase_service } from "../../src/services/supabase";
import { withAuth } from "../../src/lib/withAuth"; import { withAuth } from "../../src/lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise<AuthResponse> {
return withAuth(supaAuthenticateUser)(req, res, mode); return withAuth(supaAuthenticateUser)(req, res, mode);
} }
function setTrace(team_id: string, api_key: string) {
try {
setTraceAttributes({
team_id,
api_key
});
} catch (error) {
console.error('Error setting trace attributes:', error);
}
}
export async function supaAuthenticateUser( export async function supaAuthenticateUser(
req, req,
res, res,
@ -18,8 +29,8 @@ export async function supaAuthenticateUser(
team_id?: string; team_id?: string;
error?: string; error?: string;
status?: number; status?: number;
plan?: string;
}> { }> {
const authHeader = req.headers.authorization; const authHeader = req.headers.authorization;
if (!authHeader) { if (!authHeader) {
return { success: false, error: "Unauthorized", status: 401 }; return { success: false, error: "Unauthorized", status: 401 };
@ -33,18 +44,95 @@ export async function supaAuthenticateUser(
}; };
} }
const incomingIP = (req.headers["x-forwarded-for"] ||
req.socket.remoteAddress) as string;
const iptoken = incomingIP + token;
let rateLimiter: RateLimiterRedis;
let subscriptionData: { team_id: string, plan: string } | null = null;
let normalizedApi: string;
if (token == "this_is_just_a_preview_token") {
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
} else {
normalizedApi = parseApi(token);
const { data, error } = await supabase_service.rpc(
'get_key_and_price_id_2', { api_key: normalizedApi }
);
// get_key_and_price_id_2 rpc definition:
// create or replace function get_key_and_price_id_2(api_key uuid)
// returns table(key uuid, team_id uuid, price_id text) as $$
// begin
// if api_key is null then
// return query
// select null::uuid as key, null::uuid as team_id, null::text as price_id;
// end if;
// return query
// select ak.key, ak.team_id, s.price_id
// from api_keys ak
// left join subscriptions s on ak.team_id = s.team_id
// where ak.key = api_key;
// end;
// $$ language plpgsql;
if (error) {
console.error('Error fetching key and price_id:', error);
} else {
// console.log('Key and Price ID:', data);
}
if (error || !data || data.length === 0) {
return {
success: false,
error: "Unauthorized: Invalid token",
status: 401,
};
}
const team_id = data[0].team_id;
const plan = getPlanByPriceId(data[0].price_id);
// HyperDX Logging
setTrace(team_id, normalizedApi);
subscriptionData = {
team_id: team_id,
plan: plan
}
switch (mode) {
case RateLimiterMode.Crawl:
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
break;
case RateLimiterMode.Scrape:
rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan);
break;
case RateLimiterMode.Search:
rateLimiter = getRateLimiter(RateLimiterMode.Search, token, subscriptionData.plan);
break;
case RateLimiterMode.CrawlStatus:
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
break;
case RateLimiterMode.Preview:
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
break;
default:
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token);
break;
// case RateLimiterMode.Search:
// rateLimiter = await searchRateLimiter(RateLimiterMode.Search, token);
// break;
}
}
try { try {
const incomingIP = (req.headers["x-forwarded-for"] || await rateLimiter.consume(iptoken);
req.socket.remoteAddress) as string;
const iptoken = incomingIP + token;
await getRateLimiter(
token === "this_is_just_a_preview_token" ? RateLimiterMode.Preview : mode, token
).consume(iptoken);
} catch (rateLimiterRes) { } catch (rateLimiterRes) {
console.error(rateLimiterRes); console.error(rateLimiterRes);
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
return { return {
success: false, success: false,
error: "Rate limit exceeded. Too many requests, try again in 1 minute.", error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
status: 429, status: 429,
}; };
} }
@ -66,19 +154,44 @@ export async function supaAuthenticateUser(
// return { success: false, error: "Unauthorized: Invalid token", status: 401 }; // return { success: false, error: "Unauthorized: Invalid token", status: 401 };
} }
const normalizedApi = parseApi(token);
// make sure api key is valid, based on the api_keys table in supabase // make sure api key is valid, based on the api_keys table in supabase
const { data, error } = await supabase_service if (!subscriptionData) {
.from("api_keys") normalizedApi = parseApi(token);
.select("*")
.eq("key", normalizedApi); const { data, error } = await supabase_service
if (error || !data || data.length === 0) { .from("api_keys")
return { .select("*")
success: false, .eq("key", normalizedApi);
error: "Unauthorized: Invalid token",
status: 401, if (error || !data || data.length === 0) {
}; return {
success: false,
error: "Unauthorized: Invalid token",
status: 401,
};
}
subscriptionData = data[0];
} }
return { success: true, team_id: data[0].team_id }; return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
} }
function getPlanByPriceId(price_id: string) {
switch (price_id) {
case process.env.STRIPE_PRICE_ID_STARTER:
return 'starter';
case process.env.STRIPE_PRICE_ID_STANDARD:
return 'standard';
case process.env.STRIPE_PRICE_ID_SCALE:
return 'scale';
case process.env.STRIPE_PRICE_ID_HOBBY || process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
return 'hobby';
case process.env.STRIPE_PRICE_ID_STANDARD_NEW || process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
return 'standard-new';
case process.env.STRIPE_PRICE_ID_GROWTH || process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
return 'growth';
default:
return 'free';
}
}

View File

@ -7,6 +7,8 @@ import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs"; import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../src/services/logging/crawl_log"; import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create";
export async function crawlController(req: Request, res: Response) { export async function crawlController(req: Request, res: Response) {
try { try {
@ -19,6 +21,19 @@ export async function crawlController(req: Request, res: Response) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ error: "Idempotency key already used" });
}
try {
createIdempotencyKey(req);
} catch (error) {
console.error(error);
return res.status(500).json({ error: error.message });
}
}
const { success: creditsCheckSuccess, message: creditsCheckMessage } = const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1); await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) { if (!creditsCheckSuccess) {

View File

@ -15,7 +15,8 @@ export async function scrapeHelper(
crawlerOptions: any, crawlerOptions: any,
pageOptions: PageOptions, pageOptions: PageOptions,
extractorOptions: ExtractorOptions, extractorOptions: ExtractorOptions,
timeout: number timeout: number,
plan?: string
): Promise<{ ): Promise<{
success: boolean; success: boolean;
error?: string; error?: string;
@ -64,7 +65,9 @@ export async function scrapeHelper(
} }
let creditsToBeBilled = filteredDocs.length; let creditsToBeBilled = filteredDocs.length;
const creditsPerLLMExtract = 5; const creditsPerLLMExtract = plan === "starter" ? 5 : 50;
if (extractorOptions.mode === "llm-extraction") { if (extractorOptions.mode === "llm-extraction") {
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
@ -93,7 +96,7 @@ export async function scrapeHelper(
export async function scrapeController(req: Request, res: Response) { export async function scrapeController(req: Request, res: Response) {
try { try {
// make sure to authenticate user first, Bearer <token> // make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status } = await authenticateUser( const { success, team_id, error, status, plan } = await authenticateUser(
req, req,
res, res,
RateLimiterMode.Scrape RateLimiterMode.Scrape
@ -102,10 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
const extractorOptions = req.body.extractorOptions ?? { const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown" mode: "markdown"
} }
if (extractorOptions.mode === "llm-extraction") {
pageOptions.onlyMainContent = true;
}
const origin = req.body.origin ?? "api"; const origin = req.body.origin ?? "api";
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds
@ -126,7 +132,8 @@ export async function scrapeController(req: Request, res: Response) {
crawlerOptions, crawlerOptions,
pageOptions, pageOptions,
extractorOptions, extractorOptions,
timeout timeout,
plan
); );
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;

View File

@ -28,11 +28,13 @@ export async function searchHelper(
const tbs = searchOptions.tbs ?? null; const tbs = searchOptions.tbs ?? null;
const filter = searchOptions.filter ?? null; const filter = searchOptions.filter ?? null;
const num_results = searchOptions.limit ?? 7;
const num_results_buffer = Math.floor(num_results * 1.5);
let res = await search({ let res = await search({
query: query, query: query,
advanced: advanced, advanced: advanced,
num_results: searchOptions.limit ?? 7, num_results: num_results_buffer,
tbs: tbs, tbs: tbs,
filter: filter, filter: filter,
lang: searchOptions.lang ?? "en", lang: searchOptions.lang ?? "en",
@ -47,6 +49,9 @@ export async function searchHelper(
} }
res = res.filter((r) => !isUrlBlocked(r.url)); res = res.filter((r) => !isUrlBlocked(r.url));
if (res.length > num_results) {
res = res.slice(0, num_results);
}
if (res.length === 0) { if (res.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 }; return { success: true, error: "No search results found", returnCode: 200 };

View File

@ -5,6 +5,8 @@ import "dotenv/config";
import { getWebScraperQueue } from "./services/queue-service"; import { getWebScraperQueue } from "./services/queue-service";
import { redisClient } from "./services/rate-limiter"; import { redisClient } from "./services/rate-limiter";
import { v0Router } from "./routes/v0"; import { v0Router } from "./routes/v0";
import { initSDK } from '@hyperdx/node-opentelemetry';
const { createBullBoard } = require("@bull-board/api"); const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { BullAdapter } = require("@bull-board/api/bullAdapter");
const { ExpressAdapter } = require("@bull-board/express"); const { ExpressAdapter } = require("@bull-board/express");
@ -47,6 +49,11 @@ const DEFAULT_PORT = process.env.PORT ?? 3002;
const HOST = process.env.HOST ?? "localhost"; const HOST = process.env.HOST ?? "localhost";
redisClient.connect(); redisClient.connect();
// HyperDX OpenTelemetry
if(process.env.ENV === 'production') {
initSDK({ consoleCapture: true, additionalInstrumentations: []});
}
export function startServer(port = DEFAULT_PORT) { export function startServer(port = DEFAULT_PORT) {
const server = app.listen(Number(port), HOST, () => { const server = app.listen(Number(port), HOST, () => {
@ -161,3 +168,6 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
app.get("/is-production", (req, res) => { app.get("/is-production", (req, res) => {
res.send({ isProduction: global.isProduction }); res.send({ isProduction: global.isProduction });
}); });
// /workers health check, cant act as load balancer, just has to be a pre deploy thing

View File

@ -1,25 +1,38 @@
import OpenAI from "openai"; import OpenAI from "openai";
import { Document } from "../../lib/entities"; import { Document } from "../../lib/entities";
import { numTokensFromString } from "./helpers";
export type ScraperCompletionResult = { export type ScraperCompletionResult = {
data: any | null; data: any | null;
url: string; url: string;
}; };
const maxTokens = 32000;
const modifier = 4;
const defaultPrompt = const defaultPrompt =
"You are a professional web scraper. Extract the contents of the webpage"; "You are a professional web scraper. Extract the contents of the webpage";
function prepareOpenAIDoc( function prepareOpenAIDoc(
document: Document document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] { ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
// Check if the markdown content exists in the document let markdown = document.markdown;
if (!document.markdown) {
// Check if the markdown content exists in the document
if (!markdown) {
throw new Error( throw new Error(
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
); );
} }
return [{ type: "text", text: document.markdown }]; // count number of tokens
const numTokens = numTokensFromString(document.markdown, "gpt-4");
if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, (maxTokens * modifier));
}
return [[{ type: "text", text: markdown }], numTokens];
} }
export async function generateOpenAICompletions({ export async function generateOpenAICompletions({
@ -38,7 +51,7 @@ export async function generateOpenAICompletions({
temperature?: number; temperature?: number;
}): Promise<Document> { }): Promise<Document> {
const openai = client as OpenAI; const openai = client as OpenAI;
const content = prepareOpenAIDoc(document); const [content, numTokens] = prepareOpenAIDoc(document);
const completion = await openai.chat.completions.create({ const completion = await openai.chat.completions.create({
model, model,
@ -72,6 +85,7 @@ export async function generateOpenAICompletions({
return { return {
...document, ...document,
llm_extraction: llmExtraction, llm_extraction: llmExtraction,
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
}; };
} }

View File

@ -15,6 +15,9 @@ export type PageOptions = {
includeHtml?: boolean; includeHtml?: boolean;
fallback?: boolean; fallback?: boolean;
fetchPageContent?: boolean; fetchPageContent?: boolean;
waitFor?: number;
screenshot?: boolean;
headers?: Record<string, string>;
}; };
export type ExtractorOptions = { export type ExtractorOptions = {
@ -44,6 +47,7 @@ export type WebScraperOptions = {
limit?: number; limit?: number;
generateImgAltText?: boolean; generateImgAltText?: boolean;
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
mode?: "default" | "fast"; // have a mode of some sort
}; };
pageOptions?: PageOptions; pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions; extractorOptions?: ExtractorOptions;
@ -71,6 +75,7 @@ export class Document {
}; };
childrenLinks?: string[]; childrenLinks?: string[];
provider?: string; provider?: string;
warning?: string;
constructor(data: Partial<Document>) { constructor(data: Partial<Document>) {
if (!data.content) { if (!data.content) {
@ -102,4 +107,9 @@ export class SearchResult {
toString(): string { toString(): string {
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
} }
}
export interface FireEngineResponse {
html: string;
screenshot: string;
} }

View File

@ -0,0 +1,42 @@
// import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
// const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
// const scrapInBatches = async (
// urls: string[],
// batchSize: number,
// delayMs: number
// ) => {
// let successCount = 0;
// let errorCount = 0;
// for (let i = 0; i < urls.length; i += batchSize) {
// const batch = urls
// .slice(i, i + batchSize)
// .map((url) => scrapWithFireEngine(url));
// try {
// const results = await Promise.all(batch);
// results.forEach((data, index) => {
// if (data.trim() === "") {
// errorCount++;
// } else {
// successCount++;
// console.log(
// `Scraping result ${i + index + 1}:`,
// data.trim().substring(0, 20) + "..."
// );
// }
// });
// } catch (error) {
// console.error("Error during scraping:", error);
// }
// await delay(delayMs);
// }
// console.log(`Total successful scrapes: ${successCount}`);
// console.log(`Total errored scrapes: ${errorCount}`);
// };
// function run() {
// const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
// scrapInBatches(urls, 10, 1000);
// }

View File

@ -15,7 +15,7 @@ export class WebCrawler {
private maxCrawledLinks: number; private maxCrawledLinks: number;
private maxCrawledDepth: number; private maxCrawledDepth: number;
private visited: Set<string> = new Set(); private visited: Set<string> = new Set();
private crawledUrls: Set<string> = new Set(); private crawledUrls: Map<string, string> = new Map();
private limit: number; private limit: number;
private robotsTxtUrl: string; private robotsTxtUrl: string;
private robots: any; private robots: any;
@ -25,7 +25,7 @@ export class WebCrawler {
initialUrl, initialUrl,
includes, includes,
excludes, excludes,
maxCrawledLinks, maxCrawledLinks = 10000,
limit = 10000, limit = 10000,
generateImgAltText = false, generateImgAltText = false,
maxCrawledDepth = 10, maxCrawledDepth = 10,
@ -51,7 +51,6 @@ export class WebCrawler {
this.generateImgAltText = generateImgAltText ?? false; this.generateImgAltText = generateImgAltText ?? false;
} }
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
return sitemapLinks return sitemapLinks
.filter((link) => { .filter((link) => {
@ -77,9 +76,22 @@ export class WebCrawler {
// Check if the link matches the include patterns, if any are specified // Check if the link matches the include patterns, if any are specified
if (this.includes.length > 0 && this.includes[0] !== "") { if (this.includes.length > 0 && this.includes[0] !== "") {
return this.includes.some((includePattern) => if (!this.includes.some((includePattern) =>
new RegExp(includePattern).test(path) new RegExp(includePattern).test(path)
); )) {
return false;
}
}
// Normalize the initial URL and the link to account for www and non-www versions
const normalizedInitialUrl = new URL(this.initialUrl);
const normalizedLink = new URL(link);
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
return false;
} }
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
@ -99,19 +111,21 @@ export class WebCrawler {
concurrencyLimit: number = 5, concurrencyLimit: number = 5,
limit: number = 10000, limit: number = 10000,
maxDepth: number = 10 maxDepth: number = 10
): Promise<string[]> { ): Promise<{ url: string, html: string }[]> {
// Fetch and parse robots.txt // Fetch and parse robots.txt
try { try {
const response = await axios.get(this.robotsTxtUrl); const response = await axios.get(this.robotsTxtUrl);
this.robots = robotsParser(this.robotsTxtUrl, response.data); this.robots = robotsParser(this.robotsTxtUrl, response.data);
} catch (error) { } catch (error) {
console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
} }
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
return filteredLinks; return filteredLinks.map(link => ({ url: link, html: "" }));
} }
const urls = await this.crawlUrls( const urls = await this.crawlUrls(
@ -123,43 +137,58 @@ export class WebCrawler {
urls.length === 0 && urls.length === 0 &&
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
) { ) {
return [this.initialUrl]; return [{ url: this.initialUrl, html: "" }];
} }
// make sure to run include exclude here again // make sure to run include exclude here again
return this.filterLinks(urls, limit, this.maxCrawledDepth); const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
} }
private async crawlUrls( private async crawlUrls(
urls: string[], urls: string[],
concurrencyLimit: number, concurrencyLimit: number,
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void,
): Promise<string[]> { ): Promise<{ url: string, html: string }[]> {
const queue = async.queue(async (task: string, callback) => { const queue = async.queue(async (task: string, callback) => {
if (this.crawledUrls.size >= this.maxCrawledLinks) { if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
} }
return; return;
} }
const newUrls = await this.crawl(task); const newUrls = await this.crawl(task);
newUrls.forEach((url) => this.crawledUrls.add(url)); // add the initial url if not already added
// if (this.visited.size === 1) {
// let normalizedInitial = this.initialUrl;
// if (!normalizedInitial.endsWith("/")) {
// normalizedInitial = normalizedInitial + "/";
// }
// if (!newUrls.some(page => page.url === this.initialUrl)) {
// newUrls.push({ url: this.initialUrl, html: "" });
// }
// }
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
if (inProgress && newUrls.length > 0) { if (inProgress && newUrls.length > 0) {
inProgress({ inProgress({
current: this.crawledUrls.size, current: this.crawledUrls.size,
total: this.maxCrawledLinks, total: Math.min(this.maxCrawledLinks, this.limit),
status: "SCRAPING", status: "SCRAPING",
currentDocumentUrl: newUrls[newUrls.length - 1], currentDocumentUrl: newUrls[newUrls.length - 1].url,
}); });
} else if (inProgress) { } else if (inProgress) {
inProgress({ inProgress({
current: this.crawledUrls.size, current: this.crawledUrls.size,
total: this.maxCrawledLinks, total: Math.min(this.maxCrawledLinks, this.limit),
status: "SCRAPING", status: "SCRAPING",
currentDocumentUrl: task, currentDocumentUrl: task,
}); });
} }
await this.crawlUrls(newUrls, concurrencyLimit, inProgress); await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
} }
@ -175,35 +204,48 @@ export class WebCrawler {
} }
); );
await queue.drain(); await queue.drain();
return Array.from(this.crawledUrls); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
async crawl(url: string): Promise<string[]> { async crawl(url: string): Promise<{url: string, html: string}[]> {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
return []; return [];
}
this.visited.add(url); this.visited.add(url);
if (!url.startsWith("http")) { if (!url.startsWith("http")) {
url = "https://" + url; url = "https://" + url;
} }
if (url.endsWith("/")) { if (url.endsWith("/")) {
url = url.slice(0, -1); url = url.slice(0, -1);
} }
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) { if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
return []; return [];
} }
try { try {
let content; let content : string = "";
// If it is the first link, fetch with scrapingbee // If it is the first link, fetch with single url
if (this.visited.size === 1) { if (this.visited.size === 1) {
const page = await scrapSingleUrl(url, {includeHtml: true}); const page = await scrapSingleUrl(url, {includeHtml: true});
content = page.html; content = page.html ?? ""
} else { } else {
const response = await axios.get(url); const response = await axios.get(url);
content = response.data; content = response.data ?? "";
} }
const $ = load(content); const $ = load(content);
let links: string[] = []; let links: {url: string, html: string}[] = [];
// Add the initial URL to the list of links
if(this.visited.size === 1)
{
links.push({url, html: content});
}
$("a").each((_, element) => { $("a").each((_, element) => {
const href = $(element).attr("href"); const href = $(element).attr("href");
@ -216,7 +258,6 @@ export class WebCrawler {
const path = url.pathname; const path = url.pathname;
if ( if (
// fullUrl.startsWith(this.initialUrl) && // this condition makes it stop crawling back the url
this.isInternalLink(fullUrl) && this.isInternalLink(fullUrl) &&
this.matchesPattern(fullUrl) && this.matchesPattern(fullUrl) &&
this.noSections(fullUrl) && this.noSections(fullUrl) &&
@ -224,12 +265,16 @@ export class WebCrawler {
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent") this.robots.isAllowed(fullUrl, "FireCrawlAgent")
) { ) {
links.push(fullUrl); links.push({url: fullUrl, html: content});
} }
} }
}); });
return links.filter((link) => !this.visited.has(link)); if(this.visited.size === 1){
return links;
}
// Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(link.url));
} catch (error) { } catch (error) {
return []; return [];
} }
@ -276,9 +321,15 @@ export class WebCrawler {
".mp4", ".mp4",
".mp3", ".mp3",
".pptx", ".pptx",
".docx", // ".docx",
".xlsx", ".xlsx",
".xml", ".xml",
".avi",
".flv",
".woff",
".ttf",
".woff2",
".webp"
]; ];
return fileExtensions.some((ext) => url.endsWith(ext)); return fileExtensions.some((ext) => url.endsWith(ext));
} }
@ -295,18 +346,57 @@ export class WebCrawler {
return socialMediaOrEmail.some((ext) => url.includes(ext)); return socialMediaOrEmail.some((ext) => url.includes(ext));
} }
//
private async tryFetchSitemapLinks(url: string): Promise<string[]> { private async tryFetchSitemapLinks(url: string): Promise<string[]> {
const normalizeUrl = (url: string) => {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
};
const sitemapUrl = url.endsWith("/sitemap.xml") const sitemapUrl = url.endsWith("/sitemap.xml")
? url ? url
: `${url}/sitemap.xml`; : `${url}/sitemap.xml`;
let sitemapLinks: string[] = [];
try { try {
const response = await axios.get(sitemapUrl); const response = await axios.get(sitemapUrl);
if (response.status === 200) { if (response.status === 200) {
return await getLinksFromSitemap(sitemapUrl); sitemapLinks = await getLinksFromSitemap(sitemapUrl);
} }
} catch (error) { } catch (error) {
// Error handling for failed sitemap fetch // Error handling for failed sitemap fetch
// console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
} }
return [];
if (sitemapLinks.length === 0) {
// If the first one doesn't work, try the base URL
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try {
const response = await axios.get(baseUrlSitemap);
if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
}
} catch (error) {
// Error handling for failed base URL sitemap fetch
// console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
}
}
// Normalize and check if the URL is present in any of the sitemaps
const normalizedUrl = normalizeUrl(url);
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
// do not push the normalized url
sitemapLinks.push(url);
}
return sitemapLinks;
} }
} }

View File

@ -17,6 +17,7 @@ import {
} from "./utils/replacePaths"; } from "./utils/replacePaths";
import { generateCompletions } from "../../lib/LLM-extraction"; import { generateCompletions } from "../../lib/LLM-extraction";
import { getWebScraperQueue } from "../../../src/services/queue-service"; import { getWebScraperQueue } from "../../../src/services/queue-service";
import { fetchAndProcessDocx } from "./utils/docxProcessor";
export class WebScraperDataProvider { export class WebScraperDataProvider {
private bullJobId: string; private bullJobId: string;
@ -35,6 +36,7 @@ export class WebScraperDataProvider {
private replaceAllPathsWithAbsolutePaths?: boolean = false; private replaceAllPathsWithAbsolutePaths?: boolean = false;
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
"gpt-4-turbo"; "gpt-4-turbo";
private crawlerMode: string = "default";
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -46,7 +48,8 @@ export class WebScraperDataProvider {
private async convertUrlsToDocuments( private async convertUrlsToDocuments(
urls: string[], urls: string[],
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void,
allHtmls?: string[]
): Promise<Document[]> { ): Promise<Document[]> {
const totalUrls = urls.length; const totalUrls = urls.length;
let processedUrls = 0; let processedUrls = 0;
@ -56,7 +59,12 @@ export class WebScraperDataProvider {
const batchUrls = urls.slice(i, i + this.concurrentRequests); const batchUrls = urls.slice(i, i + this.concurrentRequests);
await Promise.all( await Promise.all(
batchUrls.map(async (url, index) => { batchUrls.map(async (url, index) => {
const result = await scrapSingleUrl(url, this.pageOptions); const existingHTML = allHtmls ? allHtmls[i + index] : "";
const result = await scrapSingleUrl(
url,
this.pageOptions,
existingHTML
);
processedUrls++; processedUrls++;
if (inProgress) { if (inProgress) {
inProgress({ inProgress({
@ -127,9 +135,30 @@ export class WebScraperDataProvider {
} }
} }
private async cleanIrrelevantPath(links: string[]) {
return links.filter((link) => {
const normalizedInitialUrl = new URL(this.urls[0]);
const normalizedLink = new URL(link);
// Normalize the hostname to account for www and non-www versions
const initialHostname = normalizedInitialUrl.hostname.replace(
/^www\./,
""
);
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
return (
linkHostname === initialHostname &&
normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
);
});
}
private async handleCrawlMode( private async handleCrawlMode(
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
initialUrl: this.urls[0], initialUrl: this.urls[0],
includes: this.includes, includes: this.includes,
@ -139,20 +168,38 @@ export class WebScraperDataProvider {
limit: this.limit, limit: this.limit,
generateImgAltText: this.generateImgAltText, generateImgAltText: this.generateImgAltText,
}); });
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
let links = await crawler.start(
inProgress,
5,
this.limit,
this.maxCrawledDepth
);
let allLinks = links.map((e) => e.url);
const allHtmls = links.map((e) => e.html);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(links, inProgress); return this.returnOnlyUrlsResponse(allLinks, inProgress);
} }
let documents = await this.processLinks(links, inProgress); let documents = [];
return this.cacheAndFinalizeDocuments(documents, links); // check if fast mode is enabled and there is html inside the links
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
documents = await this.processLinks(allLinks, inProgress, allHtmls);
} else {
documents = await this.processLinks(allLinks, inProgress);
}
return this.cacheAndFinalizeDocuments(documents, allLinks);
} }
private async handleSingleUrlsMode( private async handleSingleUrlsMode(
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
let documents = await this.processLinks(this.urls, inProgress); const links = this.urls;
let documents = await this.processLinks(links, inProgress);
return documents; return documents;
} }
@ -160,6 +207,8 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
let links = await getLinksFromSitemap(this.urls[0]); let links = await getLinksFromSitemap(this.urls[0]);
links = await this.cleanIrrelevantPath(links);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(links, inProgress); return this.returnOnlyUrlsResponse(links, inProgress);
} }
@ -189,14 +238,24 @@ export class WebScraperDataProvider {
private async processLinks( private async processLinks(
links: string[], links: string[],
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void,
allHtmls?: string[]
): Promise<Document[]> { ): Promise<Document[]> {
let pdfLinks = links.filter((link) => link.endsWith(".pdf")); const pdfLinks = links.filter(link => link.endsWith(".pdf"));
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks); const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx"));
links = links.filter((link) => !link.endsWith(".pdf"));
let documents = await this.convertUrlsToDocuments(links, inProgress); const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
const docxDocuments = await this.fetchDocxDocuments(docLinks);
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
let documents = await this.convertUrlsToDocuments(
links,
inProgress,
allHtmls
);
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
documents = this.applyPathReplacements(documents); documents = this.applyPathReplacements(documents);
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
@ -206,7 +265,7 @@ export class WebScraperDataProvider {
) { ) {
documents = await generateCompletions(documents, this.extractorOptions); documents = await generateCompletions(documents, this.extractorOptions);
} }
return documents.concat(pdfDocuments); return documents.concat(pdfDocuments).concat(docxDocuments);
} }
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> { private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
@ -221,6 +280,18 @@ export class WebScraperDataProvider {
}) })
); );
} }
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
return Promise.all(
docxLinks.map(async (p) => {
const docXDocument = await fetchAndProcessDocx(p);
return {
content: docXDocument,
metadata: { sourceURL: p },
provider: "web-scraper",
};
})
);
}
private applyPathReplacements(documents: Document[]): Document[] { private applyPathReplacements(documents: Document[]): Document[] {
return this.replaceAllPathsWithAbsolutePaths return this.replaceAllPathsWithAbsolutePaths
@ -397,8 +468,9 @@ export class WebScraperDataProvider {
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== ""); this.excludes = this.excludes.filter((item) => item !== "");
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {

View File

@ -2,13 +2,22 @@ import * as cheerio from "cheerio";
import { ScrapingBeeClient } from "scrapingbee"; import { ScrapingBeeClient } from "scrapingbee";
import { extractMetadata } from "./utils/metadata"; import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { Document, PageOptions } from "../../lib/entities"; import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags"; import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params"; import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
dotenv.config(); dotenv.config();
const baseScrapers = [
"fire-engine",
"scrapingBee",
"playwright",
"scrapingBeeLoad",
"fetch",
] as const;
export async function generateRequestParams( export async function generateRequestParams(
url: string, url: string,
wait_browser: string = "domcontentloaded", wait_browser: string = "domcontentloaded",
@ -32,16 +41,54 @@ export async function generateRequestParams(
return defaultParams; return defaultParams;
} }
} }
export async function scrapWithCustomFirecrawl( export async function scrapWithFireEngine(
url: string, url: string,
waitFor: number = 0,
screenshot: boolean = false,
headers?: Record<string, string>,
options?: any options?: any
): Promise<string> { ): Promise<FireEngineResponse> {
try { try {
// TODO: merge the custom firecrawl scraper into mono-repo when ready const reqParams = await generateRequestParams(url);
return null; // If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor;
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
console.log(
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
);
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL + "/scrape", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
url: url,
wait: waitParam,
screenshot: screenshotParam,
headers: headers,
}),
});
if (!response.ok) {
console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
);
return { html: "", screenshot: "" };
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return { html: await fetchAndProcessPdf(url), screenshot: "" };
} else {
const data = await response.json();
const html = data.content;
const screenshot = data.screenshot;
return { html: html ?? "", screenshot: screenshot ?? "" };
}
} catch (error) { } catch (error) {
console.error(`Error scraping with custom firecrawl-scraper: ${error}`); console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
return ""; return { html: "", screenshot: "" };
} }
} }
@ -62,51 +109,168 @@ export async function scrapWithScrapingBee(
if (response.status !== 200 && response.status !== 404) { if (response.status !== 200 && response.status !== 404) {
console.error( console.error(
`Scraping bee error in ${url} with status code ${response.status}` `[ScrapingBee] Error fetching url: ${url} with status code ${response.status}`
); );
return ""; return "";
} }
const decoder = new TextDecoder();
const text = decoder.decode(response.data); const contentType = response.headers["content-type"];
return text; if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url);
} else {
const decoder = new TextDecoder();
const text = decoder.decode(response.data);
return text;
}
} catch (error) { } catch (error) {
console.error(`Error scraping with Scraping Bee: ${error}`); console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }
export async function scrapWithPlaywright(url: string): Promise<string> { export async function scrapWithPlaywright(
url: string,
waitFor: number = 0,
headers?: Record<string, string>
): Promise<string> {
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
const wait_playwright = reqParams["params"]?.wait ?? 0; // If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor;
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, { const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
method: "POST", method: "POST",
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
body: JSON.stringify({ url: url, wait: wait_playwright }), body: JSON.stringify({ url: url, wait: waitParam, headers: headers }),
}); });
if (!response.ok) { if (!response.ok) {
console.error( console.error(
`Error fetching w/ playwright server -> URL: ${url} with status: ${response.status}` `[Playwright] Error fetching url: ${url} with status: ${response.status}`
); );
return ""; return "";
} }
const data = await response.json(); const contentType = response.headers["content-type"];
const html = data.content; if (contentType && contentType.includes("application/pdf")) {
return html ?? ""; return fetchAndProcessPdf(url);
} else {
const data = await response.json();
const html = data.content;
return html ?? "";
}
} catch (error) { } catch (error) {
console.error(`Error scraping with Puppeteer: ${error}`); console.error(`[Playwright][c] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }
export async function scrapWithFetch(url: string): Promise<string> {
try {
const response = await fetch(url);
if (!response.ok) {
console.error(
`[Fetch] Error fetching url: ${url} with status: ${response.status}`
);
return "";
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url);
} else {
const text = await response.text();
return text;
}
} catch (error) {
console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`);
return "";
}
}
/**
* Get the order of scrapers to be used for scraping a URL
* If the user doesn't have envs set for a specific scraper, it will be removed from the order.
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
* @returns The order of scrapers to be used for scraping a URL
*/
function getScrapingFallbackOrder(
defaultScraper?: string,
isWaitPresent: boolean = false,
isScreenshotPresent: boolean = false,
isHeadersPresent: boolean = false
) {
const availableScrapers = baseScrapers.filter((scraper) => {
switch (scraper) {
case "scrapingBee":
case "scrapingBeeLoad":
return !!process.env.SCRAPING_BEE_API_KEY;
case "fire-engine":
return !!process.env.FIRE_ENGINE_BETA_URL;
case "playwright":
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
default:
return true;
}
});
let defaultOrder = [
"scrapingBee",
"fire-engine",
"playwright",
"scrapingBeeLoad",
"fetch",
];
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
defaultOrder = [
"fire-engine",
"playwright",
...defaultOrder.filter(
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
),
];
}
const filteredDefaultOrder = defaultOrder.filter(
(scraper: (typeof baseScrapers)[number]) =>
availableScrapers.includes(scraper)
);
const uniqueScrapers = new Set(
defaultScraper
? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
: [...filteredDefaultOrder, ...availableScrapers]
);
const scrapersInOrder = Array.from(uniqueScrapers);
console.log(`Scrapers in order: ${scrapersInOrder}`);
return scrapersInOrder as (typeof baseScrapers)[number][];
}
async function handleCustomScraping(
text: string,
url: string
): Promise<FireEngineResponse | null> {
if (text.includes('<meta name="readme-deploy"')) {
console.log(
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
);
return await scrapWithFireEngine(url, 1000);
}
return null;
}
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false } pageOptions: PageOptions = {
onlyMainContent: true,
includeHtml: false,
waitFor: 0,
screenshot: false,
headers: {}
},
existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -124,17 +288,23 @@ export async function scrapSingleUrl(
const attemptScraping = async ( const attemptScraping = async (
url: string, url: string,
method: method: (typeof baseScrapers)[number]
| "firecrawl-scraper"
| "scrapingBee"
| "playwright"
| "scrapingBeeLoad"
| "fetch"
) => { ) => {
let text = ""; let text = "";
let screenshot = "";
switch (method) { switch (method) {
case "firecrawl-scraper": case "fire-engine":
text = await scrapWithCustomFirecrawl(url); if (process.env.FIRE_ENGINE_BETA_URL) {
console.log(`Scraping ${url} with Fire Engine`);
const response = await scrapWithFireEngine(
url,
pageOptions.waitFor,
pageOptions.screenshot,
pageOptions.headers
);
text = response.html;
screenshot = response.screenshot;
}
break; break;
case "scrapingBee": case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) { if (process.env.SCRAPING_BEE_API_KEY) {
@ -147,7 +317,7 @@ export async function scrapSingleUrl(
break; break;
case "playwright": case "playwright":
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
text = await scrapWithPlaywright(url); text = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
} }
break; break;
case "scrapingBeeLoad": case "scrapingBeeLoad":
@ -156,29 +326,24 @@ export async function scrapSingleUrl(
} }
break; break;
case "fetch": case "fetch":
try { text = await scrapWithFetch(url);
const response = await fetch(url);
if (!response.ok) {
console.error(
`Error fetching URL: ${url} with status: ${response.status}`
);
return "";
}
text = await response.text();
} catch (error) {
console.error(`Error scraping URL: ${error}`);
return "";
}
break; break;
} }
// Check for custom scraping conditions
const customScrapedContent = await handleCustomScraping(text, url);
if (customScrapedContent) {
text = customScrapedContent.html;
screenshot = customScrapedContent.screenshot;
}
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(text, pageOptions); let cleanedHtml = removeUnwantedElements(text, pageOptions);
return [await parseMarkdown(cleanedHtml), text]; return [await parseMarkdown(cleanedHtml), text, screenshot];
}; };
try { try {
let [text, html] = ["", ""]; let [text, html, screenshot] = ["", "", ""];
let urlKey = urlToScrap; let urlKey = urlToScrap;
try { try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
@ -186,20 +351,27 @@ export async function scrapSingleUrl(
console.error(`Invalid URL key, trying: ${urlToScrap}`); console.error(`Invalid URL key, trying: ${urlToScrap}`);
} }
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
const scrapersInOrder = defaultScraper const scrapersInOrder = getScrapingFallbackOrder(
? [ defaultScraper,
defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
"scrapingBee", pageOptions && pageOptions.screenshot && pageOptions.screenshot === true,
"playwright", pageOptions && pageOptions.headers && pageOptions.headers !== undefined
"scrapingBeeLoad", );
"fetch",
]
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
for (const scraper of scrapersInOrder) { for (const scraper of scrapersInOrder) {
[text, html] = await attemptScraping(urlToScrap, scraper); // If exists text coming from crawler, use it
if (text && text.length >= 100) break; if (existingHtml && existingHtml.trim().length >= 100) {
console.log(`Falling back to ${scraper}`); let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
text = await parseMarkdown(cleanedHtml);
html = existingHtml;
break;
}
[text, html, screenshot] = await attemptScraping(urlToScrap, scraper);
if (text && text.trim().length >= 100) break;
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
if (nextScraperIndex < scrapersInOrder.length) {
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
}
} }
if (!text) { if (!text) {
@ -208,12 +380,27 @@ export async function scrapSingleUrl(
const soup = cheerio.load(html); const soup = cheerio.load(html);
const metadata = extractMetadata(soup, urlToScrap); const metadata = extractMetadata(soup, urlToScrap);
const document: Document = {
content: text, let document: Document;
markdown: text, if (screenshot && screenshot.length > 0) {
html: pageOptions.includeHtml ? html : undefined, document = {
metadata: { ...metadata, sourceURL: urlToScrap }, content: text,
}; markdown: text,
html: pageOptions.includeHtml ? html : undefined,
metadata: {
...metadata,
screenshot: screenshot,
sourceURL: urlToScrap,
},
};
} else {
document = {
content: text,
markdown: text,
html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, sourceURL: urlToScrap },
};
}
return document; return document;
} catch (error) { } catch (error) {

View File

@ -0,0 +1,13 @@
import * as docxProcessor from "../docxProcessor";
describe("DOCX Processing Module - Integration Test", () => {
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
delete process.env.LLAMAPARSE_API_KEY;
const docxContent = await docxProcessor.fetchAndProcessDocx(
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
);
expect(docxContent.trim()).toContain(
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
);
});
});

View File

@ -0,0 +1,66 @@
import { isUrlBlocked } from '../blocklist';
describe('isUrlBlocked', () => {
it('should return true for blocked social media URLs', () => {
const blockedUrls = [
'https://www.facebook.com',
'https://twitter.com/someuser',
'https://instagram.com/someuser',
'https://www.linkedin.com/in/someuser',
'https://pinterest.com/someuser',
'https://snapchat.com/someuser',
'https://tiktok.com/@someuser',
'https://reddit.com/r/somesubreddit',
'https://flickr.com/photos/someuser',
'https://whatsapp.com/someuser',
'https://wechat.com/someuser',
'https://telegram.org/someuser',
];
blockedUrls.forEach(url => {
if (!isUrlBlocked(url)) {
console.log(`URL not blocked: ${url}`);
}
expect(isUrlBlocked(url)).toBe(true);
});
});
it('should return false for URLs containing allowed keywords', () => {
const allowedUrls = [
'https://www.facebook.com/privacy',
'https://twitter.com/terms',
'https://instagram.com/legal',
'https://www.linkedin.com/help',
'https://pinterest.com/about',
'https://snapchat.com/support',
'https://tiktok.com/contact',
'https://reddit.com/user-agreement',
'https://tumblr.com/policy',
'https://flickr.com/blog',
'https://whatsapp.com/press',
'https://wechat.com/careers',
'https://telegram.org/conditions',
'https://wix.com/careers',
];
allowedUrls.forEach(url => {
expect(isUrlBlocked(url)).toBe(false);
});
});
it('should return false for non-blocked URLs', () => {
const nonBlockedUrls = [
'https://www.example.com',
'https://www.somewebsite.org',
'https://subdomain.example.com',
'firecrawl.dev',
'amazon.com',
'wix.com',
'https://wix.com'
];
nonBlockedUrls.forEach(url => {
expect(isUrlBlocked(url)).toBe(false);
});
});
});

View File

@ -1,5 +1,6 @@
const socialMediaBlocklist = [ const socialMediaBlocklist = [
'facebook.com', 'facebook.com',
'x.com',
'twitter.com', 'twitter.com',
'instagram.com', 'instagram.com',
'linkedin.com', 'linkedin.com',
@ -14,14 +15,40 @@ const socialMediaBlocklist = [
'telegram.org', 'telegram.org',
]; ];
const allowedUrls = [ const allowedKeywords = [
'linkedin.com/pulse' 'pulse',
'privacy',
'terms',
'policy',
'user-agreement',
'legal',
'help',
'support',
'contact',
'about',
'careers',
'blog',
'press',
'conditions',
]; ];
export function isUrlBlocked(url: string): boolean { export function isUrlBlocked(url: string): boolean {
if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) { // Check if the URL contains any allowed keywords
if (allowedKeywords.some(keyword => url.includes(keyword))) {
return false; return false;
} }
return socialMediaBlocklist.some(domain => url.includes(domain)); try {
// Check if the URL matches any domain in the blocklist
return socialMediaBlocklist.some(domain => {
// Create a regular expression to match the exact domain
const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`);
// Test the hostname of the URL against the pattern
return domainPattern.test(new URL(url).hostname);
});
} catch (e) {
// If an error occurs (e.g., invalid URL), return false
return false;
}
} }

View File

@ -63,7 +63,7 @@ export const urlSpecificParams = {
}, },
}, },
"ycombinator.com":{ "ycombinator.com":{
defaultScraper: "playwright", defaultScraper: "fire-engine",
params: { params: {
wait_browser: "networkidle2", wait_browser: "networkidle2",
block_resources: false, block_resources: false,
@ -121,5 +121,43 @@ export const urlSpecificParams = {
accept: accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
}, },
},
"help.salesforce.com":{
defaultScraper: "playwright",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 2000,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"firecrawl.dev":{
defaultScraper: "fire-engine",
params: {
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
} }
}; };

View File

@ -0,0 +1,41 @@
import axios from "axios";
import fs from "fs";
import { createWriteStream } from "node:fs";
import path from "path";
import os from "os";
import mammoth from "mammoth";
export async function fetchAndProcessDocx(url: string): Promise<string> {
const tempFilePath = await downloadDocx(url);
const content = await processDocxToText(tempFilePath);
fs.unlinkSync(tempFilePath); // Clean up the temporary file
return content;
}
async function downloadDocx(url: string): Promise<string> {
const response = await axios({
url,
method: "GET",
responseType: "stream",
});
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
const writer = createWriteStream(tempFilePath);
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on("finish", () => resolve(tempFilePath));
writer.on("error", reject);
});
}
export async function processDocxToText(filePath: string): Promise<string> {
const content = await extractTextFromDocx(filePath);
return content;
}
async function extractTextFromDocx(filePath: string): Promise<string> {
const result = await mammoth.extractRawText({ path: filePath });
return result.value;
}

View File

@ -34,8 +34,6 @@ export const excludeNonMainTags = [
"#nav", "#nav",
".breadcrumbs", ".breadcrumbs",
"#breadcrumbs", "#breadcrumbs",
".form",
"form",
"#search-form", "#search-form",
".search", ".search",
"#search", "#search",
@ -51,10 +49,6 @@ export const excludeNonMainTags = [
"#tag", "#tag",
".category", ".category",
"#category", "#category",
".comment", ".cookie",
"#comment", "#cookie"
".reply",
"#reply",
".author",
"#author",
]; ];

View File

@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise<string> {
async function downloadPdf(url: string): Promise<string> { async function downloadPdf(url: string): Promise<string> {
const response = await axios({ const response = await axios({
url, url,
method: 'GET', method: "GET",
responseType: 'stream', responseType: "stream",
}); });
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`); const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
@ -29,8 +29,8 @@ async function downloadPdf(url: string): Promise<string> {
response.data.pipe(writer); response.data.pipe(writer);
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
writer.on('finish', () => resolve(tempFilePath)); writer.on("finish", () => resolve(tempFilePath));
writer.on('error', reject); writer.on("error", reject);
}); });
} }
@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise<string> {
} else { } else {
// If the status code is not 200, increment the attempt counter and wait // If the status code is not 200, increment the attempt counter and wait
attempt++; attempt++;
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
} }
} catch (error) { } catch (error) {
console.error("Error fetching result:", error); console.error("Error fetching result:", error || '');
attempt++; attempt++;
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
// You may want to handle specific errors differently // You may want to handle specific errors differently
} }
} }
@ -101,7 +101,7 @@ export async function processPdfToText(filePath: string): Promise<string> {
return content; return content;
} }
async function processPdf(file: string){ async function processPdf(file: string) {
const fileContent = fs.readFileSync(file); const fileContent = fs.readFileSync(file);
const data = await pdf(fileContent); const data = await pdf(fileContent);
return data.text; return data.text;

View File

@ -1,7 +1,7 @@
import { withAuth } from "../../lib/withAuth"; import { withAuth } from "../../lib/withAuth";
import { supabase_service } from "../supabase"; import { supabase_service } from "../supabase";
const FREE_CREDITS = 300; const FREE_CREDITS = 500;
export async function billTeam(team_id: string, credits: number) { export async function billTeam(team_id: string, credits: number) {
return withAuth(supaBillTeam)(team_id, credits); return withAuth(supaBillTeam)(team_id, credits);
@ -227,10 +227,11 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (creditUsages && creditUsages.length > 0) { if (creditUsages && creditUsages.length > 0) {
totalCreditsUsed = creditUsages[0].total_credits_used; totalCreditsUsed = creditUsages[0].total_credits_used;
console.log("Total Credits Used:", totalCreditsUsed); // console.log("Total Credits Used:", totalCreditsUsed);
} }
} catch (error) { } catch (error) {
console.error("Error calculating credit usage:", error); console.error("Error calculating credit usage:", error);
} }
// Adjust total credits used by subtracting coupon value // Adjust total credits used by subtracting coupon value
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits); const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);

View File

@ -0,0 +1,22 @@
import { Request } from "express";
import { supabase_service } from "../supabase";
export async function createIdempotencyKey(
req: Request,
): Promise<string> {
const idempotencyKey = req.headers['x-idempotency-key'] as string;
if (!idempotencyKey) {
throw new Error("No idempotency key provided in the request headers.");
}
const { data, error } = await supabase_service
.from("idempotency_keys")
.insert({ key: idempotencyKey });
if (error) {
console.error("Failed to create idempotency key:", error);
throw error;
}
return idempotencyKey;
}

View File

@ -0,0 +1,32 @@
import { Request } from "express";
import { supabase_service } from "../supabase";
import { validate as isUuid } from 'uuid';
export async function validateIdempotencyKey(
req: Request,
): Promise<boolean> {
const idempotencyKey = req.headers['x-idempotency-key'];
if (!idempotencyKey) {
// // not returning for missing idempotency key for now
return true;
}
if (!isUuid(idempotencyKey)) {
console.error("Invalid idempotency key provided in the request headers.");
return false;
}
const { data, error } = await supabase_service
.from("idempotency_keys")
.select("key")
.eq("key", idempotencyKey);
if (error) {
console.error(error);
}
if (!data || data.length === 0) {
return true;
}
return false;
}

View File

@ -5,6 +5,11 @@ import { logtail } from "./logtail";
import { startWebScraperPipeline } from "../main/runWebScraper"; import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook"; import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job"; import { logJob } from "./logging/log_job";
import { initSDK } from '@hyperdx/node-opentelemetry';
if(process.env.ENV === 'production') {
initSDK({ consoleCapture: true, additionalInstrumentations: []});
}
getWebScraperQueue().process( getWebScraperQueue().process(
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
@ -26,7 +31,7 @@ getWebScraperQueue().process(
success: success, success: success,
result: { result: {
links: docs.map((doc) => { links: docs.map((doc) => {
return { content: doc, source: doc.metadata.sourceURL }; return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
}), }),
}, },
project_id: job.data.project_id, project_id: job.data.project_id,

View File

@ -2,91 +2,68 @@ import { RateLimiterRedis } from "rate-limiter-flexible";
import * as redis from "redis"; import * as redis from "redis";
import { RateLimiterMode } from "../../src/types"; import { RateLimiterMode } from "../../src/types";
const MAX_REQUESTS_PER_MINUTE_PREVIEW = 5; const RATE_LIMITS = {
const MAX_CRAWLS_PER_MINUTE_STARTER = 2; crawl: {
const MAX_CRAWLS_PER_MINUTE_STANDARD = 4; free: 1,
const MAX_CRAWLS_PER_MINUTE_SCALE = 20; starter: 3,
standard: 5,
const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 20; scale: 20,
hobby: 3,
const MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS = 120; standardNew: 10,
growth: 50,
},
scrape: {
free: 5,
starter: 20,
standardOld: 40,
scale: 50,
hobby: 10,
standardNew: 50,
growth: 500,
},
search: {
free: 5,
starter: 20,
standard: 40,
scale: 50,
hobby: 10,
standardNew: 50,
growth: 500,
},
preview: 5,
account: 20,
crawlStatus: 150,
testSuite: 10000,
};
export const redisClient = redis.createClient({ export const redisClient = redis.createClient({
url: process.env.REDIS_URL, url: process.env.REDIS_URL,
legacyMode: true, legacyMode: true,
}); });
export const previewRateLimiter = new RateLimiterRedis({ const createRateLimiter = (keyPrefix, points) => new RateLimiterRedis({
storeClient: redisClient, storeClient: redisClient,
keyPrefix: "middleware", keyPrefix,
points: MAX_REQUESTS_PER_MINUTE_PREVIEW, points,
duration: 60, // Duration in seconds duration: 60, // Duration in seconds
}); });
export const serverRateLimiter = new RateLimiterRedis({ export const previewRateLimiter = createRateLimiter("preview", RATE_LIMITS.preview);
storeClient: redisClient, export const serverRateLimiter = createRateLimiter("server", RATE_LIMITS.account);
keyPrefix: "middleware", export const crawlStatusRateLimiter = createRateLimiter("crawl-status", RATE_LIMITS.crawlStatus);
points: MAX_REQUESTS_PER_MINUTE_ACCOUNT, export const testSuiteRateLimiter = createRateLimiter("test-suite", RATE_LIMITS.testSuite);
duration: 60, // Duration in seconds
});
export const crawlStatusRateLimiter = new RateLimiterRedis({ export function getRateLimiter(mode: RateLimiterMode, token: string, plan?: string) {
storeClient: redisClient, if (token.includes("5089cefa58") || token.includes("6254cf9")) {
keyPrefix: "middleware",
points: MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS,
duration: 60, // Duration in seconds
});
export const testSuiteRateLimiter = new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "middleware",
points: 1000,
duration: 60, // Duration in seconds
});
export function crawlRateLimit(plan: string){
if(plan === "standard"){
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "middleware",
points: MAX_CRAWLS_PER_MINUTE_STANDARD,
duration: 60, // Duration in seconds
});
}else if(plan === "scale"){
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "middleware",
points: MAX_CRAWLS_PER_MINUTE_SCALE,
duration: 60, // Duration in seconds
});
}
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "middleware",
points: MAX_CRAWLS_PER_MINUTE_STARTER,
duration: 60, // Duration in seconds
});
}
export function getRateLimiter(mode: RateLimiterMode, token: string){
// Special test suite case. TODO: Change this later.
if(token.includes("5089cefa58")){
return testSuiteRateLimiter; return testSuiteRateLimiter;
} }
switch(mode) {
case RateLimiterMode.Preview:
return previewRateLimiter; const rateLimitConfig = RATE_LIMITS[mode];
case RateLimiterMode.CrawlStatus: if (!rateLimitConfig) return serverRateLimiter;
return crawlStatusRateLimiter;
default: const planKey = plan ? plan.replace("-", "") : "starter";
return serverRateLimiter; const points = rateLimitConfig[planKey] || rateLimitConfig.preview;
}
return createRateLimiter(`${mode}-${planKey}`, points);
} }

View File

@ -57,6 +57,7 @@ export interface AuthResponse {
team_id?: string; team_id?: string;
error?: string; error?: string;
status?: number; status?: number;
plan?: string;
} }

View File

@ -1,3 +1,4 @@
import { v4 as uuidv4 } from 'uuid';
import FirecrawlApp from '@mendable/firecrawl-js'; import FirecrawlApp from '@mendable/firecrawl-js';
import { z } from "zod"; import { z } from "zod";
@ -8,7 +9,8 @@ const scrapeResult = await app.scrapeUrl('firecrawl.dev');
console.log(scrapeResult.data.content) console.log(scrapeResult.data.content)
// Crawl a website: // Crawl a website:
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false); const idempotencyKey = uuidv4(); // optional
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
console.log(crawlResult) console.log(crawlResult)
const jobId = await crawlResult['jobId']; const jobId = await crawlResult['jobId'];

View File

@ -0,0 +1,3 @@
API_URL=http://localhost:3002
TEST_API_KEY=fc-YOUR_API_KEY

View File

@ -110,11 +110,12 @@ export default class FirecrawlApp {
* @param {Params | null} params - Additional parameters for the crawl request. * @param {Params | null} params - Additional parameters for the crawl request.
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
* @param {number} timeout - Timeout in seconds for job status checks. * @param {number} timeout - Timeout in seconds for job status checks.
* @param {string} idempotencyKey - Optional idempotency key for the request.
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation. * @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
*/ */
crawlUrl(url_1) { crawlUrl(url_1) {
return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, timeout = 2) { return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, timeout = 2, idempotencyKey) {
const headers = this.prepareHeaders(); const headers = this.prepareHeaders(idempotencyKey);
let jsonData = { url }; let jsonData = { url };
if (params) { if (params) {
jsonData = Object.assign(Object.assign({}, jsonData), params); jsonData = Object.assign(Object.assign({}, jsonData), params);
@ -172,11 +173,8 @@ export default class FirecrawlApp {
* Prepares the headers for an API request. * Prepares the headers for an API request.
* @returns {AxiosRequestHeaders} The prepared headers. * @returns {AxiosRequestHeaders} The prepared headers.
*/ */
prepareHeaders() { prepareHeaders(idempotencyKey) {
return { return Object.assign({ 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}` }, (idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}));
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
};
} }
/** /**
* Sends a POST request to the specified URL. * Sends a POST request to the specified URL.

View File

@ -1,22 +1,27 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "0.0.17-beta.8", "version": "0.0.22",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "0.0.17-beta.8", "version": "0.0.22",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"axios": "^1.6.8", "axios": "^1.6.8",
"dotenv": "^16.4.5",
"uuid": "^9.0.1",
"zod": "^3.23.8", "zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0" "zod-to-json-schema": "^3.23.0"
}, },
"devDependencies": { "devDependencies": {
"@jest/globals": "^29.7.0", "@jest/globals": "^29.7.0",
"@types/axios": "^0.14.0", "@types/axios": "^0.14.0",
"@types/node": "^20.12.7", "@types/dotenv": "^8.2.0",
"@types/jest": "^29.5.12",
"@types/node": "^20.12.12",
"@types/uuid": "^9.0.8",
"jest": "^29.7.0", "jest": "^29.7.0",
"ts-jest": "^29.1.2", "ts-jest": "^29.1.2",
"typescript": "^5.4.5" "typescript": "^5.4.5"
@ -1013,6 +1018,16 @@
"@babel/types": "^7.20.7" "@babel/types": "^7.20.7"
} }
}, },
"node_modules/@types/dotenv": {
"version": "8.2.0",
"resolved": "https://registry.npmjs.org/@types/dotenv/-/dotenv-8.2.0.tgz",
"integrity": "sha512-ylSC9GhfRH7m1EUXBXofhgx4lUWmFeQDINW5oLuS+gxWdfUeW4zJdeVTYVkexEW+e2VUvlZR2kGnGGipAWR7kw==",
"deprecated": "This is a stub types definition. dotenv provides its own type definitions, so you do not need this installed.",
"dev": true,
"dependencies": {
"dotenv": "*"
}
},
"node_modules/@types/graceful-fs": { "node_modules/@types/graceful-fs": {
"version": "4.1.9", "version": "4.1.9",
"resolved": "https://registry.npmjs.org/@types/graceful-fs/-/graceful-fs-4.1.9.tgz", "resolved": "https://registry.npmjs.org/@types/graceful-fs/-/graceful-fs-4.1.9.tgz",
@ -1046,10 +1061,20 @@
"@types/istanbul-lib-report": "*" "@types/istanbul-lib-report": "*"
} }
}, },
"node_modules/@types/jest": {
"version": "29.5.12",
"resolved": "https://registry.npmjs.org/@types/jest/-/jest-29.5.12.tgz",
"integrity": "sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==",
"dev": true,
"dependencies": {
"expect": "^29.0.0",
"pretty-format": "^29.0.0"
}
},
"node_modules/@types/node": { "node_modules/@types/node": {
"version": "20.12.7", "version": "20.12.12",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.12.tgz",
"integrity": "sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==", "integrity": "sha512-eWLDGF/FOSPtAvEqeRAQ4C8LSA7M1I7i0ky1I8U7kD1J5ITyW3AsRhQrKVoWf5pFKZ2kILsEGJhsI9r93PYnOw==",
"dev": true, "dev": true,
"dependencies": { "dependencies": {
"undici-types": "~5.26.4" "undici-types": "~5.26.4"
@ -1061,6 +1086,12 @@
"integrity": "sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==", "integrity": "sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==",
"dev": true "dev": true
}, },
"node_modules/@types/uuid": {
"version": "9.0.8",
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz",
"integrity": "sha512-jg+97EGIcY9AGHJJRaaPVgetKDsrTgbRjQ5Msgjh/DQKEFl0DtyRr/VCOyD1T2R1MNeWPK/u7JoGhlDZnKBAfA==",
"dev": true
},
"node_modules/@types/yargs": { "node_modules/@types/yargs": {
"version": "17.0.32", "version": "17.0.32",
"resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.32.tgz", "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.32.tgz",
@ -1602,6 +1633,17 @@
"node": "^14.15.0 || ^16.10.0 || >=18.0.0" "node": "^14.15.0 || ^16.10.0 || >=18.0.0"
} }
}, },
"node_modules/dotenv": {
"version": "16.4.5",
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz",
"integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://dotenvx.com"
}
},
"node_modules/electron-to-chromium": { "node_modules/electron-to-chromium": {
"version": "1.4.748", "version": "1.4.748",
"resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.748.tgz", "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.748.tgz",
@ -3641,6 +3683,18 @@
"browserslist": ">= 4.21.0" "browserslist": ">= 4.21.0"
} }
}, },
"node_modules/uuid": {
"version": "9.0.1",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
"funding": [
"https://github.com/sponsors/broofa",
"https://github.com/sponsors/ctavan"
],
"bin": {
"uuid": "dist/bin/uuid"
}
},
"node_modules/v8-to-istanbul": { "node_modules/v8-to-istanbul": {
"version": "9.2.0", "version": "9.2.0",
"resolved": "https://registry.npmjs.org/v8-to-istanbul/-/v8-to-istanbul-9.2.0.tgz", "resolved": "https://registry.npmjs.org/v8-to-istanbul/-/v8-to-istanbul-9.2.0.tgz",

View File

@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "0.0.21", "version": "0.0.22",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "build/index.js", "main": "build/index.js",
"types": "types/index.d.ts", "types": "types/index.d.ts",
@ -9,7 +9,7 @@
"build": "tsc", "build": "tsc",
"publish": "npm run build && npm publish --access public", "publish": "npm run build && npm publish --access public",
"publish-beta": "npm run build && npm publish --access public --tag beta", "publish-beta": "npm run build && npm publish --access public --tag beta",
"test": "jest src/**/*.test.ts" "test": "jest src/__tests__/**/*.test.ts"
}, },
"repository": { "repository": {
"type": "git", "type": "git",
@ -19,6 +19,8 @@
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"axios": "^1.6.8", "axios": "^1.6.8",
"dotenv": "^16.4.5",
"uuid": "^9.0.1",
"zod": "^3.23.8", "zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0" "zod-to-json-schema": "^3.23.0"
}, },
@ -29,7 +31,10 @@
"devDependencies": { "devDependencies": {
"@jest/globals": "^29.7.0", "@jest/globals": "^29.7.0",
"@types/axios": "^0.14.0", "@types/axios": "^0.14.0",
"@types/node": "^20.12.7", "@types/dotenv": "^8.2.0",
"@types/jest": "^29.5.12",
"@types/node": "^20.12.12",
"@types/uuid": "^9.0.8",
"jest": "^29.7.0", "jest": "^29.7.0",
"ts-jest": "^29.1.2", "ts-jest": "^29.1.2",
"typescript": "^5.4.5" "typescript": "^5.4.5"

View File

@ -0,0 +1,146 @@
import FirecrawlApp from '../../index';
import { v4 as uuidv4 } from 'uuid';
import dotenv from 'dotenv';
dotenv.config();
const TEST_API_KEY = process.env.TEST_API_KEY;
const API_URL = process.env.API_URL;
describe('FirecrawlApp E2E Tests', () => {
test('should throw error for no API key', () => {
expect(() => {
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
}).toThrow("No API key provided");
});
test('should throw error for invalid API key on scrape', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.scrapeUrl('https://firecrawl.dev')).rejects.toThrow("Request failed with status code 401");
});
test('should throw error for blocklisted URL on scrape', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const blocklistedUrl = "https://facebook.com/fake-test";
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
});
test('should return successful response with valid preview token', async () => {
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
const response = await app.scrapeUrl('https://firecrawl.dev');
expect(response).not.toBeNull();
expect(response.data.content).toContain("🔥 Firecrawl");
}, 30000); // 30 seconds timeout
test('should return successful response for valid scrape', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.scrapeUrl('https://firecrawl.dev');
expect(response).not.toBeNull();
expect(response.data.content).toContain("🔥 Firecrawl");
expect(response.data).toHaveProperty('markdown');
expect(response.data).toHaveProperty('metadata');
expect(response.data).not.toHaveProperty('html');
}, 30000); // 30 seconds timeout
test('should return successful response with valid API key and include HTML', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.scrapeUrl('https://firecrawl.dev', { pageOptions: { includeHtml: true } });
expect(response).not.toBeNull();
expect(response.data.content).toContain("🔥 Firecrawl");
expect(response.data.markdown).toContain("🔥 Firecrawl");
expect(response.data.html).toContain("<h1");
}, 30000); // 30 seconds timeout
test('should return successful response for valid scrape with PDF file', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf');
expect(response).not.toBeNull();
expect(response.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 30000); // 30 seconds timeout
test('should return successful response for valid scrape with PDF file without explicit extension', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001');
expect(response).not.toBeNull();
expect(response.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 30000); // 30 seconds timeout
test('should throw error for invalid API key on crawl', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.crawlUrl('https://firecrawl.dev')).rejects.toThrow("Request failed with status code 401");
});
test('should throw error for blocklisted URL on crawl', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const blocklistedUrl = "https://twitter.com/fake-test";
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
});
test('should return successful response for crawl and wait for completion', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.crawlUrl('https://firecrawl.dev', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30);
expect(response).not.toBeNull();
expect(response[0].content).toContain("🔥 Firecrawl");
}, 60000); // 60 seconds timeout
test('should handle idempotency key for crawl', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const uniqueIdempotencyKey = uuidv4();
const response = await app.crawlUrl('https://firecrawl.dev', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey);
expect(response).not.toBeNull();
expect(response.jobId).toBeDefined();
await expect(app.crawlUrl('https://firecrawl.dev', { crawlerOptions: { excludes: ['blog/*'] } }, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
});
test('should check crawl status', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.crawlUrl('https://firecrawl.dev', { crawlerOptions: { excludes: ['blog/*'] } }, false);
expect(response).not.toBeNull();
expect(response.jobId).toBeDefined();
await new Promise(resolve => setTimeout(resolve, 30000)); // wait for 30 seconds
const statusResponse = await app.checkCrawlStatus(response.jobId);
expect(statusResponse).not.toBeNull();
expect(statusResponse.status).toBe('completed');
expect(statusResponse.data.length).toBeGreaterThan(0);
}, 35000); // 35 seconds timeout
test('should return successful response for search', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.search("test query");
expect(response).not.toBeNull();
expect(response.data[0].content).toBeDefined();
expect(response.data.length).toBeGreaterThan(2);
}, 30000); // 30 seconds timeout
test('should throw error for invalid API key on search', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401");
});
test('should perform LLM extraction', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.scrapeUrl("https://mendable.ai", {
extractorOptions: {
mode: 'llm-extraction',
extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
extractionSchema: {
type: 'object',
properties: {
company_mission: { type: 'string' },
supports_sso: { type: 'boolean' },
is_open_source: { type: 'boolean' }
},
required: ['company_mission', 'supports_sso', 'is_open_source']
}
}
});
expect(response).not.toBeNull();
expect(response.data.llm_extraction).toBeDefined();
const llmExtraction = response.data.llm_extraction;
expect(llmExtraction.company_mission).toBeDefined();
expect(typeof llmExtraction.supports_sso).toBe('boolean');
expect(typeof llmExtraction.is_open_source).toBe('boolean');
}, 30000); // 30 seconds timeout
});

View File

@ -6,6 +6,7 @@ import { zodToJsonSchema } from "zod-to-json-schema";
*/ */
export interface FirecrawlAppConfig { export interface FirecrawlAppConfig {
apiKey?: string | null; apiKey?: string | null;
apiUrl?: string | null;
} }
/** /**
@ -63,6 +64,7 @@ export interface JobStatusResponse {
*/ */
export default class FirecrawlApp { export default class FirecrawlApp {
private apiKey: string; private apiKey: string;
private apiUrl: string = "https://api.firecrawl.dev";
/** /**
* Initializes a new instance of the FirecrawlApp class. * Initializes a new instance of the FirecrawlApp class.
@ -107,7 +109,7 @@ export default class FirecrawlApp {
} }
try { try {
const response: AxiosResponse = await axios.post( const response: AxiosResponse = await axios.post(
"https://api.firecrawl.dev/v0/scrape", this.apiUrl + "/v0/scrape",
jsonData, jsonData,
{ headers }, { headers },
); );
@ -147,7 +149,7 @@ export default class FirecrawlApp {
} }
try { try {
const response: AxiosResponse = await axios.post( const response: AxiosResponse = await axios.post(
"https://api.firecrawl.dev/v0/search", this.apiUrl + "/v0/search",
jsonData, jsonData,
{ headers } { headers }
); );
@ -173,22 +175,24 @@ export default class FirecrawlApp {
* @param {Params | null} params - Additional parameters for the crawl request. * @param {Params | null} params - Additional parameters for the crawl request.
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
* @param {number} timeout - Timeout in seconds for job status checks. * @param {number} timeout - Timeout in seconds for job status checks.
* @param {string} idempotencyKey - Optional idempotency key for the request.
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation. * @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
*/ */
async crawlUrl( async crawlUrl(
url: string, url: string,
params: Params | null = null, params: Params | null = null,
waitUntilDone: boolean = true, waitUntilDone: boolean = true,
timeout: number = 2 timeout: number = 2,
idempotencyKey?: string
): Promise<CrawlResponse | any> { ): Promise<CrawlResponse | any> {
const headers = this.prepareHeaders(); const headers = this.prepareHeaders(idempotencyKey);
let jsonData: Params = { url }; let jsonData: Params = { url };
if (params) { if (params) {
jsonData = { ...jsonData, ...params }; jsonData = { ...jsonData, ...params };
} }
try { try {
const response: AxiosResponse = await this.postRequest( const response: AxiosResponse = await this.postRequest(
"https://api.firecrawl.dev/v0/crawl", this.apiUrl + "/v0/crawl",
jsonData, jsonData,
headers headers
); );
@ -218,7 +222,7 @@ export default class FirecrawlApp {
const headers: AxiosRequestHeaders = this.prepareHeaders(); const headers: AxiosRequestHeaders = this.prepareHeaders();
try { try {
const response: AxiosResponse = await this.getRequest( const response: AxiosResponse = await this.getRequest(
`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, this.apiUrl + `/v0/crawl/status/${jobId}`,
headers headers
); );
if (response.status === 200) { if (response.status === 200) {
@ -240,11 +244,12 @@ export default class FirecrawlApp {
* Prepares the headers for an API request. * Prepares the headers for an API request.
* @returns {AxiosRequestHeaders} The prepared headers. * @returns {AxiosRequestHeaders} The prepared headers.
*/ */
prepareHeaders(): AxiosRequestHeaders { prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
return { return {
"Content-Type": "application/json", 'Content-Type': 'application/json',
Authorization: `Bearer ${this.apiKey}`, 'Authorization': `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders; ...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}),
} as AxiosRequestHeaders & { 'x-idempotency-key'?: string };
} }
/** /**
@ -289,7 +294,7 @@ export default class FirecrawlApp {
): Promise<any> { ): Promise<any> {
while (true) { while (true) {
const statusResponse: AxiosResponse = await this.getRequest( const statusResponse: AxiosResponse = await this.getRequest(
`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, this.apiUrl + `/v0/crawl/status/${jobId}`,
headers headers
); );
if (statusResponse.status === 200) { if (statusResponse.status === 200) {

View File

@ -82,9 +82,10 @@ export default class FirecrawlApp {
* @param {Params | null} params - Additional parameters for the crawl request. * @param {Params | null} params - Additional parameters for the crawl request.
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
* @param {number} timeout - Timeout in seconds for job status checks. * @param {number} timeout - Timeout in seconds for job status checks.
* @param {string} idempotencyKey - Optional idempotency key for the request.
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation. * @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
*/ */
crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, timeout?: number): Promise<CrawlResponse | any>; crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, timeout?: number, idempotencyKey?: string): Promise<CrawlResponse | any>;
/** /**
* Checks the status of a crawl job using the Firecrawl API. * Checks the status of a crawl job using the Firecrawl API.
* @param {string} jobId - The job ID of the crawl operation. * @param {string} jobId - The job ID of the crawl operation.
@ -95,7 +96,7 @@ export default class FirecrawlApp {
* Prepares the headers for an API request. * Prepares the headers for an API request.
* @returns {AxiosRequestHeaders} The prepared headers. * @returns {AxiosRequestHeaders} The prepared headers.
*/ */
prepareHeaders(): AxiosRequestHeaders; prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders;
/** /**
* Sends a POST request to the specified URL. * Sends a POST request to the specified URL.
* @param {string} url - The URL to send the request to. * @param {string} url - The URL to send the request to.

View File

@ -11,8 +11,10 @@
"dependencies": { "dependencies": {
"@mendable/firecrawl-js": "^0.0.19", "@mendable/firecrawl-js": "^0.0.19",
"axios": "^1.6.8", "axios": "^1.6.8",
"dotenv": "^16.4.5",
"ts-node": "^10.9.2", "ts-node": "^10.9.2",
"typescript": "^5.4.5", "typescript": "^5.4.5",
"uuid": "^9.0.1",
"zod": "^3.23.8" "zod": "^3.23.8"
}, },
"devDependencies": { "devDependencies": {
@ -530,6 +532,17 @@
"node": ">=0.3.1" "node": ">=0.3.1"
} }
}, },
"node_modules/dotenv": {
"version": "16.4.5",
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz",
"integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://dotenvx.com"
}
},
"node_modules/esbuild": { "node_modules/esbuild": {
"version": "0.20.2", "version": "0.20.2",
"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.20.2.tgz", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.20.2.tgz",
@ -743,6 +756,18 @@
"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
"peer": true "peer": true
}, },
"node_modules/uuid": {
"version": "9.0.1",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
"funding": [
"https://github.com/sponsors/broofa",
"https://github.com/sponsors/ctavan"
],
"bin": {
"uuid": "dist/bin/uuid"
}
},
"node_modules/v8-compile-cache-lib": { "node_modules/v8-compile-cache-lib": {
"version": "3.0.1", "version": "3.0.1",
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",

View File

@ -2,12 +2,21 @@ from fastapi import FastAPI
from playwright.async_api import async_playwright, Browser from playwright.async_api import async_playwright, Browser
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from pydantic import BaseModel from pydantic import BaseModel
from os import environ
PROXY_SERVER = environ.get("PROXY_SERVER", None)
PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None)
BLOCK_MEDIA = environ.get("BLOCK_MEDIA", "False").upper() == "TRUE"
app = FastAPI() app = FastAPI()
class UrlModel(BaseModel): class UrlModel(BaseModel):
url: str url: str
wait: int = None wait: int = None
wait_until: str = "load"
headers: dict = None
browser: Browser = None browser: Browser = None
@ -27,11 +36,39 @@ async def shutdown_event():
@app.post("/html") @app.post("/html")
async def root(body: UrlModel): async def root(body: UrlModel):
context = await browser.new_context() context = None
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
context = await browser.new_context(
proxy={
"server": PROXY_SERVER,
"username": PROXY_USERNAME,
"password": PROXY_PASSWORD,
}
)
else:
context = await browser.new_context()
if BLOCK_MEDIA:
await context.route(
"**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}",
handler=lambda route, request: route.abort(),
)
page = await context.new_page() page = await context.new_page()
await page.goto(body.url, timeout=15000) # Set max timeout to 15s
# Set headers if provided
if body.headers:
await page.set_extra_http_headers(body.headers)
await page.goto(
body.url,
timeout=15000,
wait_until=body.wait_until if body.wait_until else "load",
) # Set max timeout to 15s
if body.wait: # Check if wait parameter is provided in the request body if body.wait: # Check if wait parameter is provided in the request body
await page.wait_for_timeout(body.wait) # Convert seconds to milliseconds for playwright await page.wait_for_timeout(
body.wait
) # Convert seconds to milliseconds for playwright
page_content = await page.content() page_content = await page.content()
await context.close() await context.close()
json_compatible_item_data = {"content": page_content} json_compatible_item_data = {"content": page_content}

View File

@ -0,0 +1,2 @@
[FORMAT]
max-line-length = 120

View File

@ -117,6 +117,25 @@ status = app.check_crawl_status(job_id)
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
## Running the Tests with Pytest
To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling.
### Running the Tests
To run the tests, execute the following commands:
Install pytest:
```bash
pip install pytest
```
Run:
```bash
pytest firecrawl/__tests__/e2e_withAuth/test.py
```
## Contributing ## Contributing
Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.

View File

@ -1,24 +1,57 @@
"""
FirecrawlApp Module
This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
and check the status of these jobs. The module uses requests for HTTP communication
and handles retries for certain HTTP status codes.
Classes:
- FirecrawlApp: Main class for interacting with the Firecrawl API.
"""
import os import os
from typing import Any, Dict, Optional
import requests
import time import time
from typing import Any, Dict, Optional
import requests
class FirecrawlApp: class FirecrawlApp:
def __init__(self, api_key=None): """
Initialize the FirecrawlApp instance.
Args:
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
api_url (Optional[str]): Base URL for the Firecrawl API.
"""
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
if self.api_key is None: if self.api_key is None:
raise ValueError('No API key provided') raise ValueError('No API key provided')
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
"""
Scrape the specified URL using the Firecrawl API.
Args:
url (str): The URL to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
Returns:
Any: The scraped data if the request is successful.
Raises:
Exception: If the scrape request fails.
"""
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}'
} }
# Prepare the base scrape parameters with the URL # Prepare the base scrape parameters with the URL
scrape_params = {'url': url} scrape_params = {'url': url}
# If there are additional params, process them # If there are additional params, process them
if params: if params:
# Initialize extractorOptions if present # Initialize extractorOptions if present
@ -31,30 +64,43 @@ class FirecrawlApp:
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
# Update the scrape_params with the processed extractorOptions # Update the scrape_params with the processed extractorOptions
scrape_params['extractorOptions'] = extractor_options scrape_params['extractorOptions'] = extractor_options
# Include any other params directly at the top level of scrape_params # Include any other params directly at the top level of scrape_params
for key, value in params.items(): for key, value in params.items():
if key != 'extractorOptions': if key != 'extractorOptions':
scrape_params[key] = value scrape_params[key] = value
# Make the POST request with the prepared headers and JSON data # Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/scrape', f'{self.api_url}/v0/scrape',
headers=headers, headers=headers,
json=scrape_params json=scrape_params,
) )
if response.status_code == 200: if response.status_code == 200:
response = response.json() response = response.json()
if response['success']: if response['success'] and 'data' in response:
return response['data'] return response['data']
else: else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]: elif response.status_code in [402, 408, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else: else:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
def search(self, query, params=None): def search(self, query, params=None):
"""
Perform a search using the Firecrawl API.
Args:
query (str): The search query.
params (Optional[Dict[str, Any]]): Additional parameters for the search request.
Returns:
Any: The search results if the request is successful.
Raises:
Exception: If the search request fails.
"""
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}'
@ -63,29 +109,46 @@ class FirecrawlApp:
if params: if params:
json_data.update(params) json_data.update(params)
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/search', f'{self.api_url}/v0/search',
headers=headers, headers=headers,
json=json_data json=json_data
) )
if response.status_code == 200: if response.status_code == 200:
response = response.json() response = response.json()
if response['success'] == True:
if response['success'] and 'data' in response:
return response['data'] return response['data']
else: else:
raise Exception(f'Failed to search. Error: {response["error"]}') raise Exception(f'Failed to search. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]: elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
else: else:
raise Exception(f'Failed to search. Status code: {response.status_code}') raise Exception(f'Failed to search. Status code: {response.status_code}')
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2): def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
headers = self._prepare_headers() """
Initiate a crawl job for the specified URL using the Firecrawl API.
Args:
url (str): The URL to crawl.
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
wait_until_done (bool): Whether to wait until the crawl job is completed.
timeout (int): Timeout between status checks when waiting for job completion.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
Any: The crawl job ID or the crawl results if waiting until completion.
Raises:
Exception: If the crawl job initiation or monitoring fails.
"""
headers = self._prepare_headers(idempotency_key)
json_data = {'url': url} json_data = {'url': url}
if params: if params:
json_data.update(params) json_data.update(params)
response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers) response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
if response.status_code == 200: if response.status_code == 200:
job_id = response.json().get('jobId') job_id = response.json().get('jobId')
if wait_until_done: if wait_until_done:
@ -96,20 +159,64 @@ class FirecrawlApp:
self._handle_error(response, 'start crawl job') self._handle_error(response, 'start crawl job')
def check_crawl_status(self, job_id): def check_crawl_status(self, job_id):
"""
Check the status of a crawl job using the Firecrawl API.
Args:
job_id (str): The ID of the crawl job.
Returns:
Any: The status of the crawl job.
Raises:
Exception: If the status check request fails.
"""
headers = self._prepare_headers() headers = self._prepare_headers()
response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
if response.status_code == 200: if response.status_code == 200:
return response.json() return response.json()
else: else:
self._handle_error(response, 'check crawl status') self._handle_error(response, 'check crawl status')
def _prepare_headers(self): def _prepare_headers(self, idempotency_key=None):
"""
Prepare the headers for API requests.
Args:
idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
Returns:
Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
"""
if idempotency_key:
return {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}',
'x-idempotency-key': idempotency_key
}
return { return {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}',
} }
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
"""
Make a POST request with retries.
Args:
url (str): The URL to send the POST request to.
data (Dict[str, Any]): The JSON data to include in the POST request.
headers (Dict[str, str]): The headers to include in the POST request.
retries (int): Number of retries for the request.
backoff_factor (float): Backoff factor for retries.
Returns:
requests.Response: The response from the POST request.
Raises:
requests.RequestException: If the request fails after the specified retries.
"""
for attempt in range(retries): for attempt in range(retries):
response = requests.post(url, headers=headers, json=data) response = requests.post(url, headers=headers, json=data)
if response.status_code == 502: if response.status_code == 502:
@ -119,6 +226,21 @@ class FirecrawlApp:
return response return response
def _get_request(self, url, headers, retries=3, backoff_factor=0.5): def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
"""
Make a GET request with retries.
Args:
url (str): The URL to send the GET request to.
headers (Dict[str, str]): The headers to include in the GET request.
retries (int): Number of retries for the request.
backoff_factor (float): Backoff factor for retries.
Returns:
requests.Response: The response from the GET request.
Raises:
requests.RequestException: If the request fails after the specified retries.
"""
for attempt in range(retries): for attempt in range(retries):
response = requests.get(url, headers=headers) response = requests.get(url, headers=headers)
if response.status_code == 502: if response.status_code == 502:
@ -128,9 +250,22 @@ class FirecrawlApp:
return response return response
def _monitor_job_status(self, job_id, headers, timeout): def _monitor_job_status(self, job_id, headers, timeout):
import time """
Monitor the status of a crawl job until completion.
Args:
job_id (str): The ID of the crawl job.
headers (Dict[str, str]): The headers to include in the status check requests.
timeout (int): Timeout between status checks.
Returns:
Any: The crawl results if the job is completed successfully.
Raises:
Exception: If the job fails or an error occurs during status checks.
"""
while True: while True:
status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
if status_response.status_code == 200: if status_response.status_code == 200:
status_data = status_response.json() status_data = status_response.json()
if status_data['status'] == 'completed': if status_data['status'] == 'completed':
@ -138,9 +273,8 @@ class FirecrawlApp:
return status_data['data'] return status_data['data']
else: else:
raise Exception('Crawl job completed but no data was returned') raise Exception('Crawl job completed but no data was returned')
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']:
if timeout < 2: timeout=max(timeout,2)
timeout = 2
time.sleep(timeout) # Wait for the specified timeout before checking again time.sleep(timeout) # Wait for the specified timeout before checking again
else: else:
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
@ -148,7 +282,17 @@ class FirecrawlApp:
self._handle_error(status_response, 'check crawl status') self._handle_error(status_response, 'check crawl status')
def _handle_error(self, response, action): def _handle_error(self, response, action):
if response.status_code in [402, 409, 500]: """
Handle errors from API responses.
Args:
response (requests.Response): The response object from the API request.
action (str): Description of the action that was being performed.
Raises:
Exception: An exception with a message containing the status code and error details from the response.
"""
if response.status_code in [402, 408, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
else: else:

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,4 +1,5 @@
from firecrawl import FirecrawlApp import uuid
from firecrawl.firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="fc-YOUR_API_KEY") app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
@ -7,7 +8,8 @@ scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown']) print(scrape_result['markdown'])
# Crawl a website: # Crawl a website:
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}) idempotency_key = str(uuid.uuid4()) # optional idempotency key
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
print(crawl_result) print(crawl_result)
# LLM Extraction: # LLM Extraction:

View File

@ -0,0 +1,3 @@
API_URL=http://localhost:3002
ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py
TEST_API_KEY=fc-YOUR_API_KEY

View File

@ -0,0 +1,168 @@
import importlib.util
import pytest
import time
import os
from uuid import uuid4
from dotenv import load_dotenv
load_dotenv()
API_URL = "http://127.0.0.1:3002";
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
TEST_API_KEY = os.getenv('TEST_API_KEY')
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
firecrawl = importlib.util.module_from_spec(spec)
spec.loader.exec_module(firecrawl)
FirecrawlApp = firecrawl.FirecrawlApp
def test_no_api_key():
with pytest.raises(Exception) as excinfo:
invalid_app = FirecrawlApp(api_url=API_URL)
assert "No API key provided" in str(excinfo.value)
def test_scrape_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.scrape_url('https://firecrawl.dev')
assert "Failed to scrape URL. Status code: 401" in str(excinfo.value)
def test_blocklisted_url():
blocklisted_url = "https://facebook.com/fake-test"
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
with pytest.raises(Exception) as excinfo:
app.scrape_url(blocklisted_url)
assert "Failed to scrape URL. Status code: 403" in str(excinfo.value)
def test_successful_response_with_valid_preview_token():
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
response = app.scrape_url('https://firecrawl.dev')
assert response is not None
assert 'content' in response
assert "🔥 Firecrawl" in response['content']
def test_scrape_url_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.scrape_url('https://firecrawl.dev')
assert response is not None
assert 'content' in response
assert 'markdown' in response
assert 'metadata' in response
assert 'html' not in response
assert "🔥 Firecrawl" in response['content']
def test_successful_response_with_valid_api_key_and_include_html():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.scrape_url('https://firecrawl.dev', {'pageOptions': {'includeHtml': True}})
assert response is not None
assert 'content' in response
assert 'markdown' in response
assert 'html' in response
assert 'metadata' in response
assert "🔥 Firecrawl" in response['content']
assert "🔥 Firecrawl" in response['markdown']
assert "<h1" in response['html']
def test_successful_response_for_valid_scrape_with_pdf_file():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
assert response is not None
assert 'content' in response
assert 'metadata' in response
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
time.sleep(6) # wait for 6 seconds
assert response is not None
assert 'content' in response
assert 'metadata' in response
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
def test_crawl_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.crawl_url('https://firecrawl.dev')
assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value)
def test_should_return_error_for_blocklisted_url():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
blocklisted_url = "https://twitter.com/fake-test"
with pytest.raises(Exception) as excinfo:
app.crawl_url(blocklisted_url)
assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value)
def test_crawl_url_wait_for_completion_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
assert response is not None
assert len(response) > 0
assert 'content' in response[0]
assert "🔥 Firecrawl" in response[0]['content']
def test_crawl_url_with_idempotency_key_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
uniqueIdempotencyKey = str(uuid4())
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
assert response is not None
assert len(response) > 0
assert 'content' in response[0]
assert "🔥 Firecrawl" in response[0]['content']
with pytest.raises(Exception) as excinfo:
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value)
def test_check_crawl_status_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
assert response is not None
assert 'jobId' in response
time.sleep(30) # wait for 30 seconds
status_response = app.check_crawl_status(response['jobId'])
assert status_response is not None
assert 'status' in status_response
assert status_response['status'] == 'completed'
assert 'data' in status_response
assert len(status_response['data']) > 0
def test_search_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.search("test query")
assert response is not None
assert 'content' in response[0]
assert len(response) > 2
def test_search_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.search("test query")
assert "Failed to search. Status code: 401" in str(excinfo.value)
def test_llm_extraction():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.scrape_url("https://mendable.ai", {
'extractorOptions': {
'mode': 'llm-extraction',
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
'extractionSchema': {
'type': 'object',
'properties': {
'company_mission': {'type': 'string'},
'supports_sso': {'type': 'boolean'},
'is_open_source': {'type': 'boolean'}
},
'required': ['company_mission', 'supports_sso', 'is_open_source']
}
}
})
assert response is not None
assert 'llm_extraction' in response
llm_extraction = response['llm_extraction']
assert 'company_mission' in llm_extraction
assert isinstance(llm_extraction['supports_sso'], bool)
assert isinstance(llm_extraction['is_open_source'], bool)

View File

@ -1,24 +1,57 @@
"""
FirecrawlApp Module
This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
and check the status of these jobs. The module uses requests for HTTP communication
and handles retries for certain HTTP status codes.
Classes:
- FirecrawlApp: Main class for interacting with the Firecrawl API.
"""
import os import os
from typing import Any, Dict, Optional
import requests
import time import time
from typing import Any, Dict, Optional
import requests
class FirecrawlApp: class FirecrawlApp:
def __init__(self, api_key=None): """
Initialize the FirecrawlApp instance.
Args:
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
api_url (Optional[str]): Base URL for the Firecrawl API.
"""
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
if self.api_key is None: if self.api_key is None:
raise ValueError('No API key provided') raise ValueError('No API key provided')
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
"""
Scrape the specified URL using the Firecrawl API.
Args:
url (str): The URL to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
Returns:
Any: The scraped data if the request is successful.
Raises:
Exception: If the scrape request fails.
"""
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}'
} }
# Prepare the base scrape parameters with the URL # Prepare the base scrape parameters with the URL
scrape_params = {'url': url} scrape_params = {'url': url}
# If there are additional params, process them # If there are additional params, process them
if params: if params:
# Initialize extractorOptions if present # Initialize extractorOptions if present
@ -31,30 +64,43 @@ class FirecrawlApp:
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
# Update the scrape_params with the processed extractorOptions # Update the scrape_params with the processed extractorOptions
scrape_params['extractorOptions'] = extractor_options scrape_params['extractorOptions'] = extractor_options
# Include any other params directly at the top level of scrape_params # Include any other params directly at the top level of scrape_params
for key, value in params.items(): for key, value in params.items():
if key != 'extractorOptions': if key != 'extractorOptions':
scrape_params[key] = value scrape_params[key] = value
# Make the POST request with the prepared headers and JSON data # Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/scrape', f'{self.api_url}/v0/scrape',
headers=headers, headers=headers,
json=scrape_params json=scrape_params,
) )
if response.status_code == 200: if response.status_code == 200:
response = response.json() response = response.json()
if response['success']: if response['success'] and 'data' in response:
return response['data'] return response['data']
else: else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]: elif response.status_code in [402, 408, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else: else:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
def search(self, query, params=None): def search(self, query, params=None):
"""
Perform a search using the Firecrawl API.
Args:
query (str): The search query.
params (Optional[Dict[str, Any]]): Additional parameters for the search request.
Returns:
Any: The search results if the request is successful.
Raises:
Exception: If the search request fails.
"""
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}'
@ -63,29 +109,46 @@ class FirecrawlApp:
if params: if params:
json_data.update(params) json_data.update(params)
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/search', f'{self.api_url}/v0/search',
headers=headers, headers=headers,
json=json_data json=json_data
) )
if response.status_code == 200: if response.status_code == 200:
response = response.json() response = response.json()
if response['success'] == True:
if response['success'] and 'data' in response:
return response['data'] return response['data']
else: else:
raise Exception(f'Failed to search. Error: {response["error"]}') raise Exception(f'Failed to search. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]: elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
else: else:
raise Exception(f'Failed to search. Status code: {response.status_code}') raise Exception(f'Failed to search. Status code: {response.status_code}')
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2): def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
headers = self._prepare_headers() """
Initiate a crawl job for the specified URL using the Firecrawl API.
Args:
url (str): The URL to crawl.
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
wait_until_done (bool): Whether to wait until the crawl job is completed.
timeout (int): Timeout between status checks when waiting for job completion.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
Any: The crawl job ID or the crawl results if waiting until completion.
Raises:
Exception: If the crawl job initiation or monitoring fails.
"""
headers = self._prepare_headers(idempotency_key)
json_data = {'url': url} json_data = {'url': url}
if params: if params:
json_data.update(params) json_data.update(params)
response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers) response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
if response.status_code == 200: if response.status_code == 200:
job_id = response.json().get('jobId') job_id = response.json().get('jobId')
if wait_until_done: if wait_until_done:
@ -96,20 +159,64 @@ class FirecrawlApp:
self._handle_error(response, 'start crawl job') self._handle_error(response, 'start crawl job')
def check_crawl_status(self, job_id): def check_crawl_status(self, job_id):
"""
Check the status of a crawl job using the Firecrawl API.
Args:
job_id (str): The ID of the crawl job.
Returns:
Any: The status of the crawl job.
Raises:
Exception: If the status check request fails.
"""
headers = self._prepare_headers() headers = self._prepare_headers()
response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
if response.status_code == 200: if response.status_code == 200:
return response.json() return response.json()
else: else:
self._handle_error(response, 'check crawl status') self._handle_error(response, 'check crawl status')
def _prepare_headers(self): def _prepare_headers(self, idempotency_key=None):
"""
Prepare the headers for API requests.
Args:
idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
Returns:
Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
"""
if idempotency_key:
return {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}',
'x-idempotency-key': idempotency_key
}
return { return {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}',
} }
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
"""
Make a POST request with retries.
Args:
url (str): The URL to send the POST request to.
data (Dict[str, Any]): The JSON data to include in the POST request.
headers (Dict[str, str]): The headers to include in the POST request.
retries (int): Number of retries for the request.
backoff_factor (float): Backoff factor for retries.
Returns:
requests.Response: The response from the POST request.
Raises:
requests.RequestException: If the request fails after the specified retries.
"""
for attempt in range(retries): for attempt in range(retries):
response = requests.post(url, headers=headers, json=data) response = requests.post(url, headers=headers, json=data)
if response.status_code == 502: if response.status_code == 502:
@ -119,6 +226,21 @@ class FirecrawlApp:
return response return response
def _get_request(self, url, headers, retries=3, backoff_factor=0.5): def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
"""
Make a GET request with retries.
Args:
url (str): The URL to send the GET request to.
headers (Dict[str, str]): The headers to include in the GET request.
retries (int): Number of retries for the request.
backoff_factor (float): Backoff factor for retries.
Returns:
requests.Response: The response from the GET request.
Raises:
requests.RequestException: If the request fails after the specified retries.
"""
for attempt in range(retries): for attempt in range(retries):
response = requests.get(url, headers=headers) response = requests.get(url, headers=headers)
if response.status_code == 502: if response.status_code == 502:
@ -128,9 +250,22 @@ class FirecrawlApp:
return response return response
def _monitor_job_status(self, job_id, headers, timeout): def _monitor_job_status(self, job_id, headers, timeout):
import time """
Monitor the status of a crawl job until completion.
Args:
job_id (str): The ID of the crawl job.
headers (Dict[str, str]): The headers to include in the status check requests.
timeout (int): Timeout between status checks.
Returns:
Any: The crawl results if the job is completed successfully.
Raises:
Exception: If the job fails or an error occurs during status checks.
"""
while True: while True:
status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
if status_response.status_code == 200: if status_response.status_code == 200:
status_data = status_response.json() status_data = status_response.json()
if status_data['status'] == 'completed': if status_data['status'] == 'completed':
@ -138,9 +273,8 @@ class FirecrawlApp:
return status_data['data'] return status_data['data']
else: else:
raise Exception('Crawl job completed but no data was returned') raise Exception('Crawl job completed but no data was returned')
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']:
if timeout < 2: timeout=max(timeout,2)
timeout = 2
time.sleep(timeout) # Wait for the specified timeout before checking again time.sleep(timeout) # Wait for the specified timeout before checking again
else: else:
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
@ -148,7 +282,17 @@ class FirecrawlApp:
self._handle_error(status_response, 'check crawl status') self._handle_error(status_response, 'check crawl status')
def _handle_error(self, response, action): def _handle_error(self, response, action):
if response.status_code in [402, 409, 500]: """
Handle errors from API responses.
Args:
response (requests.Response): The response object from the API request.
action (str): Description of the action that was being performed.
Raises:
Exception: An exception with a message containing the status code and error details from the response.
"""
if response.status_code in [402, 408, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
else: else:

View File

@ -1,7 +1,179 @@
Metadata-Version: 2.1 Metadata-Version: 2.1
Name: firecrawl-py Name: firecrawl-py
Version: 0.0.8 Version: 0.0.12
Summary: Python SDK for Firecrawl API Summary: Python SDK for Firecrawl API
Home-page: https://github.com/mendableai/firecrawl Home-page: https://github.com/mendableai/firecrawl
Author: Mendable.ai Author: Mendable.ai
Author-email: nick@mendable.ai Author-email: nick@mendable.ai
License: GNU General Public License v3 (GPLv3)
Project-URL: Documentation, https://docs.firecrawl.dev
Project-URL: Source, https://github.com/mendableai/firecrawl
Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues
Keywords: SDK API firecrawl
Classifier: Development Status :: 5 - Production/Stable
Classifier: Environment :: Web Environment
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Natural Language :: English
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Topic :: Internet
Classifier: Topic :: Internet :: WWW/HTTP
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
Classifier: Topic :: Software Development
Classifier: Topic :: Software Development :: Libraries
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Topic :: Text Processing
Classifier: Topic :: Text Processing :: Indexing
Requires-Python: >=3.8
Description-Content-Type: text/markdown
# Firecrawl Python SDK
The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
## Installation
To install the Firecrawl Python SDK, you can use pip:
```bash
pip install firecrawl-py
```
## Usage
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
Here's an example of how to use the SDK:
```python
from firecrawl import FirecrawlApp
# Initialize the FirecrawlApp with your API key
app = FirecrawlApp(api_key='your_api_key')
# Scrape a single URL
url = 'https://mendable.ai'
scraped_data = app.scrape_url(url)
# Crawl a website
crawl_url = 'https://mendable.ai'
params = {
'pageOptions': {
'onlyMainContent': True
}
}
crawl_result = app.crawl_url(crawl_url, params=params)
```
### Scraping a URL
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
```python
url = 'https://example.com'
scraped_data = app.scrape_url(url)
```
### Extracting structured data from a URL
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
```python
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
data = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': TopArticlesSchema.model_json_schema(),
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})
print(data["llm_extraction"])
```
### Search for a query
Used to search the web, get the most relevant results, scrap each page and return the markdown.
```python
query = 'what is mendable?'
search_result = app.search(query)
```
### Crawling a Website
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
```python
crawl_url = 'https://example.com'
params = {
'crawlerOptions': {
'excludes': ['blog/*'],
'includes': [], # leave empty for all pages
'limit': 1000,
},
'pageOptions': {
'onlyMainContent': True
}
}
crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
```
If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
### Checking Crawl Status
To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
```python
job_id = crawl_result['jobId']
status = app.check_crawl_status(job_id)
```
## Error Handling
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
## Running the Tests with Pytest
To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling.
### Running the Tests
To run the tests, execute the following commands:
Install pytest:
```bash
pip install pytest
```
Run:
```bash
pytest firecrawl/__tests__/e2e_withAuth/test.py
```
## Contributing
Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
## License
The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).

View File

@ -1 +1,3 @@
requests requests
pytest
python-dotenv

View File

@ -0,0 +1,3 @@
requests
pytest
python-dotenv

View File

@ -1,14 +1,52 @@
from setuptools import setup, find_packages from pathlib import Path
from setuptools import find_packages, setup
this_directory = Path(__file__).parent
long_description_content = (this_directory / "README.md").read_text()
setup( setup(
name='firecrawl-py', name="firecrawl-py",
version='0.0.8', version="0.0.12",
url='https://github.com/mendableai/firecrawl', url="https://github.com/mendableai/firecrawl",
author='Mendable.ai', author="Mendable.ai",
author_email='nick@mendable.ai', author_email="nick@mendable.ai",
description='Python SDK for Firecrawl API', description="Python SDK for Firecrawl API",
long_description=long_description_content,
long_description_content_type="text/markdown",
packages=find_packages(), packages=find_packages(),
install_requires=[ install_requires=[
'requests', 'requests',
'pytest',
'python-dotenv',
], ],
python_requires='>=3.8',
classifiers=[
"Development Status :: 5 - Production/Stable",
"Environment :: Web Environment",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Internet",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Topic :: Software Development",
"Topic :: Software Development :: Libraries",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing",
"Topic :: Text Processing :: Indexing",
],
keywords="SDK API firecrawl",
project_urls={
"Documentation": "https://docs.firecrawl.dev",
"Source": "https://github.com/mendableai/firecrawl",
"Tracker": "https://github.com/mendableai/firecrawl/issues",
},
license="GNU General Public License v3 (GPLv3)",
) )

View File

@ -0,0 +1,178 @@
[
{
"website": "https://www.vellum.ai/llm-leaderboard",
"expected_min_num_of_pages": 1,
"expected_crawled_pages": ["https://www.vellum.ai/llm-leaderboard"]
},
{
"website": "https://openai.com/news",
"expected_min_num_of_pages": 4,
"expected_crawled_pages": [
"https://openai.com/news/company/",
"https://openai.com/news/research/",
"https://openai.com/news/safety-and-alignment/",
"https://openai.com/news/stories/"
]
},
{
"website": "https://www.framer.com/pricing",
"expected_min_num_of_pages": 1,
"expected_not_crawled_pages": [
"https://www.framer.com/features/navigation/",
"https://www.framer.com/contact/",
"https://www.framer.com/add-ons/",
"https://www.framer.com/free-saas-ui-kit/",
"https://www.framer.com/help/",
"https://www.framer.com/features/effects/",
"https://www.framer.com/enterprise/",
"https://www.framer.com/templates/"
]
},
{
"website": "https://mendable.ai/pricing",
"expected_min_num_of_pages": 1,
"expected_not_crawled_pages": [
"https://mendable.ai/",
"https://mendable.ai/blog",
"https://mendable.ai/signin",
"https://mendable.ai/signup",
"https://mendable.ai",
"https://mendable.ai/usecases/sales-enablement",
"https://mendable.ai/usecases/documentation",
"https://mendable.ai/usecases/cs-enablement",
"https://mendable.ai/usecases/productcopilot",
"https://mendable.ai/security"
],
"notes": "This one should not go backwards, but it does!"
},
{
"website": "https://agentops.ai/blog",
"expected_min_num_of_pages": 6,
"expected_crawled_pages": [
"https://www.agentops.ai/blog/effortless-hr-management-with-saas",
"https://www.agentops.ai/blog/streamlining-hr-with-saas",
"https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions",
"https://www.agentops.ai/blog/efficient-hr-operations-with-saas",
"https://www.agentops.ai/blog/hr-made-simple-with-saas",
"https://agentops.ai/blog"
],
"expected_not_crawled_pages": [
"https://agentops.ai/about-us",
"https://agentops.ai/contact-us"
]
},
{
"website": "https://en.wikipedia.org/wiki/T._N._Seshan",
"expected_min_num_of_pages": 1,
"expected_not_crawled_pages": [
"https://en.wikipedia.org/wiki/Wikipedia:Contents",
"https://en.wikipedia.org/wiki/Wikipedia:Contact_us",
"https://en.wikipedia.org/wiki/V._S._Ramadevi",
"https://en.wikipedia.org/wiki/Wikipedia:About",
"https://en.wikipedia.org/wiki/Help:Introduction",
"https://en.wikipedia.org/wiki/H._D._Deve_Gowda",
"https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg"
]
},
{
"website": "https://ycombinator.com/companies",
"expected_min_num_of_pages": 20,
"expected_crawled_pages": [
"https://www.ycombinator.com/companies/industry/elearning",
"https://www.ycombinator.com/companies/industry/computer-vision",
"https://www.ycombinator.com/companies/industry/health-tech",
"https://www.ycombinator.com/companies/industry/education",
"https://www.ycombinator.com/companies/industry/robotics",
"https://www.ycombinator.com/companies/industry/hardware",
"https://www.ycombinator.com/companies/industry/saas",
"https://www.ycombinator.com/companies/industry/hard-tech",
"https://www.ycombinator.com/companies/industry/developer-tools",
"https://www.ycombinator.com/companies/industry/entertainment",
"https://www.ycombinator.com/companies/industry/finance",
"https://www.ycombinator.com/companies/industry/generative-ai",
"https://www.ycombinator.com/companies/industry/machine-learning"
]
},
{
"website": "https://firecrawl.dev",
"expected_min_num_of_pages": 2,
"expected_crawled_pages": [
"https://firecrawl.dev/",
"https://firecrawl.dev/pricing"
]
},
{
"website": "https://fly.io/docs/gpus/gpu-quickstart",
"expected_min_num_of_pages": 1,
"expected_not_crawled_pages": [
"https://fly.io/docs/getting-started/",
"https://fly.io/docs/hands-on/",
"https://fly.io/docs/about/support/",
"https://fly.io/docs/blueprints/going-to-production-with-healthcare-apps/",
"https://fly.io/docs/machines/flyctl/fly-machine-update/",
"https://fly.io/docs/blueprints/review-apps-guide/",
"https://fly.io/docs/blueprints/supercronic/"
],
"notes": "This one should not go backwards, but it does!"
},
{
"website": "https://www.instructables.com/circuits",
"expected_min_num_of_pages": 12,
"expected_crawled_pages": [
"https://www.instructables.com/circuits/",
"https://www.instructables.com/circuits/apple/projects/",
"https://www.instructables.com/circuits/art/projects/",
"https://www.instructables.com/circuits/electronics/projects/",
"https://www.instructables.com/circuits/microsoft/projects/",
"https://www.instructables.com/circuits/microcontrollers/projects/",
"https://www.instructables.com/circuits/community/",
"https://www.instructables.com/circuits/leds/projects/",
"https://www.instructables.com/circuits/gadgets/projects/",
"https://www.instructables.com/circuits/arduino/projects/",
"https://www.instructables.com/circuits/lasers/projects/",
"https://www.instructables.com/circuits/clocks/projects/"
]
},
{
"website": "https://richmondconfidential.org",
"expected_min_num_of_pages": 20,
"expected_crawled_pages": [
"https://richmondconfidential.org/2009/10/13/salesians-star-guard-has-a-big-impact/",
"https://richmondconfidential.org/2009/10/13/on-team-of-beginners-oilers-old-hand-stands-out/",
"https://richmondconfidential.org/2009/10/19/point-richmond-clockmaker-turns-clutter-into-crafts/",
"https://richmondconfidential.org/2009/10/13/profile-maurice-cathy/",
"https://richmondconfidential.org/2009/10/13/soul-food-rescue-mission-rebuilds-diets-and-lives/",
"https://richmondconfidential.org/2009/10/21/in-tough-economy-pain-trickles-to-the-bottom/",
"https://richmondconfidential.org/2009/10/19/richmond-homicide-map/",
"https://richmondconfidential.org/2009/10/13/rough-roads-for-richmonds-cab-drivers/",
"https://richmondconfidential.org/2009/10/13/before-napa-there-was-winehaven/",
"https://richmondconfidential.org/2009/10/13/family-calls-for-end-to-violence-at-memorial-for-slain-woman-friend/"
]
},
{
"website": "https://www.boardgamegeek.com",
"expected_min_num_of_pages": 15,
"expected_crawled_pages": [
"https://www.boardgamegeek.com/browse/boardgameartist",
"https://www.boardgamegeek.com/browse/boardgamehonor",
"https://www.boardgamegeek.com/browse/boardgamepublisher",
"https://www.boardgamegeek.com/browse/boardgamepodcast",
"https://www.boardgamegeek.com/wiki/page/Index",
"https://www.boardgamegeek.com/browse/boardgamecategory",
"https://www.boardgamegeek.com/boardgame/random",
"https://www.boardgamegeek.com/browse/boardgamemechanic",
"https://www.boardgamegeek.com/forums",
"https://www.boardgamegeek.com/gonecardboard",
"https://www.boardgamegeek.com/browse/boardgameaccessory",
"https://www.boardgamegeek.com/browse/boardgamedesigner",
"https://www.boardgamegeek.com/",
"https://www.boardgamegeek.com/previews",
"https://www.boardgamegeek.com/browse/boardgame"
]
}
]

View File

@ -3,7 +3,9 @@
"version": "1.0.0", "version": "1.0.0",
"description": "", "description": "",
"scripts": { "scripts": {
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false" "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false",
"test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts",
"test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts"
}, },
"author": "", "author": "",
"license": "ISC", "license": "ISC",

View File

@ -0,0 +1,150 @@
import request from "supertest";
import dotenv from "dotenv";
import { WebsiteScrapeError } from "../utils/types";
import { logErrors } from "../utils/log";
import websitesData from "../data/crawl.json";
import "dotenv/config";
import fs from 'fs';
dotenv.config();
interface WebsiteData {
website: string;
expected_min_num_of_pages: number;
expected_crawled_pages: string[];
}
const TEST_URL = "http://127.0.0.1:3002";
describe("Crawling Checkup (E2E)", () => {
beforeAll(() => {
if (!process.env.TEST_API_KEY) {
throw new Error("TEST_API_KEY is not set");
}
});
describe("Crawling website tests with a dataset", () => {
it("Should crawl the website and verify the response", async () => {
let passedTests = 0;
const startTime = new Date().getTime();
const date = new Date();
const logsDir = `logs/${date.getMonth() + 1}-${date.getDate()}-${date.getFullYear()}`;
let errorLogFileName = `${logsDir}/run.log_${new Date().toTimeString().split(' ')[0]}`;
const errorLog: WebsiteScrapeError[] = [];
for (const websiteData of websitesData) {
try {
const crawlResponse = await request(TEST_URL || "")
.post("/v0/crawl")
.set("Content-Type", "application/json")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100, returnOnlyUrls: true }});
const jobId = crawlResponse.body.jobId;
let completedResponse: any;
let isFinished = false;
while (!isFinished) {
completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
isFinished = completedResponse.body.status === "completed";
if (!isFinished) {
await new Promise(resolve => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
if(!completedResponse) {
// fail the test
console.log('No response');
continue;
// continue;
}
if (!completedResponse.body || completedResponse.body.status !== "completed") {
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',
expected_output: 'SUCCESS',
actual_output: 'FAILURE',
error: `Crawl job did not complete successfully.`
});
continue;
}
// check how many webpages were crawled successfully
// compares with expected_num_of_pages
if (completedResponse.body.data.length < websiteData.expected_min_num_of_pages) {
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',
expected_output: `SUCCESS: ${websiteData.expected_min_num_of_pages}`,
actual_output: `FAILURE: ${completedResponse.body.data.length}`,
error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}`
});
console.log('Error: ', errorLog);
continue;
}
// checks if crawled pages contain expected_crawled_pages
if (websiteData.expected_crawled_pages && websiteData.expected_crawled_pages.length > 0 && websiteData.expected_crawled_pages.some(page => !completedResponse.body.data?.some((d: { url: string }) => d.url === page))) {
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',
expected_output: `SUCCESS: ${websiteData.expected_crawled_pages}`,
actual_output: `FAILURE: ${completedResponse.body.data}`,
error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}`
});
console.log('Error: ', errorLog);
continue;
}
// checks if crawled pages not contain expected_not_crawled_pages
if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && completedResponse.body.data && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',
expected_output: `SUCCESS: ${websiteData.expected_not_crawled_pages}`,
actual_output: `FAILURE: ${completedResponse.body.data}`,
error: `Expected crawled pages to not contain ${websiteData.expected_not_crawled_pages}, but got ${completedResponse.body.data}`
});
console.log('Error: ', errorLog);
continue;
}
passedTests++;
} catch (error) {
console.error(`Error processing ${websiteData.website}: ${error}`);
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',
expected_output: 'SUCCESS',
actual_output: 'FAILURE',
error: `Error processing ${websiteData.website}: ${error}`
});
continue;
}
}
const score = (passedTests / websitesData.length) * 100;
const endTime = new Date().getTime();
const timeTaken = (endTime - startTime) / 1000;
console.log(`Score: ${score}%`);
await logErrors(errorLog, timeTaken, 0, score, websitesData.length);
if (process.env.ENV === "local" && errorLog.length > 0) {
if (!fs.existsSync(logsDir)){
fs.mkdirSync(logsDir, { recursive: true });
}
fs.writeFileSync(errorLogFileName, JSON.stringify(errorLog, null, 2));
}
expect(score).toBeGreaterThanOrEqual(90);
}, 350000); // 150 seconds timeout
});
});

View File

@ -1,16 +1,14 @@
import request from "supertest"; import request from "supertest";
import dotenv from "dotenv"; import dotenv from "dotenv";
import Anthropic from "@anthropic-ai/sdk"; import { numTokensFromString } from "../utils/tokens";
import { numTokensFromString } from "./utils/tokens";
import OpenAI from "openai"; import OpenAI from "openai";
import { WebsiteScrapeError } from "./utils/types"; import { WebsiteScrapeError } from "../utils/types";
import { logErrors } from "./utils/log"; import { logErrors } from "../utils/log";
const websitesData = require("./data/websites.json"); import websitesData from "../data/scrape.json";
import "dotenv/config"; import "dotenv/config";
const fs = require('fs'); import fs from 'fs';
dotenv.config(); dotenv.config();
interface WebsiteData { interface WebsiteData {
@ -21,8 +19,7 @@ interface WebsiteData {
const TEST_URL = "http://127.0.0.1:3002"; const TEST_URL = "http://127.0.0.1:3002";
describe("Scraping Checkup (E2E)", () => {
describe("Scraping/Crawling Checkup (E2E)", () => {
beforeAll(() => { beforeAll(() => {
if (!process.env.TEST_API_KEY) { if (!process.env.TEST_API_KEY) {
throw new Error("TEST_API_KEY is not set"); throw new Error("TEST_API_KEY is not set");
@ -72,10 +69,6 @@ describe("Scraping/Crawling Checkup (E2E)", () => {
return null; return null;
} }
const anthropic = new Anthropic({
apiKey: process.env.ANTHROPIC_API_KEY,
});
const openai = new OpenAI({ const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY, apiKey: process.env.OPENAI_API_KEY,
}); });
@ -183,7 +176,7 @@ describe("Scraping/Crawling Checkup (E2E)", () => {
} }
expect(score).toBeGreaterThanOrEqual(75); expect(score).toBeGreaterThanOrEqual(70);
}, 350000); // 150 seconds timeout }, 350000); // 150 seconds timeout
}); });
}); });

View File

@ -39,7 +39,7 @@
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
// "resolveJsonModule": true, /* Enable importing .json files. */ "resolveJsonModule": true, /* Enable importing .json files. */
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */ // "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */

82
docker-compose.yaml Normal file
View File

@ -0,0 +1,82 @@
name: firecrawl
version: '3.9'
services:
playwright-service:
build: apps/playwright-service
environment:
- PORT=3000
- PROXY_SERVER=${PROXY_SERVER}
- PROXY_USERNAME=${PROXY_USERNAME}
- PROXY_PASSWORD=${PROXY_PASSWORD}
- BLOCK_MEDIA=${BLOCK_MEDIA}
networks:
- backend
api:
build: apps/api
environment:
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
- PORT=${PORT:-3002}
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
- POSTHOG_HOST=${POSTHOG_HOST}
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
- SUPABASE_URL=${SUPABASE_URL}
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
- HOST=${HOST:-0.0.0.0}
depends_on:
- redis
- playwright-service
ports:
- "3002:3002"
command: [ "pnpm", "run", "start:production" ]
networks:
- backend
worker:
build: apps/api
environment:
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
- PORT=${PORT:-3002}
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
- POSTHOG_HOST=${POSTHOG_HOST}
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
- SUPABASE_URL=${SUPABASE_URL}
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
- HOST=${HOST:-0.0.0.0}
depends_on:
- redis
- playwright-service
- api
networks:
- backend
redis:
image: redis:alpine
networks:
- backend
command: redis-server --bind 0.0.0.0
networks:
backend:
driver: bridge

View File

@ -0,0 +1,3 @@
{
"extends": "next/core-web-vitals"
}

38
examples/roastmywebsite/.gitignore vendored Normal file
View File

@ -0,0 +1,38 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.js
.yarn/install-state.gz
# testing
/coverage
# next.js
/.next/
/out/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# local env files
.env*.local
# vercel
.vercel
# typescript
*.tsbuildinfo
next-env.d.ts
.env
node_modules

View File

@ -0,0 +1,5 @@
# Roast My Website 🔥
Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them.
Check it out at roastmywebsite.ai 😈

View File

@ -0,0 +1,17 @@
{
"$schema": "https://ui.shadcn.com/schema.json",
"style": "default",
"rsc": true,
"tsx": true,
"tailwind": {
"config": "tailwind.config.ts",
"css": "src/app/globals.css",
"baseColor": "zinc",
"cssVariables": false,
"prefix": ""
},
"aliases": {
"components": "@/components",
"utils": "@/lib/utils"
}
}

View File

@ -0,0 +1,11 @@
/** @type {import('next').NextConfig} */
const nextConfig = {
env: {
G1: process.env.G1,
G2: process.env.G2,
G3: process.env.G3,
G4: process.env.G4,
},
};
export default nextConfig;

6617
examples/roastmywebsite/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,53 @@
{
"name": "roastmywebsite",
"version": "0.1.0",
"private": true,
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"lint": "next lint"
},
"dependencies": {
"@dqbd/tiktoken": "^1.0.15",
"@headlessui/react": "^2.0.4",
"@headlessui/tailwindcss": "^0.2.0",
"@mendable/firecrawl-js": "^0.0.21",
"@radix-ui/react-dialog": "^1.0.5",
"@radix-ui/react-dropdown-menu": "^2.0.6",
"@radix-ui/react-select": "^2.0.0",
"@radix-ui/react-slot": "^1.0.2",
"@radix-ui/react-switch": "^1.0.3",
"@remixicon/react": "^4.2.0",
"@tremor/react": "^3.17.2",
"@vercel/analytics": "^1.3.1",
"axios": "^1.7.2",
"class-variance-authority": "^0.7.0",
"clsx": "^2.1.1",
"cubic-spline": "^3.0.3",
"html2canvas": "^1.4.1",
"image-size": "^1.1.1",
"lucide": "^0.379.0",
"lucide-react": "^0.379.0",
"next": "14.2.3",
"next-themes": "^0.3.0",
"openai": "^4.47.3",
"react": "^18",
"react-dom": "^18",
"sonner": "^1.4.41",
"tailwind-merge": "^2.3.0",
"tailwindcss-animate": "^1.0.7",
"tiktoken": "^1.0.15"
},
"devDependencies": {
"@tailwindcss/forms": "^0.5.7",
"@types/node": "^20",
"@types/react": "^18",
"@types/react-dom": "^18",
"eslint": "^8",
"eslint-config-next": "14.2.3",
"postcss": "^8",
"tailwindcss": "^3.4.3",
"typescript": "^5"
}
}

View File

@ -0,0 +1,8 @@
/** @type {import('postcss-load-config').Config} */
const config = {
plugins: {
tailwindcss: {},
},
};
export default config;

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 444 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 492 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 997 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 394 80"><path fill="#000" d="M262 0h68.5v12.7h-27.2v66.6h-13.6V12.7H262V0ZM149 0v12.7H94v20.4h44.3v12.6H94v21h55v12.6H80.5V0h68.7zm34.3 0h-17.8l63.8 79.4h17.9l-32-39.7 32-39.6h-17.9l-23 28.6-23-28.6zm18.3 56.7-9-11-27.1 33.7h17.8l18.3-22.7z"/><path fill="#000" d="M81 79.3 17 0H0v79.3h13.6V17l50.2 62.3H81Zm252.6-.4c-1 0-1.8-.4-2.5-1s-1.1-1.6-1.1-2.6.3-1.8 1-2.5 1.6-1 2.6-1 1.8.3 2.5 1a3.4 3.4 0 0 1 .6 4.3 3.7 3.7 0 0 1-3 1.8zm23.2-33.5h6v23.3c0 2.1-.4 4-1.3 5.5a9.1 9.1 0 0 1-3.8 3.5c-1.6.8-3.5 1.3-5.7 1.3-2 0-3.7-.4-5.3-1s-2.8-1.8-3.7-3.2c-.9-1.3-1.4-3-1.4-5h6c.1.8.3 1.6.7 2.2s1 1.2 1.6 1.5c.7.4 1.5.5 2.4.5 1 0 1.8-.2 2.4-.6a4 4 0 0 0 1.6-1.8c.3-.8.5-1.8.5-3V45.5zm30.9 9.1a4.4 4.4 0 0 0-2-3.3 7.5 7.5 0 0 0-4.3-1.1c-1.3 0-2.4.2-3.3.5-.9.4-1.6 1-2 1.6a3.5 3.5 0 0 0-.3 4c.3.5.7.9 1.3 1.2l1.8 1 2 .5 3.2.8c1.3.3 2.5.7 3.7 1.2a13 13 0 0 1 3.2 1.8 8.1 8.1 0 0 1 3 6.5c0 2-.5 3.7-1.5 5.1a10 10 0 0 1-4.4 3.5c-1.8.8-4.1 1.2-6.8 1.2-2.6 0-4.9-.4-6.8-1.2-2-.8-3.4-2-4.5-3.5a10 10 0 0 1-1.7-5.6h6a5 5 0 0 0 3.5 4.6c1 .4 2.2.6 3.4.6 1.3 0 2.5-.2 3.5-.6 1-.4 1.8-1 2.4-1.7a4 4 0 0 0 .8-2.4c0-.9-.2-1.6-.7-2.2a11 11 0 0 0-2.1-1.4l-3.2-1-3.8-1c-2.8-.7-5-1.7-6.6-3.2a7.2 7.2 0 0 1-2.4-5.7 8 8 0 0 1 1.7-5 10 10 0 0 1 4.3-3.5c2-.8 4-1.2 6.4-1.2 2.3 0 4.4.4 6.2 1.2 1.8.8 3.2 2 4.3 3.4 1 1.4 1.5 3 1.5 5h-5.8z"/></svg>

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 262 KiB

View File

@ -0,0 +1 @@
{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 283 64"><path fill="black" d="M141 16c-11 0-19 7-19 18s9 18 20 18c7 0 13-3 16-7l-7-5c-2 3-6 4-9 4-5 0-9-3-10-7h28v-3c0-11-8-18-19-18zm-9 15c1-4 4-7 9-7s8 3 9 7h-18zm117-15c-11 0-19 7-19 18s9 18 20 18c6 0 12-3 16-7l-8-5c-2 3-5 4-8 4-5 0-9-3-11-7h28l1-3c0-11-8-18-19-18zm-10 15c2-4 5-7 10-7s8 3 9 7h-19zm-39 3c0 6 4 10 10 10 4 0 7-2 9-5l8 5c-3 5-9 8-17 8-11 0-19-7-19-18s8-18 19-18c8 0 14 3 17 8l-8 5c-2-3-5-5-9-5-6 0-10 4-10 10zm83-29v46h-9V5h9zM37 0l37 64H0L37 0zm92 5-27 48L74 5h10l18 30 17-30h10zm59 12v10l-3-1c-6 0-10 4-10 10v15h-9V17h9v9c0-5 6-9 13-9z"/></svg>

After

Width:  |  Height:  |  Size: 629 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

View File

@ -0,0 +1,10 @@
@tailwind base;
@tailwind components;
@tailwind utilities;
.fill-tremor-content-emphasis {
fill: rgb(113 113 122) !important;
}

View File

@ -0,0 +1,5 @@
export async function useGithubStars() {
const res = await fetch("https://api.github.com/repos/mendableai/firecrawl");
const data = await res.json();
return data.stargazers_count;
}

View File

@ -0,0 +1,68 @@
import type { Metadata } from "next";
import { Gloria_Hallelujah } from "next/font/google";
import "./globals.css";
import { Toaster } from "sonner";
import { Analytics } from "@vercel/analytics/react";
import { useEffect, useState } from "react";
import Head from "next/head";
const inter = Gloria_Hallelujah({ weight: "400", subsets: ["latin"] });
// const inter = Inter({ subsets: ["latin"] });
const meta = {
title: "Roast My Website",
description:
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 😈",
cardImage: "/og.png",
robots: "follow, index",
favicon: "/favicon.ico",
url: "https://www.roastmywebsite.ai/",
};
export async function generateMetadata(): Promise<Metadata> {
return {
title: meta.title,
description: meta.description,
referrer: "origin-when-cross-origin",
keywords: ["Roast My Website", "Roast", "Website", "GitHub", "Firecrawl"],
authors: [
{ name: "Roast My Website", url: "https://www.roastmywebsite.ai/" },
],
creator: "Roast My Website",
publisher: "Roast My Website",
robots: meta.robots,
icons: { icon: meta.favicon },
metadataBase: new URL(meta.url),
openGraph: {
url: meta.url,
title: meta.title,
description: meta.description,
images: [meta.cardImage],
type: "website",
siteName: meta.title,
},
twitter: {
card: "summary_large_image",
site: "@Vercel",
creator: "@Vercel",
title: meta.title,
description: meta.description,
images: [meta.cardImage],
},
};
}
export default function RootLayout({
children,
}: Readonly<{
children: React.ReactNode;
}>) {
return (
<html lang="en">
<body className={inter.className}>{children}</body>
<Analytics />
<Toaster />
</html>
);
}

Some files were not shown because too many files have changed in this diff Show More