Merge branch 'main' into nsc/improvemnts-fixes-misc
35
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
@ -0,0 +1,35 @@
|
||||
---
|
||||
name: Bug report
|
||||
about: Create a report to help us improve
|
||||
title: "[BUG]"
|
||||
labels: bug
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Describe the Bug**
|
||||
Provide a clear and concise description of what the bug is.
|
||||
|
||||
**To Reproduce**
|
||||
Steps to reproduce the issue:
|
||||
1. Configure the environment or settings with '...'
|
||||
2. Run the command '...'
|
||||
3. Observe the error or unexpected output at '...'
|
||||
4. Log output/error message
|
||||
|
||||
**Expected Behavior**
|
||||
A clear and concise description of what you expected to happen.
|
||||
|
||||
**Screenshots**
|
||||
If applicable, add screenshots or copies of the command line output to help explain the issue.
|
||||
|
||||
**Environment (please complete the following information):**
|
||||
- OS: [e.g. macOS, Linux, Windows]
|
||||
- Firecrawl Version: [e.g. 1.2.3]
|
||||
- Node.js Version: [e.g. 14.x]
|
||||
|
||||
**Logs**
|
||||
If applicable, include detailed logs to help understand the problem.
|
||||
|
||||
**Additional Context**
|
||||
Add any other context about the problem here, such as configuration specifics, network conditions, data volumes, etc.
|
26
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
---
|
||||
name: Feature request
|
||||
about: Suggest an idea for this project
|
||||
title: "[Feat]"
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Problem Description**
|
||||
Describe the issue you're experiencing that has prompted this feature request. For example, "I find it difficult when..."
|
||||
|
||||
**Proposed Feature**
|
||||
Provide a clear and concise description of the feature you would like implemented.
|
||||
|
||||
**Alternatives Considered**
|
||||
Discuss any alternative solutions or features you've considered. Why were these alternatives not suitable?
|
||||
|
||||
**Implementation Suggestions**
|
||||
If you have ideas on how the feature could be implemented, share them here. This could include technical details, API changes, or interaction mechanisms.
|
||||
|
||||
**Use Case**
|
||||
Explain how this feature would be used and what benefits it would bring. Include specific examples to illustrate how this would improve functionality or user experience.
|
||||
|
||||
**Additional Context**
|
||||
Add any other context such as comparisons with similar features in other products, or links to prototypes or mockups.
|
3
.github/workflows/ci.yml
vendored
@ -25,6 +25,9 @@ env:
|
||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
|
||||
jobs:
|
||||
pre-deploy:
|
||||
|
19
.github/workflows/fly.yml
vendored
@ -94,6 +94,25 @@ jobs:
|
||||
run: |
|
||||
npm run test
|
||||
working-directory: ./apps/test-suite
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
working-directory: ./apps/python-sdk
|
||||
- name: Run E2E tests for Python SDK
|
||||
run: |
|
||||
pytest firecrawl/__tests__/e2e_withAuth/test.py
|
||||
working-directory: ./apps/python-sdk
|
||||
- name: Install dependencies for JavaScript SDK
|
||||
run: pnpm install
|
||||
working-directory: ./apps/js-sdk/firecrawl
|
||||
- name: Run E2E tests for JavaScript SDK
|
||||
run: npm run test
|
||||
working-directory: ./apps/js-sdk/firecrawl
|
||||
|
||||
deploy:
|
||||
name: Deploy app
|
||||
|
60
.github/workflows/js-sdk.yml
vendored
Normal file
@ -0,0 +1,60 @@
|
||||
name: Run JavaScript SDK E2E Tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
||||
PORT: ${{ secrets.PORT }}
|
||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
redis:
|
||||
image: redis
|
||||
ports:
|
||||
- 6379:6379
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: "20"
|
||||
- name: Install pnpm
|
||||
run: npm install -g pnpm
|
||||
- name: Install dependencies for API
|
||||
run: pnpm install
|
||||
working-directory: ./apps/api
|
||||
- name: Start the application
|
||||
run: npm start &
|
||||
working-directory: ./apps/api
|
||||
- name: Start workers
|
||||
run: npm run workers &
|
||||
working-directory: ./apps/api
|
||||
- name: Install dependencies for JavaScript SDK
|
||||
run: pnpm install
|
||||
working-directory: ./apps/js-sdk/firecrawl
|
||||
- name: Run E2E tests for JavaScript SDK
|
||||
run: npm run test
|
||||
working-directory: ./apps/js-sdk/firecrawl
|
72
.github/workflows/python-sdk.yml
vendored
Normal file
@ -0,0 +1,72 @@
|
||||
name: Run Python SDK E2E Tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
||||
PORT: ${{ secrets.PORT }}
|
||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.10"]
|
||||
services:
|
||||
redis:
|
||||
image: redis
|
||||
ports:
|
||||
- 6379:6379
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: "20"
|
||||
- name: Install pnpm
|
||||
run: npm install -g pnpm
|
||||
- name: Install dependencies for API
|
||||
run: pnpm install
|
||||
working-directory: ./apps/api
|
||||
- name: Start the application
|
||||
run: npm start &
|
||||
working-directory: ./apps/api
|
||||
id: start_app
|
||||
- name: Start workers
|
||||
run: npm run workers &
|
||||
working-directory: ./apps/api
|
||||
id: start_workers
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
working-directory: ./apps/python-sdk
|
||||
- name: Run E2E tests for Python SDK
|
||||
run: |
|
||||
pytest firecrawl/__tests__/e2e_withAuth/test.py
|
||||
working-directory: ./apps/python-sdk
|
@ -39,7 +39,7 @@ SUPABASE_SERVICE_TOKEN=
|
||||
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= #
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
|
@ -6,7 +6,7 @@ _This repository is in its early development stages. We are still merging custom
|
||||
|
||||
## What is Firecrawl?
|
||||
|
||||
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown. We crawl all accessible subpages and give you clean markdown for each. No sitemap required.
|
||||
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required.
|
||||
|
||||
_Pst. hey, you, join our stargazers :)_
|
||||
|
||||
@ -114,7 +114,7 @@ Response:
|
||||
|
||||
### Search (Beta)
|
||||
|
||||
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
||||
Used to search the web, get the most relevant results, scrape each page and return the markdown.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/search \
|
||||
@ -296,7 +296,6 @@ npm install @mendable/firecrawl-js
|
||||
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
||||
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
||||
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
@ -403,7 +402,6 @@ const searchResults = await app.search(query, {
|
||||
|
||||
```
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.
|
||||
|
27
SELF_HOST.md
@ -1,6 +1,31 @@
|
||||
# Self-hosting Firecrawl
|
||||
*We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version.*
|
||||
|
||||
Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally.
|
||||
|
||||
*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it*
|
||||
## Getting Started
|
||||
|
||||
First, clone this repository and copy the example env file from api folder `.env.example` to `.env`.
|
||||
```bash
|
||||
git clone https://github.com/mendableai/firecrawl.git
|
||||
cd firecrawl
|
||||
cp ./apps/api/.env.example ./.env
|
||||
```
|
||||
|
||||
For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` on `.env` to not use the database authentication.
|
||||
```yml
|
||||
USE_DB_AUTHENTICATION=false
|
||||
```
|
||||
|
||||
Update the Redis URL in the .env file to align with the Docker configuration:
|
||||
```yml
|
||||
REDIS_URL=redis://redis:6379
|
||||
```
|
||||
|
||||
Once that's complete, you can simply run the following commands to get started:
|
||||
```bash
|
||||
docker compose up
|
||||
```
|
||||
|
||||
|
||||
This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`.
|
||||
|
@ -3,6 +3,7 @@ NUM_WORKERS_PER_QUEUE=8
|
||||
PORT=3002
|
||||
HOST=0.0.0.0
|
||||
REDIS_URL=redis://localhost:6379
|
||||
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000
|
||||
|
||||
## To turn on DB authentication, you need to set up supabase.
|
||||
USE_DB_AUTHENTICATION=true
|
||||
@ -16,14 +17,36 @@ SUPABASE_SERVICE_TOKEN=
|
||||
|
||||
# Other Optionals
|
||||
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
||||
RATE_LIMIT_TEST_API_KEY_SCRAPE= # set if you'd like to test the scraping rate limit
|
||||
RATE_LIMIT_TEST_API_KEY_CRAWL= # set if you'd like to test the crawling rate limit
|
||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= #
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
|
||||
STRIPE_PRICE_ID_STANDARD=
|
||||
STRIPE_PRICE_ID_SCALE=
|
||||
STRIPE_PRICE_ID_STARTER=
|
||||
STRIPE_PRICE_ID_HOBBY=
|
||||
STRIPE_PRICE_ID_HOBBY_YEARLY=
|
||||
STRIPE_PRICE_ID_STANDARD_NEW=
|
||||
STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
|
||||
STRIPE_PRICE_ID_GROWTH=
|
||||
STRIPE_PRICE_ID_GROWTH_YEARLY=
|
||||
|
||||
HYPERDX_API_KEY=
|
||||
HDX_NODE_BETA_MODE=1
|
||||
|
||||
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta
|
||||
|
||||
# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
|
||||
PROXY_SERVER=
|
||||
PROXY_USERNAME=
|
||||
PROXY_PASSWORD=
|
||||
# set if you'd like to block media requests to save proxy bandwidth
|
||||
BLOCK_MEDIA=
|
@ -27,6 +27,13 @@ kill_timeout = '5s'
|
||||
hard_limit = 200
|
||||
soft_limit = 100
|
||||
|
||||
[[http_service.checks]]
|
||||
grace_period = "10s"
|
||||
interval = "30s"
|
||||
method = "GET"
|
||||
timeout = "5s"
|
||||
path = "/"
|
||||
|
||||
[[services]]
|
||||
protocol = 'tcp'
|
||||
internal_port = 8080
|
||||
|
@ -18,8 +18,8 @@
|
||||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL",
|
||||
"operationId": "scrapeSingleUrl",
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
@ -45,8 +45,48 @@
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"extractorOptions": {
|
||||
"type": "object",
|
||||
"description": "Options for LLM-based extraction of structured information from the page content",
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["llm-extraction"],
|
||||
"description": "The extraction mode to use, currently supports 'llm-extraction'"
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page"
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
@ -126,6 +166,16 @@
|
||||
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
|
||||
"default": false
|
||||
},
|
||||
"maxDepth": {
|
||||
"type": "integer",
|
||||
"description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on."
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["default", "fast"],
|
||||
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
|
||||
"default": "default"
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of pages to crawl",
|
||||
@ -140,6 +190,11 @@
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -192,7 +247,7 @@
|
||||
"query": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The URL to scrape"
|
||||
"description": "The query to search for"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
@ -206,6 +261,11 @@
|
||||
"type": "boolean",
|
||||
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
|
||||
"default": true
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -299,9 +359,66 @@
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/ScrapeResponse"
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Data returned from the job (null when it is in progress)"
|
||||
},
|
||||
"partial_data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required"
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests"
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/cancel/{jobId}": {
|
||||
"delete": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Cancel a crawl job",
|
||||
"operationId": "cancelCrawlJob",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Returns cancelled."
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -344,6 +461,11 @@
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@ -362,6 +484,51 @@
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
"type": "object",
|
||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||
"nullable": true
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlStatusResponseObj": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -33,6 +33,7 @@
|
||||
"express": "^4.18.2",
|
||||
"jest": "^29.6.3",
|
||||
"jest-fetch-mock": "^3.0.3",
|
||||
"mammoth": "^1.7.2",
|
||||
"nodemon": "^2.0.20",
|
||||
"supabase": "^1.77.9",
|
||||
"supertest": "^6.3.3",
|
||||
@ -47,6 +48,7 @@
|
||||
"@bull-board/express": "^5.8.0",
|
||||
"@devil7softwares/pos": "^1.0.2",
|
||||
"@dqbd/tiktoken": "^1.0.13",
|
||||
"@hyperdx/node-opentelemetry": "^0.7.0",
|
||||
"@logtail/node": "^0.4.12",
|
||||
"@nangohq/node": "^0.36.33",
|
||||
"@sentry/node": "^7.48.0",
|
||||
|
@ -1,6 +1,7 @@
|
||||
import request from "supertest";
|
||||
import { app } from "../../index";
|
||||
import dotenv from "dotenv";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
@ -67,7 +68,7 @@ describe("E2E Tests for API Routes", () => {
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(200);
|
||||
}, 10000); // 10 seconds timeout
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
it("should return a successful response with a valid API key", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
@ -81,7 +82,7 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.content).toContain("🔥 FireCrawl");
|
||||
expect(response.body.data.content).toContain("🔥 Firecrawl");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
it("should return a successful response with a valid API key and includeHtml set to true", async () => {
|
||||
@ -99,10 +100,61 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("html");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.content).toContain("🔥 FireCrawl");
|
||||
expect(response.body.data.markdown).toContain("🔥 FireCrawl");
|
||||
expect(response.body.data.content).toContain("🔥 Firecrawl");
|
||||
expect(response.body.data.markdown).toContain("🔥 Firecrawl");
|
||||
expect(response.body.data.html).toContain("<h1");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
it('should return a successful response for a valid scrape with PDF file', async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' });
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' });
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
||||
// it("should return a successful response with a valid API key and waitFor option", async () => {
|
||||
// const startTime = Date.now();
|
||||
// const response = await request(TEST_URL)
|
||||
// .post("/v0/scrape")
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
// .set("Content-Type", "application/json")
|
||||
// .send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } });
|
||||
// const endTime = Date.now();
|
||||
// const duration = endTime - startTime;
|
||||
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// expect(response.body).toHaveProperty("data");
|
||||
// expect(response.body.data).toHaveProperty("content");
|
||||
// expect(response.body.data).toHaveProperty("markdown");
|
||||
// expect(response.body.data).toHaveProperty("metadata");
|
||||
// expect(response.body.data).not.toHaveProperty("html");
|
||||
// expect(response.body.data.content).toContain("🔥 Firecrawl");
|
||||
// expect(duration).toBeGreaterThanOrEqual(7000);
|
||||
// }, 12000); // 12 seconds timeout
|
||||
});
|
||||
|
||||
describe("POST /v0/crawl", () => {
|
||||
@ -145,8 +197,299 @@ describe("E2E Tests for API Routes", () => {
|
||||
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
|
||||
);
|
||||
});
|
||||
it('should prevent duplicate requests using the same idempotency key', async () => {
|
||||
const uniqueIdempotencyKey = uuidv4();
|
||||
|
||||
// Additional tests for insufficient credits?
|
||||
// First request with the idempotency key
|
||||
const firstResponse = await request(TEST_URL)
|
||||
.post('/v0/crawl')
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.set("x-idempotency-key", uniqueIdempotencyKey)
|
||||
.send({ url: 'https://mendable.ai' });
|
||||
|
||||
expect(firstResponse.statusCode).toBe(200);
|
||||
|
||||
// Second request with the same idempotency key
|
||||
const secondResponse = await request(TEST_URL)
|
||||
.post('/v0/crawl')
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.set("x-idempotency-key", uniqueIdempotencyKey)
|
||||
.send({ url: 'https://mendable.ai' });
|
||||
|
||||
expect(secondResponse.statusCode).toBe(409);
|
||||
expect(secondResponse.body.error).toBe('Idempotency key already used');
|
||||
});
|
||||
|
||||
it("should return a successful response with a valid API key and valid includes option", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://mendable.ai",
|
||||
limit: 10,
|
||||
crawlerOptions: {
|
||||
includes: ["blog/*"],
|
||||
},
|
||||
});
|
||||
|
||||
let response;
|
||||
let isFinished = false;
|
||||
|
||||
while (!isFinished) {
|
||||
response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
isFinished = response.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
const completedResponse = response;
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(5);
|
||||
urls.forEach((url: string) => {
|
||||
console.log({url})
|
||||
expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy();
|
||||
});
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it("should return a successful response with a valid API key and valid excludes option", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://mendable.ai",
|
||||
limit: 10,
|
||||
crawlerOptions: {
|
||||
excludes: ["blog/*"],
|
||||
},
|
||||
});
|
||||
|
||||
let isFinished = false;
|
||||
let response;
|
||||
|
||||
while (!isFinished) {
|
||||
response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
isFinished = response.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
const completedResponse = response;
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(5);
|
||||
urls.forEach((url: string) => {
|
||||
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
|
||||
});
|
||||
}, 90000); // 90 seconds
|
||||
|
||||
it("should return a successful response with a valid API key and limit to 3", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://mendable.ai",
|
||||
crawlerOptions: { limit: 3 },
|
||||
});
|
||||
|
||||
let isFinished = false;
|
||||
let response;
|
||||
|
||||
while (!isFinished) {
|
||||
response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
isFinished = response.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
const completedResponse = response;
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data.length).toBe(3);
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it("should return a successful response with max depth option for a valid crawl job", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://www.scrapethissite.com",
|
||||
crawlerOptions: { maxDepth: 2 },
|
||||
});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
// wait for 60 seconds
|
||||
await new Promise((r) => setTimeout(r, 60000));
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(1);
|
||||
|
||||
// Check if all URLs have a maximum depth of 1
|
||||
urls.forEach((url: string) => {
|
||||
const depth = new URL(url).pathname.split("/").filter(Boolean).length;
|
||||
expect(depth).toBeLessThanOrEqual(1);
|
||||
});
|
||||
}, 120000);
|
||||
|
||||
// it("should return a successful response with a valid API key and valid limit option", async () => {
|
||||
// const crawlResponse = await request(TEST_URL)
|
||||
// .post("/v0/crawl")
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
// .set("Content-Type", "application/json")
|
||||
// .send({
|
||||
// url: "https://mendable.ai",
|
||||
// crawlerOptions: { limit: 10 },
|
||||
// });
|
||||
|
||||
// const response = await request(TEST_URL)
|
||||
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// expect(response.body).toHaveProperty("status");
|
||||
// expect(response.body.status).toBe("active");
|
||||
|
||||
// let isCompleted = false;
|
||||
// while (!isCompleted) {
|
||||
// const statusCheckResponse = await request(TEST_URL)
|
||||
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
// expect(statusCheckResponse.statusCode).toBe(200);
|
||||
// isCompleted = statusCheckResponse.body.status === "completed";
|
||||
// if (!isCompleted) {
|
||||
// await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
// }
|
||||
// }
|
||||
|
||||
// const completedResponse = await request(TEST_URL)
|
||||
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
// expect(completedResponse.statusCode).toBe(200);
|
||||
// expect(completedResponse.body).toHaveProperty("status");
|
||||
// expect(completedResponse.body.status).toBe("completed");
|
||||
// expect(completedResponse.body).toHaveProperty("data");
|
||||
// expect(completedResponse.body.data.length).toBe(10);
|
||||
// expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
// expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
// expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
// expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
// expect(completedResponse.body.data[0].content).not.toContain("main menu");
|
||||
// }, 60000); // 60 seconds
|
||||
|
||||
it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
pageOptions: { includeHtml: true },
|
||||
});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
|
||||
let isCompleted = false;
|
||||
while (!isCompleted) {
|
||||
const statusCheckResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(statusCheckResponse.statusCode).toBe(200);
|
||||
isCompleted = statusCheckResponse.body.status === "completed";
|
||||
if (!isCompleted) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
|
||||
// 120 seconds
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
|
||||
expect(completedResponse.body.data[0].markdown).toContain("Firecrawl");
|
||||
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||
}, 60000);
|
||||
});
|
||||
|
||||
describe("POST /v0/crawlWebsitePreview", () => {
|
||||
@ -248,7 +591,7 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(response.statusCode).toBe(404);
|
||||
});
|
||||
|
||||
it("should return a successful response for a valid crawl job", async () => {
|
||||
it("should return a successful crawl status response for a valid crawl job", async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
@ -256,27 +599,67 @@ describe("E2E Tests for API Routes", () => {
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
let completedResponse;
|
||||
|
||||
while (!isCompleted) {
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
|
||||
// wait for 30 seconds
|
||||
await new Promise((r) => setTimeout(r, 30000));
|
||||
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
if (response.body.status === "completed") {
|
||||
isCompleted = true;
|
||||
completedResponse = response;
|
||||
} else {
|
||||
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
||||
expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post('/v0/crawl')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
let completedResponse;
|
||||
|
||||
while (!isCompleted) {
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('status');
|
||||
|
||||
if (response.body.status === 'completed') {
|
||||
isCompleted = true;
|
||||
completedResponse = response;
|
||||
} else {
|
||||
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
expect(completedResponse.body.status).toBe('completed');
|
||||
expect(completedResponse.body).toHaveProperty('data');
|
||||
expect(completedResponse.body.data.length).toEqual(1);
|
||||
expect(completedResponse.body.data).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.')
|
||||
})
|
||||
])
|
||||
);
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it("should return a successful response with max depth option for a valid crawl job", async () => {
|
||||
@ -290,18 +673,21 @@ describe("E2E Tests for API Routes", () => {
|
||||
});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
let completedResponse;
|
||||
|
||||
while (!isCompleted) {
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("active");
|
||||
// wait for 60 seconds
|
||||
await new Promise((r) => setTimeout(r, 60000));
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
if (response.body.status === "completed") {
|
||||
isCompleted = true;
|
||||
completedResponse = response;
|
||||
}
|
||||
}
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
@ -357,8 +743,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
// 120 seconds
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
||||
expect(completedResponse.body.data[0].markdown).toContain("FireCrawl");
|
||||
expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
|
||||
expect(completedResponse.body.data[0].markdown).toContain("Firecrawl");
|
||||
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||
}, 60000);
|
||||
}); // 60 seconds
|
||||
@ -371,10 +757,8 @@ describe("E2E Tests for API Routes", () => {
|
||||
.send({ url: "https://jestjs.io" });
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
|
||||
|
||||
// wait for 30 seconds
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
await new Promise((r) => setTimeout(r, 20000));
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
|
||||
@ -383,7 +767,7 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(response.body.status).toBe("cancelled");
|
||||
|
||||
await new Promise((r) => setTimeout(r, 20000));
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
@ -400,8 +784,6 @@ describe("E2E Tests for API Routes", () => {
|
||||
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
|
||||
|
||||
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||
it("should extract data using LLM extraction mode", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
@ -511,6 +893,107 @@ describe("E2E Tests for API Routes", () => {
|
||||
// }, 120000); // 120 secs
|
||||
// });
|
||||
|
||||
describe("POST /v0/crawl with fast mode", () => {
|
||||
it("should complete the crawl under 20 seconds", async () => {
|
||||
const startTime = Date.now();
|
||||
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://flutterbricks.com",
|
||||
crawlerOptions: {
|
||||
mode: "fast"
|
||||
}
|
||||
});
|
||||
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const jobId = crawlResponse.body.jobId;
|
||||
let statusResponse;
|
||||
let isFinished = false;
|
||||
|
||||
while (!isFinished) {
|
||||
statusResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(statusResponse.statusCode).toBe(200);
|
||||
isFinished = statusResponse.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
const endTime = Date.now();
|
||||
const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
|
||||
|
||||
console.log(`Time elapsed: ${timeElapsed} seconds`);
|
||||
|
||||
expect(statusResponse.body.status).toBe("completed");
|
||||
expect(statusResponse.body).toHaveProperty("data");
|
||||
expect(statusResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(statusResponse.body.data[0]).toHaveProperty("markdown");
|
||||
const results = statusResponse.body.data;
|
||||
// results.forEach((result, i) => {
|
||||
// console.log(result.metadata.sourceURL);
|
||||
// });
|
||||
expect(results.length).toBeGreaterThanOrEqual(10);
|
||||
expect(results.length).toBeLessThanOrEqual(15);
|
||||
|
||||
}, 20000);
|
||||
|
||||
// it("should complete the crawl in more than 10 seconds", async () => {
|
||||
// const startTime = Date.now();
|
||||
|
||||
// const crawlResponse = await request(TEST_URL)
|
||||
// .post("/v0/crawl")
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
// .set("Content-Type", "application/json")
|
||||
// .send({
|
||||
// url: "https://flutterbricks.com",
|
||||
// });
|
||||
|
||||
// expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
// const jobId = crawlResponse.body.jobId;
|
||||
// let statusResponse;
|
||||
// let isFinished = false;
|
||||
|
||||
// while (!isFinished) {
|
||||
// statusResponse = await request(TEST_URL)
|
||||
// .get(`/v0/crawl/status/${jobId}`)
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
// expect(statusResponse.statusCode).toBe(200);
|
||||
// isFinished = statusResponse.body.status === "completed";
|
||||
|
||||
// if (!isFinished) {
|
||||
// await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
// }
|
||||
// }
|
||||
|
||||
// const endTime = Date.now();
|
||||
// const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
|
||||
|
||||
// console.log(`Time elapsed: ${timeElapsed} seconds`);
|
||||
|
||||
// expect(statusResponse.body.status).toBe("completed");
|
||||
// expect(statusResponse.body).toHaveProperty("data");
|
||||
// expect(statusResponse.body.data[0]).toHaveProperty("content");
|
||||
// expect(statusResponse.body.data[0]).toHaveProperty("markdown");
|
||||
// const results = statusResponse.body.data;
|
||||
// // results.forEach((result, i) => {
|
||||
// // console.log(result.metadata.sourceURL);
|
||||
// // });
|
||||
// expect(results.length).toBeGreaterThanOrEqual(10);
|
||||
// expect(results.length).toBeLessThanOrEqual(15);
|
||||
|
||||
// }, 50000);// 15 seconds timeout to account for network delays
|
||||
});
|
||||
|
||||
describe("GET /is-production", () => {
|
||||
it("should return the production status", async () => {
|
||||
const response = await request(TEST_URL).get("/is-production");
|
||||
@ -518,4 +1001,65 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(response.body).toHaveProperty("isProduction");
|
||||
});
|
||||
});
|
||||
|
||||
describe("Rate Limiter", () => {
|
||||
it("should return 429 when rate limit is exceeded for preview token", async () => {
|
||||
for (let i = 0; i < 4; i++) {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer this_is_just_a_preview_token`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://www.scrapethissite.com" });
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
}
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer this_is_just_a_preview_token`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://www.scrapethissite.com" });
|
||||
|
||||
expect(response.statusCode).toBe(429);
|
||||
}, 60000);
|
||||
});
|
||||
|
||||
// it("should return 429 when rate limit is exceeded for API key", async () => {
|
||||
// for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) {
|
||||
// const response = await request(TEST_URL)
|
||||
// .post("/v0/scrape")
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
// .set("Content-Type", "application/json")
|
||||
// .send({ url: "https://www.scrapethissite.com" });
|
||||
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// }
|
||||
|
||||
// const response = await request(TEST_URL)
|
||||
// .post("/v0/scrape")
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
// .set("Content-Type", "application/json")
|
||||
// .send({ url: "https://www.scrapethissite.com" });
|
||||
|
||||
// expect(response.statusCode).toBe(429);
|
||||
// }, 60000);
|
||||
|
||||
// it("should return 429 when rate limit is exceeded for API key", async () => {
|
||||
// for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) {
|
||||
// const response = await request(TEST_URL)
|
||||
// .post("/v0/crawl")
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
// .set("Content-Type", "application/json")
|
||||
// .send({ url: "https://www.scrapethissite.com" });
|
||||
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// }
|
||||
|
||||
// const response = await request(TEST_URL)
|
||||
// .post("/v0/crawl")
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
// .set("Content-Type", "application/json")
|
||||
// .send({ url: "https://www.scrapethissite.com" });
|
||||
|
||||
// expect(response.statusCode).toBe(429);
|
||||
// }, 60000);
|
||||
});
|
||||
|
@ -1,14 +1,25 @@
|
||||
import { parseApi } from "../../src/lib/parseApi";
|
||||
import { getRateLimiter } from "../../src/services/rate-limiter";
|
||||
import { getRateLimiter, } from "../../src/services/rate-limiter";
|
||||
import { AuthResponse, RateLimiterMode } from "../../src/types";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { withAuth } from "../../src/lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
|
||||
|
||||
|
||||
export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise<AuthResponse> {
|
||||
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
|
||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
||||
}
|
||||
function setTrace(team_id: string, api_key: string) {
|
||||
try {
|
||||
setTraceAttributes({
|
||||
team_id,
|
||||
api_key
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error setting trace attributes:', error);
|
||||
}
|
||||
|
||||
}
|
||||
export async function supaAuthenticateUser(
|
||||
req,
|
||||
res,
|
||||
@ -18,8 +29,8 @@ export async function supaAuthenticateUser(
|
||||
team_id?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
plan?: string;
|
||||
}> {
|
||||
|
||||
const authHeader = req.headers.authorization;
|
||||
if (!authHeader) {
|
||||
return { success: false, error: "Unauthorized", status: 401 };
|
||||
@ -33,18 +44,95 @@ export async function supaAuthenticateUser(
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const incomingIP = (req.headers["x-forwarded-for"] ||
|
||||
req.socket.remoteAddress) as string;
|
||||
const iptoken = incomingIP + token;
|
||||
await getRateLimiter(
|
||||
token === "this_is_just_a_preview_token" ? RateLimiterMode.Preview : mode, token
|
||||
).consume(iptoken);
|
||||
} catch (rateLimiterRes) {
|
||||
console.error(rateLimiterRes);
|
||||
|
||||
let rateLimiter: RateLimiterRedis;
|
||||
let subscriptionData: { team_id: string, plan: string } | null = null;
|
||||
let normalizedApi: string;
|
||||
|
||||
if (token == "this_is_just_a_preview_token") {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
} else {
|
||||
normalizedApi = parseApi(token);
|
||||
|
||||
const { data, error } = await supabase_service.rpc(
|
||||
'get_key_and_price_id_2', { api_key: normalizedApi }
|
||||
);
|
||||
// get_key_and_price_id_2 rpc definition:
|
||||
// create or replace function get_key_and_price_id_2(api_key uuid)
|
||||
// returns table(key uuid, team_id uuid, price_id text) as $$
|
||||
// begin
|
||||
// if api_key is null then
|
||||
// return query
|
||||
// select null::uuid as key, null::uuid as team_id, null::text as price_id;
|
||||
// end if;
|
||||
|
||||
// return query
|
||||
// select ak.key, ak.team_id, s.price_id
|
||||
// from api_keys ak
|
||||
// left join subscriptions s on ak.team_id = s.team_id
|
||||
// where ak.key = api_key;
|
||||
// end;
|
||||
// $$ language plpgsql;
|
||||
|
||||
if (error) {
|
||||
console.error('Error fetching key and price_id:', error);
|
||||
} else {
|
||||
// console.log('Key and Price ID:', data);
|
||||
}
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
error: "Rate limit exceeded. Too many requests, try again in 1 minute.",
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
const team_id = data[0].team_id;
|
||||
const plan = getPlanByPriceId(data[0].price_id);
|
||||
// HyperDX Logging
|
||||
setTrace(team_id, normalizedApi);
|
||||
subscriptionData = {
|
||||
team_id: team_id,
|
||||
plan: plan
|
||||
}
|
||||
switch (mode) {
|
||||
case RateLimiterMode.Crawl:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
|
||||
break;
|
||||
case RateLimiterMode.Scrape:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan);
|
||||
break;
|
||||
case RateLimiterMode.Search:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Search, token, subscriptionData.plan);
|
||||
break;
|
||||
case RateLimiterMode.CrawlStatus:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
break;
|
||||
|
||||
case RateLimiterMode.Preview:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
break;
|
||||
default:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token);
|
||||
break;
|
||||
// case RateLimiterMode.Search:
|
||||
// rateLimiter = await searchRateLimiter(RateLimiterMode.Search, token);
|
||||
// break;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
await rateLimiter.consume(iptoken);
|
||||
} catch (rateLimiterRes) {
|
||||
console.error(rateLimiterRes);
|
||||
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||
return {
|
||||
success: false,
|
||||
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
||||
status: 429,
|
||||
};
|
||||
}
|
||||
@ -66,12 +154,15 @@ export async function supaAuthenticateUser(
|
||||
// return { success: false, error: "Unauthorized: Invalid token", status: 401 };
|
||||
}
|
||||
|
||||
const normalizedApi = parseApi(token);
|
||||
// make sure api key is valid, based on the api_keys table in supabase
|
||||
if (!subscriptionData) {
|
||||
normalizedApi = parseApi(token);
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("api_keys")
|
||||
.select("*")
|
||||
.eq("key", normalizedApi);
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
@ -80,5 +171,27 @@ export async function supaAuthenticateUser(
|
||||
};
|
||||
}
|
||||
|
||||
return { success: true, team_id: data[0].team_id };
|
||||
subscriptionData = data[0];
|
||||
}
|
||||
|
||||
return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
|
||||
}
|
||||
|
||||
function getPlanByPriceId(price_id: string) {
|
||||
switch (price_id) {
|
||||
case process.env.STRIPE_PRICE_ID_STARTER:
|
||||
return 'starter';
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD:
|
||||
return 'standard';
|
||||
case process.env.STRIPE_PRICE_ID_SCALE:
|
||||
return 'scale';
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY || process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
|
||||
return 'hobby';
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW || process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
|
||||
return 'standard-new';
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH || process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
|
||||
return 'growth';
|
||||
default:
|
||||
return 'free';
|
||||
}
|
||||
}
|
@ -7,6 +7,8 @@ import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -19,6 +21,19 @@ export async function crawlController(req: Request, res: Response) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ error: "Idempotency key already used" });
|
||||
}
|
||||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
|
@ -15,7 +15,8 @@ export async function scrapeHelper(
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions: ExtractorOptions,
|
||||
timeout: number
|
||||
timeout: number,
|
||||
plan?: string
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
@ -64,7 +65,9 @@ export async function scrapeHelper(
|
||||
}
|
||||
|
||||
let creditsToBeBilled = filteredDocs.length;
|
||||
const creditsPerLLMExtract = 5;
|
||||
const creditsPerLLMExtract = plan === "starter" ? 5 : 50;
|
||||
|
||||
|
||||
|
||||
if (extractorOptions.mode === "llm-extraction") {
|
||||
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
@ -93,7 +96,7 @@ export async function scrapeHelper(
|
||||
export async function scrapeController(req: Request, res: Response) {
|
||||
try {
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
@ -102,10 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
|
||||
const extractorOptions = req.body.extractorOptions ?? {
|
||||
mode: "markdown"
|
||||
}
|
||||
if (extractorOptions.mode === "llm-extraction") {
|
||||
pageOptions.onlyMainContent = true;
|
||||
}
|
||||
const origin = req.body.origin ?? "api";
|
||||
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds
|
||||
|
||||
@ -126,7 +132,8 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
timeout
|
||||
timeout,
|
||||
plan
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
@ -28,11 +28,13 @@ export async function searchHelper(
|
||||
|
||||
const tbs = searchOptions.tbs ?? null;
|
||||
const filter = searchOptions.filter ?? null;
|
||||
const num_results = searchOptions.limit ?? 7;
|
||||
const num_results_buffer = Math.floor(num_results * 1.5);
|
||||
|
||||
let res = await search({
|
||||
query: query,
|
||||
advanced: advanced,
|
||||
num_results: searchOptions.limit ?? 7,
|
||||
num_results: num_results_buffer,
|
||||
tbs: tbs,
|
||||
filter: filter,
|
||||
lang: searchOptions.lang ?? "en",
|
||||
@ -47,6 +49,9 @@ export async function searchHelper(
|
||||
}
|
||||
|
||||
res = res.filter((r) => !isUrlBlocked(r.url));
|
||||
if (res.length > num_results) {
|
||||
res = res.slice(0, num_results);
|
||||
}
|
||||
|
||||
if (res.length === 0) {
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
|
@ -5,6 +5,8 @@ import "dotenv/config";
|
||||
import { getWebScraperQueue } from "./services/queue-service";
|
||||
import { redisClient } from "./services/rate-limiter";
|
||||
import { v0Router } from "./routes/v0";
|
||||
import { initSDK } from '@hyperdx/node-opentelemetry';
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
const { ExpressAdapter } = require("@bull-board/express");
|
||||
@ -47,6 +49,11 @@ const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
const HOST = process.env.HOST ?? "localhost";
|
||||
redisClient.connect();
|
||||
|
||||
// HyperDX OpenTelemetry
|
||||
if(process.env.ENV === 'production') {
|
||||
initSDK({ consoleCapture: true, additionalInstrumentations: []});
|
||||
}
|
||||
|
||||
|
||||
export function startServer(port = DEFAULT_PORT) {
|
||||
const server = app.listen(Number(port), HOST, () => {
|
||||
@ -161,3 +168,6 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
|
||||
app.get("/is-production", (req, res) => {
|
||||
res.send({ isProduction: global.isProduction });
|
||||
});
|
||||
|
||||
|
||||
// /workers health check, cant act as load balancer, just has to be a pre deploy thing
|
@ -1,25 +1,38 @@
|
||||
import OpenAI from "openai";
|
||||
import { Document } from "../../lib/entities";
|
||||
import { numTokensFromString } from "./helpers";
|
||||
|
||||
export type ScraperCompletionResult = {
|
||||
data: any | null;
|
||||
url: string;
|
||||
};
|
||||
|
||||
const maxTokens = 32000;
|
||||
const modifier = 4;
|
||||
const defaultPrompt =
|
||||
"You are a professional web scraper. Extract the contents of the webpage";
|
||||
|
||||
function prepareOpenAIDoc(
|
||||
document: Document
|
||||
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
|
||||
// Check if the markdown content exists in the document
|
||||
if (!document.markdown) {
|
||||
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
|
||||
let markdown = document.markdown;
|
||||
|
||||
// Check if the markdown content exists in the document
|
||||
if (!markdown) {
|
||||
throw new Error(
|
||||
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
|
||||
);
|
||||
}
|
||||
|
||||
return [{ type: "text", text: document.markdown }];
|
||||
// count number of tokens
|
||||
const numTokens = numTokensFromString(document.markdown, "gpt-4");
|
||||
|
||||
if (numTokens > maxTokens) {
|
||||
// trim the document to the maximum number of tokens, tokens != characters
|
||||
markdown = markdown.slice(0, (maxTokens * modifier));
|
||||
}
|
||||
|
||||
return [[{ type: "text", text: markdown }], numTokens];
|
||||
}
|
||||
|
||||
export async function generateOpenAICompletions({
|
||||
@ -38,7 +51,7 @@ export async function generateOpenAICompletions({
|
||||
temperature?: number;
|
||||
}): Promise<Document> {
|
||||
const openai = client as OpenAI;
|
||||
const content = prepareOpenAIDoc(document);
|
||||
const [content, numTokens] = prepareOpenAIDoc(document);
|
||||
|
||||
const completion = await openai.chat.completions.create({
|
||||
model,
|
||||
@ -72,6 +85,7 @@ export async function generateOpenAICompletions({
|
||||
return {
|
||||
...document,
|
||||
llm_extraction: llmExtraction,
|
||||
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -15,6 +15,9 @@ export type PageOptions = {
|
||||
includeHtml?: boolean;
|
||||
fallback?: boolean;
|
||||
fetchPageContent?: boolean;
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
headers?: Record<string, string>;
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
@ -44,6 +47,7 @@ export type WebScraperOptions = {
|
||||
limit?: number;
|
||||
generateImgAltText?: boolean;
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
mode?: "default" | "fast"; // have a mode of some sort
|
||||
};
|
||||
pageOptions?: PageOptions;
|
||||
extractorOptions?: ExtractorOptions;
|
||||
@ -71,6 +75,7 @@ export class Document {
|
||||
};
|
||||
childrenLinks?: string[];
|
||||
provider?: string;
|
||||
warning?: string;
|
||||
|
||||
constructor(data: Partial<Document>) {
|
||||
if (!data.content) {
|
||||
@ -103,3 +108,8 @@ export class SearchResult {
|
||||
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
|
||||
}
|
||||
}
|
||||
|
||||
export interface FireEngineResponse {
|
||||
html: string;
|
||||
screenshot: string;
|
||||
}
|
42
apps/api/src/lib/load-testing-example.ts
Normal file
@ -0,0 +1,42 @@
|
||||
// import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
|
||||
|
||||
// const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
|
||||
// const scrapInBatches = async (
|
||||
// urls: string[],
|
||||
// batchSize: number,
|
||||
// delayMs: number
|
||||
// ) => {
|
||||
// let successCount = 0;
|
||||
// let errorCount = 0;
|
||||
|
||||
// for (let i = 0; i < urls.length; i += batchSize) {
|
||||
// const batch = urls
|
||||
// .slice(i, i + batchSize)
|
||||
// .map((url) => scrapWithFireEngine(url));
|
||||
// try {
|
||||
// const results = await Promise.all(batch);
|
||||
// results.forEach((data, index) => {
|
||||
// if (data.trim() === "") {
|
||||
// errorCount++;
|
||||
// } else {
|
||||
// successCount++;
|
||||
// console.log(
|
||||
// `Scraping result ${i + index + 1}:`,
|
||||
// data.trim().substring(0, 20) + "..."
|
||||
// );
|
||||
// }
|
||||
// });
|
||||
// } catch (error) {
|
||||
// console.error("Error during scraping:", error);
|
||||
// }
|
||||
// await delay(delayMs);
|
||||
// }
|
||||
|
||||
// console.log(`Total successful scrapes: ${successCount}`);
|
||||
// console.log(`Total errored scrapes: ${errorCount}`);
|
||||
// };
|
||||
// function run() {
|
||||
// const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
|
||||
// scrapInBatches(urls, 10, 1000);
|
||||
// }
|
@ -15,7 +15,7 @@ export class WebCrawler {
|
||||
private maxCrawledLinks: number;
|
||||
private maxCrawledDepth: number;
|
||||
private visited: Set<string> = new Set();
|
||||
private crawledUrls: Set<string> = new Set();
|
||||
private crawledUrls: Map<string, string> = new Map();
|
||||
private limit: number;
|
||||
private robotsTxtUrl: string;
|
||||
private robots: any;
|
||||
@ -25,7 +25,7 @@ export class WebCrawler {
|
||||
initialUrl,
|
||||
includes,
|
||||
excludes,
|
||||
maxCrawledLinks,
|
||||
maxCrawledLinks = 10000,
|
||||
limit = 10000,
|
||||
generateImgAltText = false,
|
||||
maxCrawledDepth = 10,
|
||||
@ -51,7 +51,6 @@ export class WebCrawler {
|
||||
this.generateImgAltText = generateImgAltText ?? false;
|
||||
}
|
||||
|
||||
|
||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
return sitemapLinks
|
||||
.filter((link) => {
|
||||
@ -77,9 +76,22 @@ export class WebCrawler {
|
||||
|
||||
// Check if the link matches the include patterns, if any are specified
|
||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||
return this.includes.some((includePattern) =>
|
||||
if (!this.includes.some((includePattern) =>
|
||||
new RegExp(includePattern).test(path)
|
||||
);
|
||||
)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize the initial URL and the link to account for www and non-www versions
|
||||
const normalizedInitialUrl = new URL(this.initialUrl);
|
||||
const normalizedLink = new URL(link);
|
||||
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
|
||||
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
|
||||
|
||||
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
||||
if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
||||
@ -99,19 +111,21 @@ export class WebCrawler {
|
||||
concurrencyLimit: number = 5,
|
||||
limit: number = 10000,
|
||||
maxDepth: number = 10
|
||||
): Promise<string[]> {
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
// Fetch and parse robots.txt
|
||||
try {
|
||||
const response = await axios.get(this.robotsTxtUrl);
|
||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||
|
||||
}
|
||||
|
||||
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if (sitemapLinks.length > 0) {
|
||||
const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||
return filteredLinks;
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||
}
|
||||
|
||||
const urls = await this.crawlUrls(
|
||||
@ -123,43 +137,58 @@ export class WebCrawler {
|
||||
urls.length === 0 &&
|
||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
||||
) {
|
||||
return [this.initialUrl];
|
||||
return [{ url: this.initialUrl, html: "" }];
|
||||
}
|
||||
|
||||
|
||||
// make sure to run include exclude here again
|
||||
return this.filterLinks(urls, limit, this.maxCrawledDepth);
|
||||
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
||||
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
||||
}
|
||||
|
||||
private async crawlUrls(
|
||||
urls: string[],
|
||||
concurrencyLimit: number,
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<string[]> {
|
||||
inProgress?: (progress: Progress) => void,
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
const queue = async.queue(async (task: string, callback) => {
|
||||
if (this.crawledUrls.size >= this.maxCrawledLinks) {
|
||||
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
|
||||
if (callback && typeof callback === "function") {
|
||||
callback();
|
||||
}
|
||||
return;
|
||||
}
|
||||
const newUrls = await this.crawl(task);
|
||||
newUrls.forEach((url) => this.crawledUrls.add(url));
|
||||
// add the initial url if not already added
|
||||
// if (this.visited.size === 1) {
|
||||
// let normalizedInitial = this.initialUrl;
|
||||
// if (!normalizedInitial.endsWith("/")) {
|
||||
// normalizedInitial = normalizedInitial + "/";
|
||||
// }
|
||||
// if (!newUrls.some(page => page.url === this.initialUrl)) {
|
||||
// newUrls.push({ url: this.initialUrl, html: "" });
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
||||
|
||||
if (inProgress && newUrls.length > 0) {
|
||||
inProgress({
|
||||
current: this.crawledUrls.size,
|
||||
total: this.maxCrawledLinks,
|
||||
total: Math.min(this.maxCrawledLinks, this.limit),
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: newUrls[newUrls.length - 1],
|
||||
currentDocumentUrl: newUrls[newUrls.length - 1].url,
|
||||
});
|
||||
} else if (inProgress) {
|
||||
inProgress({
|
||||
current: this.crawledUrls.size,
|
||||
total: this.maxCrawledLinks,
|
||||
total: Math.min(this.maxCrawledLinks, this.limit),
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: task,
|
||||
});
|
||||
}
|
||||
await this.crawlUrls(newUrls, concurrencyLimit, inProgress);
|
||||
await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
|
||||
if (callback && typeof callback === "function") {
|
||||
callback();
|
||||
}
|
||||
@ -175,35 +204,48 @@ export class WebCrawler {
|
||||
}
|
||||
);
|
||||
await queue.drain();
|
||||
return Array.from(this.crawledUrls);
|
||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||
}
|
||||
|
||||
async crawl(url: string): Promise<string[]> {
|
||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent"))
|
||||
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
|
||||
return [];
|
||||
}
|
||||
this.visited.add(url);
|
||||
|
||||
|
||||
if (!url.startsWith("http")) {
|
||||
url = "https://" + url;
|
||||
|
||||
}
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
|
||||
}
|
||||
|
||||
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
let content;
|
||||
// If it is the first link, fetch with scrapingbee
|
||||
let content : string = "";
|
||||
// If it is the first link, fetch with single url
|
||||
if (this.visited.size === 1) {
|
||||
const page = await scrapSingleUrl(url, {includeHtml: true});
|
||||
content = page.html;
|
||||
content = page.html ?? ""
|
||||
} else {
|
||||
const response = await axios.get(url);
|
||||
content = response.data;
|
||||
content = response.data ?? "";
|
||||
}
|
||||
const $ = load(content);
|
||||
let links: string[] = [];
|
||||
let links: {url: string, html: string}[] = [];
|
||||
|
||||
// Add the initial URL to the list of links
|
||||
if(this.visited.size === 1)
|
||||
{
|
||||
links.push({url, html: content});
|
||||
}
|
||||
|
||||
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
@ -216,7 +258,6 @@ export class WebCrawler {
|
||||
const path = url.pathname;
|
||||
|
||||
if (
|
||||
// fullUrl.startsWith(this.initialUrl) && // this condition makes it stop crawling back the url
|
||||
this.isInternalLink(fullUrl) &&
|
||||
this.matchesPattern(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
@ -224,12 +265,16 @@ export class WebCrawler {
|
||||
!this.matchesExcludes(path) &&
|
||||
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
||||
) {
|
||||
links.push(fullUrl);
|
||||
links.push({url: fullUrl, html: content});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return links.filter((link) => !this.visited.has(link));
|
||||
if(this.visited.size === 1){
|
||||
return links;
|
||||
}
|
||||
// Create a new list to return to avoid modifying the visited list
|
||||
return links.filter((link) => !this.visited.has(link.url));
|
||||
} catch (error) {
|
||||
return [];
|
||||
}
|
||||
@ -276,9 +321,15 @@ export class WebCrawler {
|
||||
".mp4",
|
||||
".mp3",
|
||||
".pptx",
|
||||
".docx",
|
||||
// ".docx",
|
||||
".xlsx",
|
||||
".xml",
|
||||
".avi",
|
||||
".flv",
|
||||
".woff",
|
||||
".ttf",
|
||||
".woff2",
|
||||
".webp"
|
||||
];
|
||||
return fileExtensions.some((ext) => url.endsWith(ext));
|
||||
}
|
||||
@ -295,18 +346,57 @@ export class WebCrawler {
|
||||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||
}
|
||||
|
||||
//
|
||||
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
||||
const normalizeUrl = (url: string) => {
|
||||
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
return url;
|
||||
};
|
||||
|
||||
const sitemapUrl = url.endsWith("/sitemap.xml")
|
||||
? url
|
||||
: `${url}/sitemap.xml`;
|
||||
|
||||
let sitemapLinks: string[] = [];
|
||||
|
||||
try {
|
||||
const response = await axios.get(sitemapUrl);
|
||||
if (response.status === 200) {
|
||||
return await getLinksFromSitemap(sitemapUrl);
|
||||
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
||||
}
|
||||
} catch (error) {
|
||||
// Error handling for failed sitemap fetch
|
||||
// console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
|
||||
}
|
||||
return [];
|
||||
|
||||
if (sitemapLinks.length === 0) {
|
||||
// If the first one doesn't work, try the base URL
|
||||
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||
try {
|
||||
const response = await axios.get(baseUrlSitemap);
|
||||
if (response.status === 200) {
|
||||
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
||||
}
|
||||
} catch (error) {
|
||||
// Error handling for failed base URL sitemap fetch
|
||||
// console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize and check if the URL is present in any of the sitemaps
|
||||
const normalizedUrl = normalizeUrl(url);
|
||||
|
||||
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
||||
|
||||
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
||||
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
|
||||
// do not push the normalized url
|
||||
sitemapLinks.push(url);
|
||||
}
|
||||
|
||||
return sitemapLinks;
|
||||
}
|
||||
}
|
||||
|
@ -17,6 +17,7 @@ import {
|
||||
} from "./utils/replacePaths";
|
||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||
|
||||
export class WebScraperDataProvider {
|
||||
private bullJobId: string;
|
||||
@ -35,6 +36,7 @@ export class WebScraperDataProvider {
|
||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
||||
"gpt-4-turbo";
|
||||
private crawlerMode: string = "default";
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
@ -46,7 +48,8 @@ export class WebScraperDataProvider {
|
||||
|
||||
private async convertUrlsToDocuments(
|
||||
urls: string[],
|
||||
inProgress?: (progress: Progress) => void
|
||||
inProgress?: (progress: Progress) => void,
|
||||
allHtmls?: string[]
|
||||
): Promise<Document[]> {
|
||||
const totalUrls = urls.length;
|
||||
let processedUrls = 0;
|
||||
@ -56,7 +59,12 @@ export class WebScraperDataProvider {
|
||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||
await Promise.all(
|
||||
batchUrls.map(async (url, index) => {
|
||||
const result = await scrapSingleUrl(url, this.pageOptions);
|
||||
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
||||
const result = await scrapSingleUrl(
|
||||
url,
|
||||
this.pageOptions,
|
||||
existingHTML
|
||||
);
|
||||
processedUrls++;
|
||||
if (inProgress) {
|
||||
inProgress({
|
||||
@ -127,9 +135,30 @@ export class WebScraperDataProvider {
|
||||
}
|
||||
}
|
||||
|
||||
private async cleanIrrelevantPath(links: string[]) {
|
||||
return links.filter((link) => {
|
||||
const normalizedInitialUrl = new URL(this.urls[0]);
|
||||
const normalizedLink = new URL(link);
|
||||
|
||||
// Normalize the hostname to account for www and non-www versions
|
||||
const initialHostname = normalizedInitialUrl.hostname.replace(
|
||||
/^www\./,
|
||||
""
|
||||
);
|
||||
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
|
||||
|
||||
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
||||
return (
|
||||
linkHostname === initialHostname &&
|
||||
normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
private async handleCrawlMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
|
||||
const crawler = new WebCrawler({
|
||||
initialUrl: this.urls[0],
|
||||
includes: this.includes,
|
||||
@ -139,20 +168,38 @@ export class WebScraperDataProvider {
|
||||
limit: this.limit,
|
||||
generateImgAltText: this.generateImgAltText,
|
||||
});
|
||||
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
|
||||
|
||||
let links = await crawler.start(
|
||||
inProgress,
|
||||
5,
|
||||
this.limit,
|
||||
this.maxCrawledDepth
|
||||
);
|
||||
|
||||
let allLinks = links.map((e) => e.url);
|
||||
const allHtmls = links.map((e) => e.html);
|
||||
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
return this.returnOnlyUrlsResponse(allLinks, inProgress);
|
||||
}
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
let documents = [];
|
||||
// check if fast mode is enabled and there is html inside the links
|
||||
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
||||
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
||||
} else {
|
||||
documents = await this.processLinks(allLinks, inProgress);
|
||||
}
|
||||
|
||||
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
||||
}
|
||||
|
||||
private async handleSingleUrlsMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let documents = await this.processLinks(this.urls, inProgress);
|
||||
const links = this.urls;
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return documents;
|
||||
}
|
||||
|
||||
@ -160,6 +207,8 @@ export class WebScraperDataProvider {
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let links = await getLinksFromSitemap(this.urls[0]);
|
||||
links = await this.cleanIrrelevantPath(links);
|
||||
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
@ -189,14 +238,24 @@ export class WebScraperDataProvider {
|
||||
|
||||
private async processLinks(
|
||||
links: string[],
|
||||
inProgress?: (progress: Progress) => void
|
||||
inProgress?: (progress: Progress) => void,
|
||||
allHtmls?: string[]
|
||||
): Promise<Document[]> {
|
||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||
const pdfLinks = links.filter(link => link.endsWith(".pdf"));
|
||||
const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx"));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||
const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
const docxDocuments = await this.fetchDocxDocuments(docLinks);
|
||||
|
||||
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(
|
||||
links,
|
||||
inProgress,
|
||||
allHtmls
|
||||
);
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
|
||||
documents = this.applyPathReplacements(documents);
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
|
||||
@ -206,7 +265,7 @@ export class WebScraperDataProvider {
|
||||
) {
|
||||
documents = await generateCompletions(documents, this.extractorOptions);
|
||||
}
|
||||
return documents.concat(pdfDocuments);
|
||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||
}
|
||||
|
||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
@ -221,6 +280,18 @@ export class WebScraperDataProvider {
|
||||
})
|
||||
);
|
||||
}
|
||||
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
docxLinks.map(async (p) => {
|
||||
const docXDocument = await fetchAndProcessDocx(p);
|
||||
return {
|
||||
content: docXDocument,
|
||||
metadata: { sourceURL: p },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
private applyPathReplacements(documents: Document[]): Document[] {
|
||||
return this.replaceAllPathsWithAbsolutePaths
|
||||
@ -397,8 +468,9 @@ export class WebScraperDataProvider {
|
||||
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||
this.excludes = this.excludes.filter((item) => item !== "");
|
||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
|
@ -2,13 +2,22 @@ import * as cheerio from "cheerio";
|
||||
import { ScrapingBeeClient } from "scrapingbee";
|
||||
import { extractMetadata } from "./utils/metadata";
|
||||
import dotenv from "dotenv";
|
||||
import { Document, PageOptions } from "../../lib/entities";
|
||||
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const baseScrapers = [
|
||||
"fire-engine",
|
||||
"scrapingBee",
|
||||
"playwright",
|
||||
"scrapingBeeLoad",
|
||||
"fetch",
|
||||
] as const;
|
||||
|
||||
export async function generateRequestParams(
|
||||
url: string,
|
||||
wait_browser: string = "domcontentloaded",
|
||||
@ -32,16 +41,54 @@ export async function generateRequestParams(
|
||||
return defaultParams;
|
||||
}
|
||||
}
|
||||
export async function scrapWithCustomFirecrawl(
|
||||
export async function scrapWithFireEngine(
|
||||
url: string,
|
||||
waitFor: number = 0,
|
||||
screenshot: boolean = false,
|
||||
headers?: Record<string, string>,
|
||||
options?: any
|
||||
): Promise<string> {
|
||||
): Promise<FireEngineResponse> {
|
||||
try {
|
||||
// TODO: merge the custom firecrawl scraper into mono-repo when ready
|
||||
return null;
|
||||
const reqParams = await generateRequestParams(url);
|
||||
// If the user has passed a wait parameter in the request, use that
|
||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
console.log(
|
||||
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
|
||||
);
|
||||
|
||||
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL + "/scrape", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url: url,
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
headers: headers,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
console.error(
|
||||
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
||||
);
|
||||
return { html: "", screenshot: "" };
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
||||
} else {
|
||||
const data = await response.json();
|
||||
const html = data.content;
|
||||
const screenshot = data.screenshot;
|
||||
return { html: html ?? "", screenshot: screenshot ?? "" };
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error scraping with custom firecrawl-scraper: ${error}`);
|
||||
return "";
|
||||
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
||||
return { html: "", screenshot: "" };
|
||||
}
|
||||
}
|
||||
|
||||
@ -62,51 +109,168 @@ export async function scrapWithScrapingBee(
|
||||
|
||||
if (response.status !== 200 && response.status !== 404) {
|
||||
console.error(
|
||||
`Scraping bee error in ${url} with status code ${response.status}`
|
||||
`[ScrapingBee] Error fetching url: ${url} with status code ${response.status}`
|
||||
);
|
||||
return "";
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return fetchAndProcessPdf(url);
|
||||
} else {
|
||||
const decoder = new TextDecoder();
|
||||
const text = decoder.decode(response.data);
|
||||
return text;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error scraping with Scraping Bee: ${error}`);
|
||||
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
export async function scrapWithPlaywright(url: string): Promise<string> {
|
||||
export async function scrapWithPlaywright(
|
||||
url: string,
|
||||
waitFor: number = 0,
|
||||
headers?: Record<string, string>
|
||||
): Promise<string> {
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
const wait_playwright = reqParams["params"]?.wait ?? 0;
|
||||
// If the user has passed a wait parameter in the request, use that
|
||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
|
||||
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({ url: url, wait: wait_playwright }),
|
||||
body: JSON.stringify({ url: url, wait: waitParam, headers: headers }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
console.error(
|
||||
`Error fetching w/ playwright server -> URL: ${url} with status: ${response.status}`
|
||||
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
||||
);
|
||||
return "";
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return fetchAndProcessPdf(url);
|
||||
} else {
|
||||
const data = await response.json();
|
||||
const html = data.content;
|
||||
return html ?? "";
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error scraping with Puppeteer: ${error}`);
|
||||
console.error(`[Playwright][c] Error fetching url: ${url} -> ${error}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
export async function scrapWithFetch(url: string): Promise<string> {
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) {
|
||||
console.error(
|
||||
`[Fetch] Error fetching url: ${url} with status: ${response.status}`
|
||||
);
|
||||
return "";
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return fetchAndProcessPdf(url);
|
||||
} else {
|
||||
const text = await response.text();
|
||||
return text;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the order of scrapers to be used for scraping a URL
|
||||
* If the user doesn't have envs set for a specific scraper, it will be removed from the order.
|
||||
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
|
||||
* @returns The order of scrapers to be used for scraping a URL
|
||||
*/
|
||||
function getScrapingFallbackOrder(
|
||||
defaultScraper?: string,
|
||||
isWaitPresent: boolean = false,
|
||||
isScreenshotPresent: boolean = false,
|
||||
isHeadersPresent: boolean = false
|
||||
) {
|
||||
const availableScrapers = baseScrapers.filter((scraper) => {
|
||||
switch (scraper) {
|
||||
case "scrapingBee":
|
||||
case "scrapingBeeLoad":
|
||||
return !!process.env.SCRAPING_BEE_API_KEY;
|
||||
case "fire-engine":
|
||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
||||
case "playwright":
|
||||
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
let defaultOrder = [
|
||||
"scrapingBee",
|
||||
"fire-engine",
|
||||
"playwright",
|
||||
"scrapingBeeLoad",
|
||||
"fetch",
|
||||
];
|
||||
|
||||
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||
defaultOrder = [
|
||||
"fire-engine",
|
||||
"playwright",
|
||||
...defaultOrder.filter(
|
||||
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
const filteredDefaultOrder = defaultOrder.filter(
|
||||
(scraper: (typeof baseScrapers)[number]) =>
|
||||
availableScrapers.includes(scraper)
|
||||
);
|
||||
const uniqueScrapers = new Set(
|
||||
defaultScraper
|
||||
? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
|
||||
: [...filteredDefaultOrder, ...availableScrapers]
|
||||
);
|
||||
const scrapersInOrder = Array.from(uniqueScrapers);
|
||||
console.log(`Scrapers in order: ${scrapersInOrder}`);
|
||||
return scrapersInOrder as (typeof baseScrapers)[number][];
|
||||
}
|
||||
|
||||
async function handleCustomScraping(
|
||||
text: string,
|
||||
url: string
|
||||
): Promise<FireEngineResponse | null> {
|
||||
if (text.includes('<meta name="readme-deploy"')) {
|
||||
console.log(
|
||||
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
||||
);
|
||||
return await scrapWithFireEngine(url, 1000);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export async function scrapSingleUrl(
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
|
||||
pageOptions: PageOptions = {
|
||||
onlyMainContent: true,
|
||||
includeHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
headers: {}
|
||||
},
|
||||
existingHtml: string = ""
|
||||
): Promise<Document> {
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
@ -124,17 +288,23 @@ export async function scrapSingleUrl(
|
||||
|
||||
const attemptScraping = async (
|
||||
url: string,
|
||||
method:
|
||||
| "firecrawl-scraper"
|
||||
| "scrapingBee"
|
||||
| "playwright"
|
||||
| "scrapingBeeLoad"
|
||||
| "fetch"
|
||||
method: (typeof baseScrapers)[number]
|
||||
) => {
|
||||
let text = "";
|
||||
let screenshot = "";
|
||||
switch (method) {
|
||||
case "firecrawl-scraper":
|
||||
text = await scrapWithCustomFirecrawl(url);
|
||||
case "fire-engine":
|
||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||
console.log(`Scraping ${url} with Fire Engine`);
|
||||
const response = await scrapWithFireEngine(
|
||||
url,
|
||||
pageOptions.waitFor,
|
||||
pageOptions.screenshot,
|
||||
pageOptions.headers
|
||||
);
|
||||
text = response.html;
|
||||
screenshot = response.screenshot;
|
||||
}
|
||||
break;
|
||||
case "scrapingBee":
|
||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||
@ -147,7 +317,7 @@ export async function scrapSingleUrl(
|
||||
break;
|
||||
case "playwright":
|
||||
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
||||
text = await scrapWithPlaywright(url);
|
||||
text = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
|
||||
}
|
||||
break;
|
||||
case "scrapingBeeLoad":
|
||||
@ -156,29 +326,24 @@ export async function scrapSingleUrl(
|
||||
}
|
||||
break;
|
||||
case "fetch":
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) {
|
||||
console.error(
|
||||
`Error fetching URL: ${url} with status: ${response.status}`
|
||||
);
|
||||
return "";
|
||||
}
|
||||
text = await response.text();
|
||||
} catch (error) {
|
||||
console.error(`Error scraping URL: ${error}`);
|
||||
return "";
|
||||
}
|
||||
text = await scrapWithFetch(url);
|
||||
break;
|
||||
}
|
||||
|
||||
// Check for custom scraping conditions
|
||||
const customScrapedContent = await handleCustomScraping(text, url);
|
||||
if (customScrapedContent) {
|
||||
text = customScrapedContent.html;
|
||||
screenshot = customScrapedContent.screenshot;
|
||||
}
|
||||
|
||||
//* TODO: add an optional to return markdown or structured/extracted content
|
||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||
|
||||
return [await parseMarkdown(cleanedHtml), text];
|
||||
return [await parseMarkdown(cleanedHtml), text, screenshot];
|
||||
};
|
||||
try {
|
||||
let [text, html] = ["", ""];
|
||||
let [text, html, screenshot] = ["", "", ""];
|
||||
let urlKey = urlToScrap;
|
||||
try {
|
||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||
@ -186,20 +351,27 @@ export async function scrapSingleUrl(
|
||||
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||
}
|
||||
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
||||
const scrapersInOrder = defaultScraper
|
||||
? [
|
||||
const scrapersInOrder = getScrapingFallbackOrder(
|
||||
defaultScraper,
|
||||
"scrapingBee",
|
||||
"playwright",
|
||||
"scrapingBeeLoad",
|
||||
"fetch",
|
||||
]
|
||||
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
|
||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||
pageOptions && pageOptions.screenshot && pageOptions.screenshot === true,
|
||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
||||
);
|
||||
|
||||
for (const scraper of scrapersInOrder) {
|
||||
[text, html] = await attemptScraping(urlToScrap, scraper);
|
||||
if (text && text.length >= 100) break;
|
||||
console.log(`Falling back to ${scraper}`);
|
||||
// If exists text coming from crawler, use it
|
||||
if (existingHtml && existingHtml.trim().length >= 100) {
|
||||
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
|
||||
text = await parseMarkdown(cleanedHtml);
|
||||
html = existingHtml;
|
||||
break;
|
||||
}
|
||||
[text, html, screenshot] = await attemptScraping(urlToScrap, scraper);
|
||||
if (text && text.trim().length >= 100) break;
|
||||
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||
if (nextScraperIndex < scrapersInOrder.length) {
|
||||
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (!text) {
|
||||
@ -208,12 +380,27 @@ export async function scrapSingleUrl(
|
||||
|
||||
const soup = cheerio.load(html);
|
||||
const metadata = extractMetadata(soup, urlToScrap);
|
||||
const document: Document = {
|
||||
|
||||
let document: Document;
|
||||
if (screenshot && screenshot.length > 0) {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
metadata: {
|
||||
...metadata,
|
||||
screenshot: screenshot,
|
||||
sourceURL: urlToScrap,
|
||||
},
|
||||
};
|
||||
} else {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
||||
};
|
||||
}
|
||||
|
||||
return document;
|
||||
} catch (error) {
|
||||
|
@ -0,0 +1,13 @@
|
||||
import * as docxProcessor from "../docxProcessor";
|
||||
|
||||
describe("DOCX Processing Module - Integration Test", () => {
|
||||
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
|
||||
delete process.env.LLAMAPARSE_API_KEY;
|
||||
const docxContent = await docxProcessor.fetchAndProcessDocx(
|
||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
|
||||
);
|
||||
expect(docxContent.trim()).toContain(
|
||||
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
|
||||
);
|
||||
});
|
||||
});
|
@ -0,0 +1,66 @@
|
||||
import { isUrlBlocked } from '../blocklist';
|
||||
|
||||
describe('isUrlBlocked', () => {
|
||||
it('should return true for blocked social media URLs', () => {
|
||||
const blockedUrls = [
|
||||
'https://www.facebook.com',
|
||||
'https://twitter.com/someuser',
|
||||
'https://instagram.com/someuser',
|
||||
'https://www.linkedin.com/in/someuser',
|
||||
'https://pinterest.com/someuser',
|
||||
'https://snapchat.com/someuser',
|
||||
'https://tiktok.com/@someuser',
|
||||
'https://reddit.com/r/somesubreddit',
|
||||
'https://flickr.com/photos/someuser',
|
||||
'https://whatsapp.com/someuser',
|
||||
'https://wechat.com/someuser',
|
||||
'https://telegram.org/someuser',
|
||||
];
|
||||
|
||||
blockedUrls.forEach(url => {
|
||||
if (!isUrlBlocked(url)) {
|
||||
console.log(`URL not blocked: ${url}`);
|
||||
}
|
||||
expect(isUrlBlocked(url)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
it('should return false for URLs containing allowed keywords', () => {
|
||||
const allowedUrls = [
|
||||
'https://www.facebook.com/privacy',
|
||||
'https://twitter.com/terms',
|
||||
'https://instagram.com/legal',
|
||||
'https://www.linkedin.com/help',
|
||||
'https://pinterest.com/about',
|
||||
'https://snapchat.com/support',
|
||||
'https://tiktok.com/contact',
|
||||
'https://reddit.com/user-agreement',
|
||||
'https://tumblr.com/policy',
|
||||
'https://flickr.com/blog',
|
||||
'https://whatsapp.com/press',
|
||||
'https://wechat.com/careers',
|
||||
'https://telegram.org/conditions',
|
||||
'https://wix.com/careers',
|
||||
];
|
||||
|
||||
allowedUrls.forEach(url => {
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
it('should return false for non-blocked URLs', () => {
|
||||
const nonBlockedUrls = [
|
||||
'https://www.example.com',
|
||||
'https://www.somewebsite.org',
|
||||
'https://subdomain.example.com',
|
||||
'firecrawl.dev',
|
||||
'amazon.com',
|
||||
'wix.com',
|
||||
'https://wix.com'
|
||||
];
|
||||
|
||||
nonBlockedUrls.forEach(url => {
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
});
|
||||
});
|
||||
});
|
@ -1,5 +1,6 @@
|
||||
const socialMediaBlocklist = [
|
||||
'facebook.com',
|
||||
'x.com',
|
||||
'twitter.com',
|
||||
'instagram.com',
|
||||
'linkedin.com',
|
||||
@ -14,14 +15,40 @@ const socialMediaBlocklist = [
|
||||
'telegram.org',
|
||||
];
|
||||
|
||||
const allowedUrls = [
|
||||
'linkedin.com/pulse'
|
||||
const allowedKeywords = [
|
||||
'pulse',
|
||||
'privacy',
|
||||
'terms',
|
||||
'policy',
|
||||
'user-agreement',
|
||||
'legal',
|
||||
'help',
|
||||
'support',
|
||||
'contact',
|
||||
'about',
|
||||
'careers',
|
||||
'blog',
|
||||
'press',
|
||||
'conditions',
|
||||
];
|
||||
|
||||
export function isUrlBlocked(url: string): boolean {
|
||||
if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) {
|
||||
// Check if the URL contains any allowed keywords
|
||||
if (allowedKeywords.some(keyword => url.includes(keyword))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return socialMediaBlocklist.some(domain => url.includes(domain));
|
||||
try {
|
||||
// Check if the URL matches any domain in the blocklist
|
||||
return socialMediaBlocklist.some(domain => {
|
||||
// Create a regular expression to match the exact domain
|
||||
const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`);
|
||||
// Test the hostname of the URL against the pattern
|
||||
return domainPattern.test(new URL(url).hostname);
|
||||
});
|
||||
} catch (e) {
|
||||
// If an error occurs (e.g., invalid URL), return false
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -63,7 +63,7 @@ export const urlSpecificParams = {
|
||||
},
|
||||
},
|
||||
"ycombinator.com":{
|
||||
defaultScraper: "playwright",
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
@ -121,5 +121,43 @@ export const urlSpecificParams = {
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"help.salesforce.com":{
|
||||
defaultScraper: "playwright",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"firecrawl.dev":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
}
|
||||
};
|
||||
|
41
apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
Normal file
@ -0,0 +1,41 @@
|
||||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import { createWriteStream } from "node:fs";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import mammoth from "mammoth";
|
||||
|
||||
export async function fetchAndProcessDocx(url: string): Promise<string> {
|
||||
const tempFilePath = await downloadDocx(url);
|
||||
const content = await processDocxToText(tempFilePath);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
return content;
|
||||
}
|
||||
|
||||
async function downloadDocx(url: string): Promise<string> {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
|
||||
const writer = createWriteStream(tempFilePath);
|
||||
|
||||
response.data.pipe(writer);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve(tempFilePath));
|
||||
writer.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
||||
export async function processDocxToText(filePath: string): Promise<string> {
|
||||
const content = await extractTextFromDocx(filePath);
|
||||
return content;
|
||||
}
|
||||
|
||||
async function extractTextFromDocx(filePath: string): Promise<string> {
|
||||
const result = await mammoth.extractRawText({ path: filePath });
|
||||
return result.value;
|
||||
}
|
@ -34,8 +34,6 @@ export const excludeNonMainTags = [
|
||||
"#nav",
|
||||
".breadcrumbs",
|
||||
"#breadcrumbs",
|
||||
".form",
|
||||
"form",
|
||||
"#search-form",
|
||||
".search",
|
||||
"#search",
|
||||
@ -51,10 +49,6 @@ export const excludeNonMainTags = [
|
||||
"#tag",
|
||||
".category",
|
||||
"#category",
|
||||
".comment",
|
||||
"#comment",
|
||||
".reply",
|
||||
"#reply",
|
||||
".author",
|
||||
"#author",
|
||||
".cookie",
|
||||
"#cookie"
|
||||
];
|
||||
|
@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise<string> {
|
||||
async function downloadPdf(url: string): Promise<string> {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: 'GET',
|
||||
responseType: 'stream',
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
|
||||
@ -29,8 +29,8 @@ async function downloadPdf(url: string): Promise<string> {
|
||||
response.data.pipe(writer);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on('finish', () => resolve(tempFilePath));
|
||||
writer.on('error', reject);
|
||||
writer.on("finish", () => resolve(tempFilePath));
|
||||
writer.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
||||
@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise<string> {
|
||||
} else {
|
||||
// If the status code is not 200, increment the attempt counter and wait
|
||||
attempt++;
|
||||
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds
|
||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error fetching result:", error);
|
||||
console.error("Error fetching result:", error || '');
|
||||
attempt++;
|
||||
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying
|
||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
||||
// You may want to handle specific errors differently
|
||||
}
|
||||
}
|
||||
@ -101,7 +101,7 @@ export async function processPdfToText(filePath: string): Promise<string> {
|
||||
return content;
|
||||
}
|
||||
|
||||
async function processPdf(file: string){
|
||||
async function processPdf(file: string) {
|
||||
const fileContent = fs.readFileSync(file);
|
||||
const data = await pdf(fileContent);
|
||||
return data.text;
|
||||
|
@ -1,7 +1,7 @@
|
||||
import { withAuth } from "../../lib/withAuth";
|
||||
import { supabase_service } from "../supabase";
|
||||
|
||||
const FREE_CREDITS = 300;
|
||||
const FREE_CREDITS = 500;
|
||||
|
||||
export async function billTeam(team_id: string, credits: number) {
|
||||
return withAuth(supaBillTeam)(team_id, credits);
|
||||
@ -227,10 +227,11 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||
|
||||
if (creditUsages && creditUsages.length > 0) {
|
||||
totalCreditsUsed = creditUsages[0].total_credits_used;
|
||||
console.log("Total Credits Used:", totalCreditsUsed);
|
||||
// console.log("Total Credits Used:", totalCreditsUsed);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error calculating credit usage:", error);
|
||||
|
||||
}
|
||||
// Adjust total credits used by subtracting coupon value
|
||||
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
|
||||
|
22
apps/api/src/services/idempotency/create.ts
Normal file
@ -0,0 +1,22 @@
|
||||
import { Request } from "express";
|
||||
import { supabase_service } from "../supabase";
|
||||
|
||||
export async function createIdempotencyKey(
|
||||
req: Request,
|
||||
): Promise<string> {
|
||||
const idempotencyKey = req.headers['x-idempotency-key'] as string;
|
||||
if (!idempotencyKey) {
|
||||
throw new Error("No idempotency key provided in the request headers.");
|
||||
}
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("idempotency_keys")
|
||||
.insert({ key: idempotencyKey });
|
||||
|
||||
if (error) {
|
||||
console.error("Failed to create idempotency key:", error);
|
||||
throw error;
|
||||
}
|
||||
|
||||
return idempotencyKey;
|
||||
}
|
32
apps/api/src/services/idempotency/validate.ts
Normal file
@ -0,0 +1,32 @@
|
||||
import { Request } from "express";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { validate as isUuid } from 'uuid';
|
||||
|
||||
export async function validateIdempotencyKey(
|
||||
req: Request,
|
||||
): Promise<boolean> {
|
||||
const idempotencyKey = req.headers['x-idempotency-key'];
|
||||
if (!idempotencyKey) {
|
||||
// // not returning for missing idempotency key for now
|
||||
return true;
|
||||
}
|
||||
if (!isUuid(idempotencyKey)) {
|
||||
console.error("Invalid idempotency key provided in the request headers.");
|
||||
return false;
|
||||
}
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("idempotency_keys")
|
||||
.select("key")
|
||||
.eq("key", idempotencyKey);
|
||||
|
||||
if (error) {
|
||||
console.error(error);
|
||||
}
|
||||
|
||||
if (!data || data.length === 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
@ -5,6 +5,11 @@ import { logtail } from "./logtail";
|
||||
import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||
import { callWebhook } from "./webhook";
|
||||
import { logJob } from "./logging/log_job";
|
||||
import { initSDK } from '@hyperdx/node-opentelemetry';
|
||||
|
||||
if(process.env.ENV === 'production') {
|
||||
initSDK({ consoleCapture: true, additionalInstrumentations: []});
|
||||
}
|
||||
|
||||
getWebScraperQueue().process(
|
||||
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
||||
@ -26,7 +31,7 @@ getWebScraperQueue().process(
|
||||
success: success,
|
||||
result: {
|
||||
links: docs.map((doc) => {
|
||||
return { content: doc, source: doc.metadata.sourceURL };
|
||||
return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
|
||||
}),
|
||||
},
|
||||
project_id: job.data.project_id,
|
||||
|
@ -2,91 +2,68 @@ import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import * as redis from "redis";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
|
||||
const MAX_REQUESTS_PER_MINUTE_PREVIEW = 5;
|
||||
const MAX_CRAWLS_PER_MINUTE_STARTER = 2;
|
||||
const MAX_CRAWLS_PER_MINUTE_STANDARD = 4;
|
||||
const MAX_CRAWLS_PER_MINUTE_SCALE = 20;
|
||||
|
||||
const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 20;
|
||||
|
||||
const MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS = 120;
|
||||
|
||||
|
||||
|
||||
const RATE_LIMITS = {
|
||||
crawl: {
|
||||
free: 1,
|
||||
starter: 3,
|
||||
standard: 5,
|
||||
scale: 20,
|
||||
hobby: 3,
|
||||
standardNew: 10,
|
||||
growth: 50,
|
||||
},
|
||||
scrape: {
|
||||
free: 5,
|
||||
starter: 20,
|
||||
standardOld: 40,
|
||||
scale: 50,
|
||||
hobby: 10,
|
||||
standardNew: 50,
|
||||
growth: 500,
|
||||
},
|
||||
search: {
|
||||
free: 5,
|
||||
starter: 20,
|
||||
standard: 40,
|
||||
scale: 50,
|
||||
hobby: 10,
|
||||
standardNew: 50,
|
||||
growth: 500,
|
||||
},
|
||||
preview: 5,
|
||||
account: 20,
|
||||
crawlStatus: 150,
|
||||
testSuite: 10000,
|
||||
};
|
||||
|
||||
export const redisClient = redis.createClient({
|
||||
url: process.env.REDIS_URL,
|
||||
legacyMode: true,
|
||||
});
|
||||
|
||||
export const previewRateLimiter = new RateLimiterRedis({
|
||||
const createRateLimiter = (keyPrefix, points) => new RateLimiterRedis({
|
||||
storeClient: redisClient,
|
||||
keyPrefix: "middleware",
|
||||
points: MAX_REQUESTS_PER_MINUTE_PREVIEW,
|
||||
keyPrefix,
|
||||
points,
|
||||
duration: 60, // Duration in seconds
|
||||
});
|
||||
|
||||
export const serverRateLimiter = new RateLimiterRedis({
|
||||
storeClient: redisClient,
|
||||
keyPrefix: "middleware",
|
||||
points: MAX_REQUESTS_PER_MINUTE_ACCOUNT,
|
||||
duration: 60, // Duration in seconds
|
||||
});
|
||||
export const previewRateLimiter = createRateLimiter("preview", RATE_LIMITS.preview);
|
||||
export const serverRateLimiter = createRateLimiter("server", RATE_LIMITS.account);
|
||||
export const crawlStatusRateLimiter = createRateLimiter("crawl-status", RATE_LIMITS.crawlStatus);
|
||||
export const testSuiteRateLimiter = createRateLimiter("test-suite", RATE_LIMITS.testSuite);
|
||||
|
||||
export const crawlStatusRateLimiter = new RateLimiterRedis({
|
||||
storeClient: redisClient,
|
||||
keyPrefix: "middleware",
|
||||
points: MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS,
|
||||
duration: 60, // Duration in seconds
|
||||
});
|
||||
|
||||
export const testSuiteRateLimiter = new RateLimiterRedis({
|
||||
storeClient: redisClient,
|
||||
keyPrefix: "middleware",
|
||||
points: 1000,
|
||||
duration: 60, // Duration in seconds
|
||||
});
|
||||
|
||||
|
||||
export function crawlRateLimit(plan: string){
|
||||
if(plan === "standard"){
|
||||
return new RateLimiterRedis({
|
||||
storeClient: redisClient,
|
||||
keyPrefix: "middleware",
|
||||
points: MAX_CRAWLS_PER_MINUTE_STANDARD,
|
||||
duration: 60, // Duration in seconds
|
||||
});
|
||||
}else if(plan === "scale"){
|
||||
return new RateLimiterRedis({
|
||||
storeClient: redisClient,
|
||||
keyPrefix: "middleware",
|
||||
points: MAX_CRAWLS_PER_MINUTE_SCALE,
|
||||
duration: 60, // Duration in seconds
|
||||
});
|
||||
}
|
||||
return new RateLimiterRedis({
|
||||
storeClient: redisClient,
|
||||
keyPrefix: "middleware",
|
||||
points: MAX_CRAWLS_PER_MINUTE_STARTER,
|
||||
duration: 60, // Duration in seconds
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
export function getRateLimiter(mode: RateLimiterMode, token: string){
|
||||
// Special test suite case. TODO: Change this later.
|
||||
if(token.includes("5089cefa58")){
|
||||
export function getRateLimiter(mode: RateLimiterMode, token: string, plan?: string) {
|
||||
if (token.includes("5089cefa58") || token.includes("6254cf9")) {
|
||||
return testSuiteRateLimiter;
|
||||
}
|
||||
switch(mode) {
|
||||
case RateLimiterMode.Preview:
|
||||
return previewRateLimiter;
|
||||
case RateLimiterMode.CrawlStatus:
|
||||
return crawlStatusRateLimiter;
|
||||
default:
|
||||
return serverRateLimiter;
|
||||
}
|
||||
|
||||
|
||||
const rateLimitConfig = RATE_LIMITS[mode];
|
||||
if (!rateLimitConfig) return serverRateLimiter;
|
||||
|
||||
const planKey = plan ? plan.replace("-", "") : "starter";
|
||||
const points = rateLimitConfig[planKey] || rateLimitConfig.preview;
|
||||
|
||||
return createRateLimiter(`${mode}-${planKey}`, points);
|
||||
}
|
||||
|
@ -57,6 +57,7 @@ export interface AuthResponse {
|
||||
team_id?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
plan?: string;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import FirecrawlApp from '@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
|
||||
@ -8,7 +9,8 @@ const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
console.log(scrapeResult.data.content)
|
||||
|
||||
// Crawl a website:
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||
const idempotencyKey = uuidv4(); // optional
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId = await crawlResult['jobId'];
|
||||
|
3
apps/js-sdk/firecrawl/.env.example
Normal file
@ -0,0 +1,3 @@
|
||||
API_URL=http://localhost:3002
|
||||
TEST_API_KEY=fc-YOUR_API_KEY
|
||||
|
@ -110,11 +110,12 @@ export default class FirecrawlApp {
|
||||
* @param {Params | null} params - Additional parameters for the crawl request.
|
||||
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
|
||||
* @param {number} timeout - Timeout in seconds for job status checks.
|
||||
* @param {string} idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
|
||||
*/
|
||||
crawlUrl(url_1) {
|
||||
return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, timeout = 2) {
|
||||
const headers = this.prepareHeaders();
|
||||
return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, timeout = 2, idempotencyKey) {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData = { url };
|
||||
if (params) {
|
||||
jsonData = Object.assign(Object.assign({}, jsonData), params);
|
||||
@ -172,11 +173,8 @@ export default class FirecrawlApp {
|
||||
* Prepares the headers for an API request.
|
||||
* @returns {AxiosRequestHeaders} The prepared headers.
|
||||
*/
|
||||
prepareHeaders() {
|
||||
return {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
};
|
||||
prepareHeaders(idempotencyKey) {
|
||||
return Object.assign({ 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}` }, (idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}));
|
||||
}
|
||||
/**
|
||||
* Sends a POST request to the specified URL.
|
||||
|
66
apps/js-sdk/firecrawl/package-lock.json
generated
@ -1,22 +1,27 @@
|
||||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "0.0.17-beta.8",
|
||||
"version": "0.0.22",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "0.0.17-beta.8",
|
||||
"version": "0.0.22",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@jest/globals": "^29.7.0",
|
||||
"@types/axios": "^0.14.0",
|
||||
"@types/node": "^20.12.7",
|
||||
"@types/dotenv": "^8.2.0",
|
||||
"@types/jest": "^29.5.12",
|
||||
"@types/node": "^20.12.12",
|
||||
"@types/uuid": "^9.0.8",
|
||||
"jest": "^29.7.0",
|
||||
"ts-jest": "^29.1.2",
|
||||
"typescript": "^5.4.5"
|
||||
@ -1013,6 +1018,16 @@
|
||||
"@babel/types": "^7.20.7"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/dotenv": {
|
||||
"version": "8.2.0",
|
||||
"resolved": "https://registry.npmjs.org/@types/dotenv/-/dotenv-8.2.0.tgz",
|
||||
"integrity": "sha512-ylSC9GhfRH7m1EUXBXofhgx4lUWmFeQDINW5oLuS+gxWdfUeW4zJdeVTYVkexEW+e2VUvlZR2kGnGGipAWR7kw==",
|
||||
"deprecated": "This is a stub types definition. dotenv provides its own type definitions, so you do not need this installed.",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"dotenv": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/graceful-fs": {
|
||||
"version": "4.1.9",
|
||||
"resolved": "https://registry.npmjs.org/@types/graceful-fs/-/graceful-fs-4.1.9.tgz",
|
||||
@ -1046,10 +1061,20 @@
|
||||
"@types/istanbul-lib-report": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/jest": {
|
||||
"version": "29.5.12",
|
||||
"resolved": "https://registry.npmjs.org/@types/jest/-/jest-29.5.12.tgz",
|
||||
"integrity": "sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"expect": "^29.0.0",
|
||||
"pretty-format": "^29.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/node": {
|
||||
"version": "20.12.7",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz",
|
||||
"integrity": "sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==",
|
||||
"version": "20.12.12",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.12.tgz",
|
||||
"integrity": "sha512-eWLDGF/FOSPtAvEqeRAQ4C8LSA7M1I7i0ky1I8U7kD1J5ITyW3AsRhQrKVoWf5pFKZ2kILsEGJhsI9r93PYnOw==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~5.26.4"
|
||||
@ -1061,6 +1086,12 @@
|
||||
"integrity": "sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@types/uuid": {
|
||||
"version": "9.0.8",
|
||||
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz",
|
||||
"integrity": "sha512-jg+97EGIcY9AGHJJRaaPVgetKDsrTgbRjQ5Msgjh/DQKEFl0DtyRr/VCOyD1T2R1MNeWPK/u7JoGhlDZnKBAfA==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@types/yargs": {
|
||||
"version": "17.0.32",
|
||||
"resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.32.tgz",
|
||||
@ -1602,6 +1633,17 @@
|
||||
"node": "^14.15.0 || ^16.10.0 || >=18.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/dotenv": {
|
||||
"version": "16.4.5",
|
||||
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz",
|
||||
"integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://dotenvx.com"
|
||||
}
|
||||
},
|
||||
"node_modules/electron-to-chromium": {
|
||||
"version": "1.4.748",
|
||||
"resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.748.tgz",
|
||||
@ -3641,6 +3683,18 @@
|
||||
"browserslist": ">= 4.21.0"
|
||||
}
|
||||
},
|
||||
"node_modules/uuid": {
|
||||
"version": "9.0.1",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
|
||||
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
"https://github.com/sponsors/ctavan"
|
||||
],
|
||||
"bin": {
|
||||
"uuid": "dist/bin/uuid"
|
||||
}
|
||||
},
|
||||
"node_modules/v8-to-istanbul": {
|
||||
"version": "9.2.0",
|
||||
"resolved": "https://registry.npmjs.org/v8-to-istanbul/-/v8-to-istanbul-9.2.0.tgz",
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "0.0.21",
|
||||
"version": "0.0.22",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "build/index.js",
|
||||
"types": "types/index.d.ts",
|
||||
@ -9,7 +9,7 @@
|
||||
"build": "tsc",
|
||||
"publish": "npm run build && npm publish --access public",
|
||||
"publish-beta": "npm run build && npm publish --access public --tag beta",
|
||||
"test": "jest src/**/*.test.ts"
|
||||
"test": "jest src/__tests__/**/*.test.ts"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
@ -19,6 +19,8 @@
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.0"
|
||||
},
|
||||
@ -29,7 +31,10 @@
|
||||
"devDependencies": {
|
||||
"@jest/globals": "^29.7.0",
|
||||
"@types/axios": "^0.14.0",
|
||||
"@types/node": "^20.12.7",
|
||||
"@types/dotenv": "^8.2.0",
|
||||
"@types/jest": "^29.5.12",
|
||||
"@types/node": "^20.12.12",
|
||||
"@types/uuid": "^9.0.8",
|
||||
"jest": "^29.7.0",
|
||||
"ts-jest": "^29.1.2",
|
||||
"typescript": "^5.4.5"
|
||||
|
146
apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts
Normal file
@ -0,0 +1,146 @@
|
||||
import FirecrawlApp from '../../index';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const TEST_API_KEY = process.env.TEST_API_KEY;
|
||||
const API_URL = process.env.API_URL;
|
||||
|
||||
describe('FirecrawlApp E2E Tests', () => {
|
||||
test('should throw error for no API key', () => {
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
||||
}).toThrow("No API key provided");
|
||||
});
|
||||
|
||||
test('should throw error for invalid API key on scrape', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.scrapeUrl('https://firecrawl.dev')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test('should throw error for blocklisted URL on scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test('should return successful response with valid preview token', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://firecrawl.dev');
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data.content).toContain("🔥 Firecrawl");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test('should return successful response for valid scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://firecrawl.dev');
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data.content).toContain("🔥 Firecrawl");
|
||||
expect(response.data).toHaveProperty('markdown');
|
||||
expect(response.data).toHaveProperty('metadata');
|
||||
expect(response.data).not.toHaveProperty('html');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test('should return successful response with valid API key and include HTML', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://firecrawl.dev', { pageOptions: { includeHtml: true } });
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data.content).toContain("🔥 Firecrawl");
|
||||
expect(response.data.markdown).toContain("🔥 Firecrawl");
|
||||
expect(response.data.html).toContain("<h1");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test('should return successful response for valid scrape with PDF file', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf');
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test('should return successful response for valid scrape with PDF file without explicit extension', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001');
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test('should throw error for invalid API key on crawl', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.crawlUrl('https://firecrawl.dev')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test('should throw error for blocklisted URL on crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test('should return successful response for crawl and wait for completion', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://firecrawl.dev', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30);
|
||||
expect(response).not.toBeNull();
|
||||
expect(response[0].content).toContain("🔥 Firecrawl");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test('should handle idempotency key for crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const uniqueIdempotencyKey = uuidv4();
|
||||
const response = await app.crawlUrl('https://firecrawl.dev', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey);
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
await expect(app.crawlUrl('https://firecrawl.dev', { crawlerOptions: { excludes: ['blog/*'] } }, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
|
||||
});
|
||||
|
||||
test('should check crawl status', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://firecrawl.dev', { crawlerOptions: { excludes: ['blog/*'] } }, false);
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 30000)); // wait for 30 seconds
|
||||
const statusResponse = await app.checkCrawlStatus(response.jobId);
|
||||
expect(statusResponse).not.toBeNull();
|
||||
expect(statusResponse.status).toBe('completed');
|
||||
expect(statusResponse.data.length).toBeGreaterThan(0);
|
||||
}, 35000); // 35 seconds timeout
|
||||
|
||||
test('should return successful response for search', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.search("test query");
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data[0].content).toBeDefined();
|
||||
expect(response.data.length).toBeGreaterThan(2);
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test('should throw error for invalid API key on search', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test('should perform LLM extraction', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl("https://mendable.ai", {
|
||||
extractorOptions: {
|
||||
mode: 'llm-extraction',
|
||||
extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
extractionSchema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
company_mission: { type: 'string' },
|
||||
supports_sso: { type: 'boolean' },
|
||||
is_open_source: { type: 'boolean' }
|
||||
},
|
||||
required: ['company_mission', 'supports_sso', 'is_open_source']
|
||||
}
|
||||
}
|
||||
});
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data.llm_extraction).toBeDefined();
|
||||
const llmExtraction = response.data.llm_extraction;
|
||||
expect(llmExtraction.company_mission).toBeDefined();
|
||||
expect(typeof llmExtraction.supports_sso).toBe('boolean');
|
||||
expect(typeof llmExtraction.is_open_source).toBe('boolean');
|
||||
}, 30000); // 30 seconds timeout
|
||||
});
|
@ -6,6 +6,7 @@ import { zodToJsonSchema } from "zod-to-json-schema";
|
||||
*/
|
||||
export interface FirecrawlAppConfig {
|
||||
apiKey?: string | null;
|
||||
apiUrl?: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -63,6 +64,7 @@ export interface JobStatusResponse {
|
||||
*/
|
||||
export default class FirecrawlApp {
|
||||
private apiKey: string;
|
||||
private apiUrl: string = "https://api.firecrawl.dev";
|
||||
|
||||
/**
|
||||
* Initializes a new instance of the FirecrawlApp class.
|
||||
@ -107,7 +109,7 @@ export default class FirecrawlApp {
|
||||
}
|
||||
try {
|
||||
const response: AxiosResponse = await axios.post(
|
||||
"https://api.firecrawl.dev/v0/scrape",
|
||||
this.apiUrl + "/v0/scrape",
|
||||
jsonData,
|
||||
{ headers },
|
||||
);
|
||||
@ -147,7 +149,7 @@ export default class FirecrawlApp {
|
||||
}
|
||||
try {
|
||||
const response: AxiosResponse = await axios.post(
|
||||
"https://api.firecrawl.dev/v0/search",
|
||||
this.apiUrl + "/v0/search",
|
||||
jsonData,
|
||||
{ headers }
|
||||
);
|
||||
@ -173,22 +175,24 @@ export default class FirecrawlApp {
|
||||
* @param {Params | null} params - Additional parameters for the crawl request.
|
||||
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
|
||||
* @param {number} timeout - Timeout in seconds for job status checks.
|
||||
* @param {string} idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
|
||||
*/
|
||||
async crawlUrl(
|
||||
url: string,
|
||||
params: Params | null = null,
|
||||
waitUntilDone: boolean = true,
|
||||
timeout: number = 2
|
||||
timeout: number = 2,
|
||||
idempotencyKey?: string
|
||||
): Promise<CrawlResponse | any> {
|
||||
const headers = this.prepareHeaders();
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: Params = { url };
|
||||
if (params) {
|
||||
jsonData = { ...jsonData, ...params };
|
||||
}
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
"https://api.firecrawl.dev/v0/crawl",
|
||||
this.apiUrl + "/v0/crawl",
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
@ -218,7 +222,7 @@ export default class FirecrawlApp {
|
||||
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
||||
try {
|
||||
const response: AxiosResponse = await this.getRequest(
|
||||
`https://api.firecrawl.dev/v0/crawl/status/${jobId}`,
|
||||
this.apiUrl + `/v0/crawl/status/${jobId}`,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
@ -240,11 +244,12 @@ export default class FirecrawlApp {
|
||||
* Prepares the headers for an API request.
|
||||
* @returns {AxiosRequestHeaders} The prepared headers.
|
||||
*/
|
||||
prepareHeaders(): AxiosRequestHeaders {
|
||||
prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
|
||||
return {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
} as AxiosRequestHeaders;
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${this.apiKey}`,
|
||||
...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}),
|
||||
} as AxiosRequestHeaders & { 'x-idempotency-key'?: string };
|
||||
}
|
||||
|
||||
/**
|
||||
@ -289,7 +294,7 @@ export default class FirecrawlApp {
|
||||
): Promise<any> {
|
||||
while (true) {
|
||||
const statusResponse: AxiosResponse = await this.getRequest(
|
||||
`https://api.firecrawl.dev/v0/crawl/status/${jobId}`,
|
||||
this.apiUrl + `/v0/crawl/status/${jobId}`,
|
||||
headers
|
||||
);
|
||||
if (statusResponse.status === 200) {
|
||||
|
5
apps/js-sdk/firecrawl/types/index.d.ts
vendored
@ -82,9 +82,10 @@ export default class FirecrawlApp {
|
||||
* @param {Params | null} params - Additional parameters for the crawl request.
|
||||
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
|
||||
* @param {number} timeout - Timeout in seconds for job status checks.
|
||||
* @param {string} idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
|
||||
*/
|
||||
crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, timeout?: number): Promise<CrawlResponse | any>;
|
||||
crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, timeout?: number, idempotencyKey?: string): Promise<CrawlResponse | any>;
|
||||
/**
|
||||
* Checks the status of a crawl job using the Firecrawl API.
|
||||
* @param {string} jobId - The job ID of the crawl operation.
|
||||
@ -95,7 +96,7 @@ export default class FirecrawlApp {
|
||||
* Prepares the headers for an API request.
|
||||
* @returns {AxiosRequestHeaders} The prepared headers.
|
||||
*/
|
||||
prepareHeaders(): AxiosRequestHeaders;
|
||||
prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders;
|
||||
/**
|
||||
* Sends a POST request to the specified URL.
|
||||
* @param {string} url - The URL to send the request to.
|
||||
|
25
apps/js-sdk/package-lock.json
generated
@ -11,8 +11,10 @@
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^0.0.19",
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.4.5",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.23.8"
|
||||
},
|
||||
"devDependencies": {
|
||||
@ -530,6 +532,17 @@
|
||||
"node": ">=0.3.1"
|
||||
}
|
||||
},
|
||||
"node_modules/dotenv": {
|
||||
"version": "16.4.5",
|
||||
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz",
|
||||
"integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://dotenvx.com"
|
||||
}
|
||||
},
|
||||
"node_modules/esbuild": {
|
||||
"version": "0.20.2",
|
||||
"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.20.2.tgz",
|
||||
@ -743,6 +756,18 @@
|
||||
"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/uuid": {
|
||||
"version": "9.0.1",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
|
||||
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
"https://github.com/sponsors/ctavan"
|
||||
],
|
||||
"bin": {
|
||||
"uuid": "dist/bin/uuid"
|
||||
}
|
||||
},
|
||||
"node_modules/v8-compile-cache-lib": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
|
||||
|
@ -2,12 +2,21 @@ from fastapi import FastAPI
|
||||
from playwright.async_api import async_playwright, Browser
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel
|
||||
from os import environ
|
||||
|
||||
PROXY_SERVER = environ.get("PROXY_SERVER", None)
|
||||
PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
|
||||
PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None)
|
||||
BLOCK_MEDIA = environ.get("BLOCK_MEDIA", "False").upper() == "TRUE"
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
||||
class UrlModel(BaseModel):
|
||||
url: str
|
||||
wait: int = None
|
||||
wait_until: str = "load"
|
||||
headers: dict = None
|
||||
|
||||
|
||||
browser: Browser = None
|
||||
@ -27,11 +36,39 @@ async def shutdown_event():
|
||||
|
||||
@app.post("/html")
|
||||
async def root(body: UrlModel):
|
||||
context = None
|
||||
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
|
||||
context = await browser.new_context(
|
||||
proxy={
|
||||
"server": PROXY_SERVER,
|
||||
"username": PROXY_USERNAME,
|
||||
"password": PROXY_PASSWORD,
|
||||
}
|
||||
)
|
||||
else:
|
||||
context = await browser.new_context()
|
||||
|
||||
if BLOCK_MEDIA:
|
||||
await context.route(
|
||||
"**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}",
|
||||
handler=lambda route, request: route.abort(),
|
||||
)
|
||||
|
||||
page = await context.new_page()
|
||||
await page.goto(body.url, timeout=15000) # Set max timeout to 15s
|
||||
|
||||
# Set headers if provided
|
||||
if body.headers:
|
||||
await page.set_extra_http_headers(body.headers)
|
||||
|
||||
await page.goto(
|
||||
body.url,
|
||||
timeout=15000,
|
||||
wait_until=body.wait_until if body.wait_until else "load",
|
||||
) # Set max timeout to 15s
|
||||
if body.wait: # Check if wait parameter is provided in the request body
|
||||
await page.wait_for_timeout(body.wait) # Convert seconds to milliseconds for playwright
|
||||
await page.wait_for_timeout(
|
||||
body.wait
|
||||
) # Convert seconds to milliseconds for playwright
|
||||
page_content = await page.content()
|
||||
await context.close()
|
||||
json_compatible_item_data = {"content": page_content}
|
||||
|
2
apps/python-sdk/.pylintrc
Normal file
@ -0,0 +1,2 @@
|
||||
[FORMAT]
|
||||
max-line-length = 120
|
@ -117,6 +117,25 @@ status = app.check_crawl_status(job_id)
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
||||
|
||||
## Running the Tests with Pytest
|
||||
|
||||
To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling.
|
||||
|
||||
### Running the Tests
|
||||
|
||||
To run the tests, execute the following commands:
|
||||
|
||||
Install pytest:
|
||||
```bash
|
||||
pip install pytest
|
||||
```
|
||||
|
||||
Run:
|
||||
```bash
|
||||
pytest firecrawl/__tests__/e2e_withAuth/test.py
|
||||
```
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
|
||||
|
@ -1,17 +1,50 @@
|
||||
"""
|
||||
FirecrawlApp Module
|
||||
|
||||
This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
|
||||
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
||||
and check the status of these jobs. The module uses requests for HTTP communication
|
||||
and handles retries for certain HTTP status codes.
|
||||
|
||||
Classes:
|
||||
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Any, Dict, Optional
|
||||
import requests
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class FirecrawlApp:
|
||||
def __init__(self, api_key=None):
|
||||
"""
|
||||
Initialize the FirecrawlApp instance.
|
||||
|
||||
Args:
|
||||
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
||||
api_url (Optional[str]): Base URL for the Firecrawl API.
|
||||
"""
|
||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||
if self.api_key is None:
|
||||
raise ValueError('No API key provided')
|
||||
|
||||
|
||||
|
||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
"""
|
||||
Scrape the specified URL using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
|
||||
|
||||
Returns:
|
||||
Any: The scraped data if the request is successful.
|
||||
|
||||
Raises:
|
||||
Exception: If the scrape request fails.
|
||||
"""
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
@ -38,23 +71,36 @@ class FirecrawlApp:
|
||||
scrape_params[key] = value
|
||||
# Make the POST request with the prepared headers and JSON data
|
||||
response = requests.post(
|
||||
'https://api.firecrawl.dev/v0/scrape',
|
||||
f'{self.api_url}/v0/scrape',
|
||||
headers=headers,
|
||||
json=scrape_params
|
||||
json=scrape_params,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
if response['success']:
|
||||
if response['success'] and 'data' in response:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||
elif response.status_code in [402, 409, 500]:
|
||||
elif response.status_code in [402, 408, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
||||
|
||||
def search(self, query, params=None):
|
||||
"""
|
||||
Perform a search using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the search request.
|
||||
|
||||
Returns:
|
||||
Any: The search results if the request is successful.
|
||||
|
||||
Raises:
|
||||
Exception: If the search request fails.
|
||||
"""
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
@ -63,13 +109,14 @@ class FirecrawlApp:
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = requests.post(
|
||||
'https://api.firecrawl.dev/v0/search',
|
||||
f'{self.api_url}/v0/search',
|
||||
headers=headers,
|
||||
json=json_data
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
if response['success'] == True:
|
||||
|
||||
if response['success'] and 'data' in response:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
||||
@ -80,12 +127,28 @@ class FirecrawlApp:
|
||||
else:
|
||||
raise Exception(f'Failed to search. Status code: {response.status_code}')
|
||||
|
||||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
|
||||
headers = self._prepare_headers()
|
||||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
|
||||
"""
|
||||
Initiate a crawl job for the specified URL using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to crawl.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
||||
wait_until_done (bool): Whether to wait until the crawl job is completed.
|
||||
timeout (int): Timeout between status checks when waiting for job completion.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Any: The crawl job ID or the crawl results if waiting until completion.
|
||||
|
||||
Raises:
|
||||
Exception: If the crawl job initiation or monitoring fails.
|
||||
"""
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
json_data = {'url': url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers)
|
||||
response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
job_id = response.json().get('jobId')
|
||||
if wait_until_done:
|
||||
@ -96,20 +159,64 @@ class FirecrawlApp:
|
||||
self._handle_error(response, 'start crawl job')
|
||||
|
||||
def check_crawl_status(self, job_id):
|
||||
"""
|
||||
Check the status of a crawl job using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
job_id (str): The ID of the crawl job.
|
||||
|
||||
Returns:
|
||||
Any: The status of the crawl job.
|
||||
|
||||
Raises:
|
||||
Exception: If the status check request fails.
|
||||
"""
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers)
|
||||
response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, 'check crawl status')
|
||||
|
||||
def _prepare_headers(self):
|
||||
def _prepare_headers(self, idempotency_key=None):
|
||||
"""
|
||||
Prepare the headers for API requests.
|
||||
|
||||
Args:
|
||||
idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
|
||||
"""
|
||||
if idempotency_key:
|
||||
return {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
'x-idempotency-key': idempotency_key
|
||||
}
|
||||
|
||||
return {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
}
|
||||
|
||||
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
|
||||
"""
|
||||
Make a POST request with retries.
|
||||
|
||||
Args:
|
||||
url (str): The URL to send the POST request to.
|
||||
data (Dict[str, Any]): The JSON data to include in the POST request.
|
||||
headers (Dict[str, str]): The headers to include in the POST request.
|
||||
retries (int): Number of retries for the request.
|
||||
backoff_factor (float): Backoff factor for retries.
|
||||
|
||||
Returns:
|
||||
requests.Response: The response from the POST request.
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If the request fails after the specified retries.
|
||||
"""
|
||||
for attempt in range(retries):
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
if response.status_code == 502:
|
||||
@ -119,6 +226,21 @@ class FirecrawlApp:
|
||||
return response
|
||||
|
||||
def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
|
||||
"""
|
||||
Make a GET request with retries.
|
||||
|
||||
Args:
|
||||
url (str): The URL to send the GET request to.
|
||||
headers (Dict[str, str]): The headers to include in the GET request.
|
||||
retries (int): Number of retries for the request.
|
||||
backoff_factor (float): Backoff factor for retries.
|
||||
|
||||
Returns:
|
||||
requests.Response: The response from the GET request.
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If the request fails after the specified retries.
|
||||
"""
|
||||
for attempt in range(retries):
|
||||
response = requests.get(url, headers=headers)
|
||||
if response.status_code == 502:
|
||||
@ -128,9 +250,22 @@ class FirecrawlApp:
|
||||
return response
|
||||
|
||||
def _monitor_job_status(self, job_id, headers, timeout):
|
||||
import time
|
||||
"""
|
||||
Monitor the status of a crawl job until completion.
|
||||
|
||||
Args:
|
||||
job_id (str): The ID of the crawl job.
|
||||
headers (Dict[str, str]): The headers to include in the status check requests.
|
||||
timeout (int): Timeout between status checks.
|
||||
|
||||
Returns:
|
||||
Any: The crawl results if the job is completed successfully.
|
||||
|
||||
Raises:
|
||||
Exception: If the job fails or an error occurs during status checks.
|
||||
"""
|
||||
while True:
|
||||
status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers)
|
||||
status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
||||
if status_response.status_code == 200:
|
||||
status_data = status_response.json()
|
||||
if status_data['status'] == 'completed':
|
||||
@ -138,9 +273,8 @@ class FirecrawlApp:
|
||||
return status_data['data']
|
||||
else:
|
||||
raise Exception('Crawl job completed but no data was returned')
|
||||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
|
||||
if timeout < 2:
|
||||
timeout = 2
|
||||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']:
|
||||
timeout=max(timeout,2)
|
||||
time.sleep(timeout) # Wait for the specified timeout before checking again
|
||||
else:
|
||||
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
|
||||
@ -148,7 +282,17 @@ class FirecrawlApp:
|
||||
self._handle_error(status_response, 'check crawl status')
|
||||
|
||||
def _handle_error(self, response, action):
|
||||
if response.status_code in [402, 409, 500]:
|
||||
"""
|
||||
Handle errors from API responses.
|
||||
|
||||
Args:
|
||||
response (requests.Response): The response object from the API request.
|
||||
action (str): Description of the action that was being performed.
|
||||
|
||||
Raises:
|
||||
Exception: An exception with a message containing the status code and error details from the response.
|
||||
"""
|
||||
if response.status_code in [402, 408, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
|
BIN
apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz
vendored
Normal file
BIN
apps/python-sdk/dist/firecrawl-py-0.0.8.tar.gz
vendored
BIN
apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl
vendored
Normal file
@ -1,4 +1,5 @@
|
||||
from firecrawl import FirecrawlApp
|
||||
import uuid
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
@ -7,7 +8,8 @@ scrape_result = app.scrape_url('firecrawl.dev')
|
||||
print(scrape_result['markdown'])
|
||||
|
||||
# Crawl a website:
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
|
||||
print(crawl_result)
|
||||
|
||||
# LLM Extraction:
|
||||
|
@ -0,0 +1,3 @@
|
||||
API_URL=http://localhost:3002
|
||||
ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py
|
||||
TEST_API_KEY=fc-YOUR_API_KEY
|
168
apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py
Normal file
@ -0,0 +1,168 @@
|
||||
import importlib.util
|
||||
import pytest
|
||||
import time
|
||||
import os
|
||||
from uuid import uuid4
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "http://127.0.0.1:3002";
|
||||
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
||||
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
||||
|
||||
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
|
||||
|
||||
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
|
||||
firecrawl = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(firecrawl)
|
||||
FirecrawlApp = firecrawl.FirecrawlApp
|
||||
|
||||
def test_no_api_key():
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app = FirecrawlApp(api_url=API_URL)
|
||||
assert "No API key provided" in str(excinfo.value)
|
||||
|
||||
def test_scrape_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.scrape_url('https://firecrawl.dev')
|
||||
assert "Failed to scrape URL. Status code: 401" in str(excinfo.value)
|
||||
|
||||
def test_blocklisted_url():
|
||||
blocklisted_url = "https://facebook.com/fake-test"
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.scrape_url(blocklisted_url)
|
||||
assert "Failed to scrape URL. Status code: 403" in str(excinfo.value)
|
||||
|
||||
def test_successful_response_with_valid_preview_token():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
||||
response = app.scrape_url('https://firecrawl.dev')
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
assert "🔥 Firecrawl" in response['content']
|
||||
|
||||
def test_scrape_url_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url('https://firecrawl.dev')
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
assert 'markdown' in response
|
||||
assert 'metadata' in response
|
||||
assert 'html' not in response
|
||||
assert "🔥 Firecrawl" in response['content']
|
||||
|
||||
def test_successful_response_with_valid_api_key_and_include_html():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url('https://firecrawl.dev', {'pageOptions': {'includeHtml': True}})
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
assert 'markdown' in response
|
||||
assert 'html' in response
|
||||
assert 'metadata' in response
|
||||
assert "🔥 Firecrawl" in response['content']
|
||||
assert "🔥 Firecrawl" in response['markdown']
|
||||
assert "<h1" in response['html']
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
assert 'metadata' in response
|
||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
||||
time.sleep(6) # wait for 6 seconds
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
assert 'metadata' in response
|
||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
||||
|
||||
def test_crawl_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.crawl_url('https://firecrawl.dev')
|
||||
assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value)
|
||||
|
||||
def test_should_return_error_for_blocklisted_url():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
blocklisted_url = "https://twitter.com/fake-test"
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url(blocklisted_url)
|
||||
assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value)
|
||||
|
||||
def test_crawl_url_wait_for_completion_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
||||
assert response is not None
|
||||
assert len(response) > 0
|
||||
assert 'content' in response[0]
|
||||
assert "🔥 Firecrawl" in response[0]['content']
|
||||
|
||||
def test_crawl_url_with_idempotency_key_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
uniqueIdempotencyKey = str(uuid4())
|
||||
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
||||
assert response is not None
|
||||
assert len(response) > 0
|
||||
assert 'content' in response[0]
|
||||
assert "🔥 Firecrawl" in response[0]['content']
|
||||
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
||||
assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value)
|
||||
|
||||
def test_check_crawl_status_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
||||
assert response is not None
|
||||
assert 'jobId' in response
|
||||
|
||||
time.sleep(30) # wait for 30 seconds
|
||||
status_response = app.check_crawl_status(response['jobId'])
|
||||
assert status_response is not None
|
||||
assert 'status' in status_response
|
||||
assert status_response['status'] == 'completed'
|
||||
assert 'data' in status_response
|
||||
assert len(status_response['data']) > 0
|
||||
|
||||
def test_search_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.search("test query")
|
||||
assert response is not None
|
||||
assert 'content' in response[0]
|
||||
assert len(response) > 2
|
||||
|
||||
def test_search_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.search("test query")
|
||||
assert "Failed to search. Status code: 401" in str(excinfo.value)
|
||||
|
||||
def test_llm_extraction():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url("https://mendable.ai", {
|
||||
'extractorOptions': {
|
||||
'mode': 'llm-extraction',
|
||||
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
'extractionSchema': {
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'company_mission': {'type': 'string'},
|
||||
'supports_sso': {'type': 'boolean'},
|
||||
'is_open_source': {'type': 'boolean'}
|
||||
},
|
||||
'required': ['company_mission', 'supports_sso', 'is_open_source']
|
||||
}
|
||||
}
|
||||
})
|
||||
assert response is not None
|
||||
assert 'llm_extraction' in response
|
||||
llm_extraction = response['llm_extraction']
|
||||
assert 'company_mission' in llm_extraction
|
||||
assert isinstance(llm_extraction['supports_sso'], bool)
|
||||
assert isinstance(llm_extraction['is_open_source'], bool)
|
@ -1,17 +1,50 @@
|
||||
"""
|
||||
FirecrawlApp Module
|
||||
|
||||
This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
|
||||
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
||||
and check the status of these jobs. The module uses requests for HTTP communication
|
||||
and handles retries for certain HTTP status codes.
|
||||
|
||||
Classes:
|
||||
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Any, Dict, Optional
|
||||
import requests
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class FirecrawlApp:
|
||||
def __init__(self, api_key=None):
|
||||
"""
|
||||
Initialize the FirecrawlApp instance.
|
||||
|
||||
Args:
|
||||
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
||||
api_url (Optional[str]): Base URL for the Firecrawl API.
|
||||
"""
|
||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||
if self.api_key is None:
|
||||
raise ValueError('No API key provided')
|
||||
|
||||
|
||||
|
||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
"""
|
||||
Scrape the specified URL using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
|
||||
|
||||
Returns:
|
||||
Any: The scraped data if the request is successful.
|
||||
|
||||
Raises:
|
||||
Exception: If the scrape request fails.
|
||||
"""
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
@ -38,23 +71,36 @@ class FirecrawlApp:
|
||||
scrape_params[key] = value
|
||||
# Make the POST request with the prepared headers and JSON data
|
||||
response = requests.post(
|
||||
'https://api.firecrawl.dev/v0/scrape',
|
||||
f'{self.api_url}/v0/scrape',
|
||||
headers=headers,
|
||||
json=scrape_params
|
||||
json=scrape_params,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
if response['success']:
|
||||
if response['success'] and 'data' in response:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||
elif response.status_code in [402, 409, 500]:
|
||||
elif response.status_code in [402, 408, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
||||
|
||||
def search(self, query, params=None):
|
||||
"""
|
||||
Perform a search using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the search request.
|
||||
|
||||
Returns:
|
||||
Any: The search results if the request is successful.
|
||||
|
||||
Raises:
|
||||
Exception: If the search request fails.
|
||||
"""
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
@ -63,13 +109,14 @@ class FirecrawlApp:
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = requests.post(
|
||||
'https://api.firecrawl.dev/v0/search',
|
||||
f'{self.api_url}/v0/search',
|
||||
headers=headers,
|
||||
json=json_data
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
if response['success'] == True:
|
||||
|
||||
if response['success'] and 'data' in response:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
||||
@ -80,12 +127,28 @@ class FirecrawlApp:
|
||||
else:
|
||||
raise Exception(f'Failed to search. Status code: {response.status_code}')
|
||||
|
||||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
|
||||
headers = self._prepare_headers()
|
||||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
|
||||
"""
|
||||
Initiate a crawl job for the specified URL using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to crawl.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
||||
wait_until_done (bool): Whether to wait until the crawl job is completed.
|
||||
timeout (int): Timeout between status checks when waiting for job completion.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Any: The crawl job ID or the crawl results if waiting until completion.
|
||||
|
||||
Raises:
|
||||
Exception: If the crawl job initiation or monitoring fails.
|
||||
"""
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
json_data = {'url': url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers)
|
||||
response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
job_id = response.json().get('jobId')
|
||||
if wait_until_done:
|
||||
@ -96,20 +159,64 @@ class FirecrawlApp:
|
||||
self._handle_error(response, 'start crawl job')
|
||||
|
||||
def check_crawl_status(self, job_id):
|
||||
"""
|
||||
Check the status of a crawl job using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
job_id (str): The ID of the crawl job.
|
||||
|
||||
Returns:
|
||||
Any: The status of the crawl job.
|
||||
|
||||
Raises:
|
||||
Exception: If the status check request fails.
|
||||
"""
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers)
|
||||
response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, 'check crawl status')
|
||||
|
||||
def _prepare_headers(self):
|
||||
def _prepare_headers(self, idempotency_key=None):
|
||||
"""
|
||||
Prepare the headers for API requests.
|
||||
|
||||
Args:
|
||||
idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
|
||||
"""
|
||||
if idempotency_key:
|
||||
return {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
'x-idempotency-key': idempotency_key
|
||||
}
|
||||
|
||||
return {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
}
|
||||
|
||||
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
|
||||
"""
|
||||
Make a POST request with retries.
|
||||
|
||||
Args:
|
||||
url (str): The URL to send the POST request to.
|
||||
data (Dict[str, Any]): The JSON data to include in the POST request.
|
||||
headers (Dict[str, str]): The headers to include in the POST request.
|
||||
retries (int): Number of retries for the request.
|
||||
backoff_factor (float): Backoff factor for retries.
|
||||
|
||||
Returns:
|
||||
requests.Response: The response from the POST request.
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If the request fails after the specified retries.
|
||||
"""
|
||||
for attempt in range(retries):
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
if response.status_code == 502:
|
||||
@ -119,6 +226,21 @@ class FirecrawlApp:
|
||||
return response
|
||||
|
||||
def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
|
||||
"""
|
||||
Make a GET request with retries.
|
||||
|
||||
Args:
|
||||
url (str): The URL to send the GET request to.
|
||||
headers (Dict[str, str]): The headers to include in the GET request.
|
||||
retries (int): Number of retries for the request.
|
||||
backoff_factor (float): Backoff factor for retries.
|
||||
|
||||
Returns:
|
||||
requests.Response: The response from the GET request.
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If the request fails after the specified retries.
|
||||
"""
|
||||
for attempt in range(retries):
|
||||
response = requests.get(url, headers=headers)
|
||||
if response.status_code == 502:
|
||||
@ -128,9 +250,22 @@ class FirecrawlApp:
|
||||
return response
|
||||
|
||||
def _monitor_job_status(self, job_id, headers, timeout):
|
||||
import time
|
||||
"""
|
||||
Monitor the status of a crawl job until completion.
|
||||
|
||||
Args:
|
||||
job_id (str): The ID of the crawl job.
|
||||
headers (Dict[str, str]): The headers to include in the status check requests.
|
||||
timeout (int): Timeout between status checks.
|
||||
|
||||
Returns:
|
||||
Any: The crawl results if the job is completed successfully.
|
||||
|
||||
Raises:
|
||||
Exception: If the job fails or an error occurs during status checks.
|
||||
"""
|
||||
while True:
|
||||
status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers)
|
||||
status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
||||
if status_response.status_code == 200:
|
||||
status_data = status_response.json()
|
||||
if status_data['status'] == 'completed':
|
||||
@ -138,9 +273,8 @@ class FirecrawlApp:
|
||||
return status_data['data']
|
||||
else:
|
||||
raise Exception('Crawl job completed but no data was returned')
|
||||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
|
||||
if timeout < 2:
|
||||
timeout = 2
|
||||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']:
|
||||
timeout=max(timeout,2)
|
||||
time.sleep(timeout) # Wait for the specified timeout before checking again
|
||||
else:
|
||||
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
|
||||
@ -148,7 +282,17 @@ class FirecrawlApp:
|
||||
self._handle_error(status_response, 'check crawl status')
|
||||
|
||||
def _handle_error(self, response, action):
|
||||
if response.status_code in [402, 409, 500]:
|
||||
"""
|
||||
Handle errors from API responses.
|
||||
|
||||
Args:
|
||||
response (requests.Response): The response object from the API request.
|
||||
action (str): Description of the action that was being performed.
|
||||
|
||||
Raises:
|
||||
Exception: An exception with a message containing the status code and error details from the response.
|
||||
"""
|
||||
if response.status_code in [402, 408, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
|
@ -1,7 +1,179 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: firecrawl-py
|
||||
Version: 0.0.8
|
||||
Version: 0.0.12
|
||||
Summary: Python SDK for Firecrawl API
|
||||
Home-page: https://github.com/mendableai/firecrawl
|
||||
Author: Mendable.ai
|
||||
Author-email: nick@mendable.ai
|
||||
License: GNU General Public License v3 (GPLv3)
|
||||
Project-URL: Documentation, https://docs.firecrawl.dev
|
||||
Project-URL: Source, https://github.com/mendableai/firecrawl
|
||||
Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues
|
||||
Keywords: SDK API firecrawl
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Environment :: Web Environment
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
||||
Classifier: Natural Language :: English
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Topic :: Internet
|
||||
Classifier: Topic :: Internet :: WWW/HTTP
|
||||
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
||||
Classifier: Topic :: Software Development
|
||||
Classifier: Topic :: Software Development :: Libraries
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Text Processing
|
||||
Classifier: Topic :: Text Processing :: Indexing
|
||||
Requires-Python: >=3.8
|
||||
Description-Content-Type: text/markdown
|
||||
|
||||
# Firecrawl Python SDK
|
||||
|
||||
The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
|
||||
|
||||
## Installation
|
||||
|
||||
To install the Firecrawl Python SDK, you can use pip:
|
||||
|
||||
```bash
|
||||
pip install firecrawl-py
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
||||
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
||||
|
||||
|
||||
Here's an example of how to use the SDK:
|
||||
|
||||
```python
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
# Initialize the FirecrawlApp with your API key
|
||||
app = FirecrawlApp(api_key='your_api_key')
|
||||
|
||||
# Scrape a single URL
|
||||
url = 'https://mendable.ai'
|
||||
scraped_data = app.scrape_url(url)
|
||||
|
||||
# Crawl a website
|
||||
crawl_url = 'https://mendable.ai'
|
||||
params = {
|
||||
'pageOptions': {
|
||||
'onlyMainContent': True
|
||||
}
|
||||
}
|
||||
crawl_result = app.crawl_url(crawl_url, params=params)
|
||||
```
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```python
|
||||
url = 'https://example.com'
|
||||
scraped_data = app.scrape_url(url)
|
||||
```
|
||||
### Extracting structured data from a URL
|
||||
|
||||
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
|
||||
|
||||
```python
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
by: str
|
||||
commentsURL: str
|
||||
|
||||
class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||
|
||||
data = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
print(data["llm_extraction"])
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
||||
|
||||
```python
|
||||
query = 'what is mendable?'
|
||||
search_result = app.search(query)
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
||||
The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
|
||||
|
||||
```python
|
||||
crawl_url = 'https://example.com'
|
||||
params = {
|
||||
'crawlerOptions': {
|
||||
'excludes': ['blog/*'],
|
||||
'includes': [], # leave empty for all pages
|
||||
'limit': 1000,
|
||||
},
|
||||
'pageOptions': {
|
||||
'onlyMainContent': True
|
||||
}
|
||||
}
|
||||
crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
|
||||
```
|
||||
|
||||
If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
|
||||
|
||||
### Checking Crawl Status
|
||||
|
||||
To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
||||
```python
|
||||
job_id = crawl_result['jobId']
|
||||
status = app.check_crawl_status(job_id)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
||||
|
||||
## Running the Tests with Pytest
|
||||
|
||||
To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling.
|
||||
|
||||
### Running the Tests
|
||||
|
||||
To run the tests, execute the following commands:
|
||||
|
||||
Install pytest:
|
||||
```bash
|
||||
pip install pytest
|
||||
```
|
||||
|
||||
Run:
|
||||
```bash
|
||||
pytest firecrawl/__tests__/e2e_withAuth/test.py
|
||||
```
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
|
||||
|
||||
## License
|
||||
|
||||
The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
|
||||
|
@ -1 +1,3 @@
|
||||
requests
|
||||
pytest
|
||||
python-dotenv
|
||||
|
3
apps/python-sdk/requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
requests
|
||||
pytest
|
||||
python-dotenv
|
@ -1,14 +1,52 @@
|
||||
from setuptools import setup, find_packages
|
||||
from pathlib import Path
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
this_directory = Path(__file__).parent
|
||||
long_description_content = (this_directory / "README.md").read_text()
|
||||
|
||||
setup(
|
||||
name='firecrawl-py',
|
||||
version='0.0.8',
|
||||
url='https://github.com/mendableai/firecrawl',
|
||||
author='Mendable.ai',
|
||||
author_email='nick@mendable.ai',
|
||||
description='Python SDK for Firecrawl API',
|
||||
name="firecrawl-py",
|
||||
version="0.0.12",
|
||||
url="https://github.com/mendableai/firecrawl",
|
||||
author="Mendable.ai",
|
||||
author_email="nick@mendable.ai",
|
||||
description="Python SDK for Firecrawl API",
|
||||
long_description=long_description_content,
|
||||
long_description_content_type="text/markdown",
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
'requests',
|
||||
'pytest',
|
||||
'python-dotenv',
|
||||
],
|
||||
python_requires='>=3.8',
|
||||
classifiers=[
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: Web Environment",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
||||
"Natural Language :: English",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Topic :: Internet",
|
||||
"Topic :: Internet :: WWW/HTTP",
|
||||
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
||||
"Topic :: Software Development",
|
||||
"Topic :: Software Development :: Libraries",
|
||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||
"Topic :: Text Processing",
|
||||
"Topic :: Text Processing :: Indexing",
|
||||
],
|
||||
keywords="SDK API firecrawl",
|
||||
project_urls={
|
||||
"Documentation": "https://docs.firecrawl.dev",
|
||||
"Source": "https://github.com/mendableai/firecrawl",
|
||||
"Tracker": "https://github.com/mendableai/firecrawl/issues",
|
||||
},
|
||||
license="GNU General Public License v3 (GPLv3)",
|
||||
)
|
||||
|
178
apps/test-suite/data/crawl.json
Normal file
@ -0,0 +1,178 @@
|
||||
[
|
||||
{
|
||||
"website": "https://www.vellum.ai/llm-leaderboard",
|
||||
"expected_min_num_of_pages": 1,
|
||||
"expected_crawled_pages": ["https://www.vellum.ai/llm-leaderboard"]
|
||||
},
|
||||
{
|
||||
"website": "https://openai.com/news",
|
||||
"expected_min_num_of_pages": 4,
|
||||
"expected_crawled_pages": [
|
||||
"https://openai.com/news/company/",
|
||||
"https://openai.com/news/research/",
|
||||
"https://openai.com/news/safety-and-alignment/",
|
||||
"https://openai.com/news/stories/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://www.framer.com/pricing",
|
||||
"expected_min_num_of_pages": 1,
|
||||
"expected_not_crawled_pages": [
|
||||
"https://www.framer.com/features/navigation/",
|
||||
"https://www.framer.com/contact/",
|
||||
"https://www.framer.com/add-ons/",
|
||||
"https://www.framer.com/free-saas-ui-kit/",
|
||||
"https://www.framer.com/help/",
|
||||
"https://www.framer.com/features/effects/",
|
||||
"https://www.framer.com/enterprise/",
|
||||
"https://www.framer.com/templates/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://mendable.ai/pricing",
|
||||
"expected_min_num_of_pages": 1,
|
||||
"expected_not_crawled_pages": [
|
||||
"https://mendable.ai/",
|
||||
"https://mendable.ai/blog",
|
||||
"https://mendable.ai/signin",
|
||||
"https://mendable.ai/signup",
|
||||
"https://mendable.ai",
|
||||
"https://mendable.ai/usecases/sales-enablement",
|
||||
"https://mendable.ai/usecases/documentation",
|
||||
"https://mendable.ai/usecases/cs-enablement",
|
||||
"https://mendable.ai/usecases/productcopilot",
|
||||
"https://mendable.ai/security"
|
||||
],
|
||||
"notes": "This one should not go backwards, but it does!"
|
||||
},
|
||||
|
||||
{
|
||||
"website": "https://agentops.ai/blog",
|
||||
"expected_min_num_of_pages": 6,
|
||||
"expected_crawled_pages": [
|
||||
"https://www.agentops.ai/blog/effortless-hr-management-with-saas",
|
||||
"https://www.agentops.ai/blog/streamlining-hr-with-saas",
|
||||
"https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions",
|
||||
"https://www.agentops.ai/blog/efficient-hr-operations-with-saas",
|
||||
"https://www.agentops.ai/blog/hr-made-simple-with-saas",
|
||||
"https://agentops.ai/blog"
|
||||
],
|
||||
"expected_not_crawled_pages": [
|
||||
"https://agentops.ai/about-us",
|
||||
"https://agentops.ai/contact-us"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://en.wikipedia.org/wiki/T._N._Seshan",
|
||||
"expected_min_num_of_pages": 1,
|
||||
"expected_not_crawled_pages": [
|
||||
"https://en.wikipedia.org/wiki/Wikipedia:Contents",
|
||||
"https://en.wikipedia.org/wiki/Wikipedia:Contact_us",
|
||||
"https://en.wikipedia.org/wiki/V._S._Ramadevi",
|
||||
"https://en.wikipedia.org/wiki/Wikipedia:About",
|
||||
"https://en.wikipedia.org/wiki/Help:Introduction",
|
||||
"https://en.wikipedia.org/wiki/H._D._Deve_Gowda",
|
||||
"https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg"
|
||||
]
|
||||
},
|
||||
|
||||
{
|
||||
"website": "https://ycombinator.com/companies",
|
||||
"expected_min_num_of_pages": 20,
|
||||
"expected_crawled_pages": [
|
||||
"https://www.ycombinator.com/companies/industry/elearning",
|
||||
"https://www.ycombinator.com/companies/industry/computer-vision",
|
||||
"https://www.ycombinator.com/companies/industry/health-tech",
|
||||
"https://www.ycombinator.com/companies/industry/education",
|
||||
"https://www.ycombinator.com/companies/industry/robotics",
|
||||
"https://www.ycombinator.com/companies/industry/hardware",
|
||||
"https://www.ycombinator.com/companies/industry/saas",
|
||||
"https://www.ycombinator.com/companies/industry/hard-tech",
|
||||
"https://www.ycombinator.com/companies/industry/developer-tools",
|
||||
"https://www.ycombinator.com/companies/industry/entertainment",
|
||||
"https://www.ycombinator.com/companies/industry/finance",
|
||||
"https://www.ycombinator.com/companies/industry/generative-ai",
|
||||
"https://www.ycombinator.com/companies/industry/machine-learning"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://firecrawl.dev",
|
||||
"expected_min_num_of_pages": 2,
|
||||
"expected_crawled_pages": [
|
||||
"https://firecrawl.dev/",
|
||||
"https://firecrawl.dev/pricing"
|
||||
]
|
||||
},
|
||||
|
||||
|
||||
{
|
||||
"website": "https://fly.io/docs/gpus/gpu-quickstart",
|
||||
"expected_min_num_of_pages": 1,
|
||||
"expected_not_crawled_pages": [
|
||||
"https://fly.io/docs/getting-started/",
|
||||
"https://fly.io/docs/hands-on/",
|
||||
"https://fly.io/docs/about/support/",
|
||||
"https://fly.io/docs/blueprints/going-to-production-with-healthcare-apps/",
|
||||
"https://fly.io/docs/machines/flyctl/fly-machine-update/",
|
||||
"https://fly.io/docs/blueprints/review-apps-guide/",
|
||||
"https://fly.io/docs/blueprints/supercronic/"
|
||||
],
|
||||
"notes": "This one should not go backwards, but it does!"
|
||||
},
|
||||
|
||||
{
|
||||
"website": "https://www.instructables.com/circuits",
|
||||
"expected_min_num_of_pages": 12,
|
||||
"expected_crawled_pages": [
|
||||
"https://www.instructables.com/circuits/",
|
||||
"https://www.instructables.com/circuits/apple/projects/",
|
||||
"https://www.instructables.com/circuits/art/projects/",
|
||||
"https://www.instructables.com/circuits/electronics/projects/",
|
||||
"https://www.instructables.com/circuits/microsoft/projects/",
|
||||
"https://www.instructables.com/circuits/microcontrollers/projects/",
|
||||
"https://www.instructables.com/circuits/community/",
|
||||
"https://www.instructables.com/circuits/leds/projects/",
|
||||
"https://www.instructables.com/circuits/gadgets/projects/",
|
||||
"https://www.instructables.com/circuits/arduino/projects/",
|
||||
"https://www.instructables.com/circuits/lasers/projects/",
|
||||
"https://www.instructables.com/circuits/clocks/projects/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://richmondconfidential.org",
|
||||
"expected_min_num_of_pages": 20,
|
||||
"expected_crawled_pages": [
|
||||
"https://richmondconfidential.org/2009/10/13/salesians-star-guard-has-a-big-impact/",
|
||||
"https://richmondconfidential.org/2009/10/13/on-team-of-beginners-oilers-old-hand-stands-out/",
|
||||
"https://richmondconfidential.org/2009/10/19/point-richmond-clockmaker-turns-clutter-into-crafts/",
|
||||
"https://richmondconfidential.org/2009/10/13/profile-maurice-cathy/",
|
||||
"https://richmondconfidential.org/2009/10/13/soul-food-rescue-mission-rebuilds-diets-and-lives/",
|
||||
"https://richmondconfidential.org/2009/10/21/in-tough-economy-pain-trickles-to-the-bottom/",
|
||||
"https://richmondconfidential.org/2009/10/19/richmond-homicide-map/",
|
||||
"https://richmondconfidential.org/2009/10/13/rough-roads-for-richmonds-cab-drivers/",
|
||||
"https://richmondconfidential.org/2009/10/13/before-napa-there-was-winehaven/",
|
||||
"https://richmondconfidential.org/2009/10/13/family-calls-for-end-to-violence-at-memorial-for-slain-woman-friend/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"website": "https://www.boardgamegeek.com",
|
||||
"expected_min_num_of_pages": 15,
|
||||
"expected_crawled_pages": [
|
||||
"https://www.boardgamegeek.com/browse/boardgameartist",
|
||||
"https://www.boardgamegeek.com/browse/boardgamehonor",
|
||||
"https://www.boardgamegeek.com/browse/boardgamepublisher",
|
||||
"https://www.boardgamegeek.com/browse/boardgamepodcast",
|
||||
"https://www.boardgamegeek.com/wiki/page/Index",
|
||||
"https://www.boardgamegeek.com/browse/boardgamecategory",
|
||||
"https://www.boardgamegeek.com/boardgame/random",
|
||||
"https://www.boardgamegeek.com/browse/boardgamemechanic",
|
||||
"https://www.boardgamegeek.com/forums",
|
||||
"https://www.boardgamegeek.com/gonecardboard",
|
||||
"https://www.boardgamegeek.com/browse/boardgameaccessory",
|
||||
"https://www.boardgamegeek.com/browse/boardgamedesigner",
|
||||
"https://www.boardgamegeek.com/",
|
||||
"https://www.boardgamegeek.com/previews",
|
||||
"https://www.boardgamegeek.com/browse/boardgame"
|
||||
]
|
||||
}
|
||||
]
|
@ -3,7 +3,9 @@
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"scripts": {
|
||||
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false"
|
||||
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false",
|
||||
"test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts",
|
||||
"test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
|
150
apps/test-suite/tests/crawl.test.ts
Normal file
@ -0,0 +1,150 @@
|
||||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import { WebsiteScrapeError } from "../utils/types";
|
||||
import { logErrors } from "../utils/log";
|
||||
|
||||
import websitesData from "../data/crawl.json";
|
||||
import "dotenv/config";
|
||||
|
||||
import fs from 'fs';
|
||||
dotenv.config();
|
||||
|
||||
interface WebsiteData {
|
||||
website: string;
|
||||
expected_min_num_of_pages: number;
|
||||
expected_crawled_pages: string[];
|
||||
}
|
||||
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("Crawling Checkup (E2E)", () => {
|
||||
beforeAll(() => {
|
||||
if (!process.env.TEST_API_KEY) {
|
||||
throw new Error("TEST_API_KEY is not set");
|
||||
}
|
||||
});
|
||||
|
||||
describe("Crawling website tests with a dataset", () => {
|
||||
it("Should crawl the website and verify the response", async () => {
|
||||
let passedTests = 0;
|
||||
const startTime = new Date().getTime();
|
||||
const date = new Date();
|
||||
const logsDir = `logs/${date.getMonth() + 1}-${date.getDate()}-${date.getFullYear()}`;
|
||||
|
||||
let errorLogFileName = `${logsDir}/run.log_${new Date().toTimeString().split(' ')[0]}`;
|
||||
const errorLog: WebsiteScrapeError[] = [];
|
||||
|
||||
for (const websiteData of websitesData) {
|
||||
try {
|
||||
const crawlResponse = await request(TEST_URL || "")
|
||||
.post("/v0/crawl")
|
||||
.set("Content-Type", "application/json")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100, returnOnlyUrls: true }});
|
||||
|
||||
const jobId = crawlResponse.body.jobId;
|
||||
let completedResponse: any;
|
||||
let isFinished = false;
|
||||
|
||||
while (!isFinished) {
|
||||
completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
isFinished = completedResponse.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
if(!completedResponse) {
|
||||
// fail the test
|
||||
console.log('No response');
|
||||
continue;
|
||||
// continue;
|
||||
}
|
||||
|
||||
if (!completedResponse.body || completedResponse.body.status !== "completed") {
|
||||
errorLog.push({
|
||||
website: websiteData.website,
|
||||
prompt: 'CRAWL',
|
||||
expected_output: 'SUCCESS',
|
||||
actual_output: 'FAILURE',
|
||||
error: `Crawl job did not complete successfully.`
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// check how many webpages were crawled successfully
|
||||
// compares with expected_num_of_pages
|
||||
if (completedResponse.body.data.length < websiteData.expected_min_num_of_pages) {
|
||||
errorLog.push({
|
||||
website: websiteData.website,
|
||||
prompt: 'CRAWL',
|
||||
expected_output: `SUCCESS: ${websiteData.expected_min_num_of_pages}`,
|
||||
actual_output: `FAILURE: ${completedResponse.body.data.length}`,
|
||||
error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}`
|
||||
});
|
||||
console.log('Error: ', errorLog);
|
||||
continue;
|
||||
}
|
||||
|
||||
// checks if crawled pages contain expected_crawled_pages
|
||||
if (websiteData.expected_crawled_pages && websiteData.expected_crawled_pages.length > 0 && websiteData.expected_crawled_pages.some(page => !completedResponse.body.data?.some((d: { url: string }) => d.url === page))) {
|
||||
errorLog.push({
|
||||
website: websiteData.website,
|
||||
prompt: 'CRAWL',
|
||||
expected_output: `SUCCESS: ${websiteData.expected_crawled_pages}`,
|
||||
actual_output: `FAILURE: ${completedResponse.body.data}`,
|
||||
error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}`
|
||||
});
|
||||
console.log('Error: ', errorLog);
|
||||
continue;
|
||||
}
|
||||
|
||||
// checks if crawled pages not contain expected_not_crawled_pages
|
||||
if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && completedResponse.body.data && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
|
||||
errorLog.push({
|
||||
website: websiteData.website,
|
||||
prompt: 'CRAWL',
|
||||
expected_output: `SUCCESS: ${websiteData.expected_not_crawled_pages}`,
|
||||
actual_output: `FAILURE: ${completedResponse.body.data}`,
|
||||
error: `Expected crawled pages to not contain ${websiteData.expected_not_crawled_pages}, but got ${completedResponse.body.data}`
|
||||
});
|
||||
console.log('Error: ', errorLog);
|
||||
continue;
|
||||
}
|
||||
|
||||
passedTests++;
|
||||
} catch (error) {
|
||||
console.error(`Error processing ${websiteData.website}: ${error}`);
|
||||
errorLog.push({
|
||||
website: websiteData.website,
|
||||
prompt: 'CRAWL',
|
||||
expected_output: 'SUCCESS',
|
||||
actual_output: 'FAILURE',
|
||||
error: `Error processing ${websiteData.website}: ${error}`
|
||||
});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const score = (passedTests / websitesData.length) * 100;
|
||||
const endTime = new Date().getTime();
|
||||
const timeTaken = (endTime - startTime) / 1000;
|
||||
console.log(`Score: ${score}%`);
|
||||
|
||||
await logErrors(errorLog, timeTaken, 0, score, websitesData.length);
|
||||
|
||||
if (process.env.ENV === "local" && errorLog.length > 0) {
|
||||
if (!fs.existsSync(logsDir)){
|
||||
fs.mkdirSync(logsDir, { recursive: true });
|
||||
}
|
||||
fs.writeFileSync(errorLogFileName, JSON.stringify(errorLog, null, 2));
|
||||
}
|
||||
|
||||
expect(score).toBeGreaterThanOrEqual(90);
|
||||
}, 350000); // 150 seconds timeout
|
||||
});
|
||||
});
|
@ -1,16 +1,14 @@
|
||||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import Anthropic from "@anthropic-ai/sdk";
|
||||
import { numTokensFromString } from "./utils/tokens";
|
||||
import { numTokensFromString } from "../utils/tokens";
|
||||
import OpenAI from "openai";
|
||||
import { WebsiteScrapeError } from "./utils/types";
|
||||
import { logErrors } from "./utils/log";
|
||||
import { WebsiteScrapeError } from "../utils/types";
|
||||
import { logErrors } from "../utils/log";
|
||||
|
||||
const websitesData = require("./data/websites.json");
|
||||
import websitesData from "../data/scrape.json";
|
||||
import "dotenv/config";
|
||||
|
||||
const fs = require('fs');
|
||||
|
||||
import fs from 'fs';
|
||||
dotenv.config();
|
||||
|
||||
interface WebsiteData {
|
||||
@ -21,8 +19,7 @@ interface WebsiteData {
|
||||
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
|
||||
describe("Scraping/Crawling Checkup (E2E)", () => {
|
||||
describe("Scraping Checkup (E2E)", () => {
|
||||
beforeAll(() => {
|
||||
if (!process.env.TEST_API_KEY) {
|
||||
throw new Error("TEST_API_KEY is not set");
|
||||
@ -72,10 +69,6 @@ describe("Scraping/Crawling Checkup (E2E)", () => {
|
||||
return null;
|
||||
}
|
||||
|
||||
const anthropic = new Anthropic({
|
||||
apiKey: process.env.ANTHROPIC_API_KEY,
|
||||
});
|
||||
|
||||
const openai = new OpenAI({
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
});
|
||||
@ -183,7 +176,7 @@ describe("Scraping/Crawling Checkup (E2E)", () => {
|
||||
}
|
||||
|
||||
|
||||
expect(score).toBeGreaterThanOrEqual(75);
|
||||
expect(score).toBeGreaterThanOrEqual(70);
|
||||
}, 350000); // 150 seconds timeout
|
||||
});
|
||||
});
|
@ -39,7 +39,7 @@
|
||||
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
|
||||
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
|
||||
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
|
||||
// "resolveJsonModule": true, /* Enable importing .json files. */
|
||||
"resolveJsonModule": true, /* Enable importing .json files. */
|
||||
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
|
||||
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||
|
||||
|
82
docker-compose.yaml
Normal file
@ -0,0 +1,82 @@
|
||||
name: firecrawl
|
||||
version: '3.9'
|
||||
services:
|
||||
playwright-service:
|
||||
build: apps/playwright-service
|
||||
environment:
|
||||
- PORT=3000
|
||||
- PROXY_SERVER=${PROXY_SERVER}
|
||||
- PROXY_USERNAME=${PROXY_USERNAME}
|
||||
- PROXY_PASSWORD=${PROXY_PASSWORD}
|
||||
- BLOCK_MEDIA=${BLOCK_MEDIA}
|
||||
networks:
|
||||
- backend
|
||||
|
||||
api:
|
||||
build: apps/api
|
||||
environment:
|
||||
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
|
||||
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
|
||||
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
|
||||
- PORT=${PORT:-3002}
|
||||
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
|
||||
- SERPER_API_KEY=${SERPER_API_KEY}
|
||||
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
|
||||
- LOGTAIL_KEY=${LOGTAIL_KEY}
|
||||
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
|
||||
- TEST_API_KEY=${TEST_API_KEY}
|
||||
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
|
||||
- POSTHOG_HOST=${POSTHOG_HOST}
|
||||
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
|
||||
- SUPABASE_URL=${SUPABASE_URL}
|
||||
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
|
||||
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
|
||||
- HOST=${HOST:-0.0.0.0}
|
||||
depends_on:
|
||||
- redis
|
||||
- playwright-service
|
||||
ports:
|
||||
- "3002:3002"
|
||||
command: [ "pnpm", "run", "start:production" ]
|
||||
networks:
|
||||
- backend
|
||||
|
||||
worker:
|
||||
build: apps/api
|
||||
environment:
|
||||
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
|
||||
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
|
||||
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
|
||||
- PORT=${PORT:-3002}
|
||||
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
|
||||
- SERPER_API_KEY=${SERPER_API_KEY}
|
||||
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
|
||||
- LOGTAIL_KEY=${LOGTAIL_KEY}
|
||||
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
|
||||
- TEST_API_KEY=${TEST_API_KEY}
|
||||
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
|
||||
- POSTHOG_HOST=${POSTHOG_HOST}
|
||||
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
|
||||
- SUPABASE_URL=${SUPABASE_URL}
|
||||
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
|
||||
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
|
||||
- HOST=${HOST:-0.0.0.0}
|
||||
depends_on:
|
||||
- redis
|
||||
- playwright-service
|
||||
- api
|
||||
networks:
|
||||
- backend
|
||||
redis:
|
||||
image: redis:alpine
|
||||
networks:
|
||||
- backend
|
||||
command: redis-server --bind 0.0.0.0
|
||||
|
||||
networks:
|
||||
backend:
|
||||
driver: bridge
|
3
examples/roastmywebsite/.eslintrc.json
Normal file
@ -0,0 +1,3 @@
|
||||
{
|
||||
"extends": "next/core-web-vitals"
|
||||
}
|
38
examples/roastmywebsite/.gitignore
vendored
Normal file
@ -0,0 +1,38 @@
|
||||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||
|
||||
# dependencies
|
||||
/node_modules
|
||||
/.pnp
|
||||
.pnp.js
|
||||
.yarn/install-state.gz
|
||||
|
||||
# testing
|
||||
/coverage
|
||||
|
||||
# next.js
|
||||
/.next/
|
||||
/out/
|
||||
|
||||
# production
|
||||
/build
|
||||
|
||||
# misc
|
||||
.DS_Store
|
||||
*.pem
|
||||
|
||||
# debug
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
|
||||
# local env files
|
||||
.env*.local
|
||||
|
||||
# vercel
|
||||
.vercel
|
||||
|
||||
# typescript
|
||||
*.tsbuildinfo
|
||||
next-env.d.ts
|
||||
.env
|
||||
node_modules
|
5
examples/roastmywebsite/README.md
Normal file
@ -0,0 +1,5 @@
|
||||
# Roast My Website 🔥
|
||||
|
||||
Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them.
|
||||
|
||||
Check it out at roastmywebsite.ai 😈
|
17
examples/roastmywebsite/components.json
Normal file
@ -0,0 +1,17 @@
|
||||
{
|
||||
"$schema": "https://ui.shadcn.com/schema.json",
|
||||
"style": "default",
|
||||
"rsc": true,
|
||||
"tsx": true,
|
||||
"tailwind": {
|
||||
"config": "tailwind.config.ts",
|
||||
"css": "src/app/globals.css",
|
||||
"baseColor": "zinc",
|
||||
"cssVariables": false,
|
||||
"prefix": ""
|
||||
},
|
||||
"aliases": {
|
||||
"components": "@/components",
|
||||
"utils": "@/lib/utils"
|
||||
}
|
||||
}
|
11
examples/roastmywebsite/next.config.mjs
Normal file
@ -0,0 +1,11 @@
|
||||
/** @type {import('next').NextConfig} */
|
||||
const nextConfig = {
|
||||
env: {
|
||||
G1: process.env.G1,
|
||||
G2: process.env.G2,
|
||||
G3: process.env.G3,
|
||||
G4: process.env.G4,
|
||||
},
|
||||
};
|
||||
|
||||
export default nextConfig;
|
6617
examples/roastmywebsite/package-lock.json
generated
Normal file
53
examples/roastmywebsite/package.json
Normal file
@ -0,0 +1,53 @@
|
||||
{
|
||||
"name": "roastmywebsite",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"dev": "next dev",
|
||||
"build": "next build",
|
||||
"start": "next start",
|
||||
"lint": "next lint"
|
||||
},
|
||||
"dependencies": {
|
||||
"@dqbd/tiktoken": "^1.0.15",
|
||||
"@headlessui/react": "^2.0.4",
|
||||
"@headlessui/tailwindcss": "^0.2.0",
|
||||
"@mendable/firecrawl-js": "^0.0.21",
|
||||
"@radix-ui/react-dialog": "^1.0.5",
|
||||
"@radix-ui/react-dropdown-menu": "^2.0.6",
|
||||
"@radix-ui/react-select": "^2.0.0",
|
||||
"@radix-ui/react-slot": "^1.0.2",
|
||||
"@radix-ui/react-switch": "^1.0.3",
|
||||
"@remixicon/react": "^4.2.0",
|
||||
"@tremor/react": "^3.17.2",
|
||||
"@vercel/analytics": "^1.3.1",
|
||||
"axios": "^1.7.2",
|
||||
"class-variance-authority": "^0.7.0",
|
||||
"clsx": "^2.1.1",
|
||||
"cubic-spline": "^3.0.3",
|
||||
"html2canvas": "^1.4.1",
|
||||
"image-size": "^1.1.1",
|
||||
"lucide": "^0.379.0",
|
||||
"lucide-react": "^0.379.0",
|
||||
"next": "14.2.3",
|
||||
"next-themes": "^0.3.0",
|
||||
"openai": "^4.47.3",
|
||||
"react": "^18",
|
||||
"react-dom": "^18",
|
||||
"sonner": "^1.4.41",
|
||||
"tailwind-merge": "^2.3.0",
|
||||
"tailwindcss-animate": "^1.0.7",
|
||||
"tiktoken": "^1.0.15"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@tailwindcss/forms": "^0.5.7",
|
||||
"@types/node": "^20",
|
||||
"@types/react": "^18",
|
||||
"@types/react-dom": "^18",
|
||||
"eslint": "^8",
|
||||
"eslint-config-next": "14.2.3",
|
||||
"postcss": "^8",
|
||||
"tailwindcss": "^3.4.3",
|
||||
"typescript": "^5"
|
||||
}
|
||||
}
|
8
examples/roastmywebsite/postcss.config.mjs
Normal file
@ -0,0 +1,8 @@
|
||||
/** @type {import('postcss-load-config').Config} */
|
||||
const config = {
|
||||
plugins: {
|
||||
tailwindcss: {},
|
||||
},
|
||||
};
|
||||
|
||||
export default config;
|
BIN
examples/roastmywebsite/public/android-chrome-192x192.png
Normal file
After Width: | Height: | Size: 7.8 KiB |
BIN
examples/roastmywebsite/public/android-chrome-512x512.png
Normal file
After Width: | Height: | Size: 23 KiB |
BIN
examples/roastmywebsite/public/apple-touch-icon.png
Normal file
After Width: | Height: | Size: 7.0 KiB |
BIN
examples/roastmywebsite/public/bgd.png
Normal file
After Width: | Height: | Size: 444 KiB |
BIN
examples/roastmywebsite/public/favicon-16x16.png
Normal file
After Width: | Height: | Size: 492 B |
BIN
examples/roastmywebsite/public/favicon-32x32.png
Normal file
After Width: | Height: | Size: 997 B |
BIN
examples/roastmywebsite/public/favicon.ico
Normal file
After Width: | Height: | Size: 15 KiB |
1
examples/roastmywebsite/public/next.svg
Normal file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 394 80"><path fill="#000" d="M262 0h68.5v12.7h-27.2v66.6h-13.6V12.7H262V0ZM149 0v12.7H94v20.4h44.3v12.6H94v21h55v12.6H80.5V0h68.7zm34.3 0h-17.8l63.8 79.4h17.9l-32-39.7 32-39.6h-17.9l-23 28.6-23-28.6zm18.3 56.7-9-11-27.1 33.7h17.8l18.3-22.7z"/><path fill="#000" d="M81 79.3 17 0H0v79.3h13.6V17l50.2 62.3H81Zm252.6-.4c-1 0-1.8-.4-2.5-1s-1.1-1.6-1.1-2.6.3-1.8 1-2.5 1.6-1 2.6-1 1.8.3 2.5 1a3.4 3.4 0 0 1 .6 4.3 3.7 3.7 0 0 1-3 1.8zm23.2-33.5h6v23.3c0 2.1-.4 4-1.3 5.5a9.1 9.1 0 0 1-3.8 3.5c-1.6.8-3.5 1.3-5.7 1.3-2 0-3.7-.4-5.3-1s-2.8-1.8-3.7-3.2c-.9-1.3-1.4-3-1.4-5h6c.1.8.3 1.6.7 2.2s1 1.2 1.6 1.5c.7.4 1.5.5 2.4.5 1 0 1.8-.2 2.4-.6a4 4 0 0 0 1.6-1.8c.3-.8.5-1.8.5-3V45.5zm30.9 9.1a4.4 4.4 0 0 0-2-3.3 7.5 7.5 0 0 0-4.3-1.1c-1.3 0-2.4.2-3.3.5-.9.4-1.6 1-2 1.6a3.5 3.5 0 0 0-.3 4c.3.5.7.9 1.3 1.2l1.8 1 2 .5 3.2.8c1.3.3 2.5.7 3.7 1.2a13 13 0 0 1 3.2 1.8 8.1 8.1 0 0 1 3 6.5c0 2-.5 3.7-1.5 5.1a10 10 0 0 1-4.4 3.5c-1.8.8-4.1 1.2-6.8 1.2-2.6 0-4.9-.4-6.8-1.2-2-.8-3.4-2-4.5-3.5a10 10 0 0 1-1.7-5.6h6a5 5 0 0 0 3.5 4.6c1 .4 2.2.6 3.4.6 1.3 0 2.5-.2 3.5-.6 1-.4 1.8-1 2.4-1.7a4 4 0 0 0 .8-2.4c0-.9-.2-1.6-.7-2.2a11 11 0 0 0-2.1-1.4l-3.2-1-3.8-1c-2.8-.7-5-1.7-6.6-3.2a7.2 7.2 0 0 1-2.4-5.7 8 8 0 0 1 1.7-5 10 10 0 0 1 4.3-3.5c2-.8 4-1.2 6.4-1.2 2.3 0 4.4.4 6.2 1.2 1.8.8 3.2 2 4.3 3.4 1 1.4 1.5 3 1.5 5h-5.8z"/></svg>
|
After Width: | Height: | Size: 1.3 KiB |
BIN
examples/roastmywebsite/public/og.png
Normal file
After Width: | Height: | Size: 262 KiB |
1
examples/roastmywebsite/public/site.webmanifest
Normal file
@ -0,0 +1 @@
|
||||
{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}
|
1
examples/roastmywebsite/public/vercel.svg
Normal file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 283 64"><path fill="black" d="M141 16c-11 0-19 7-19 18s9 18 20 18c7 0 13-3 16-7l-7-5c-2 3-6 4-9 4-5 0-9-3-10-7h28v-3c0-11-8-18-19-18zm-9 15c1-4 4-7 9-7s8 3 9 7h-18zm117-15c-11 0-19 7-19 18s9 18 20 18c6 0 12-3 16-7l-8-5c-2 3-5 4-8 4-5 0-9-3-11-7h28l1-3c0-11-8-18-19-18zm-10 15c2-4 5-7 10-7s8 3 9 7h-19zm-39 3c0 6 4 10 10 10 4 0 7-2 9-5l8 5c-3 5-9 8-17 8-11 0-19-7-19-18s8-18 19-18c8 0 14 3 17 8l-8 5c-2-3-5-5-9-5-6 0-10 4-10 10zm83-29v46h-9V5h9zM37 0l37 64H0L37 0zm92 5-27 48L74 5h10l18 30 17-30h10zm59 12v10l-3-1c-6 0-10 4-10 10v15h-9V17h9v9c0-5 6-9 13-9z"/></svg>
|
After Width: | Height: | Size: 629 B |
BIN
examples/roastmywebsite/src/app/favicon.ico
Normal file
After Width: | Height: | Size: 15 KiB |
10
examples/roastmywebsite/src/app/globals.css
Normal file
@ -0,0 +1,10 @@
|
||||
@tailwind base;
|
||||
@tailwind components;
|
||||
@tailwind utilities;
|
||||
|
||||
|
||||
|
||||
.fill-tremor-content-emphasis {
|
||||
fill: rgb(113 113 122) !important;
|
||||
}
|
||||
|
5
examples/roastmywebsite/src/app/hooks/useGithubStars.ts
Normal file
@ -0,0 +1,5 @@
|
||||
export async function useGithubStars() {
|
||||
const res = await fetch("https://api.github.com/repos/mendableai/firecrawl");
|
||||
const data = await res.json();
|
||||
return data.stargazers_count;
|
||||
}
|
68
examples/roastmywebsite/src/app/layout.tsx
Normal file
@ -0,0 +1,68 @@
|
||||
import type { Metadata } from "next";
|
||||
import { Gloria_Hallelujah } from "next/font/google";
|
||||
import "./globals.css";
|
||||
import { Toaster } from "sonner";
|
||||
import { Analytics } from "@vercel/analytics/react";
|
||||
import { useEffect, useState } from "react";
|
||||
import Head from "next/head";
|
||||
|
||||
|
||||
const inter = Gloria_Hallelujah({ weight: "400", subsets: ["latin"] });
|
||||
// const inter = Inter({ subsets: ["latin"] });
|
||||
|
||||
const meta = {
|
||||
title: "Roast My Website",
|
||||
description:
|
||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 😈",
|
||||
cardImage: "/og.png",
|
||||
robots: "follow, index",
|
||||
favicon: "/favicon.ico",
|
||||
url: "https://www.roastmywebsite.ai/",
|
||||
};
|
||||
|
||||
export async function generateMetadata(): Promise<Metadata> {
|
||||
return {
|
||||
title: meta.title,
|
||||
description: meta.description,
|
||||
referrer: "origin-when-cross-origin",
|
||||
keywords: ["Roast My Website", "Roast", "Website", "GitHub", "Firecrawl"],
|
||||
authors: [
|
||||
{ name: "Roast My Website", url: "https://www.roastmywebsite.ai/" },
|
||||
],
|
||||
creator: "Roast My Website",
|
||||
publisher: "Roast My Website",
|
||||
robots: meta.robots,
|
||||
icons: { icon: meta.favicon },
|
||||
metadataBase: new URL(meta.url),
|
||||
openGraph: {
|
||||
url: meta.url,
|
||||
title: meta.title,
|
||||
description: meta.description,
|
||||
images: [meta.cardImage],
|
||||
type: "website",
|
||||
siteName: meta.title,
|
||||
},
|
||||
twitter: {
|
||||
card: "summary_large_image",
|
||||
site: "@Vercel",
|
||||
creator: "@Vercel",
|
||||
title: meta.title,
|
||||
description: meta.description,
|
||||
images: [meta.cardImage],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export default function RootLayout({
|
||||
children,
|
||||
}: Readonly<{
|
||||
children: React.ReactNode;
|
||||
}>) {
|
||||
return (
|
||||
<html lang="en">
|
||||
<body className={inter.className}>{children}</body>
|
||||
<Analytics />
|
||||
<Toaster />
|
||||
</html>
|
||||
);
|
||||
}
|