Merge branch 'main' into main
This commit is contained in:
commit
efb821d63b
@ -1,12 +1,12 @@
|
|||||||
import { parseApi } from "../../src/lib/parseApi";
|
import { parseApi } from "../../src/lib/parseApi";
|
||||||
import { getRateLimiter, } from "../../src/services/rate-limiter";
|
import { getRateLimiter, } from "../../src/services/rate-limiter";
|
||||||
import { AuthResponse, RateLimiterMode } from "../../src/types";
|
import { AuthResponse, RateLimiterMode } from "../../src/types";
|
||||||
import { supabase_service } from "../../src/services/supabase";
|
import { supabase_service } from "../../src/services/supabase";
|
||||||
import { withAuth } from "../../src/lib/withAuth";
|
import { withAuth } from "../../src/lib/withAuth";
|
||||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||||
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
|
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
|
||||||
|
|
||||||
export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise<AuthResponse> {
|
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
|
||||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
return withAuth(supaAuthenticateUser)(req, res, mode);
|
||||||
}
|
}
|
||||||
function setTrace(team_id: string, api_key: string) {
|
function setTrace(team_id: string, api_key: string) {
|
||||||
@ -126,9 +126,11 @@ export async function supaAuthenticateUser(
|
|||||||
await rateLimiter.consume(iptoken);
|
await rateLimiter.consume(iptoken);
|
||||||
} catch (rateLimiterRes) {
|
} catch (rateLimiterRes) {
|
||||||
console.error(rateLimiterRes);
|
console.error(rateLimiterRes);
|
||||||
|
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||||
|
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: "Rate limit exceeded. Too many requests, try again in 1 minute.",
|
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Please retry after ${secs}s, resets at ${retryDate}`,
|
||||||
status: 429,
|
status: 429,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -155,9 +157,9 @@ export async function supaAuthenticateUser(
|
|||||||
normalizedApi = parseApi(token);
|
normalizedApi = parseApi(token);
|
||||||
|
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
.from("api_keys")
|
.from("api_keys")
|
||||||
.select("*")
|
.select("*")
|
||||||
.eq("key", normalizedApi);
|
.eq("key", normalizedApi);
|
||||||
|
|
||||||
if (error || !data || data.length === 0) {
|
if (error || !data || data.length === 0) {
|
||||||
return {
|
return {
|
||||||
|
@ -28,11 +28,13 @@ export async function searchHelper(
|
|||||||
|
|
||||||
const tbs = searchOptions.tbs ?? null;
|
const tbs = searchOptions.tbs ?? null;
|
||||||
const filter = searchOptions.filter ?? null;
|
const filter = searchOptions.filter ?? null;
|
||||||
|
const num_results = searchOptions.limit ?? 7;
|
||||||
|
const num_results_buffer = Math.floor(num_results * 1.5);
|
||||||
|
|
||||||
let res = await search({
|
let res = await search({
|
||||||
query: query,
|
query: query,
|
||||||
advanced: advanced,
|
advanced: advanced,
|
||||||
num_results: searchOptions.limit ?? 7,
|
num_results: num_results_buffer,
|
||||||
tbs: tbs,
|
tbs: tbs,
|
||||||
filter: filter,
|
filter: filter,
|
||||||
lang: searchOptions.lang ?? "en",
|
lang: searchOptions.lang ?? "en",
|
||||||
@ -47,6 +49,9 @@ export async function searchHelper(
|
|||||||
}
|
}
|
||||||
|
|
||||||
res = res.filter((r) => !isUrlBlocked(r.url));
|
res = res.filter((r) => !isUrlBlocked(r.url));
|
||||||
|
if (res.length > num_results) {
|
||||||
|
res = res.slice(0, num_results);
|
||||||
|
}
|
||||||
|
|
||||||
if (res.length === 0) {
|
if (res.length === 0) {
|
||||||
return { success: true, error: "No search results found", returnCode: 200 };
|
return { success: true, error: "No search results found", returnCode: 200 };
|
||||||
|
@ -1,18 +1,50 @@
|
|||||||
|
"""
|
||||||
|
FirecrawlApp Module
|
||||||
|
|
||||||
|
This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
|
||||||
|
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
||||||
|
and check the status of these jobs. The module uses requests for HTTP communication
|
||||||
|
and handles retries for certain HTTP status codes.
|
||||||
|
|
||||||
|
Classes:
|
||||||
|
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
import requests
|
|
||||||
import time
|
import time
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
class FirecrawlApp:
|
class FirecrawlApp:
|
||||||
def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'):
|
"""
|
||||||
|
Initialize the FirecrawlApp instance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
||||||
|
api_url (Optional[str]): Base URL for the Firecrawl API.
|
||||||
|
"""
|
||||||
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||||
if self.api_key is None:
|
if self.api_key is None:
|
||||||
raise ValueError('No API key provided')
|
raise ValueError('No API key provided')
|
||||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL')
|
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||||
|
"""
|
||||||
|
Scrape the specified URL using the Firecrawl API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): The URL to scrape.
|
||||||
|
params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Any: The scraped data if the request is successful.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If the scrape request fails.
|
||||||
|
"""
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
'Authorization': f'Bearer {self.api_key}'
|
||||||
@ -41,11 +73,11 @@ class FirecrawlApp:
|
|||||||
response = requests.post(
|
response = requests.post(
|
||||||
f'{self.api_url}/v0/scrape',
|
f'{self.api_url}/v0/scrape',
|
||||||
headers=headers,
|
headers=headers,
|
||||||
json=scrape_params
|
json=scrape_params,
|
||||||
)
|
)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
response = response.json()
|
response = response.json()
|
||||||
if response['success']:
|
if response['success'] and 'data' in response:
|
||||||
return response['data']
|
return response['data']
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||||
@ -56,6 +88,19 @@ class FirecrawlApp:
|
|||||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
||||||
|
|
||||||
def search(self, query, params=None):
|
def search(self, query, params=None):
|
||||||
|
"""
|
||||||
|
Perform a search using the Firecrawl API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query (str): The search query.
|
||||||
|
params (Optional[Dict[str, Any]]): Additional parameters for the search request.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Any: The search results if the request is successful.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If the search request fails.
|
||||||
|
"""
|
||||||
headers = {
|
headers = {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
'Authorization': f'Bearer {self.api_key}'
|
||||||
@ -70,7 +115,8 @@ class FirecrawlApp:
|
|||||||
)
|
)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
response = response.json()
|
response = response.json()
|
||||||
if response['success'] == True:
|
|
||||||
|
if response['success'] and 'data' in response:
|
||||||
return response['data']
|
return response['data']
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
raise Exception(f'Failed to search. Error: {response["error"]}')
|
||||||
@ -81,8 +127,24 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to search. Status code: {response.status_code}')
|
raise Exception(f'Failed to search. Status code: {response.status_code}')
|
||||||
|
|
||||||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
|
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
|
||||||
headers = self._prepare_headers()
|
"""
|
||||||
|
Initiate a crawl job for the specified URL using the Firecrawl API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): The URL to crawl.
|
||||||
|
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
||||||
|
wait_until_done (bool): Whether to wait until the crawl job is completed.
|
||||||
|
timeout (int): Timeout between status checks when waiting for job completion.
|
||||||
|
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Any: The crawl job ID or the crawl results if waiting until completion.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If the crawl job initiation or monitoring fails.
|
||||||
|
"""
|
||||||
|
headers = self._prepare_headers(idempotency_key)
|
||||||
json_data = {'url': url}
|
json_data = {'url': url}
|
||||||
if params:
|
if params:
|
||||||
json_data.update(params)
|
json_data.update(params)
|
||||||
@ -97,6 +159,18 @@ class FirecrawlApp:
|
|||||||
self._handle_error(response, 'start crawl job')
|
self._handle_error(response, 'start crawl job')
|
||||||
|
|
||||||
def check_crawl_status(self, job_id):
|
def check_crawl_status(self, job_id):
|
||||||
|
"""
|
||||||
|
Check the status of a crawl job using the Firecrawl API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id (str): The ID of the crawl job.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Any: The status of the crawl job.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If the status check request fails.
|
||||||
|
"""
|
||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
@ -104,13 +178,45 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(response, 'check crawl status')
|
self._handle_error(response, 'check crawl status')
|
||||||
|
|
||||||
def _prepare_headers(self):
|
def _prepare_headers(self, idempotency_key=None):
|
||||||
|
"""
|
||||||
|
Prepare the headers for API requests.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
|
||||||
|
"""
|
||||||
|
if idempotency_key:
|
||||||
|
return {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Authorization': f'Bearer {self.api_key}',
|
||||||
|
'x-idempotency-key': idempotency_key
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
'Authorization': f'Bearer {self.api_key}',
|
||||||
}
|
}
|
||||||
|
|
||||||
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
|
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
|
||||||
|
"""
|
||||||
|
Make a POST request with retries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): The URL to send the POST request to.
|
||||||
|
data (Dict[str, Any]): The JSON data to include in the POST request.
|
||||||
|
headers (Dict[str, str]): The headers to include in the POST request.
|
||||||
|
retries (int): Number of retries for the request.
|
||||||
|
backoff_factor (float): Backoff factor for retries.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
requests.Response: The response from the POST request.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
requests.RequestException: If the request fails after the specified retries.
|
||||||
|
"""
|
||||||
for attempt in range(retries):
|
for attempt in range(retries):
|
||||||
response = requests.post(url, headers=headers, json=data)
|
response = requests.post(url, headers=headers, json=data)
|
||||||
if response.status_code == 502:
|
if response.status_code == 502:
|
||||||
@ -120,6 +226,21 @@ class FirecrawlApp:
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
|
def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
|
||||||
|
"""
|
||||||
|
Make a GET request with retries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): The URL to send the GET request to.
|
||||||
|
headers (Dict[str, str]): The headers to include in the GET request.
|
||||||
|
retries (int): Number of retries for the request.
|
||||||
|
backoff_factor (float): Backoff factor for retries.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
requests.Response: The response from the GET request.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
requests.RequestException: If the request fails after the specified retries.
|
||||||
|
"""
|
||||||
for attempt in range(retries):
|
for attempt in range(retries):
|
||||||
response = requests.get(url, headers=headers)
|
response = requests.get(url, headers=headers)
|
||||||
if response.status_code == 502:
|
if response.status_code == 502:
|
||||||
@ -129,7 +250,20 @@ class FirecrawlApp:
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
def _monitor_job_status(self, job_id, headers, timeout):
|
def _monitor_job_status(self, job_id, headers, timeout):
|
||||||
import time
|
"""
|
||||||
|
Monitor the status of a crawl job until completion.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id (str): The ID of the crawl job.
|
||||||
|
headers (Dict[str, str]): The headers to include in the status check requests.
|
||||||
|
timeout (int): Timeout between status checks.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Any: The crawl results if the job is completed successfully.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If the job fails or an error occurs during status checks.
|
||||||
|
"""
|
||||||
while True:
|
while True:
|
||||||
status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
||||||
if status_response.status_code == 200:
|
if status_response.status_code == 200:
|
||||||
@ -140,8 +274,7 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
raise Exception('Crawl job completed but no data was returned')
|
raise Exception('Crawl job completed but no data was returned')
|
||||||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
|
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
|
||||||
if timeout < 2:
|
timeout=max(timeout,2)
|
||||||
timeout = 2
|
|
||||||
time.sleep(timeout) # Wait for the specified timeout before checking again
|
time.sleep(timeout) # Wait for the specified timeout before checking again
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
|
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
|
||||||
@ -149,6 +282,16 @@ class FirecrawlApp:
|
|||||||
self._handle_error(status_response, 'check crawl status')
|
self._handle_error(status_response, 'check crawl status')
|
||||||
|
|
||||||
def _handle_error(self, response, action):
|
def _handle_error(self, response, action):
|
||||||
|
"""
|
||||||
|
Handle errors from API responses.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
response (requests.Response): The response object from the API request.
|
||||||
|
action (str): Description of the action that was being performed.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: An exception with a message containing the status code and error details from the response.
|
||||||
|
"""
|
||||||
if response.status_code in [402, 408, 409, 500]:
|
if response.status_code in [402, 408, 409, 500]:
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
error_message = response.json().get('error', 'Unknown error occurred')
|
||||||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
||||||
|
BIN
apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz
vendored
Normal file
BIN
apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz
vendored
Normal file
Binary file not shown.
BIN
apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz
vendored
BIN
apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz
vendored
Binary file not shown.
BIN
apps/python-sdk/dist/firecrawl_py-0.0.10-py3-none-any.whl
vendored
Normal file
BIN
apps/python-sdk/dist/firecrawl_py-0.0.10-py3-none-any.whl
vendored
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,6 +1,6 @@
|
|||||||
Metadata-Version: 2.1
|
Metadata-Version: 2.1
|
||||||
Name: firecrawl-py
|
Name: firecrawl-py
|
||||||
Version: 0.0.9
|
Version: 0.0.10
|
||||||
Summary: Python SDK for Firecrawl API
|
Summary: Python SDK for Firecrawl API
|
||||||
Home-page: https://github.com/mendableai/firecrawl
|
Home-page: https://github.com/mendableai/firecrawl
|
||||||
Author: Mendable.ai
|
Author: Mendable.ai
|
||||||
|
@ -7,7 +7,7 @@ long_description_content = (this_directory / "README.md").read_text()
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="firecrawl-py",
|
name="firecrawl-py",
|
||||||
version="0.0.9",
|
version="0.0.10",
|
||||||
url="https://github.com/mendableai/firecrawl",
|
url="https://github.com/mendableai/firecrawl",
|
||||||
author="Mendable.ai",
|
author="Mendable.ai",
|
||||||
author_email="nick@mendable.ai",
|
author_email="nick@mendable.ai",
|
||||||
|
Loading…
Reference in New Issue
Block a user