0
v-firecrawl/apps/python-sdk/firecrawl/firecrawl.py

169 lines
7.3 KiB
Python
Raw Normal View History

"""
FirecrawlApp Module
This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
and check the status of these jobs. The module uses requests for HTTP communication
and handles retries for certain HTTP status codes.
Classes:
- FirecrawlApp: Main class for interacting with the Firecrawl API.
"""
2024-04-15 17:01:47 -04:00
import os
from typing import Any, Dict, Optional
2024-04-15 17:01:47 -04:00
import requests
import time
2024-04-15 17:01:47 -04:00
class FirecrawlApp:
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
2024-04-15 17:01:47 -04:00
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
if self.api_key is None:
raise ValueError('No API key provided')
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
2024-04-15 17:01:47 -04:00
2024-05-08 20:35:16 -04:00
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
2024-04-15 17:01:47 -04:00
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
# Prepare the base scrape parameters with the URL
scrape_params = {'url': url}
# If there are additional params, process them
2024-04-15 17:01:47 -04:00
if params:
# Initialize extractorOptions if present
extractor_options = params.get('extractorOptions', {})
# Check and convert the extractionSchema if it's a Pydantic model
if 'extractionSchema' in extractor_options:
if hasattr(extractor_options['extractionSchema'], 'schema'):
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
# Update the scrape_params with the processed extractorOptions
scrape_params['extractorOptions'] = extractor_options
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key != 'extractorOptions':
scrape_params[key] = value
# Make the POST request with the prepared headers and JSON data
2024-04-15 17:01:47 -04:00
response = requests.post(
f'{self.api_url}/v0/scrape',
2024-04-15 17:01:47 -04:00
headers=headers,
json=scrape_params
2024-04-15 17:01:47 -04:00
)
if response.status_code == 200:
response = response.json()
if response['success']:
2024-04-15 17:01:47 -04:00
return response['data']
else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
2024-05-15 20:13:04 -04:00
elif response.status_code in [402, 408, 409, 500]:
2024-04-15 17:01:47 -04:00
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
2024-04-25 14:20:35 -04:00
def search(self, query, params=None):
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
json_data = {'query': query}
if params:
json_data.update(params)
response = requests.post(
f'{self.api_url}/v0/search',
2024-04-25 14:20:35 -04:00
headers=headers,
json=json_data
)
if response.status_code == 200:
response = response.json()
if response['success'] == True:
return response['data']
else:
raise Exception(f'Failed to search. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to search. Status code: {response.status_code}')
2024-04-15 17:01:47 -04:00
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
headers = self._prepare_headers()
json_data = {'url': url}
if params:
json_data.update(params)
response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
2024-04-15 17:01:47 -04:00
if response.status_code == 200:
job_id = response.json().get('jobId')
if wait_until_done:
return self._monitor_job_status(job_id, headers, timeout)
else:
return {'jobId': job_id}
else:
self._handle_error(response, 'start crawl job')
def check_crawl_status(self, job_id):
headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
2024-04-15 17:01:47 -04:00
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, 'check crawl status')
def _prepare_headers(self):
return {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
for attempt in range(retries):
response = requests.post(url, headers=headers, json=data)
if response.status_code == 502:
time.sleep(backoff_factor * (2 ** attempt))
else:
return response
return response
2024-04-15 17:01:47 -04:00
def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
for attempt in range(retries):
response = requests.get(url, headers=headers)
if response.status_code == 502:
time.sleep(backoff_factor * (2 ** attempt))
else:
return response
return response
2024-04-15 17:01:47 -04:00
def _monitor_job_status(self, job_id, headers, timeout):
import time
while True:
status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
2024-04-15 17:01:47 -04:00
if status_response.status_code == 200:
status_data = status_response.json()
if status_data['status'] == 'completed':
if 'data' in status_data:
return status_data['data']
else:
raise Exception('Crawl job completed but no data was returned')
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
if timeout < 2:
timeout = 2
time.sleep(timeout) # Wait for the specified timeout before checking again
else:
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
else:
self._handle_error(status_response, 'check crawl status')
def _handle_error(self, response, action):
2024-05-15 20:13:04 -04:00
if response.status_code in [402, 408, 409, 500]:
2024-04-15 17:01:47 -04:00
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')