0

Nick: v8 python

This commit is contained in:
Nicolas 2024-05-08 17:35:16 -07:00
parent e6dbbf1bab
commit 4c88d5da66
8 changed files with 47 additions and 20 deletions

View File

@ -1,5 +1,7 @@
import os import os
from typing import Any, Dict, Optional
import requests import requests
import time
class FirecrawlApp: class FirecrawlApp:
def __init__(self, api_key=None): def __init__(self, api_key=None):
@ -7,26 +9,45 @@ class FirecrawlApp:
if self.api_key is None: if self.api_key is None:
raise ValueError('No API key provided') raise ValueError('No API key provided')
def scrape_url(self, url, params=None):
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}'
} }
json_data = {'url': url} # Prepare the base scrape parameters with the URL
scrape_params = {'url': url}
# If there are additional params, process them
if params: if params:
json_data.update(params) # Initialize extractorOptions if present
extractor_options = params.get('extractorOptions', {})
# Check and convert the extractionSchema if it's a Pydantic model
if 'extractionSchema' in extractor_options:
if hasattr(extractor_options['extractionSchema'], 'schema'):
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
# Update the scrape_params with the processed extractorOptions
scrape_params['extractorOptions'] = extractor_options
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key != 'extractorOptions':
scrape_params[key] = value
# Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/scrape', 'https://api.firecrawl.dev/v0/scrape',
headers=headers, headers=headers,
json=json_data json=scrape_params
) )
if response.status_code == 200: if response.status_code == 200:
response = response.json() response = response.json()
if response['success'] == True: if response['success']:
return response['data'] return response['data']
else: else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]: elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
@ -88,11 +109,23 @@ class FirecrawlApp:
'Authorization': f'Bearer {self.api_key}' 'Authorization': f'Bearer {self.api_key}'
} }
def _post_request(self, url, data, headers): def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
return requests.post(url, headers=headers, json=data) for attempt in range(retries):
response = requests.post(url, headers=headers, json=data)
if response.status_code == 502:
time.sleep(backoff_factor * (2 ** attempt))
else:
return response
return response
def _get_request(self, url, headers): def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
return requests.get(url, headers=headers) for attempt in range(retries):
response = requests.get(url, headers=headers)
if response.status_code == 502:
time.sleep(backoff_factor * (2 ** attempt))
else:
return response
return response
def _monitor_job_status(self, job_id, headers, timeout): def _monitor_job_status(self, job_id, headers, timeout):
import time import time

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -9,12 +9,7 @@ class FirecrawlApp:
if self.api_key is None: if self.api_key is None:
raise ValueError('No API key provided') raise ValueError('No API key provided')
from pydantic import BaseModel
from typing import Optional, Dict, Any
class ScrapeParams(BaseModel):
url: str
extractorOptions: Optional[Dict[str, Any]] = None
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
headers = { headers = {
@ -41,7 +36,6 @@ class FirecrawlApp:
for key, value in params.items(): for key, value in params.items():
if key != 'extractorOptions': if key != 'extractorOptions':
scrape_params[key] = value scrape_params[key] = value
print(scrape_params)
# Make the POST request with the prepared headers and JSON data # Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/scrape', 'https://api.firecrawl.dev/v0/scrape',

View File

@ -1,6 +1,6 @@
Metadata-Version: 2.1 Metadata-Version: 2.1
Name: firecrawl-py Name: firecrawl-py
Version: 0.0.6 Version: 0.0.8
Summary: Python SDK for Firecrawl API Summary: Python SDK for Firecrawl API
Home-page: https://github.com/mendableai/firecrawl Home-page: https://github.com/mendableai/firecrawl
Author: Mendable.ai Author: Mendable.ai

View File

@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup( setup(
name='firecrawl-py', name='firecrawl-py',
version='0.0.7', version='0.0.8',
url='https://github.com/mendableai/firecrawl', url='https://github.com/mendableai/firecrawl',
author='Mendable.ai', author='Mendable.ai',
author_email='nick@mendable.ai', author_email='nick@mendable.ai',