From 8d041c05b461a8ecd3a90d17f47ce32611b429be Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Thu, 23 May 2024 08:00:56 +1000 Subject: [PATCH 01/11] rearranged logic for FIRECRAWL_API_URL It would not use the ENV unless the param was set to None which was counter-intuitive. --- apps/python-sdk/firecrawl/firecrawl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 98cb8ed..2b7121a 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -4,11 +4,11 @@ import requests import time class FirecrawlApp: - def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'): + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') if self.api_key is None: raise ValueError('No API key provided') - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') From 971e1f85c45341645214d0f8ea3fa5de202e78fb Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Thu, 23 May 2024 08:03:58 +1000 Subject: [PATCH 02/11] Added module docstring PyLint C0114 - missing-module-docstring --- apps/python-sdk/firecrawl/firecrawl.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 2b7121a..23934cf 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -1,3 +1,15 @@ +""" +FirecrawlApp Module + +This module provides a class `FirecrawlApp` for interacting with the Firecrawl API. +It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, +and check the status of these jobs. The module uses requests for HTTP communication +and handles retries for certain HTTP status codes. + +Classes: + - FirecrawlApp: Main class for interacting with the Firecrawl API. +""" + import os from typing import Any, Dict, Optional import requests From 8adf2b71322238823fc5c770723bcaca548311a6 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Thu, 23 May 2024 08:20:32 +1000 Subject: [PATCH 03/11] Added Docstrings for functions PyLint C0116: Missing function or method docstring (missing-function-docstring) --- apps/python-sdk/firecrawl/firecrawl.py | 128 ++++++++++++++++++++++++- 1 file changed, 126 insertions(+), 2 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 23934cf..9aa9359 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -11,11 +11,20 @@ Classes: """ import os -from typing import Any, Dict, Optional -import requests import time +from typing import Any, Dict, Optional + +import requests + class FirecrawlApp: + """ + Initialize the FirecrawlApp instance. + + Args: + api_key (Optional[str]): API key for authenticating with the Firecrawl API. + api_url (Optional[str]): Base URL for the Firecrawl API. + """ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') if self.api_key is None: @@ -25,6 +34,20 @@ class FirecrawlApp: def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + """ + Scrape the specified URL using the Firecrawl API. + + Args: + url (str): The URL to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. + + Returns: + Any: The scraped data if the request is successful. + + Raises: + Exception: If the scrape request fails. + """ + headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}' @@ -68,6 +91,19 @@ class FirecrawlApp: raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') def search(self, query, params=None): + """ + Perform a search using the Firecrawl API. + + Args: + query (str): The search query. + params (Optional[Dict[str, Any]]): Additional parameters for the search request. + + Returns: + Any: The search results if the request is successful. + + Raises: + Exception: If the search request fails. + """ headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}' @@ -94,6 +130,21 @@ class FirecrawlApp: raise Exception(f'Failed to search. Status code: {response.status_code}') def crawl_url(self, url, params=None, wait_until_done=True, timeout=2): + """ + Initiate a crawl job for the specified URL using the Firecrawl API. + + Args: + url (str): The URL to crawl. + params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. + wait_until_done (bool): Whether to wait until the crawl job is completed. + timeout (int): Timeout between status checks when waiting for job completion. + + Returns: + Any: The crawl job ID or the crawl results if waiting until completion. + + Raises: + Exception: If the crawl job initiation or monitoring fails. + """ headers = self._prepare_headers() json_data = {'url': url} if params: @@ -109,6 +160,18 @@ class FirecrawlApp: self._handle_error(response, 'start crawl job') def check_crawl_status(self, job_id): + """ + Check the status of a crawl job using the Firecrawl API. + + Args: + job_id (str): The ID of the crawl job. + + Returns: + Any: The status of the crawl job. + + Raises: + Exception: If the status check request fails. + """ headers = self._prepare_headers() response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) if response.status_code == 200: @@ -117,12 +180,34 @@ class FirecrawlApp: self._handle_error(response, 'check crawl status') def _prepare_headers(self): + """ + Prepare the headers for API requests. + + Returns: + Dict[str, str]: The headers including content type and authorization. + """ return { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}' } def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): + """ + Make a POST request with retries. + + Args: + url (str): The URL to send the POST request to. + data (Dict[str, Any]): The JSON data to include in the POST request. + headers (Dict[str, str]): The headers to include in the POST request. + retries (int): Number of retries for the request. + backoff_factor (float): Backoff factor for retries. + + Returns: + requests.Response: The response from the POST request. + + Raises: + requests.RequestException: If the request fails after the specified retries. + """ for attempt in range(retries): response = requests.post(url, headers=headers, json=data) if response.status_code == 502: @@ -132,6 +217,21 @@ class FirecrawlApp: return response def _get_request(self, url, headers, retries=3, backoff_factor=0.5): + """ + Make a GET request with retries. + + Args: + url (str): The URL to send the GET request to. + headers (Dict[str, str]): The headers to include in the GET request. + retries (int): Number of retries for the request. + backoff_factor (float): Backoff factor for retries. + + Returns: + requests.Response: The response from the GET request. + + Raises: + requests.RequestException: If the request fails after the specified retries. + """ for attempt in range(retries): response = requests.get(url, headers=headers) if response.status_code == 502: @@ -141,6 +241,20 @@ class FirecrawlApp: return response def _monitor_job_status(self, job_id, headers, timeout): + """ + Monitor the status of a crawl job until completion. + + Args: + job_id (str): The ID of the crawl job. + headers (Dict[str, str]): The headers to include in the status check requests. + timeout (int): Timeout between status checks. + + Returns: + Any: The crawl results if the job is completed successfully. + + Raises: + Exception: If the job fails or an error occurs during status checks. + """ import time while True: status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) @@ -161,6 +275,16 @@ class FirecrawlApp: self._handle_error(status_response, 'check crawl status') def _handle_error(self, response, action): + """ + Handle errors from API responses. + + Args: + response (requests.Response): The response object from the API request. + action (str): Description of the action that was being performed. + + Raises: + Exception: An exception with a message containing the status code and error details from the response. + """ if response.status_code in [402, 408, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') From 6216c8532295ba1d8abdd9d944c4708cf66f3036 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Thu, 23 May 2024 08:21:32 +1000 Subject: [PATCH 04/11] Time module already imported Pylint W0404: Reimport 'time' (imported line 16) (reimported) C0415: Import outside toplevel (time) (import-outside-toplevel) --- apps/python-sdk/firecrawl/firecrawl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 9aa9359..d96db4e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -255,7 +255,6 @@ class FirecrawlApp: Raises: Exception: If the job fails or an error occurs during status checks. """ - import time while True: status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) if status_response.status_code == 200: From 96b19172a167467a9fba9dfc018fc27e595c9056 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Thu, 23 May 2024 08:30:23 +1000 Subject: [PATCH 05/11] Removed trailing whitespace PyLint C0303: Trailing whitespace (trailing-whitespace) --- apps/python-sdk/firecrawl/firecrawl.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d96db4e..7612e5d 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -30,9 +30,6 @@ class FirecrawlApp: if self.api_key is None: raise ValueError('No API key provided') self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - - - def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ Scrape the specified URL using the Firecrawl API. @@ -54,7 +51,7 @@ class FirecrawlApp: } # Prepare the base scrape parameters with the URL scrape_params = {'url': url} - + # If there are additional params, process them if params: # Initialize extractorOptions if present @@ -67,7 +64,7 @@ class FirecrawlApp: extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') # Update the scrape_params with the processed extractorOptions scrape_params['extractorOptions'] = extractor_options - + # Include any other params directly at the top level of scrape_params for key, value in params.items(): if key != 'extractorOptions': @@ -89,7 +86,7 @@ class FirecrawlApp: raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') else: raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') - + def search(self, query, params=None): """ Perform a search using the Firecrawl API. @@ -122,7 +119,7 @@ class FirecrawlApp: return response['data'] else: raise Exception(f'Failed to search. Error: {response["error"]}') - + elif response.status_code in [402, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') @@ -283,7 +280,7 @@ class FirecrawlApp: Raises: Exception: An exception with a message containing the status code and error details from the response. - """ + """ if response.status_code in [402, 408, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') From 7d2efe5acb5595c53323565046564927143370de Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Thu, 23 May 2024 08:39:19 +1000 Subject: [PATCH 06/11] Added request timeouts connection timeout to 5 seconds and the response timeout to 10 PyLint W3101 --- apps/python-sdk/firecrawl/firecrawl.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 7612e5d..d986407 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -73,7 +73,8 @@ class FirecrawlApp: response = requests.post( f'{self.api_url}/v0/scrape', headers=headers, - json=scrape_params + json=scrape_params, + timeout=(5,10) ) if response.status_code == 200: response = response.json() @@ -111,7 +112,8 @@ class FirecrawlApp: response = requests.post( f'{self.api_url}/v0/search', headers=headers, - json=json_data + json=json_data, + timeout=(5,10) ) if response.status_code == 200: response = response.json() From 48e91c89e7bbd97af388c8edcc3cdc173572d3f5 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Thu, 23 May 2024 08:42:07 +1000 Subject: [PATCH 07/11] Removed unnecessary If block PyLint R1731 --- apps/python-sdk/firecrawl/firecrawl.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d986407..62db7a2 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -264,8 +264,7 @@ class FirecrawlApp: else: raise Exception('Crawl job completed but no data was returned') elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: - if timeout < 2: - timeout = 2 + timeout=max(timeout,2) time.sleep(timeout) # Wait for the specified timeout before checking again else: raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') From 5c21aed9c783beb6775c551a8a1f631778ca1af7 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Thu, 23 May 2024 08:45:56 +1000 Subject: [PATCH 08/11] adding pylintrc to allow longer lines --- apps/python-sdk/.pylintrc | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 apps/python-sdk/.pylintrc diff --git a/apps/python-sdk/.pylintrc b/apps/python-sdk/.pylintrc new file mode 100644 index 0000000..a580885 --- /dev/null +++ b/apps/python-sdk/.pylintrc @@ -0,0 +1,2 @@ +[FORMAT] +max-line-length = 120 \ No newline at end of file From 106c18d11f851595413256210af3d0f7158ba8f9 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Thu, 23 May 2024 08:57:53 +1000 Subject: [PATCH 09/11] Use truthiness check for 'success' key in API response PyLint C0121 --- apps/python-sdk/firecrawl/firecrawl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 62db7a2..9e3011c 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -117,7 +117,7 @@ class FirecrawlApp: ) if response.status_code == 200: response = response.json() - if response['success'] == True: + if response['success']: return response['data'] else: raise Exception(f'Failed to search. Error: {response["error"]}') From 53a7ec0f6eee6a1cb22d7ea174b489e042e5bbb3 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 24 May 2024 13:46:16 -0300 Subject: [PATCH 10/11] Removed hard coded timeout --- apps/python-sdk/firecrawl/firecrawl.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 9e3011c..1826605 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -74,7 +74,6 @@ class FirecrawlApp: f'{self.api_url}/v0/scrape', headers=headers, json=scrape_params, - timeout=(5,10) ) if response.status_code == 200: response = response.json() @@ -112,8 +111,7 @@ class FirecrawlApp: response = requests.post( f'{self.api_url}/v0/search', headers=headers, - json=json_data, - timeout=(5,10) + json=json_data ) if response.status_code == 200: response = response.json() From 8c380d70a5f37cf8ac54a090c1159909ba99fa97 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 24 May 2024 09:48:48 -0700 Subject: [PATCH 11/11] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 0e4fc3e..f28a057 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -115,11 +115,8 @@ class FirecrawlApp: ) if response.status_code == 200: response = response.json() -<<<<<<< main - if response['success']: -======= + if response['success'] and 'data' in response: ->>>>>>> main return response['data'] else: raise Exception(f'Failed to search. Error: {response["error"]}')