Merge branch 'test-sdks' of https://github.com/mendableai/firecrawl into test-sdks

2024-05-27 14:23:39 -03:00 · 2024-05-27 14:23:39 -03:00 · 667d3e4c4f
commit 667d3e4c4f
parent 19decd1062 3c8edf683c
13 changed files with 385 additions and 40 deletions
--- a/apps/api/fly.toml
+++ b/apps/api/fly.toml
@ -27,6 +27,13 @@ kill_timeout = '5s'
  hard_limit = 200
  soft_limit = 100

+[[http_service.checks]]
+  grace_period = "10s"
+  interval = "30s"
+  method = "GET"
+  timeout = "5s"
+  path = "/"
+
 [[services]]
  protocol = 'tcp'
  internal_port = 8080
--- a/apps/api/src/controllers/auth.ts
+++ b/apps/api/src/controllers/auth.ts
@ -1,12 +1,12 @@
 import { parseApi } from "../../src/lib/parseApi";
-import { getRateLimiter,  } from "../../src/services/rate-limiter";
+import { getRateLimiter, } from "../../src/services/rate-limiter";
 import { AuthResponse, RateLimiterMode } from "../../src/types";
 import { supabase_service } from "../../src/services/supabase";
 import { withAuth } from "../../src/lib/withAuth";
 import { RateLimiterRedis } from "rate-limiter-flexible";
 import { setTraceAttributes } from '@hyperdx/node-opentelemetry';

-export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise<AuthResponse> {
+export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
  return withAuth(supaAuthenticateUser)(req, res, mode);
 }
 function setTrace(team_id: string, api_key: string) {
@ -18,7 +18,7 @@ function setTrace(team_id: string, api_key: string) {
  } catch (error) {
    console.error('Error setting trace attributes:', error);
  }
-  
+
 }
 export async function supaAuthenticateUser(
  req,
@ -97,7 +97,7 @@ export async function supaAuthenticateUser(
      team_id: team_id,
      plan: plan
    }
-    switch (mode) { 
+    switch (mode) {
      case RateLimiterMode.Crawl:
        rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
        break;
@ -126,9 +126,11 @@ export async function supaAuthenticateUser(
    await rateLimiter.consume(iptoken);
  } catch (rateLimiterRes) {
    console.error(rateLimiterRes);
+    const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
+    const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
    return {
      success: false,
-      error: "Rate limit exceeded. Too many requests, try again in 1 minute.",
+      error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Please retry after ${secs}s, resets at ${retryDate}`,
      status: 429,
    };
  }
@ -155,9 +157,9 @@ export async function supaAuthenticateUser(
    normalizedApi = parseApi(token);

    const { data, error } = await supabase_service
-    .from("api_keys")
-    .select("*")
-    .eq("key", normalizedApi);
+      .from("api_keys")
+      .select("*")
+      .eq("key", normalizedApi);

    if (error || !data || data.length === 0) {
      return {
@ -170,7 +172,7 @@ export async function supaAuthenticateUser(
    subscriptionData = data[0];
  }

-  return { success: true, team_id: subscriptionData.team_id };  
+  return { success: true, team_id: subscriptionData.team_id };
 }

 function getPlanByPriceId(price_id: string) {
--- a/apps/api/src/controllers/search.ts
+++ b/apps/api/src/controllers/search.ts
@ -28,11 +28,13 @@ export async function searchHelper(

  const tbs = searchOptions.tbs ?? null;
  const filter = searchOptions.filter ?? null;
+  const num_results = searchOptions.limit ?? 7;
+  const num_results_buffer = Math.floor(num_results * 1.5);

  let res = await search({
    query: query,
    advanced: advanced,
-    num_results: searchOptions.limit ?? 7,
+    num_results: num_results_buffer,
    tbs: tbs,
    filter: filter,
    lang: searchOptions.lang ?? "en",
@ -47,6 +49,9 @@ export async function searchHelper(
  }

  res = res.filter((r) => !isUrlBlocked(r.url));
+  if (res.length > num_results) {
+    res = res.slice(0, num_results);
+  }

  if (res.length === 0) {
    return { success: true, error: "No search results found", returnCode: 200 };
--- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts
+++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts
@ -1,7 +1,6 @@
 const socialMediaBlocklist = [
  'facebook.com',
  'twitter.com',
-  'x.com',
  'instagram.com',
  'linkedin.com',
  'pinterest.com',
--- a/apps/python-sdk/build/lib/firecrawl/firecrawl.py
+++ b/apps/python-sdk/build/lib/firecrawl/firecrawl.py
@ -1,25 +1,57 @@
+"""
+FirecrawlApp Module
+
+This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
+It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
+and check the status of these jobs. The module uses requests for HTTP communication
+and handles retries for certain HTTP status codes.
+
+Classes:
+    - FirecrawlApp: Main class for interacting with the Firecrawl API.
+"""
+
 import os
-from typing import Any, Dict, Optional
-import requests
 import time
+from typing import Any, Dict, Optional
+
+import requests
+

 class FirecrawlApp:
-    def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'):
+    """
+    Initialize the FirecrawlApp instance.
+
+    Args:
+        api_key (Optional[str]): API key for authenticating with the Firecrawl API.
+        api_url (Optional[str]): Base URL for the Firecrawl API.
+    """
+    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
        self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
        if self.api_key is None:
            raise ValueError('No API key provided')
-        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL')
-    
-    
-
+        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
    def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
+        """
+        Scrape the specified URL using the Firecrawl API.
+
+        Args:
+            url (str): The URL to scrape.
+            params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
+
+        Returns:
+            Any: The scraped data if the request is successful.
+
+        Raises:
+            Exception: If the scrape request fails.
+        """
+
        headers = {
            'Content-Type': 'application/json',
            'Authorization': f'Bearer {self.api_key}'
        }
        # Prepare the base scrape parameters with the URL
        scrape_params = {'url': url}
-        
+
        # If there are additional params, process them
        if params:
            # Initialize extractorOptions if present
@ -32,7 +64,7 @@ class FirecrawlApp:
                extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
                # Update the scrape_params with the processed extractorOptions
                scrape_params['extractorOptions'] = extractor_options
-            
+
            # Include any other params directly at the top level of scrape_params
            for key, value in params.items():
                if key != 'extractorOptions':
@ -41,11 +73,11 @@ class FirecrawlApp:
        response = requests.post(
            f'{self.api_url}/v0/scrape',
            headers=headers,
-            json=scrape_params
+            json=scrape_params,
        )
        if response.status_code == 200:
            response = response.json()
-            if response['success']:
+            if response['success'] and 'data' in response:
                return response['data']
            else:
                raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
@ -54,8 +86,21 @@ class FirecrawlApp:
            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
        else:
            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
-        
+
    def search(self, query, params=None):
+        """
+        Perform a search using the Firecrawl API.
+
+        Args:
+            query (str): The search query.
+            params (Optional[Dict[str, Any]]): Additional parameters for the search request.
+
+        Returns:
+            Any: The search results if the request is successful.
+
+        Raises:
+            Exception: If the search request fails.
+        """
        headers = {
            'Content-Type': 'application/json',
            'Authorization': f'Bearer {self.api_key}'
@ -70,19 +115,36 @@ class FirecrawlApp:
        )
        if response.status_code == 200:
            response = response.json()
-            if response['success'] == True:
+            
+            if response['success'] and 'data' in response:
                return response['data']
            else:
                raise Exception(f'Failed to search. Error: {response["error"]}')
-            
+
        elif response.status_code in [402, 409, 500]:
            error_message = response.json().get('error', 'Unknown error occurred')
            raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
        else:
            raise Exception(f'Failed to search. Status code: {response.status_code}')

-    def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
-        headers = self._prepare_headers()
+    def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
+        """
+        Initiate a crawl job for the specified URL using the Firecrawl API.
+
+        Args:
+            url (str): The URL to crawl.
+            params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
+            wait_until_done (bool): Whether to wait until the crawl job is completed.
+            timeout (int): Timeout between status checks when waiting for job completion.
+            idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
+
+        Returns:
+            Any: The crawl job ID or the crawl results if waiting until completion.
+
+        Raises:
+            Exception: If the crawl job initiation or monitoring fails.
+        """
+        headers = self._prepare_headers(idempotency_key)
        json_data = {'url': url}
        if params:
            json_data.update(params)
@ -97,6 +159,18 @@ class FirecrawlApp:
            self._handle_error(response, 'start crawl job')

    def check_crawl_status(self, job_id):
+        """
+        Check the status of a crawl job using the Firecrawl API.
+
+        Args:
+            job_id (str): The ID of the crawl job.
+
+        Returns:
+            Any: The status of the crawl job.
+
+        Raises:
+            Exception: If the status check request fails.
+        """
        headers = self._prepare_headers()
        response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
        if response.status_code == 200:
@ -104,13 +178,45 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'check crawl status')

-    def _prepare_headers(self):
+    def _prepare_headers(self, idempotency_key=None):
+        """
+        Prepare the headers for API requests.
+
+        Args:
+            idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
+
+        Returns:
+            Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
+        """
+        if idempotency_key:
+            return {
+                'Content-Type': 'application/json',
+                'Authorization': f'Bearer {self.api_key}',
+                'x-idempotency-key': idempotency_key
+            }
+
        return {
            'Content-Type': 'application/json',
-            'Authorization': f'Bearer {self.api_key}'
+            'Authorization': f'Bearer {self.api_key}',
        }

    def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
+        """
+        Make a POST request with retries.
+
+        Args:
+            url (str): The URL to send the POST request to.
+            data (Dict[str, Any]): The JSON data to include in the POST request.
+            headers (Dict[str, str]): The headers to include in the POST request.
+            retries (int): Number of retries for the request.
+            backoff_factor (float): Backoff factor for retries.
+
+        Returns:
+            requests.Response: The response from the POST request.
+
+        Raises:
+            requests.RequestException: If the request fails after the specified retries.
+        """
        for attempt in range(retries):
            response = requests.post(url, headers=headers, json=data)
            if response.status_code == 502:
@ -120,6 +226,21 @@ class FirecrawlApp:
        return response

    def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
+        """
+        Make a GET request with retries.
+
+        Args:
+            url (str): The URL to send the GET request to.
+            headers (Dict[str, str]): The headers to include in the GET request.
+            retries (int): Number of retries for the request.
+            backoff_factor (float): Backoff factor for retries.
+
+        Returns:
+            requests.Response: The response from the GET request.
+
+        Raises:
+            requests.RequestException: If the request fails after the specified retries.
+        """
        for attempt in range(retries):
            response = requests.get(url, headers=headers)
            if response.status_code == 502:
@ -129,7 +250,20 @@ class FirecrawlApp:
        return response

    def _monitor_job_status(self, job_id, headers, timeout):
-        import time
+        """
+        Monitor the status of a crawl job until completion.
+
+        Args:
+            job_id (str): The ID of the crawl job.
+            headers (Dict[str, str]): The headers to include in the status check requests.
+            timeout (int): Timeout between status checks.
+
+        Returns:
+            Any: The crawl results if the job is completed successfully.
+
+        Raises:
+            Exception: If the job fails or an error occurs during status checks.
+        """
        while True:
            status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
            if status_response.status_code == 200:
@ -140,8 +274,7 @@ class FirecrawlApp:
                    else:
                        raise Exception('Crawl job completed but no data was returned')
                elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
-                    if timeout < 2:
-                        timeout = 2
+                    timeout=max(timeout,2)
                    time.sleep(timeout)  # Wait for the specified timeout before checking again
                else:
                    raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
@ -149,6 +282,16 @@ class FirecrawlApp:
                self._handle_error(status_response, 'check crawl status')

    def _handle_error(self, response, action):
+        """
+        Handle errors from API responses.
+
+        Args:
+            response (requests.Response): The response object from the API request.
+            action (str): Description of the action that was being performed.
+
+        Raises:
+            Exception: An exception with a message containing the status code and error details from the response.
+        """
        if response.status_code in [402, 408, 409, 500]:
            error_message = response.json().get('error', 'Unknown error occurred')
            raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
--- a/apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz
+++ b/apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz
--- a/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz
+++ b/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz
--- a/apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl
+++ b/apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl
--- a/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl
+++ b/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl
--- a/apps/python-sdk/firecrawl/pycache/init.cpython-311.pyc
+++ b/apps/python-sdk/firecrawl/pycache/init.cpython-311.pyc
--- a/apps/python-sdk/firecrawl/pycache/firecrawl.cpython-311.pyc
+++ b/apps/python-sdk/firecrawl/pycache/firecrawl.cpython-311.pyc
--- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO
+++ b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO
@ -1,7 +1,160 @@
 Metadata-Version: 2.1
 Name: firecrawl-py
-Version: 0.0.9
+Version: 0.0.11
 Summary: Python SDK for Firecrawl API
 Home-page: https://github.com/mendableai/firecrawl
 Author: Mendable.ai
 Author-email: nick@mendable.ai
+License: GNU General Public License v3 (GPLv3)
+Project-URL: Documentation, https://docs.firecrawl.dev
+Project-URL: Source, https://github.com/mendableai/firecrawl
+Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues
+Keywords: SDK API firecrawl
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Web Environment
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Topic :: Internet
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
+Classifier: Topic :: Software Development
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing
+Classifier: Topic :: Text Processing :: Indexing
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+
+# Firecrawl Python SDK
+
+The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
+
+## Installation
+
+To install the Firecrawl Python SDK, you can use pip:
+
+```bash
+pip install firecrawl-py
+```
+
+## Usage
+
+1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
+2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
+
+
+Here's an example of how to use the SDK:
+
+```python
+from firecrawl import FirecrawlApp
+
+# Initialize the FirecrawlApp with your API key
+app = FirecrawlApp(api_key='your_api_key')
+
+# Scrape a single URL
+url = 'https://mendable.ai'
+scraped_data = app.scrape_url(url)
+
+# Crawl a website
+crawl_url = 'https://mendable.ai'
+params = {
+    'pageOptions': {
+        'onlyMainContent': True
+    }
+}
+crawl_result = app.crawl_url(crawl_url, params=params)
+```
+
+### Scraping a URL
+
+To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
+
+```python
+url = 'https://example.com'
+scraped_data = app.scrape_url(url)
+```
+### Extracting structured data from a URL
+
+With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
+
+```python
+class ArticleSchema(BaseModel):
+    title: str
+    points: int 
+    by: str
+    commentsURL: str
+
+class TopArticlesSchema(BaseModel):
+    top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
+
+data = app.scrape_url('https://news.ycombinator.com', {
+    'extractorOptions': {
+        'extractionSchema': TopArticlesSchema.model_json_schema(),
+        'mode': 'llm-extraction'
+    },
+    'pageOptions':{
+        'onlyMainContent': True
+    }
+})
+print(data["llm_extraction"])
+```
+
+### Search for a query
+
+Used to search the web, get the most relevant results, scrap each page and return the markdown.
+
+```python
+query = 'what is mendable?'
+search_result = app.search(query)
+```
+
+### Crawling a Website
+
+To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
+
+The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
+
+```python
+crawl_url = 'https://example.com'
+params = {
+    'crawlerOptions': {
+        'excludes': ['blog/*'],
+        'includes': [], # leave empty for all pages
+        'limit': 1000,
+    },
+    'pageOptions': {
+        'onlyMainContent': True
+    }
+}
+crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
+```
+
+If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
+
+### Checking Crawl Status
+
+To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
+
+```python
+job_id = crawl_result['jobId']
+status = app.check_crawl_status(job_id)
+```
+
+## Error Handling
+
+The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
+
+## Contributing
+
+Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
+
+## License
+
+The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
--- a/apps/python-sdk/setup.py
+++ b/apps/python-sdk/setup.py
@ -1,16 +1,52 @@
-from setuptools import setup, find_packages
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+this_directory = Path(__file__).parent
+long_description_content = (this_directory / "README.md").read_text()

 setup(
-    name='firecrawl-py',
-    version='0.0.9',
-    url='https://github.com/mendableai/firecrawl',
-    author='Mendable.ai',
-    author_email='nick@mendable.ai',
-    description='Python SDK for Firecrawl API',
+    name="firecrawl-py",
+    version="0.0.11",
+    url="https://github.com/mendableai/firecrawl",
+    author="Mendable.ai",
+    author_email="nick@mendable.ai",
+    description="Python SDK for Firecrawl API",
+    long_description=long_description_content,
+    long_description_content_type="text/markdown",
    packages=find_packages(),
    install_requires=[
        'requests',
        'pytest',
        'python-dotenv',
    ],
+    python_requires='>=3.8',
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Environment :: Web Environment",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
+        "Natural Language :: English",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Internet",
+        "Topic :: Internet :: WWW/HTTP",
+        "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
+        "Topic :: Software Development",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+        "Topic :: Text Processing",
+        "Topic :: Text Processing :: Indexing",
+    ],    
+    keywords="SDK API firecrawl",
+    project_urls={
+        "Documentation": "https://docs.firecrawl.dev",
+        "Source": "https://github.com/mendableai/firecrawl",
+        "Tracker": "https://github.com/mendableai/firecrawl/issues",
+    },
+    license="GNU General Public License v3 (GPLv3)",
 )