Merge branch 'main' into feat/idempotency-key

2024-05-24 14:15:37 -03:00 · 2024-05-24 14:15:37 -03:00 · d39860c08b
commit d39860c08b
parent c201ea1986 605ba4c031
8 changed files with 274 additions and 25 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,35 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[BUG]"
+labels: bug
+assignees: ''
+
+---
+
+**Describe the Bug**
+Provide a clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the issue:
+1. Configure the environment or settings with '...'
+2. Run the command '...'
+3. Observe the error or unexpected output at '...'
+4. Log output/error message
+
+**Expected Behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots or copies of the command line output to help explain the issue.
+
+**Environment (please complete the following information):**
+- OS: [e.g. macOS, Linux, Windows]
+- Firecrawl Version: [e.g. 1.2.3]
+- Node.js Version: [e.g. 14.x]
+
+**Logs**
+If applicable, include detailed logs to help understand the problem.
+
+**Additional Context**
+Add any other context about the problem here, such as configuration specifics, network conditions, data volumes, etc.
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,26 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[Feat]"
+labels: ''
+assignees: ''
+
+---
+
+**Problem Description**
+Describe the issue you're experiencing that has prompted this feature request. For example, "I find it difficult when..."
+
+**Proposed Feature**
+Provide a clear and concise description of the feature you would like implemented.
+
+**Alternatives Considered**
+Discuss any alternative solutions or features you've considered. Why were these alternatives not suitable?
+
+**Implementation Suggestions**
+If you have ideas on how the feature could be implemented, share them here. This could include technical details, API changes, or interaction mechanisms.
+
+**Use Case**
+Explain how this feature would be used and what benefits it would bring. Include specific examples to illustrate how this would improve functionality or user experience.
+
+**Additional Context**
+Add any other context such as comparisons with similar features in other products, or links to prototypes or mockups.
--- a/apps/api/.env.example
+++ b/apps/api/.env.example
@ -36,3 +36,10 @@ HYPERDX_API_KEY=
 HDX_NODE_BETA_MODE=1

 FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta
+
+# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
+PROXY_SERVER=
+PROXY_USERNAME=
+PROXY_PASSWORD=
+# set if you'd like to block media requests to save proxy bandwidth
+BLOCK_MEDIA=
--- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts
+++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts
@ -1,6 +1,7 @@
 const socialMediaBlocklist = [
  'facebook.com',
  'twitter.com',
+  'x.com',
  'instagram.com',
  'linkedin.com',
  'pinterest.com',
@ -14,12 +15,25 @@ const socialMediaBlocklist = [
  'telegram.org',
 ];

-const allowedUrls = [
-  'linkedin.com/pulse'
+const allowedKeywords = [
+  'pulse',
+  'privacy',
+  'terms',
+  'policy',
+  'user-agreement',
+  'legal',
+  'help',
+  'support',
+  'contact',
+  'about',
+  'careers',
+  'blog',
+  'press',
+  'conditions',
 ];

 export function isUrlBlocked(url: string): boolean {
-  if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) {
+  if (allowedKeywords.some(keyword => url.includes(keyword))) {
    return false;
  }

--- a/apps/playwright-service/main.py
+++ b/apps/playwright-service/main.py
@ -2,9 +2,16 @@ from fastapi import FastAPI
 from playwright.async_api import async_playwright, Browser
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
+from os import environ
+
+PROXY_SERVER = environ.get('PROXY_SERVER', None)
+PROXY_USERNAME = environ.get('PROXY_USERNAME', None)
+PROXY_PASSWORD = environ.get('PROXY_PASSWORD', None)
+BLOCK_MEDIA = environ.get('BLOCK_MEDIA', 'False').upper() == 'TRUE'

 app = FastAPI()

+
 class UrlModel(BaseModel):
    url: str
    wait: int = None
@ -27,11 +34,28 @@ async def shutdown_event():

@app.post("/html")
 async def root(body: UrlModel):
+    context = None
+    if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
+        context = await browser.new_context(proxy={"server": PROXY_SERVER,
+                                                   "username": PROXY_USERNAME,
+                                                   "password": PROXY_PASSWORD})
+    else:
        context = await browser.new_context()
+
+    if BLOCK_MEDIA:
+        await context.route("**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}",
+                            handler=lambda route, request: route.abort())
+
    page = await context.new_page()
-    await page.goto(body.url, timeout=15000)  # Set max timeout to 15s
-    if body.wait:  # Check if wait parameter is provided in the request body
-        await page.wait_for_timeout(body.wait)  # Convert seconds to milliseconds for playwright
+    await page.goto(
+        body.url,
+        wait_until="load",
+        timeout=body.timeout if body.timeout else 15000,
+    )
+    # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
+    if body.wait:
+        await page.wait_for_timeout(body.wait)
+        
    page_content = await page.content()
    await context.close()
    json_compatible_item_data = {"content": page_content}
--- a/apps/python-sdk/.pylintrc
+++ b/apps/python-sdk/.pylintrc
@ -0,0 +1,2 @@
+[FORMAT]
+max-line-length = 120
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -1,18 +1,50 @@
+"""
+FirecrawlApp Module
+
+This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
+It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
+and check the status of these jobs. The module uses requests for HTTP communication
+and handles retries for certain HTTP status codes.
+
+Classes:
+    - FirecrawlApp: Main class for interacting with the Firecrawl API.
+"""
+
 import os
-from typing import Any, Dict, Optional
-import requests
 import time
+from typing import Any, Dict, Optional
+
+import requests
+

 class FirecrawlApp:
-    def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'):
+    """
+    Initialize the FirecrawlApp instance.
+
+    Args:
+        api_key (Optional[str]): API key for authenticating with the Firecrawl API.
+        api_url (Optional[str]): Base URL for the Firecrawl API.
+    """
+    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
        self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
        if self.api_key is None:
            raise ValueError('No API key provided')
-        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL')
-    
-    
-
+        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
    def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
+        """
+        Scrape the specified URL using the Firecrawl API.
+
+        Args:
+            url (str): The URL to scrape.
+            params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
+
+        Returns:
+            Any: The scraped data if the request is successful.
+
+        Raises:
+            Exception: If the scrape request fails.
+        """
+
        headers = {
            'Content-Type': 'application/json',
            'Authorization': f'Bearer {self.api_key}'
@ -41,11 +73,11 @@ class FirecrawlApp:
        response = requests.post(
            f'{self.api_url}/v0/scrape',
            headers=headers,
-            json=scrape_params
+            json=scrape_params,
        )
        if response.status_code == 200:
            response = response.json()
-            if response['success']:
+            if response['success'] and 'data' in response:
                return response['data']
            else:
                raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
@ -56,6 +88,19 @@ class FirecrawlApp:
            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')

    def search(self, query, params=None):
+        """
+        Perform a search using the Firecrawl API.
+
+        Args:
+            query (str): The search query.
+            params (Optional[Dict[str, Any]]): Additional parameters for the search request.
+
+        Returns:
+            Any: The search results if the request is successful.
+
+        Raises:
+            Exception: If the search request fails.
+        """
        headers = {
            'Content-Type': 'application/json',
            'Authorization': f'Bearer {self.api_key}'
@ -70,7 +115,8 @@ class FirecrawlApp:
        )
        if response.status_code == 200:
            response = response.json()
-            if response['success'] == True:
+            
+            if response['success'] and 'data' in response:
                return response['data']
            else:
                raise Exception(f'Failed to search. Error: {response["error"]}')
@ -82,6 +128,22 @@ class FirecrawlApp:
            raise Exception(f'Failed to search. Status code: {response.status_code}')

    def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
+        """
+        Initiate a crawl job for the specified URL using the Firecrawl API.
+
+        Args:
+            url (str): The URL to crawl.
+            params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
+            wait_until_done (bool): Whether to wait until the crawl job is completed.
+            timeout (int): Timeout between status checks when waiting for job completion.
+            idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
+
+        Returns:
+            Any: The crawl job ID or the crawl results if waiting until completion.
+
+        Raises:
+            Exception: If the crawl job initiation or monitoring fails.
+        """
        headers = self._prepare_headers(idempotency_key)
        json_data = {'url': url}
        if params:
@ -97,6 +159,18 @@ class FirecrawlApp:
            self._handle_error(response, 'start crawl job')

    def check_crawl_status(self, job_id):
+        """
+        Check the status of a crawl job using the Firecrawl API.
+
+        Args:
+            job_id (str): The ID of the crawl job.
+
+        Returns:
+            Any: The status of the crawl job.
+
+        Raises:
+            Exception: If the status check request fails.
+        """
        headers = self._prepare_headers()
        response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
        if response.status_code == 200:
@ -105,18 +179,44 @@ class FirecrawlApp:
            self._handle_error(response, 'check crawl status')

    def _prepare_headers(self, idempotency_key=None):
+        """
+        Prepare the headers for API requests.
+
+        Args:
+            idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
+
+        Returns:
+            Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
+        """
        if idempotency_key:
            return {
                'Content-Type': 'application/json',
                'Authorization': f'Bearer {self.api_key}',
                'x-idempotency-key': idempotency_key
            }
+
        return {
            'Content-Type': 'application/json',
            'Authorization': f'Bearer {self.api_key}',
        }

    def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
+        """
+        Make a POST request with retries.
+
+        Args:
+            url (str): The URL to send the POST request to.
+            data (Dict[str, Any]): The JSON data to include in the POST request.
+            headers (Dict[str, str]): The headers to include in the POST request.
+            retries (int): Number of retries for the request.
+            backoff_factor (float): Backoff factor for retries.
+
+        Returns:
+            requests.Response: The response from the POST request.
+
+        Raises:
+            requests.RequestException: If the request fails after the specified retries.
+        """
        for attempt in range(retries):
            response = requests.post(url, headers=headers, json=data)
            if response.status_code == 502:
@ -126,6 +226,21 @@ class FirecrawlApp:
        return response

    def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
+        """
+        Make a GET request with retries.
+
+        Args:
+            url (str): The URL to send the GET request to.
+            headers (Dict[str, str]): The headers to include in the GET request.
+            retries (int): Number of retries for the request.
+            backoff_factor (float): Backoff factor for retries.
+
+        Returns:
+            requests.Response: The response from the GET request.
+
+        Raises:
+            requests.RequestException: If the request fails after the specified retries.
+        """
        for attempt in range(retries):
            response = requests.get(url, headers=headers)
            if response.status_code == 502:
@ -135,7 +250,20 @@ class FirecrawlApp:
        return response

    def _monitor_job_status(self, job_id, headers, timeout):
-        import time
+        """
+        Monitor the status of a crawl job until completion.
+
+        Args:
+            job_id (str): The ID of the crawl job.
+            headers (Dict[str, str]): The headers to include in the status check requests.
+            timeout (int): Timeout between status checks.
+
+        Returns:
+            Any: The crawl results if the job is completed successfully.
+
+        Raises:
+            Exception: If the job fails or an error occurs during status checks.
+        """
        while True:
            status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
            if status_response.status_code == 200:
@ -146,8 +274,7 @@ class FirecrawlApp:
                    else:
                        raise Exception('Crawl job completed but no data was returned')
                elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
-                    if timeout < 2:
-                        timeout = 2
+                    timeout=max(timeout,2)
                    time.sleep(timeout)  # Wait for the specified timeout before checking again
                else:
                    raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
@ -155,6 +282,16 @@ class FirecrawlApp:
                self._handle_error(status_response, 'check crawl status')

    def _handle_error(self, response, action):
+        """
+        Handle errors from API responses.
+
+        Args:
+            response (requests.Response): The response object from the API request.
+            action (str): Description of the action that was being performed.
+
+        Raises:
+            Exception: An exception with a message containing the status code and error details from the response.
+        """
        if response.status_code in [402, 408, 409, 500]:
            error_message = response.json().get('error', 'Unknown error occurred')
            raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -5,6 +5,10 @@ services:
    build: apps/playwright-service
    environment:
      - PORT=3000
+      - PROXY_SERVER=${PROXY_SERVER}
+      - PROXY_USERNAME=${PROXY_USERNAME}
+      - PROXY_PASSWORD=${PROXY_PASSWORD}
+      - BLOCK_MEDIA=${BLOCK_MEDIA}
    networks:
      - backend