diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..bb47b47 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,35 @@ +--- +name: Bug report +about: Create a report to help us improve +title: "[BUG]" +labels: bug +assignees: '' + +--- + +**Describe the Bug** +Provide a clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the issue: +1. Configure the environment or settings with '...' +2. Run the command '...' +3. Observe the error or unexpected output at '...' +4. Log output/error message + +**Expected Behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots or copies of the command line output to help explain the issue. + +**Environment (please complete the following information):** +- OS: [e.g. macOS, Linux, Windows] +- Firecrawl Version: [e.g. 1.2.3] +- Node.js Version: [e.g. 14.x] + +**Logs** +If applicable, include detailed logs to help understand the problem. + +**Additional Context** +Add any other context about the problem here, such as configuration specifics, network conditions, data volumes, etc. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..b01699b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,26 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: "[Feat]" +labels: '' +assignees: '' + +--- + +**Problem Description** +Describe the issue you're experiencing that has prompted this feature request. For example, "I find it difficult when..." + +**Proposed Feature** +Provide a clear and concise description of the feature you would like implemented. + +**Alternatives Considered** +Discuss any alternative solutions or features you've considered. Why were these alternatives not suitable? + +**Implementation Suggestions** +If you have ideas on how the feature could be implemented, share them here. This could include technical details, API changes, or interaction mechanisms. + +**Use Case** +Explain how this feature would be used and what benefits it would bring. Include specific examples to illustrate how this would improve functionality or user experience. + +**Additional Context** +Add any other context such as comparisons with similar features in other products, or links to prototypes or mockups. diff --git a/apps/api/.env.example b/apps/api/.env.example index 659d68f..735444b 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -35,4 +35,11 @@ STRIPE_PRICE_ID_SCALE= HYPERDX_API_KEY= HDX_NODE_BETA_MODE=1 -FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta \ No newline at end of file +FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta + +# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request) +PROXY_SERVER= +PROXY_USERNAME= +PROXY_PASSWORD= +# set if you'd like to block media requests to save proxy bandwidth +BLOCK_MEDIA= \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index a50e42e..c3d37c4 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -1,6 +1,7 @@ const socialMediaBlocklist = [ 'facebook.com', 'twitter.com', + 'x.com', 'instagram.com', 'linkedin.com', 'pinterest.com', @@ -14,12 +15,25 @@ const socialMediaBlocklist = [ 'telegram.org', ]; -const allowedUrls = [ - 'linkedin.com/pulse' +const allowedKeywords = [ + 'pulse', + 'privacy', + 'terms', + 'policy', + 'user-agreement', + 'legal', + 'help', + 'support', + 'contact', + 'about', + 'careers', + 'blog', + 'press', + 'conditions', ]; export function isUrlBlocked(url: string): boolean { - if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) { + if (allowedKeywords.some(keyword => url.includes(keyword))) { return false; } diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index c28bc63..9cb0c4e 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -2,9 +2,16 @@ from fastapi import FastAPI from playwright.async_api import async_playwright, Browser from fastapi.responses import JSONResponse from pydantic import BaseModel +from os import environ + +PROXY_SERVER = environ.get('PROXY_SERVER', None) +PROXY_USERNAME = environ.get('PROXY_USERNAME', None) +PROXY_PASSWORD = environ.get('PROXY_PASSWORD', None) +BLOCK_MEDIA = environ.get('BLOCK_MEDIA', 'False').upper() == 'TRUE' app = FastAPI() + class UrlModel(BaseModel): url: str wait: int = None @@ -27,11 +34,28 @@ async def shutdown_event(): @app.post("/html") async def root(body: UrlModel): - context = await browser.new_context() + context = None + if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD: + context = await browser.new_context(proxy={"server": PROXY_SERVER, + "username": PROXY_USERNAME, + "password": PROXY_PASSWORD}) + else: + context = await browser.new_context() + + if BLOCK_MEDIA: + await context.route("**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}", + handler=lambda route, request: route.abort()) + page = await context.new_page() - await page.goto(body.url, timeout=15000) # Set max timeout to 15s - if body.wait: # Check if wait parameter is provided in the request body - await page.wait_for_timeout(body.wait) # Convert seconds to milliseconds for playwright + await page.goto( + body.url, + wait_until="load", + timeout=body.timeout if body.timeout else 15000, + ) + # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough + if body.wait: + await page.wait_for_timeout(body.wait) + page_content = await page.content() await context.close() json_compatible_item_data = {"content": page_content} diff --git a/apps/python-sdk/.pylintrc b/apps/python-sdk/.pylintrc new file mode 100644 index 0000000..a580885 --- /dev/null +++ b/apps/python-sdk/.pylintrc @@ -0,0 +1,2 @@ +[FORMAT] +max-line-length = 120 \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 1f59ec7..6c0bc41 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -1,25 +1,57 @@ +""" +FirecrawlApp Module + +This module provides a class `FirecrawlApp` for interacting with the Firecrawl API. +It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, +and check the status of these jobs. The module uses requests for HTTP communication +and handles retries for certain HTTP status codes. + +Classes: + - FirecrawlApp: Main class for interacting with the Firecrawl API. +""" + import os -from typing import Any, Dict, Optional -import requests import time +from typing import Any, Dict, Optional + +import requests + class FirecrawlApp: - def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'): + """ + Initialize the FirecrawlApp instance. + + Args: + api_key (Optional[str]): API key for authenticating with the Firecrawl API. + api_url (Optional[str]): Base URL for the Firecrawl API. + """ + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') if self.api_key is None: raise ValueError('No API key provided') - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL') - - - + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + """ + Scrape the specified URL using the Firecrawl API. + + Args: + url (str): The URL to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. + + Returns: + Any: The scraped data if the request is successful. + + Raises: + Exception: If the scrape request fails. + """ + headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}' } # Prepare the base scrape parameters with the URL scrape_params = {'url': url} - + # If there are additional params, process them if params: # Initialize extractorOptions if present @@ -32,7 +64,7 @@ class FirecrawlApp: extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') # Update the scrape_params with the processed extractorOptions scrape_params['extractorOptions'] = extractor_options - + # Include any other params directly at the top level of scrape_params for key, value in params.items(): if key != 'extractorOptions': @@ -41,11 +73,11 @@ class FirecrawlApp: response = requests.post( f'{self.api_url}/v0/scrape', headers=headers, - json=scrape_params + json=scrape_params, ) if response.status_code == 200: response = response.json() - if response['success']: + if response['success'] and 'data' in response: return response['data'] else: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') @@ -54,8 +86,21 @@ class FirecrawlApp: raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') else: raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') - + def search(self, query, params=None): + """ + Perform a search using the Firecrawl API. + + Args: + query (str): The search query. + params (Optional[Dict[str, Any]]): Additional parameters for the search request. + + Returns: + Any: The search results if the request is successful. + + Raises: + Exception: If the search request fails. + """ headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}' @@ -70,11 +115,12 @@ class FirecrawlApp: ) if response.status_code == 200: response = response.json() - if response['success'] == True: + + if response['success'] and 'data' in response: return response['data'] else: raise Exception(f'Failed to search. Error: {response["error"]}') - + elif response.status_code in [402, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') @@ -82,6 +128,22 @@ class FirecrawlApp: raise Exception(f'Failed to search. Status code: {response.status_code}') def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None): + """ + Initiate a crawl job for the specified URL using the Firecrawl API. + + Args: + url (str): The URL to crawl. + params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. + wait_until_done (bool): Whether to wait until the crawl job is completed. + timeout (int): Timeout between status checks when waiting for job completion. + idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + + Returns: + Any: The crawl job ID or the crawl results if waiting until completion. + + Raises: + Exception: If the crawl job initiation or monitoring fails. + """ headers = self._prepare_headers(idempotency_key) json_data = {'url': url} if params: @@ -97,6 +159,18 @@ class FirecrawlApp: self._handle_error(response, 'start crawl job') def check_crawl_status(self, job_id): + """ + Check the status of a crawl job using the Firecrawl API. + + Args: + job_id (str): The ID of the crawl job. + + Returns: + Any: The status of the crawl job. + + Raises: + Exception: If the status check request fails. + """ headers = self._prepare_headers() response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) if response.status_code == 200: @@ -105,18 +179,44 @@ class FirecrawlApp: self._handle_error(response, 'check crawl status') def _prepare_headers(self, idempotency_key=None): + """ + Prepare the headers for API requests. + + Args: + idempotency_key (Optional[str]): A unique key to ensure idempotency of requests. + + Returns: + Dict[str, str]: The headers including content type, authorization, and optionally idempotency key. + """ if idempotency_key: return { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}', 'x-idempotency-key': idempotency_key } + return { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}', } def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): + """ + Make a POST request with retries. + + Args: + url (str): The URL to send the POST request to. + data (Dict[str, Any]): The JSON data to include in the POST request. + headers (Dict[str, str]): The headers to include in the POST request. + retries (int): Number of retries for the request. + backoff_factor (float): Backoff factor for retries. + + Returns: + requests.Response: The response from the POST request. + + Raises: + requests.RequestException: If the request fails after the specified retries. + """ for attempt in range(retries): response = requests.post(url, headers=headers, json=data) if response.status_code == 502: @@ -126,6 +226,21 @@ class FirecrawlApp: return response def _get_request(self, url, headers, retries=3, backoff_factor=0.5): + """ + Make a GET request with retries. + + Args: + url (str): The URL to send the GET request to. + headers (Dict[str, str]): The headers to include in the GET request. + retries (int): Number of retries for the request. + backoff_factor (float): Backoff factor for retries. + + Returns: + requests.Response: The response from the GET request. + + Raises: + requests.RequestException: If the request fails after the specified retries. + """ for attempt in range(retries): response = requests.get(url, headers=headers) if response.status_code == 502: @@ -135,7 +250,20 @@ class FirecrawlApp: return response def _monitor_job_status(self, job_id, headers, timeout): - import time + """ + Monitor the status of a crawl job until completion. + + Args: + job_id (str): The ID of the crawl job. + headers (Dict[str, str]): The headers to include in the status check requests. + timeout (int): Timeout between status checks. + + Returns: + Any: The crawl results if the job is completed successfully. + + Raises: + Exception: If the job fails or an error occurs during status checks. + """ while True: status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) if status_response.status_code == 200: @@ -146,8 +274,7 @@ class FirecrawlApp: else: raise Exception('Crawl job completed but no data was returned') elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: - if timeout < 2: - timeout = 2 + timeout=max(timeout,2) time.sleep(timeout) # Wait for the specified timeout before checking again else: raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') @@ -155,6 +282,16 @@ class FirecrawlApp: self._handle_error(status_response, 'check crawl status') def _handle_error(self, response, action): + """ + Handle errors from API responses. + + Args: + response (requests.Response): The response object from the API request. + action (str): Description of the action that was being performed. + + Raises: + Exception: An exception with a message containing the status code and error details from the response. + """ if response.status_code in [402, 408, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') diff --git a/docker-compose.yaml b/docker-compose.yaml index 049672d..c95ccc9 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -5,6 +5,10 @@ services: build: apps/playwright-service environment: - PORT=3000 + - PROXY_SERVER=${PROXY_SERVER} + - PROXY_USERNAME=${PROXY_USERNAME} + - PROXY_PASSWORD=${PROXY_PASSWORD} + - BLOCK_MEDIA=${BLOCK_MEDIA} networks: - backend