From 756f54466d37f00850343cc8ed57979a0d587c50 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 20 May 2024 17:24:21 -0700 Subject: [PATCH 01/10] Nick: allowed keywords for now --- apps/api/src/scraper/WebScraper/utils/blocklist.ts | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index a50e42e..ededfc7 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -1,6 +1,7 @@ const socialMediaBlocklist = [ 'facebook.com', 'twitter.com', + 'x.com', 'instagram.com', 'linkedin.com', 'pinterest.com', @@ -14,12 +15,18 @@ const socialMediaBlocklist = [ 'telegram.org', ]; -const allowedUrls = [ - 'linkedin.com/pulse' +const allowedKeywords = [ + 'pulse', + 'privacy', + 'terms', + 'policy', + 'user-agreement', + 'legal', + 'help' ]; export function isUrlBlocked(url: string): boolean { - if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) { + if (allowedKeywords.some(keyword => url.includes(keyword))) { return false; } From 7f64fe884a57441ab1103fab8d8ca44a1e92bfd7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 20 May 2024 17:26:01 -0700 Subject: [PATCH 02/10] Update blocklist.ts --- apps/api/src/scraper/WebScraper/utils/blocklist.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index ededfc7..c3d37c4 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -22,7 +22,14 @@ const allowedKeywords = [ 'policy', 'user-agreement', 'legal', - 'help' + 'help', + 'support', + 'contact', + 'about', + 'careers', + 'blog', + 'press', + 'conditions', ]; export function isUrlBlocked(url: string): boolean { From c47dae13a93d1b54680cb948230173f0de26c68a Mon Sep 17 00:00:00 2001 From: youqiang Date: Tue, 21 May 2024 14:53:57 +0800 Subject: [PATCH 03/10] update: wait until body attached in playwright-service --- apps/playwright-service/main.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index c28bc63..a044597 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -5,6 +5,7 @@ from pydantic import BaseModel app = FastAPI() + class UrlModel(BaseModel): url: str wait: int = None @@ -29,9 +30,12 @@ async def shutdown_event(): async def root(body: UrlModel): context = await browser.new_context() page = await context.new_page() - await page.goto(body.url, timeout=15000) # Set max timeout to 15s - if body.wait: # Check if wait parameter is provided in the request body - await page.wait_for_timeout(body.wait) # Convert seconds to milliseconds for playwright + await page.goto( + body.url, + wait_until="load", + timeout=body.wait if body.wait else 15, + ) + await page.wait_for_selector("body", state="attached") page_content = await page.content() await context.close() json_compatible_item_data = {"content": page_content} From f9ae1729b6beeb32bde3d3106baee6cc5bbf3bd6 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 22 May 2024 09:40:38 -0300 Subject: [PATCH 04/10] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 98cb8ed..b946686 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -45,7 +45,7 @@ class FirecrawlApp: ) if response.status_code == 200: response = response.json() - if response['success']: + if response['success'] and 'data' in response: return response['data'] else: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') @@ -70,7 +70,7 @@ class FirecrawlApp: ) if response.status_code == 200: response = response.json() - if response['success'] == True: + if response['success'] and 'data' in response: return response['data'] else: raise Exception(f'Failed to search. Error: {response["error"]}') From 3e63985e53bc795c4633b218b71d5851691fc585 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 22 May 2024 10:40:47 -0700 Subject: [PATCH 05/10] Update main.py --- apps/playwright-service/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index a044597..544f113 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -35,7 +35,6 @@ async def root(body: UrlModel): wait_until="load", timeout=body.wait if body.wait else 15, ) - await page.wait_for_selector("body", state="attached") page_content = await page.content() await context.close() json_compatible_item_data = {"content": page_content} From 3aa5f266272039cda4fb6407d57a218623af6e93 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 22 May 2024 10:45:43 -0700 Subject: [PATCH 06/10] Update main.py --- apps/playwright-service/main.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index 544f113..a2f5e52 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -33,8 +33,12 @@ async def root(body: UrlModel): await page.goto( body.url, wait_until="load", - timeout=body.wait if body.wait else 15, + timeout=body.timeout if body.timeout else 15000, ) + # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough + if body.wait: + await page.wait_for_timeout(body.wait) + # await page.wait_for_selector("body", state="attached") page_content = await page.content() await context.close() json_compatible_item_data = {"content": page_content} From 4e39701644e724dd01bceebf4488aae1ed7b7900 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 22 May 2024 12:59:56 -0700 Subject: [PATCH 07/10] Update main.py --- apps/playwright-service/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index a2f5e52..8344adc 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -38,7 +38,7 @@ async def root(body: UrlModel): # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough if body.wait: await page.wait_for_timeout(body.wait) - # await page.wait_for_selector("body", state="attached") + page_content = await page.content() await context.close() json_compatible_item_data = {"content": page_content} From 9562c837eb757f49174d05c8ae3869645b7f5859 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 24 May 2024 09:34:43 -0300 Subject: [PATCH 08/10] Update issue templates --- .github/ISSUE_TEMPLATE/bug_report.md | 35 +++++++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.md | 26 +++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..bb47b47 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,35 @@ +--- +name: Bug report +about: Create a report to help us improve +title: "[BUG]" +labels: bug +assignees: '' + +--- + +**Describe the Bug** +Provide a clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the issue: +1. Configure the environment or settings with '...' +2. Run the command '...' +3. Observe the error or unexpected output at '...' +4. Log output/error message + +**Expected Behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots or copies of the command line output to help explain the issue. + +**Environment (please complete the following information):** +- OS: [e.g. macOS, Linux, Windows] +- Firecrawl Version: [e.g. 1.2.3] +- Node.js Version: [e.g. 14.x] + +**Logs** +If applicable, include detailed logs to help understand the problem. + +**Additional Context** +Add any other context about the problem here, such as configuration specifics, network conditions, data volumes, etc. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..b01699b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,26 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: "[Feat]" +labels: '' +assignees: '' + +--- + +**Problem Description** +Describe the issue you're experiencing that has prompted this feature request. For example, "I find it difficult when..." + +**Proposed Feature** +Provide a clear and concise description of the feature you would like implemented. + +**Alternatives Considered** +Discuss any alternative solutions or features you've considered. Why were these alternatives not suitable? + +**Implementation Suggestions** +If you have ideas on how the feature could be implemented, share them here. This could include technical details, API changes, or interaction mechanisms. + +**Use Case** +Explain how this feature would be used and what benefits it would bring. Include specific examples to illustrate how this would improve functionality or user experience. + +**Additional Context** +Add any other context such as comparisons with similar features in other products, or links to prototypes or mockups. From b001aded46a620021f0265db683c17ab3fc46793 Mon Sep 17 00:00:00 2001 From: Jakob Stadlhuber Date: Fri, 24 May 2024 17:41:34 +0200 Subject: [PATCH 09/10] Add proxy and media blocking configurations Updated environment variables and application settings to include proxy configurations and media blocking option. The proxy settings allow users to use a proxy service, while the media blocking is an optional feature that can help save bandwidth. Changes have been made in the .env.example, docker-compose.yaml, and main.py files. --- apps/api/.env.example | 9 ++++++++- apps/playwright-service/main.py | 20 +++++++++++++++++++- docker-compose.yaml | 4 ++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/apps/api/.env.example b/apps/api/.env.example index 659d68f..0ba20e8 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -35,4 +35,11 @@ STRIPE_PRICE_ID_SCALE= HYPERDX_API_KEY= HDX_NODE_BETA_MODE=1 -FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta \ No newline at end of file +FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta + +# Proxy Settings (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request) +PROXY_SERVER= +PROXY_USERNAME= +PROXY_PASSWORD= +# set if you'd like to block media requests to save proxy bandwidth +BLOCK_MEDIA= \ No newline at end of file diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index c28bc63..337d283 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -2,9 +2,16 @@ from fastapi import FastAPI from playwright.async_api import async_playwright, Browser from fastapi.responses import JSONResponse from pydantic import BaseModel +from os import environ + +PROXY_SERVER = environ.get('PROXY_SERVER', None) +PROXY_USERNAME = environ.get('PROXY_USERNAME', None) +PROXY_PASSWORD = environ.get('PROXY_PASSWORD', None) +BLOCK_MEDIA = environ.get('BLOCK_MEDIA', 'False').upper() == 'TRUE' app = FastAPI() + class UrlModel(BaseModel): url: str wait: int = None @@ -27,7 +34,18 @@ async def shutdown_event(): @app.post("/html") async def root(body: UrlModel): - context = await browser.new_context() + context = None + if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD: + context = await browser.new_context(proxy={"server": PROXY_SERVER, + "username": PROXY_USERNAME, + "password": PROXY_PASSWORD}) + else: + context = await browser.new_context() + + if BLOCK_MEDIA: + await context.route("**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}", + handler=lambda route, request: route.abort()) + page = await context.new_page() await page.goto(body.url, timeout=15000) # Set max timeout to 15s if body.wait: # Check if wait parameter is provided in the request body diff --git a/docker-compose.yaml b/docker-compose.yaml index 049672d..c95ccc9 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -5,6 +5,10 @@ services: build: apps/playwright-service environment: - PORT=3000 + - PROXY_SERVER=${PROXY_SERVER} + - PROXY_USERNAME=${PROXY_USERNAME} + - PROXY_PASSWORD=${PROXY_PASSWORD} + - BLOCK_MEDIA=${BLOCK_MEDIA} networks: - backend From 9fc5a0ff98492c9c6bfba517f995e2c36cb2569a Mon Sep 17 00:00:00 2001 From: Jakob Stadlhuber Date: Fri, 24 May 2024 17:45:59 +0200 Subject: [PATCH 10/10] Update comment in .env.example for proxy settings This commit modifies the comment in .env.example to specify that proxy settings are for Playwright. This clarification aims to provide users a more clear context about when and why these proxy settings are used. --- apps/api/.env.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/.env.example b/apps/api/.env.example index 0ba20e8..735444b 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -37,7 +37,7 @@ HDX_NODE_BETA_MODE=1 FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta -# Proxy Settings (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request) +# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request) PROXY_SERVER= PROXY_USERNAME= PROXY_PASSWORD=