0

Merge branch 'main' into main

This commit is contained in:
Nicolas 2024-05-24 09:47:12 -07:00 committed by GitHub
commit 65fe9c4f80
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 123 additions and 9 deletions

35
.github/ISSUE_TEMPLATE/bug_report.md vendored Normal file
View File

@ -0,0 +1,35 @@
---
name: Bug report
about: Create a report to help us improve
title: "[BUG]"
labels: bug
assignees: ''
---
**Describe the Bug**
Provide a clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the issue:
1. Configure the environment or settings with '...'
2. Run the command '...'
3. Observe the error or unexpected output at '...'
4. Log output/error message
**Expected Behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots or copies of the command line output to help explain the issue.
**Environment (please complete the following information):**
- OS: [e.g. macOS, Linux, Windows]
- Firecrawl Version: [e.g. 1.2.3]
- Node.js Version: [e.g. 14.x]
**Logs**
If applicable, include detailed logs to help understand the problem.
**Additional Context**
Add any other context about the problem here, such as configuration specifics, network conditions, data volumes, etc.

View File

@ -0,0 +1,26 @@
---
name: Feature request
about: Suggest an idea for this project
title: "[Feat]"
labels: ''
assignees: ''
---
**Problem Description**
Describe the issue you're experiencing that has prompted this feature request. For example, "I find it difficult when..."
**Proposed Feature**
Provide a clear and concise description of the feature you would like implemented.
**Alternatives Considered**
Discuss any alternative solutions or features you've considered. Why were these alternatives not suitable?
**Implementation Suggestions**
If you have ideas on how the feature could be implemented, share them here. This could include technical details, API changes, or interaction mechanisms.
**Use Case**
Explain how this feature would be used and what benefits it would bring. Include specific examples to illustrate how this would improve functionality or user experience.
**Additional Context**
Add any other context such as comparisons with similar features in other products, or links to prototypes or mockups.

View File

@ -36,3 +36,10 @@ HYPERDX_API_KEY=
HDX_NODE_BETA_MODE=1
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta
# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
PROXY_SERVER=
PROXY_USERNAME=
PROXY_PASSWORD=
# set if you'd like to block media requests to save proxy bandwidth
BLOCK_MEDIA=

View File

@ -1,6 +1,7 @@
const socialMediaBlocklist = [
'facebook.com',
'twitter.com',
'x.com',
'instagram.com',
'linkedin.com',
'pinterest.com',
@ -14,12 +15,25 @@ const socialMediaBlocklist = [
'telegram.org',
];
const allowedUrls = [
'linkedin.com/pulse'
const allowedKeywords = [
'pulse',
'privacy',
'terms',
'policy',
'user-agreement',
'legal',
'help',
'support',
'contact',
'about',
'careers',
'blog',
'press',
'conditions',
];
export function isUrlBlocked(url: string): boolean {
if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) {
if (allowedKeywords.some(keyword => url.includes(keyword))) {
return false;
}

View File

@ -2,9 +2,16 @@ from fastapi import FastAPI
from playwright.async_api import async_playwright, Browser
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from os import environ
PROXY_SERVER = environ.get('PROXY_SERVER', None)
PROXY_USERNAME = environ.get('PROXY_USERNAME', None)
PROXY_PASSWORD = environ.get('PROXY_PASSWORD', None)
BLOCK_MEDIA = environ.get('BLOCK_MEDIA', 'False').upper() == 'TRUE'
app = FastAPI()
class UrlModel(BaseModel):
url: str
wait: int = None
@ -27,11 +34,28 @@ async def shutdown_event():
@app.post("/html")
async def root(body: UrlModel):
context = await browser.new_context()
context = None
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
context = await browser.new_context(proxy={"server": PROXY_SERVER,
"username": PROXY_USERNAME,
"password": PROXY_PASSWORD})
else:
context = await browser.new_context()
if BLOCK_MEDIA:
await context.route("**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}",
handler=lambda route, request: route.abort())
page = await context.new_page()
await page.goto(body.url, timeout=15000) # Set max timeout to 15s
if body.wait: # Check if wait parameter is provided in the request body
await page.wait_for_timeout(body.wait) # Convert seconds to milliseconds for playwright
await page.goto(
body.url,
wait_until="load",
timeout=body.timeout if body.timeout else 15000,
)
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
if body.wait:
await page.wait_for_timeout(body.wait)
page_content = await page.content()
await context.close()
json_compatible_item_data = {"content": page_content}

View File

@ -77,7 +77,7 @@ class FirecrawlApp:
)
if response.status_code == 200:
response = response.json()
if response['success']:
if response['success'] and 'data' in response:
return response['data']
else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
@ -115,7 +115,11 @@ class FirecrawlApp:
)
if response.status_code == 200:
response = response.json()
<<<<<<< main
if response['success']:
=======
if response['success'] and 'data' in response:
>>>>>>> main
return response['data']
else:
raise Exception(f'Failed to search. Error: {response["error"]}')

View File

@ -5,6 +5,10 @@ services:
build: apps/playwright-service
environment:
- PORT=3000
- PROXY_SERVER=${PROXY_SERVER}
- PROXY_USERNAME=${PROXY_USERNAME}
- PROXY_PASSWORD=${PROXY_PASSWORD}
- BLOCK_MEDIA=${BLOCK_MEDIA}
networks:
- backend