Merge branch 'main' into main
This commit is contained in:
commit
65fe9c4f80
35
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
35
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
@ -0,0 +1,35 @@
|
||||
---
|
||||
name: Bug report
|
||||
about: Create a report to help us improve
|
||||
title: "[BUG]"
|
||||
labels: bug
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Describe the Bug**
|
||||
Provide a clear and concise description of what the bug is.
|
||||
|
||||
**To Reproduce**
|
||||
Steps to reproduce the issue:
|
||||
1. Configure the environment or settings with '...'
|
||||
2. Run the command '...'
|
||||
3. Observe the error or unexpected output at '...'
|
||||
4. Log output/error message
|
||||
|
||||
**Expected Behavior**
|
||||
A clear and concise description of what you expected to happen.
|
||||
|
||||
**Screenshots**
|
||||
If applicable, add screenshots or copies of the command line output to help explain the issue.
|
||||
|
||||
**Environment (please complete the following information):**
|
||||
- OS: [e.g. macOS, Linux, Windows]
|
||||
- Firecrawl Version: [e.g. 1.2.3]
|
||||
- Node.js Version: [e.g. 14.x]
|
||||
|
||||
**Logs**
|
||||
If applicable, include detailed logs to help understand the problem.
|
||||
|
||||
**Additional Context**
|
||||
Add any other context about the problem here, such as configuration specifics, network conditions, data volumes, etc.
|
26
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
26
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
---
|
||||
name: Feature request
|
||||
about: Suggest an idea for this project
|
||||
title: "[Feat]"
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Problem Description**
|
||||
Describe the issue you're experiencing that has prompted this feature request. For example, "I find it difficult when..."
|
||||
|
||||
**Proposed Feature**
|
||||
Provide a clear and concise description of the feature you would like implemented.
|
||||
|
||||
**Alternatives Considered**
|
||||
Discuss any alternative solutions or features you've considered. Why were these alternatives not suitable?
|
||||
|
||||
**Implementation Suggestions**
|
||||
If you have ideas on how the feature could be implemented, share them here. This could include technical details, API changes, or interaction mechanisms.
|
||||
|
||||
**Use Case**
|
||||
Explain how this feature would be used and what benefits it would bring. Include specific examples to illustrate how this would improve functionality or user experience.
|
||||
|
||||
**Additional Context**
|
||||
Add any other context such as comparisons with similar features in other products, or links to prototypes or mockups.
|
@ -36,3 +36,10 @@ HYPERDX_API_KEY=
|
||||
HDX_NODE_BETA_MODE=1
|
||||
|
||||
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta
|
||||
|
||||
# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
|
||||
PROXY_SERVER=
|
||||
PROXY_USERNAME=
|
||||
PROXY_PASSWORD=
|
||||
# set if you'd like to block media requests to save proxy bandwidth
|
||||
BLOCK_MEDIA=
|
@ -1,6 +1,7 @@
|
||||
const socialMediaBlocklist = [
|
||||
'facebook.com',
|
||||
'twitter.com',
|
||||
'x.com',
|
||||
'instagram.com',
|
||||
'linkedin.com',
|
||||
'pinterest.com',
|
||||
@ -14,12 +15,25 @@ const socialMediaBlocklist = [
|
||||
'telegram.org',
|
||||
];
|
||||
|
||||
const allowedUrls = [
|
||||
'linkedin.com/pulse'
|
||||
const allowedKeywords = [
|
||||
'pulse',
|
||||
'privacy',
|
||||
'terms',
|
||||
'policy',
|
||||
'user-agreement',
|
||||
'legal',
|
||||
'help',
|
||||
'support',
|
||||
'contact',
|
||||
'about',
|
||||
'careers',
|
||||
'blog',
|
||||
'press',
|
||||
'conditions',
|
||||
];
|
||||
|
||||
export function isUrlBlocked(url: string): boolean {
|
||||
if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) {
|
||||
if (allowedKeywords.some(keyword => url.includes(keyword))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -2,9 +2,16 @@ from fastapi import FastAPI
|
||||
from playwright.async_api import async_playwright, Browser
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel
|
||||
from os import environ
|
||||
|
||||
PROXY_SERVER = environ.get('PROXY_SERVER', None)
|
||||
PROXY_USERNAME = environ.get('PROXY_USERNAME', None)
|
||||
PROXY_PASSWORD = environ.get('PROXY_PASSWORD', None)
|
||||
BLOCK_MEDIA = environ.get('BLOCK_MEDIA', 'False').upper() == 'TRUE'
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
||||
class UrlModel(BaseModel):
|
||||
url: str
|
||||
wait: int = None
|
||||
@ -27,11 +34,28 @@ async def shutdown_event():
|
||||
|
||||
@app.post("/html")
|
||||
async def root(body: UrlModel):
|
||||
context = None
|
||||
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
|
||||
context = await browser.new_context(proxy={"server": PROXY_SERVER,
|
||||
"username": PROXY_USERNAME,
|
||||
"password": PROXY_PASSWORD})
|
||||
else:
|
||||
context = await browser.new_context()
|
||||
|
||||
if BLOCK_MEDIA:
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}",
|
||||
handler=lambda route, request: route.abort())
|
||||
|
||||
page = await context.new_page()
|
||||
await page.goto(body.url, timeout=15000) # Set max timeout to 15s
|
||||
if body.wait: # Check if wait parameter is provided in the request body
|
||||
await page.wait_for_timeout(body.wait) # Convert seconds to milliseconds for playwright
|
||||
await page.goto(
|
||||
body.url,
|
||||
wait_until="load",
|
||||
timeout=body.timeout if body.timeout else 15000,
|
||||
)
|
||||
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
|
||||
if body.wait:
|
||||
await page.wait_for_timeout(body.wait)
|
||||
|
||||
page_content = await page.content()
|
||||
await context.close()
|
||||
json_compatible_item_data = {"content": page_content}
|
||||
|
@ -77,7 +77,7 @@ class FirecrawlApp:
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
if response['success']:
|
||||
if response['success'] and 'data' in response:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||
@ -115,7 +115,11 @@ class FirecrawlApp:
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
<<<<<<< main
|
||||
if response['success']:
|
||||
=======
|
||||
if response['success'] and 'data' in response:
|
||||
>>>>>>> main
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
||||
|
@ -5,6 +5,10 @@ services:
|
||||
build: apps/playwright-service
|
||||
environment:
|
||||
- PORT=3000
|
||||
- PROXY_SERVER=${PROXY_SERVER}
|
||||
- PROXY_USERNAME=${PROXY_USERNAME}
|
||||
- PROXY_PASSWORD=${PROXY_PASSWORD}
|
||||
- BLOCK_MEDIA=${BLOCK_MEDIA}
|
||||
networks:
|
||||
- backend
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user