Merge branch 'main' into main
This commit is contained in:
commit
65fe9c4f80
35
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
35
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
---
|
||||||
|
name: Bug report
|
||||||
|
about: Create a report to help us improve
|
||||||
|
title: "[BUG]"
|
||||||
|
labels: bug
|
||||||
|
assignees: ''
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Describe the Bug**
|
||||||
|
Provide a clear and concise description of what the bug is.
|
||||||
|
|
||||||
|
**To Reproduce**
|
||||||
|
Steps to reproduce the issue:
|
||||||
|
1. Configure the environment or settings with '...'
|
||||||
|
2. Run the command '...'
|
||||||
|
3. Observe the error or unexpected output at '...'
|
||||||
|
4. Log output/error message
|
||||||
|
|
||||||
|
**Expected Behavior**
|
||||||
|
A clear and concise description of what you expected to happen.
|
||||||
|
|
||||||
|
**Screenshots**
|
||||||
|
If applicable, add screenshots or copies of the command line output to help explain the issue.
|
||||||
|
|
||||||
|
**Environment (please complete the following information):**
|
||||||
|
- OS: [e.g. macOS, Linux, Windows]
|
||||||
|
- Firecrawl Version: [e.g. 1.2.3]
|
||||||
|
- Node.js Version: [e.g. 14.x]
|
||||||
|
|
||||||
|
**Logs**
|
||||||
|
If applicable, include detailed logs to help understand the problem.
|
||||||
|
|
||||||
|
**Additional Context**
|
||||||
|
Add any other context about the problem here, such as configuration specifics, network conditions, data volumes, etc.
|
26
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
26
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
---
|
||||||
|
name: Feature request
|
||||||
|
about: Suggest an idea for this project
|
||||||
|
title: "[Feat]"
|
||||||
|
labels: ''
|
||||||
|
assignees: ''
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Problem Description**
|
||||||
|
Describe the issue you're experiencing that has prompted this feature request. For example, "I find it difficult when..."
|
||||||
|
|
||||||
|
**Proposed Feature**
|
||||||
|
Provide a clear and concise description of the feature you would like implemented.
|
||||||
|
|
||||||
|
**Alternatives Considered**
|
||||||
|
Discuss any alternative solutions or features you've considered. Why were these alternatives not suitable?
|
||||||
|
|
||||||
|
**Implementation Suggestions**
|
||||||
|
If you have ideas on how the feature could be implemented, share them here. This could include technical details, API changes, or interaction mechanisms.
|
||||||
|
|
||||||
|
**Use Case**
|
||||||
|
Explain how this feature would be used and what benefits it would bring. Include specific examples to illustrate how this would improve functionality or user experience.
|
||||||
|
|
||||||
|
**Additional Context**
|
||||||
|
Add any other context such as comparisons with similar features in other products, or links to prototypes or mockups.
|
@ -36,3 +36,10 @@ HYPERDX_API_KEY=
|
|||||||
HDX_NODE_BETA_MODE=1
|
HDX_NODE_BETA_MODE=1
|
||||||
|
|
||||||
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta
|
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta
|
||||||
|
|
||||||
|
# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
|
||||||
|
PROXY_SERVER=
|
||||||
|
PROXY_USERNAME=
|
||||||
|
PROXY_PASSWORD=
|
||||||
|
# set if you'd like to block media requests to save proxy bandwidth
|
||||||
|
BLOCK_MEDIA=
|
@ -1,6 +1,7 @@
|
|||||||
const socialMediaBlocklist = [
|
const socialMediaBlocklist = [
|
||||||
'facebook.com',
|
'facebook.com',
|
||||||
'twitter.com',
|
'twitter.com',
|
||||||
|
'x.com',
|
||||||
'instagram.com',
|
'instagram.com',
|
||||||
'linkedin.com',
|
'linkedin.com',
|
||||||
'pinterest.com',
|
'pinterest.com',
|
||||||
@ -14,12 +15,25 @@ const socialMediaBlocklist = [
|
|||||||
'telegram.org',
|
'telegram.org',
|
||||||
];
|
];
|
||||||
|
|
||||||
const allowedUrls = [
|
const allowedKeywords = [
|
||||||
'linkedin.com/pulse'
|
'pulse',
|
||||||
|
'privacy',
|
||||||
|
'terms',
|
||||||
|
'policy',
|
||||||
|
'user-agreement',
|
||||||
|
'legal',
|
||||||
|
'help',
|
||||||
|
'support',
|
||||||
|
'contact',
|
||||||
|
'about',
|
||||||
|
'careers',
|
||||||
|
'blog',
|
||||||
|
'press',
|
||||||
|
'conditions',
|
||||||
];
|
];
|
||||||
|
|
||||||
export function isUrlBlocked(url: string): boolean {
|
export function isUrlBlocked(url: string): boolean {
|
||||||
if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) {
|
if (allowedKeywords.some(keyword => url.includes(keyword))) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,9 +2,16 @@ from fastapi import FastAPI
|
|||||||
from playwright.async_api import async_playwright, Browser
|
from playwright.async_api import async_playwright, Browser
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
from os import environ
|
||||||
|
|
||||||
|
PROXY_SERVER = environ.get('PROXY_SERVER', None)
|
||||||
|
PROXY_USERNAME = environ.get('PROXY_USERNAME', None)
|
||||||
|
PROXY_PASSWORD = environ.get('PROXY_PASSWORD', None)
|
||||||
|
BLOCK_MEDIA = environ.get('BLOCK_MEDIA', 'False').upper() == 'TRUE'
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
|
|
||||||
class UrlModel(BaseModel):
|
class UrlModel(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
wait: int = None
|
wait: int = None
|
||||||
@ -27,11 +34,28 @@ async def shutdown_event():
|
|||||||
|
|
||||||
@app.post("/html")
|
@app.post("/html")
|
||||||
async def root(body: UrlModel):
|
async def root(body: UrlModel):
|
||||||
context = await browser.new_context()
|
context = None
|
||||||
|
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
|
||||||
|
context = await browser.new_context(proxy={"server": PROXY_SERVER,
|
||||||
|
"username": PROXY_USERNAME,
|
||||||
|
"password": PROXY_PASSWORD})
|
||||||
|
else:
|
||||||
|
context = await browser.new_context()
|
||||||
|
|
||||||
|
if BLOCK_MEDIA:
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}",
|
||||||
|
handler=lambda route, request: route.abort())
|
||||||
|
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
await page.goto(body.url, timeout=15000) # Set max timeout to 15s
|
await page.goto(
|
||||||
if body.wait: # Check if wait parameter is provided in the request body
|
body.url,
|
||||||
await page.wait_for_timeout(body.wait) # Convert seconds to milliseconds for playwright
|
wait_until="load",
|
||||||
|
timeout=body.timeout if body.timeout else 15000,
|
||||||
|
)
|
||||||
|
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
|
||||||
|
if body.wait:
|
||||||
|
await page.wait_for_timeout(body.wait)
|
||||||
|
|
||||||
page_content = await page.content()
|
page_content = await page.content()
|
||||||
await context.close()
|
await context.close()
|
||||||
json_compatible_item_data = {"content": page_content}
|
json_compatible_item_data = {"content": page_content}
|
||||||
|
@ -77,7 +77,7 @@ class FirecrawlApp:
|
|||||||
)
|
)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
response = response.json()
|
response = response.json()
|
||||||
if response['success']:
|
if response['success'] and 'data' in response:
|
||||||
return response['data']
|
return response['data']
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||||
@ -115,7 +115,11 @@ class FirecrawlApp:
|
|||||||
)
|
)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
response = response.json()
|
response = response.json()
|
||||||
|
<<<<<<< main
|
||||||
if response['success']:
|
if response['success']:
|
||||||
|
=======
|
||||||
|
if response['success'] and 'data' in response:
|
||||||
|
>>>>>>> main
|
||||||
return response['data']
|
return response['data']
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
raise Exception(f'Failed to search. Error: {response["error"]}')
|
||||||
|
@ -5,6 +5,10 @@ services:
|
|||||||
build: apps/playwright-service
|
build: apps/playwright-service
|
||||||
environment:
|
environment:
|
||||||
- PORT=3000
|
- PORT=3000
|
||||||
|
- PROXY_SERVER=${PROXY_SERVER}
|
||||||
|
- PROXY_USERNAME=${PROXY_USERNAME}
|
||||||
|
- PROXY_PASSWORD=${PROXY_PASSWORD}
|
||||||
|
- BLOCK_MEDIA=${BLOCK_MEDIA}
|
||||||
networks:
|
networks:
|
||||||
- backend
|
- backend
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user