Merge pull request #224 from mattjoyce/playwright-service-bug-222
Playwright service bugs #222 #179 #197
This commit is contained in:
commit
f17cb1a0d4
@ -3,7 +3,7 @@ NUM_WORKERS_PER_QUEUE=8
|
|||||||
PORT=3002
|
PORT=3002
|
||||||
HOST=0.0.0.0
|
HOST=0.0.0.0
|
||||||
REDIS_URL=redis://localhost:6379
|
REDIS_URL=redis://localhost:6379
|
||||||
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000
|
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html
|
||||||
|
|
||||||
## To turn on DB authentication, you need to set up supabase.
|
## To turn on DB authentication, you need to set up supabase.
|
||||||
USE_DB_AUTHENTICATION=true
|
USE_DB_AUTHENTICATION=true
|
||||||
|
@ -157,12 +157,18 @@ export async function scrapWithPlaywright(
|
|||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url);
|
||||||
} else {
|
} else {
|
||||||
const data = await response.json();
|
const textData = await response.text();
|
||||||
const html = data.content;
|
try {
|
||||||
return html ?? "";
|
const data = JSON.parse(textData);
|
||||||
|
const html = data.content;
|
||||||
|
return html ?? "";
|
||||||
|
} catch (jsonError) {
|
||||||
|
console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`[Playwright][c] Error fetching url: ${url} -> ${error}`);
|
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,15 @@
|
|||||||
from fastapi import FastAPI
|
"""
|
||||||
from playwright.async_api import async_playwright, Browser
|
This module provides a FastAPI application that uses Playwright to fetch and return
|
||||||
from fastapi.responses import JSONResponse
|
the HTML content of a specified URL. It supports optional proxy settings and media blocking.
|
||||||
from pydantic import BaseModel
|
"""
|
||||||
|
|
||||||
from os import environ
|
from os import environ
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from playwright.async_api import Browser, async_playwright
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
PROXY_SERVER = environ.get("PROXY_SERVER", None)
|
PROXY_SERVER = environ.get("PROXY_SERVER", None)
|
||||||
PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
|
PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
|
||||||
PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None)
|
PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None)
|
||||||
@ -11,31 +17,38 @@ BLOCK_MEDIA = environ.get("BLOCK_MEDIA", "False").upper() == "TRUE"
|
|||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
|
|
||||||
class UrlModel(BaseModel):
|
class UrlModel(BaseModel):
|
||||||
|
"""Model representing the URL and associated parameters for the request."""
|
||||||
url: str
|
url: str
|
||||||
wait: int = None
|
wait_after_load: int = 0
|
||||||
wait_until: str = "load"
|
timeout: int = 15000
|
||||||
headers: dict = None
|
headers: dict = None
|
||||||
|
|
||||||
|
|
||||||
browser: Browser = None
|
browser: Browser = None
|
||||||
|
|
||||||
|
|
||||||
@app.on_event("startup")
|
@app.on_event("startup")
|
||||||
async def startup_event():
|
async def startup_event():
|
||||||
|
"""Event handler for application startup to initialize the browser."""
|
||||||
global browser
|
global browser
|
||||||
playwright = await async_playwright().start()
|
playwright = await async_playwright().start()
|
||||||
browser = await playwright.chromium.launch()
|
browser = await playwright.chromium.launch()
|
||||||
|
|
||||||
|
|
||||||
@app.on_event("shutdown")
|
@app.on_event("shutdown")
|
||||||
async def shutdown_event():
|
async def shutdown_event():
|
||||||
|
"""Event handler for application shutdown to close the browser."""
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|
||||||
|
|
||||||
@app.post("/html")
|
@app.post("/html")
|
||||||
async def root(body: UrlModel):
|
async def root(body: UrlModel):
|
||||||
|
"""
|
||||||
|
Endpoint to fetch and return HTML content of a given URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
body (UrlModel): The URL model containing the target URL, wait time, and timeout.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSONResponse: The HTML content of the page.
|
||||||
|
"""
|
||||||
context = None
|
context = None
|
||||||
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
|
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
|
||||||
context = await browser.new_context(
|
context = await browser.new_context(
|
||||||
@ -62,13 +75,13 @@ async def root(body: UrlModel):
|
|||||||
|
|
||||||
await page.goto(
|
await page.goto(
|
||||||
body.url,
|
body.url,
|
||||||
timeout=15000,
|
wait_until="load",
|
||||||
wait_until=body.wait_until if body.wait_until else "load",
|
timeout=body.timeout,
|
||||||
) # Set max timeout to 15s
|
)
|
||||||
if body.wait: # Check if wait parameter is provided in the request body
|
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
|
||||||
await page.wait_for_timeout(
|
if body.wait_after_load > 0:
|
||||||
body.wait
|
await page.wait_for_timeout(body.wait_after_load)
|
||||||
) # Convert seconds to milliseconds for playwright
|
|
||||||
page_content = await page.content()
|
page_content = await page.content()
|
||||||
await context.close()
|
await context.close()
|
||||||
json_compatible_item_data = {"content": page_content}
|
json_compatible_item_data = {"content": page_content}
|
||||||
|
@ -7,7 +7,7 @@ long_description_content = (this_directory / "README.md").read_text()
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="firecrawl-py",
|
name="firecrawl-py",
|
||||||
version="0.0.12",
|
version="0.0.13",
|
||||||
url="https://github.com/mendableai/firecrawl",
|
url="https://github.com/mendableai/firecrawl",
|
||||||
author="Mendable.ai",
|
author="Mendable.ai",
|
||||||
author_email="nick@mendable.ai",
|
author_email="nick@mendable.ai",
|
||||||
|
Loading…
Reference in New Issue
Block a user