0

Merge pull request #224 from mattjoyce/playwright-service-bug-222

Playwright service bugs #222  #179  #197
This commit is contained in:
Rafael Miller 2024-06-04 12:05:56 -03:00 committed by GitHub
commit f17cb1a0d4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 43 additions and 24 deletions

View File

@ -3,7 +3,7 @@ NUM_WORKERS_PER_QUEUE=8
PORT=3002 PORT=3002
HOST=0.0.0.0 HOST=0.0.0.0
REDIS_URL=redis://localhost:6379 REDIS_URL=redis://localhost:6379
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000 PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html
## To turn on DB authentication, you need to set up supabase. ## To turn on DB authentication, you need to set up supabase.
USE_DB_AUTHENTICATION=true USE_DB_AUTHENTICATION=true

View File

@ -157,12 +157,18 @@ export async function scrapWithPlaywright(
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url);
} else { } else {
const data = await response.json(); const textData = await response.text();
const html = data.content; try {
return html ?? ""; const data = JSON.parse(textData);
const html = data.content;
return html ?? "";
} catch (jsonError) {
console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`);
return "";
}
} }
} catch (error) { } catch (error) {
console.error(`[Playwright][c] Error fetching url: ${url} -> ${error}`); console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }

View File

@ -1,9 +1,15 @@
from fastapi import FastAPI """
from playwright.async_api import async_playwright, Browser This module provides a FastAPI application that uses Playwright to fetch and return
from fastapi.responses import JSONResponse the HTML content of a specified URL. It supports optional proxy settings and media blocking.
from pydantic import BaseModel """
from os import environ from os import environ
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from playwright.async_api import Browser, async_playwright
from pydantic import BaseModel
PROXY_SERVER = environ.get("PROXY_SERVER", None) PROXY_SERVER = environ.get("PROXY_SERVER", None)
PROXY_USERNAME = environ.get("PROXY_USERNAME", None) PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None) PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None)
@ -11,31 +17,38 @@ BLOCK_MEDIA = environ.get("BLOCK_MEDIA", "False").upper() == "TRUE"
app = FastAPI() app = FastAPI()
class UrlModel(BaseModel): class UrlModel(BaseModel):
"""Model representing the URL and associated parameters for the request."""
url: str url: str
wait: int = None wait_after_load: int = 0
wait_until: str = "load" timeout: int = 15000
headers: dict = None headers: dict = None
browser: Browser = None browser: Browser = None
@app.on_event("startup") @app.on_event("startup")
async def startup_event(): async def startup_event():
"""Event handler for application startup to initialize the browser."""
global browser global browser
playwright = await async_playwright().start() playwright = await async_playwright().start()
browser = await playwright.chromium.launch() browser = await playwright.chromium.launch()
@app.on_event("shutdown") @app.on_event("shutdown")
async def shutdown_event(): async def shutdown_event():
"""Event handler for application shutdown to close the browser."""
await browser.close() await browser.close()
@app.post("/html") @app.post("/html")
async def root(body: UrlModel): async def root(body: UrlModel):
"""
Endpoint to fetch and return HTML content of a given URL.
Args:
body (UrlModel): The URL model containing the target URL, wait time, and timeout.
Returns:
JSONResponse: The HTML content of the page.
"""
context = None context = None
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD: if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
context = await browser.new_context( context = await browser.new_context(
@ -62,13 +75,13 @@ async def root(body: UrlModel):
await page.goto( await page.goto(
body.url, body.url,
timeout=15000, wait_until="load",
wait_until=body.wait_until if body.wait_until else "load", timeout=body.timeout,
) # Set max timeout to 15s )
if body.wait: # Check if wait parameter is provided in the request body # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
await page.wait_for_timeout( if body.wait_after_load > 0:
body.wait await page.wait_for_timeout(body.wait_after_load)
) # Convert seconds to milliseconds for playwright
page_content = await page.content() page_content = await page.content()
await context.close() await context.close()
json_compatible_item_data = {"content": page_content} json_compatible_item_data = {"content": page_content}

View File

@ -7,7 +7,7 @@ long_description_content = (this_directory / "README.md").read_text()
setup( setup(
name="firecrawl-py", name="firecrawl-py",
version="0.0.12", version="0.0.13",
url="https://github.com/mendableai/firecrawl", url="https://github.com/mendableai/firecrawl",
author="Mendable.ai", author="Mendable.ai",
author_email="nick@mendable.ai", author_email="nick@mendable.ai",