diff --git a/apps/api/.env.example b/apps/api/.env.example index 463a40f..f18f274 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -3,7 +3,7 @@ NUM_WORKERS_PER_QUEUE=8 PORT=3002 HOST=0.0.0.0 REDIS_URL=redis://localhost:6379 -PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000 +PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html ## To turn on DB authentication, you need to set up supabase. USE_DB_AUTHENTICATION=true diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 911382f..a99b365 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -157,12 +157,18 @@ export async function scrapWithPlaywright( if (contentType && contentType.includes("application/pdf")) { return fetchAndProcessPdf(url); } else { - const data = await response.json(); - const html = data.content; - return html ?? ""; + const textData = await response.text(); + try { + const data = JSON.parse(textData); + const html = data.content; + return html ?? ""; + } catch (jsonError) { + console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`); + return ""; + } } } catch (error) { - console.error(`[Playwright][c] Error fetching url: ${url} -> ${error}`); + console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); return ""; } } diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index b4ac715..8ef7418 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -1,9 +1,15 @@ -from fastapi import FastAPI -from playwright.async_api import async_playwright, Browser -from fastapi.responses import JSONResponse -from pydantic import BaseModel +""" +This module provides a FastAPI application that uses Playwright to fetch and return +the HTML content of a specified URL. It supports optional proxy settings and media blocking. +""" + from os import environ +from fastapi import FastAPI +from fastapi.responses import JSONResponse +from playwright.async_api import Browser, async_playwright +from pydantic import BaseModel + PROXY_SERVER = environ.get("PROXY_SERVER", None) PROXY_USERNAME = environ.get("PROXY_USERNAME", None) PROXY_PASSWORD = environ.get("PROXY_PASSWORD", None) @@ -11,31 +17,38 @@ BLOCK_MEDIA = environ.get("BLOCK_MEDIA", "False").upper() == "TRUE" app = FastAPI() - class UrlModel(BaseModel): + """Model representing the URL and associated parameters for the request.""" url: str - wait: int = None - wait_until: str = "load" + wait_after_load: int = 0 + timeout: int = 15000 headers: dict = None - browser: Browser = None - @app.on_event("startup") async def startup_event(): + """Event handler for application startup to initialize the browser.""" global browser playwright = await async_playwright().start() browser = await playwright.chromium.launch() - @app.on_event("shutdown") async def shutdown_event(): + """Event handler for application shutdown to close the browser.""" await browser.close() - @app.post("/html") async def root(body: UrlModel): + """ + Endpoint to fetch and return HTML content of a given URL. + + Args: + body (UrlModel): The URL model containing the target URL, wait time, and timeout. + + Returns: + JSONResponse: The HTML content of the page. + """ context = None if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD: context = await browser.new_context( @@ -62,13 +75,13 @@ async def root(body: UrlModel): await page.goto( body.url, - timeout=15000, - wait_until=body.wait_until if body.wait_until else "load", - ) # Set max timeout to 15s - if body.wait: # Check if wait parameter is provided in the request body - await page.wait_for_timeout( - body.wait - ) # Convert seconds to milliseconds for playwright + wait_until="load", + timeout=body.timeout, + ) + # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough + if body.wait_after_load > 0: + await page.wait_for_timeout(body.wait_after_load) + page_content = await page.content() await context.close() json_compatible_item_data = {"content": page_content} diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py index b4b9b2f..903eab4 100644 --- a/apps/python-sdk/setup.py +++ b/apps/python-sdk/setup.py @@ -7,7 +7,7 @@ long_description_content = (this_directory / "README.md").read_text() setup( name="firecrawl-py", - version="0.0.12", + version="0.0.13", url="https://github.com/mendableai/firecrawl", author="Mendable.ai", author_email="nick@mendable.ai",