From 2a39b5382b3f7985aa06b5205fa783191841506c Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Sat, 1 Jun 2024 13:10:14 +1000 Subject: [PATCH 1/6] Add timeout to class and provide default. --- apps/playwright-service/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index 9cb0c4e..805c53f 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -15,6 +15,7 @@ app = FastAPI() class UrlModel(BaseModel): url: str wait: int = None + timeout: int = 15000 browser: Browser = None @@ -50,7 +51,7 @@ async def root(body: UrlModel): await page.goto( body.url, wait_until="load", - timeout=body.timeout if body.timeout else 15000, + timeout=body.timeout, ) # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough if body.wait: From c516140bfb5750a5ac7132eceaa11e5e242599ea Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Sat, 1 Jun 2024 13:24:40 +1000 Subject: [PATCH 2/6] Various Linting Pylint C0114: Missing module docstring C0115: Missing class docstring C0116: Missing function or method docstring C0303: Trailing whitespace Import ordering --- apps/playwright-service/main.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index 805c53f..a2a6d75 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -1,9 +1,15 @@ -from fastapi import FastAPI -from playwright.async_api import async_playwright, Browser -from fastapi.responses import JSONResponse -from pydantic import BaseModel +""" +This module provides a FastAPI application that uses Playwright to fetch and return +the HTML content of a specified URL. It supports optional proxy settings and media blocking. +""" + from os import environ +from fastapi import FastAPI +from fastapi.responses import JSONResponse +from playwright.async_api import Browser, async_playwright +from pydantic import BaseModel + PROXY_SERVER = environ.get('PROXY_SERVER', None) PROXY_USERNAME = environ.get('PROXY_USERNAME', None) PROXY_PASSWORD = environ.get('PROXY_PASSWORD', None) @@ -11,30 +17,37 @@ BLOCK_MEDIA = environ.get('BLOCK_MEDIA', 'False').upper() == 'TRUE' app = FastAPI() - class UrlModel(BaseModel): + """Model representing the URL and associated parameters for the request.""" url: str wait: int = None timeout: int = 15000 - browser: Browser = None - @app.on_event("startup") async def startup_event(): + """Event handler for application startup to initialize the browser.""" global browser playwright = await async_playwright().start() browser = await playwright.chromium.launch() - @app.on_event("shutdown") async def shutdown_event(): + """Event handler for application shutdown to close the browser.""" await browser.close() - @app.post("/html") async def root(body: UrlModel): + """ + Endpoint to fetch and return HTML content of a given URL. + + Args: + body (UrlModel): The URL model containing the target URL, wait time, and timeout. + + Returns: + JSONResponse: The HTML content of the page. + """ context = None if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD: context = await browser.new_context(proxy={"server": PROXY_SERVER, @@ -56,7 +69,7 @@ async def root(body: UrlModel): # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough if body.wait: await page.wait_for_timeout(body.wait) - + page_content = await page.content() await context.close() json_compatible_item_data = {"content": page_content} From 1eacad4ef34ad22f3521e4e9d7f47b24102a6797 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Sat, 1 Jun 2024 13:46:16 +1000 Subject: [PATCH 3/6] Clarifying wait type and name --- apps/playwright-service/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index a2a6d75..3c1ff19 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -20,7 +20,7 @@ app = FastAPI() class UrlModel(BaseModel): """Model representing the URL and associated parameters for the request.""" url: str - wait: int = None + wait_after_load: int = 0 timeout: int = 15000 browser: Browser = None @@ -67,8 +67,8 @@ async def root(body: UrlModel): timeout=body.timeout, ) # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough - if body.wait: - await page.wait_for_timeout(body.wait) + if body.wait_after_load > 0: + await page.wait_for_timeout(body.wait_after_load) page_content = await page.content() await context.close() From 14896a9fdda7c62e6d51c01f61c88210c7a77d2f Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Sat, 1 Jun 2024 19:03:16 +1000 Subject: [PATCH 4/6] Fix PLAYWRIGHT_MICROSERVICE_URL It needs to end in html, otherwise scrape will 404 --- apps/api/.env.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/.env.example b/apps/api/.env.example index 7934903..a6eaae3 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -3,7 +3,7 @@ NUM_WORKERS_PER_QUEUE=8 PORT=3002 HOST=0.0.0.0 REDIS_URL=redis://localhost:6379 -PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000 +PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html ## To turn on DB authentication, you need to set up supabase. USE_DB_AUTHENTICATION=true From deefe65cbe115c15d4f9eeae16b87bf66c234167 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Sat, 1 Jun 2024 19:16:56 +1000 Subject: [PATCH 5/6] Change the way the playwright response is parsed Was failing with a Type Error, but actually looked ok. This fixes the type error, and stop scraper fallback. --- apps/api/src/scraper/WebScraper/single_url.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c7f9469..70b4aa6 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -146,12 +146,18 @@ export async function scrapWithPlaywright(url: string, waitFor: number = 0): Pro if (contentType && contentType.includes('application/pdf')) { return fetchAndProcessPdf(url); } else { - const data = await response.json(); - const html = data.content; - return html ?? ""; + const textData = await response.text(); + try { + const data = JSON.parse(textData); + const html = data.content; + return html ?? ""; + } catch (jsonError) { + console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`); + return ""; + } } } catch (error) { - console.error(`[Playwright][c] Error fetching url: ${url} -> ${error}`); + console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); return ""; } } From 4e3a0495d7256d337792852b7754a82171686d68 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 4 Jun 2024 12:03:55 -0300 Subject: [PATCH 6/6] updated version 0.0.12 -> 0.0.13 - [ ] publish --- apps/python-sdk/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py index b4b9b2f..903eab4 100644 --- a/apps/python-sdk/setup.py +++ b/apps/python-sdk/setup.py @@ -7,7 +7,7 @@ long_description_content = (this_directory / "README.md").read_text() setup( name="firecrawl-py", - version="0.0.12", + version="0.0.13", url="https://github.com/mendableai/firecrawl", author="Mendable.ai", author_email="nick@mendable.ai",