From c71ea7a795f9096bbde997bbd48539a5ec865ea0 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Sat, 8 Jun 2024 11:08:26 +1000 Subject: [PATCH 01/11] Prepare headers consistently --- apps/python-sdk/firecrawl/firecrawl.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index b9a823f..c5207fb 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -45,10 +45,8 @@ class FirecrawlApp: Exception: If the scrape request fails. """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } + headers = self._prepare_headers() + # Prepare the base scrape parameters with the URL scrape_params = {'url': url} @@ -101,10 +99,7 @@ class FirecrawlApp: Raises: Exception: If the search request fails. """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } + headers = self._prepare_headers() json_data = {'query': query} if params: json_data.update(params) @@ -297,3 +292,4 @@ class FirecrawlApp: raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') else: raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') + From 9f306736afc3cead950da4898f0d8b658aac860e Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Sat, 8 Jun 2024 11:18:30 +1000 Subject: [PATCH 02/11] More detailed error handling --- apps/python-sdk/firecrawl/firecrawl.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index c5207fb..f13ba72 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -287,9 +287,20 @@ class FirecrawlApp: Raises: Exception: An exception with a message containing the status code and error details from the response. """ - if response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') + error_message = response.json().get('error', 'No additional error details provided.') + + if response.status_code == 402: + message = f"Payment Required: Failed to {action}. {error_message}" + elif response.status_code == 408: + message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}" + elif response.status_code == 409: + message = f"Conflict: Failed to {action} due to a conflict. {error_message}" + elif response.status_code == 500: + message = f"Internal Server Error: Failed to {action}. {error_message}" else: - raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') + message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}" + + # Raise an HTTPError with the custom message and attach the response + raise requests.exceptions.HTTPError(message, response=response) + From 7477c5e5bd23b88faa30d7ddf5e34cb335a6b6fb Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Sat, 8 Jun 2024 11:28:51 +1000 Subject: [PATCH 03/11] Use error handler consistently --- apps/python-sdk/firecrawl/firecrawl.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index f13ba72..a820ef2 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -30,6 +30,7 @@ class FirecrawlApp: if self.api_key is None: raise ValueError('No API key provided') self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ Scrape the specified URL using the Firecrawl API. @@ -79,11 +80,8 @@ class FirecrawlApp: return response['data'] else: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - elif response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') else: - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') + self._handle_error(response, 'scrape URL') def search(self, query, params=None): """ @@ -116,11 +114,8 @@ class FirecrawlApp: else: raise Exception(f'Failed to search. Error: {response["error"]}') - elif response.status_code in [402, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') else: - raise Exception(f'Failed to search. Status code: {response.status_code}') + self._handle_error(response, 'search') def crawl_url(self, url, params=None, wait_until_done=True, poll_interval=2, idempotency_key=None): """ @@ -303,4 +298,3 @@ class FirecrawlApp: # Raise an HTTPError with the custom message and attach the response raise requests.exceptions.HTTPError(message, response=response) - From 6fd9ce1c89ca19ecb737a36778f7dcb0cd2a7c35 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Sat, 8 Jun 2024 11:46:52 +1000 Subject: [PATCH 04/11] type hints and linting --- apps/python-sdk/firecrawl/firecrawl.py | 31 +++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index a820ef2..fb12af4 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -83,7 +83,7 @@ class FirecrawlApp: else: self._handle_error(response, 'scrape URL') - def search(self, query, params=None): + def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any: """ Perform a search using the Firecrawl API. @@ -117,7 +117,11 @@ class FirecrawlApp: else: self._handle_error(response, 'search') - def crawl_url(self, url, params=None, wait_until_done=True, poll_interval=2, idempotency_key=None): + def crawl_url(self, url: str, + params: Optional[Dict[str, Any]] = None, + wait_until_done: bool = True, + poll_interval: int = 2, + idempotency_key: Optional[str] = None) -> Any: """ Initiate a crawl job for the specified URL using the Firecrawl API. @@ -148,7 +152,7 @@ class FirecrawlApp: else: self._handle_error(response, 'start crawl job') - def check_crawl_status(self, job_id): + def check_crawl_status(self, job_id: str) -> Any: """ Check the status of a crawl job using the Firecrawl API. @@ -168,7 +172,7 @@ class FirecrawlApp: else: self._handle_error(response, 'check crawl status') - def _prepare_headers(self, idempotency_key=None): + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. @@ -190,7 +194,11 @@ class FirecrawlApp: 'Authorization': f'Bearer {self.api_key}', } - def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): + def _post_request(self, url: str, + data: Dict[str, Any], + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a POST request with retries. @@ -215,7 +223,10 @@ class FirecrawlApp: return response return response - def _get_request(self, url, headers, retries=3, backoff_factor=0.5): + def _get_request(self, url: str, + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a GET request with retries. @@ -239,7 +250,7 @@ class FirecrawlApp: return response return response - def _monitor_job_status(self, job_id, headers, poll_interval): + def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any: """ Monitor the status of a crawl job until completion. @@ -271,7 +282,7 @@ class FirecrawlApp: else: self._handle_error(status_response, 'check crawl status') - def _handle_error(self, response, action): + def _handle_error(self, response: requests.Response, action: str) -> None: """ Handle errors from API responses. @@ -283,7 +294,7 @@ class FirecrawlApp: Exception: An exception with a message containing the status code and error details from the response. """ error_message = response.json().get('error', 'No additional error details provided.') - + if response.status_code == 402: message = f"Payment Required: Failed to {action}. {error_message}" elif response.status_code == 408: @@ -297,4 +308,4 @@ class FirecrawlApp: # Raise an HTTPError with the custom message and attach the response raise requests.exceptions.HTTPError(message, response=response) - + \ No newline at end of file From 827354a116a4ea424af7c1994aae7214d78c8032 Mon Sep 17 00:00:00 2001 From: Matt Joyce Date: Mon, 10 Jun 2024 21:21:23 +1000 Subject: [PATCH 05/11] Added logging to python sdk FIRECRAWL_LOGGING_LEVEL Instantiates the logger early and depends on env to set. --- apps/python-sdk/firecrawl/__init__.py | 54 ++++++++++++++++++++++++++ apps/python-sdk/firecrawl/firecrawl.py | 10 ++++- 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index ecb017f..4e53e77 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -1,3 +1,57 @@ +""" +This is the Firecrawl package. + +This package provides a Python SDK for interacting with the Firecrawl API. +It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, +and check the status of these jobs. + +For more information visit https://github.com/firecrawl/ +""" + +import logging +import os + from .firecrawl import FirecrawlApp __version__ = "0.0.14" + +# Define the logger for the Firecrawl project +logger: logging.Logger = logging.getLogger("firecrawl") + + +def _basic_config() -> None: + """Set up basic configuration for logging with a specific format and date format.""" + try: + logging.basicConfig( + format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + except Exception as e: + logger.error("Failed to configure logging: %s", e) + + +def setup_logging() -> None: + """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable.""" + env = os.environ.get( + "FIRECRAWL_LOGGING_LEVEL", "INFO" + ).upper() # Default to 'INFO' level + _basic_config() + + if env == "DEBUG": + logger.setLevel(logging.DEBUG) + elif env == "INFO": + logger.setLevel(logging.INFO) + elif env == "WARNING": + logger.setLevel(logging.WARNING) + elif env == "ERROR": + logger.setLevel(logging.ERROR) + elif env == "CRITICAL": + logger.setLevel(logging.CRITICAL) + else: + logger.setLevel(logging.INFO) + logger.warning("Unknown logging level: %s, defaulting to INFO", env) + + +# Initialize logging configuration when the module is imported +setup_logging() +logger.debug("Debugging logger setup") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index b9a823f..f20d4bd 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes. Classes: - FirecrawlApp: Main class for interacting with the Firecrawl API. """ - +import logging import os import time from typing import Any, Dict, Optional import requests +logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: """ @@ -28,8 +29,15 @@ class FirecrawlApp: def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') if self.api_key is None: + logger.warning("No API key provided") raise ValueError('No API key provided') + else: + logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + if self.api_url != 'https://api.firecrawl.dev': + logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url) + def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ Scrape the specified URL using the Firecrawl API. From e37d15140428c5c2eec3a6126b2a25c86f08e23c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 12 Jun 2024 15:06:47 -0300 Subject: [PATCH 06/11] added parsePDF option to pageOptions user can decide if they are going to let us take care of the parse or they are going to parse the pdf by themselves --- .../src/__tests__/e2e_withAuth/index.test.ts | 15 ++++++++++++ apps/api/src/controllers/crawl.ts | 6 ++++- apps/api/src/controllers/scrape.ts | 8 ++++++- apps/api/src/lib/entities.ts | 1 + .../WebScraper/custom/handleCustomScraping.ts | 2 -- apps/api/src/scraper/WebScraper/index.ts | 9 ++++++-- apps/api/src/scraper/WebScraper/single_url.ts | 23 +++++++++++-------- .../utils/__tests__/pdfProcessor.test.ts | 2 +- .../scraper/WebScraper/utils/pdfProcessor.ts | 12 ++++++---- 9 files changed, 57 insertions(+), 21 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 02e4a47..4a1609b 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -136,6 +136,21 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 60000); // 60 seconds + it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); + }, 60000); // 60 seconds + // TODO: add this test back once we nail the waitFor option to be more deterministic // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { // const startTime = Date.now(); diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 58d01e2..fc3fe28 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -56,7 +56,11 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false }; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + parsePDF: true + }; if (mode === "single_urls" && !url.includes(",")) { try { diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index d5ab1de..ed28639 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false }; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + waitFor: 0, + screenshot: false, + parsePDF: true + }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 81bf12c..d676584 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -19,6 +19,7 @@ export type PageOptions = { screenshot?: boolean; headers?: Record; replaceAllPathsWithAbsolutePaths?: boolean; + parsePDF?: boolean }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index 8108a9e..081150b 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -1,5 +1,3 @@ -import { fetchAndProcessPdf } from "../utils/pdfProcessor"; - export async function handleCustomScraping( text: string, url: string diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index f432f43..f0f423a 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -280,7 +280,7 @@ export class WebScraperDataProvider { private async fetchPdfDocuments(pdfLinks: string[]): Promise { return Promise.all( pdfLinks.map(async (pdfLink) => { - const pdfContent = await fetchAndProcessPdf(pdfLink); + const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF); return { content: pdfContent, metadata: { sourceURL: pdfLink }, @@ -475,7 +475,12 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false }; + this.pageOptions = options.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + replaceAllPathsWithAbsolutePaths: false, + parsePDF: true + }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c2dcea1..8fa268f 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -49,7 +49,7 @@ export async function scrapWithFireEngine( url: string, waitFor: number = 0, screenshot: boolean = false, - pageOptions: { scrollXPaths?: string[] } = {}, + pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true }, headers?: Record, options?: any ): Promise { @@ -88,7 +88,7 @@ export async function scrapWithFireEngine( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return { html: await fetchAndProcessPdf(url), screenshot: "" }; + return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" }; } else { const data = response.data; const html = data.content; @@ -108,7 +108,8 @@ export async function scrapWithFireEngine( export async function scrapWithScrapingBee( url: string, wait_browser: string = "domcontentloaded", - timeout: number = universalTimeout + timeout: number = universalTimeout, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); @@ -129,7 +130,7 @@ export async function scrapWithScrapingBee( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const decoder = new TextDecoder(); const text = decoder.decode(response.data); @@ -144,7 +145,8 @@ export async function scrapWithScrapingBee( export async function scrapWithPlaywright( url: string, waitFor: number = 0, - headers?: Record + headers?: Record, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise { try { const reqParams = await generateRequestParams(url); @@ -172,7 +174,7 @@ export async function scrapWithPlaywright( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const textData = response.data; try { @@ -194,7 +196,10 @@ export async function scrapWithPlaywright( } } -export async function scrapWithFetch(url: string): Promise { +export async function scrapWithFetch( + url: string, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise { try { const response = await axios.get(url, { headers: { @@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise { const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const text = response.data; return text; @@ -371,7 +376,7 @@ export async function scrapSingleUrl( } break; case "pdf": - customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot } + customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot } break; } } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts index f14c8d4..f4ed3c6 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor'; describe('PDF Processing Module - Integration Test', () => { it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { delete process.env.LLAMAPARSE_API_KEY; - const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); + const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true); expect(pdfContent.trim()).toEqual("Dummy PDF file"); }); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 71984f2..1f0d6e8 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -9,9 +9,9 @@ import os from "os"; dotenv.config(); -export async function fetchAndProcessPdf(url: string): Promise { +export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise { const tempFilePath = await downloadPdf(url); - const content = await processPdfToText(tempFilePath); + const content = await processPdfToText(tempFilePath, parsePDF); fs.unlinkSync(tempFilePath); // Clean up the temporary file return content; } @@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise { }); } -export async function processPdfToText(filePath: string): Promise { +export async function processPdfToText(filePath: string, parsePDF: boolean): Promise { let content = ""; - if (process.env.LLAMAPARSE_API_KEY) { + if (process.env.LLAMAPARSE_API_KEY && parsePDF) { const apiKey = process.env.LLAMAPARSE_API_KEY; const headers = { Authorization: `Bearer ${apiKey}`, @@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise { console.error("Error processing pdf document w/ LlamaIndex(2)"); content = await processPdf(filePath); } - } else { + } else if (parsePDF) { content = await processPdf(filePath); + } else { + content = fs.readFileSync(filePath, "utf-8"); } return content; } From 676d6e8ab5f7a1fd14ff5b76f8289db7543082c4 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 13 Jun 2024 10:51:05 -0300 Subject: [PATCH 07/11] Added pageOptions.removeTags --- apps/api/openapi.json | 19 +++++++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 34 +++++++++++++++++++ apps/api/src/controllers/crawl.ts | 10 ++++-- apps/api/src/controllers/crawlPreview.ts | 2 +- apps/api/src/controllers/search.ts | 2 ++ apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/index.ts | 7 +++- apps/api/src/scraper/WebScraper/single_url.ts | 13 +++++++ 8 files changed, 84 insertions(+), 4 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index a755e37..b07e43f 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -61,6 +61,13 @@ "description": "Wait x amount of milliseconds for the page to load to fetch content", "default": 0 }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, "headers": { "type": "object", "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." @@ -194,6 +201,11 @@ "type": "integer", "description": "Maximum number of pages to crawl", "default": 10000 + }, + "allowBackwardCrawling": { + "type": "boolean", + "description": "Allow backward crawling (crawl from the base URL to the previous URLs)", + "default": false } } }, @@ -219,6 +231,13 @@ "type": "object", "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, "replaceAllPathsWithAbsolutePaths": { "type": "boolean", "description": "Replace all relative paths with absolute paths for images and links", diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 02e4a47..3423b3a 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -136,6 +136,40 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 60000); // 60 seconds + it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { + const responseWithoutRemoveTags = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/" }); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer + expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav + expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong + + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("Scrape This Site"); + expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer + expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav + expect(response.body.data.content).not.toContain("web scraping"); // strong + }, 30000); // 30 seconds timeout + // TODO: add this test back once we nail the waitFor option to be more deterministic // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { // const startTime = Date.now(); diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 58d01e2..7eab78f 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -55,8 +55,14 @@ export async function crawlController(req: Request, res: Response) { } const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false }; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + const crawlerOptions = req.body.crawlerOptions ?? { + allowBackwardCrawling: false + }; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + removeTags: [] + }; if (mode === "single_urls" && !url.includes(",")) { try { diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index d3e9afe..2c3dc4e 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] }; const job = await addWebScraperJob({ url: url, diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 7474aae..abbc357 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -85,6 +85,7 @@ export async function searchHelper( onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, includeHtml: pageOptions?.includeHtml ?? false, + removeTags: pageOptions?.removeTags ?? [], fallback: false, }, }); @@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) { includeHtml: false, onlyMainContent: true, fetchPageContent: true, + removeTags: [], fallback: false, }; const origin = req.body.origin ?? "api"; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 81bf12c..92170c1 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -19,6 +19,7 @@ export type PageOptions = { screenshot?: boolean; headers?: Record; replaceAllPathsWithAbsolutePaths?: boolean; + removeTags?: string | string[]; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index f432f43..1a6ffd0 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -475,7 +475,12 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false }; + this.pageOptions = options.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + replaceAllPathsWithAbsolutePaths: false, + removeTags: [] + }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c2dcea1..a16f6f0 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -304,6 +304,19 @@ export async function scrapSingleUrl( const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { const soup = cheerio.load(html); soup("script, style, iframe, noscript, meta, head").remove(); + + if (pageOptions.removeTags) { + if (typeof pageOptions.removeTags === 'string') { + pageOptions.removeTags.split(',').forEach((tag) => { + soup(tag.trim()).remove(); + }); + } else if (Array.isArray(pageOptions.removeTags)) { + pageOptions.removeTags.forEach((tag) => { + soup(tag).remove(); + }); + } + } + if (pageOptions.onlyMainContent) { // remove any other tags that are not in the main content excludeNonMainTags.forEach((tag) => { From bb859ae9a74a98fc16c5180f98883c02723c456f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 13 Jun 2024 17:08:40 -0300 Subject: [PATCH 08/11] Added metadata.pageStatusCode and metadata.pageError properties to the responses --- apps/api/openapi.json | 236 +++++++++++++++++- .../src/__tests__/e2e_withAuth/index.test.ts | 54 +++- apps/api/src/lib/entities.ts | 5 +- apps/api/src/scraper/WebScraper/crawler.ts | 15 +- apps/api/src/scraper/WebScraper/index.ts | 14 +- apps/api/src/scraper/WebScraper/single_url.ts | 126 ++++++---- .../utils/__tests__/docxProcessor.test.ts | 6 +- .../utils/__tests__/pdfProcessor.test.ts | 6 +- .../scraper/WebScraper/utils/docxProcessor.ts | 10 +- .../src/scraper/WebScraper/utils/metadata.ts | 9 + .../scraper/WebScraper/utils/pdfProcessor.ts | 12 +- apps/playwright-service/get_error.py | 63 +++++ apps/playwright-service/main.py | 13 +- 13 files changed, 494 insertions(+), 75 deletions(-) create mode 100644 apps/playwright-service/get_error.py diff --git a/apps/api/openapi.json b/apps/api/openapi.json index a755e37..d283376 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -492,7 +492,7 @@ "html": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeHtml` is true" + "description": "Raw HTML content of the page if `includeHtml` is true" }, "metadata": { "type": "object", @@ -507,9 +507,126 @@ "type": "string", "nullable": true }, + "keywords": { + "type": "string", + "nullable": true + }, + "robots": { + "type": "string", + "nullable": true + }, + "ogTitle": { + "type": "string", + "nullable": true + }, + "ogDescription": { + "type": "string", + "nullable": true + }, + "ogUrl": { + "type": "string", + "format": "uri", + "nullable": true + }, + "ogImage": { + "type": "string", + "nullable": true + }, + "ogAudio": { + "type": "string", + "nullable": true + }, + "ogDeterminer": { + "type": "string", + "nullable": true + }, + "ogLocale": { + "type": "string", + "nullable": true + }, + "ogLocaleAlternate": { + "type": "array", + "items": { + "type": "string" + }, + "nullable": true + }, + "ogSiteName": { + "type": "string", + "nullable": true + }, + "ogVideo": { + "type": "string", + "nullable": true + }, + "dctermsCreated": { + "type": "string", + "nullable": true + }, + "dcDateCreated": { + "type": "string", + "nullable": true + }, + "dcDate": { + "type": "string", + "nullable": true + }, + "dctermsType": { + "type": "string", + "nullable": true + }, + "dcType": { + "type": "string", + "nullable": true + }, + "dctermsAudience": { + "type": "string", + "nullable": true + }, + "dctermsSubject": { + "type": "string", + "nullable": true + }, + "dcSubject": { + "type": "string", + "nullable": true + }, + "dcDescription": { + "type": "string", + "nullable": true + }, + "dctermsKeywords": { + "type": "string", + "nullable": true + }, + "modifiedTime": { + "type": "string", + "nullable": true + }, + "publishedTime": { + "type": "string", + "nullable": true + }, + "articleTag": { + "type": "string", + "nullable": true + }, + "articleSection": { + "type": "string", + "nullable": true + }, "sourceURL": { "type": "string", "format": "uri" + }, + "pageStatusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "pageError": { + "type": "string", + "nullable": true, + "description": "The error message of the page" } } }, @@ -558,9 +675,126 @@ "type": "string", "nullable": true }, + "keywords": { + "type": "string", + "nullable": true + }, + "robots": { + "type": "string", + "nullable": true + }, + "ogTitle": { + "type": "string", + "nullable": true + }, + "ogDescription": { + "type": "string", + "nullable": true + }, + "ogUrl": { + "type": "string", + "format": "uri", + "nullable": true + }, + "ogImage": { + "type": "string", + "nullable": true + }, + "ogAudio": { + "type": "string", + "nullable": true + }, + "ogDeterminer": { + "type": "string", + "nullable": true + }, + "ogLocale": { + "type": "string", + "nullable": true + }, + "ogLocaleAlternate": { + "type": "array", + "items": { + "type": "string" + }, + "nullable": true + }, + "ogSiteName": { + "type": "string", + "nullable": true + }, + "ogVideo": { + "type": "string", + "nullable": true + }, + "dctermsCreated": { + "type": "string", + "nullable": true + }, + "dcDateCreated": { + "type": "string", + "nullable": true + }, + "dcDate": { + "type": "string", + "nullable": true + }, + "dctermsType": { + "type": "string", + "nullable": true + }, + "dcType": { + "type": "string", + "nullable": true + }, + "dctermsAudience": { + "type": "string", + "nullable": true + }, + "dctermsSubject": { + "type": "string", + "nullable": true + }, + "dcSubject": { + "type": "string", + "nullable": true + }, + "dcDescription": { + "type": "string", + "nullable": true + }, + "dctermsKeywords": { + "type": "string", + "nullable": true + }, + "modifiedTime": { + "type": "string", + "nullable": true + }, + "publishedTime": { + "type": "string", + "nullable": true + }, + "articleTag": { + "type": "string", + "nullable": true + }, + "articleSection": { + "type": "string", + "nullable": true + }, "sourceURL": { "type": "string", "format": "uri" + }, + "pageStatusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "pageError": { + "type": "string", + "nullable": true, + "description": "The error message of the page" } } } diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 431c7d1..4fd35a8 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -83,6 +83,8 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); }, 30000); // 30 seconds timeout it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => { @@ -103,6 +105,8 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).toContain("_Roast_"); expect(response.body.data.markdown).toContain("_Roast_"); expect(response.body.data.html).toContain(" { @@ -118,6 +122,8 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { @@ -133,6 +139,8 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds // TODO: add this test back once we nail the waitFor option to be more deterministic @@ -155,6 +163,23 @@ describe("E2E Tests for API Routes", () => { // expect(response.body.data.content).toContain("🔥 Firecrawl"); // expect(duration).toBeGreaterThanOrEqual(7000); // }, 12000); // 12 seconds timeout + + it.concurrent('should return a successful response for a scrape with 404 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://mendable.ai/alshdiasuhdasd' }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('Mendable'); + expect(response.body.data.metadata.pageStatusCode).toBe(404); + expect(response.body.data.metadata.pageError).toBe("Not Found"); + }, 60000); // 60 seconds }); describe("POST /v0/crawl", () => { @@ -270,6 +295,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => { @@ -351,6 +378,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { @@ -383,6 +412,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); const urls = completedResponse.body.data.map( (item: any) => item.metadata?.sourceURL ); @@ -481,6 +512,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); // 120 seconds expect(completedResponse.body.data[0]).toHaveProperty("html"); @@ -488,6 +521,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].content).toContain("_Roast_"); expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); expect(completedResponse.body.data[0].html).toContain(" { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); const childrenLinks = completedResponse.body.data.filter(doc => doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") @@ -665,6 +702,9 @@ describe("E2E Tests for API Routes", () => { }) ]) ); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); }, 120000); // 120 seconds it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { @@ -700,6 +740,9 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + const urls = completedResponse.body.data.map( (item: any) => item.metadata?.sourceURL ); @@ -759,6 +802,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].content).toContain("_Roast_"); expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); expect(completedResponse.body.data[0].html).toContain(" { expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].markdown).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); const onlyChildrenLinks = completedResponse.body.data.filter(doc => { return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") @@ -842,7 +889,8 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); - + expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds describe("POST /v0/scrape with LLM Extraction", () => { @@ -997,6 +1045,10 @@ describe("E2E Tests for API Routes", () => { expect(statusResponse.body).toHaveProperty("data"); expect(statusResponse.body.data[0]).toHaveProperty("content"); expect(statusResponse.body.data[0]).toHaveProperty("markdown"); + expect(statusResponse.body.data[0]).toHaveProperty("metadata"); + expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined(); + const results = statusResponse.body.data; // results.forEach((result, i) => { // console.log(result.metadata.sourceURL); diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 81bf12c..21bb6ea 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -119,4 +119,7 @@ export class SearchResult { export interface FireEngineResponse { html: string; screenshot: string; -} \ No newline at end of file + pageStatusCode?: number; + pageError?: string; +} + diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 7720991..8087591 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -224,7 +224,7 @@ export class WebCrawler { return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } - async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> { + async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> { const normalizedUrl = this.normalizeCrawlUrl(url); if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) { return []; @@ -244,20 +244,27 @@ export class WebCrawler { try { let content: string = ""; + let pageStatusCode: number; + let pageError: string | undefined = undefined; + // If it is the first link, fetch with single url if (this.visited.size === 1) { const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); content = page.html ?? ""; + pageStatusCode = page.metadata?.pageStatusCode; + pageError = page.metadata?.pageError || undefined; } else { const response = await axios.get(url); content = response.data ?? ""; + pageStatusCode = response.status; + pageError = response.statusText != "OK" ? response.statusText : undefined; } const $ = load(content); - let links: { url: string, html: string }[] = []; + let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = []; // Add the initial URL to the list of links if (this.visited.size === 1) { - links.push({ url, html: content }); + links.push({ url, html: content, pageStatusCode, pageError }); } $("a").each((_, element) => { @@ -279,7 +286,7 @@ export class WebCrawler { !this.matchesExcludes(path) && this.robots.isAllowed(fullUrl, "FireCrawlAgent") ) { - links.push({ url: fullUrl, html: content }); + links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } } }); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index f432f43..0446ab7 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -241,7 +241,7 @@ export class WebScraperDataProvider { content: "", html: this.pageOptions?.includeHtml ? "" : undefined, markdown: "", - metadata: { sourceURL: url }, + metadata: { sourceURL: url, pageStatusCode: 200 }, })); } @@ -280,10 +280,10 @@ export class WebScraperDataProvider { private async fetchPdfDocuments(pdfLinks: string[]): Promise { return Promise.all( pdfLinks.map(async (pdfLink) => { - const pdfContent = await fetchAndProcessPdf(pdfLink); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink); return { - content: pdfContent, - metadata: { sourceURL: pdfLink }, + content: content, + metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, provider: "web-scraper", }; }) @@ -292,10 +292,10 @@ export class WebScraperDataProvider { private async fetchDocxDocuments(docxLinks: string[]): Promise { return Promise.all( docxLinks.map(async (p) => { - const docXDocument = await fetchAndProcessDocx(p); + const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p); return { - content: docXDocument, - metadata: { sourceURL: p }, + content, + metadata: { sourceURL: p, pageStatusCode, pageError }, provider: "web-scraper", }; }) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c2dcea1..f83771e 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -88,12 +88,13 @@ export async function scrapWithFireEngine( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return { html: await fetchAndProcessPdf(url), screenshot: "" }; + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url); + return { html: content, screenshot: "", pageStatusCode, pageError }; } else { const data = response.data; const html = data.content; const screenshot = data.screenshot; - return { html: html ?? "", screenshot: screenshot ?? "" }; + return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.error }; } } catch (error) { if (error.code === 'ECONNABORTED') { @@ -109,35 +110,39 @@ export async function scrapWithScrapingBee( url: string, wait_browser: string = "domcontentloaded", timeout: number = universalTimeout -): Promise { +): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const clientParams = await generateRequestParams( url, wait_browser, - timeout + timeout, ); - const response = await client.get(clientParams); - - if (response.status !== 200 && response.status !== 404) { - console.error( - `[ScrapingBee] Error fetching url: ${url} with status code ${response.status}` - ); - return ""; - } + const response = await client.get({ + ...clientParams, + params: { + ...clientParams.params, + 'transparent_status_code': 'True' + } + }); const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return await fetchAndProcessPdf(url); } else { - const decoder = new TextDecoder(); - const text = decoder.decode(response.data); - return text; + let text = ""; + try { + const decoder = new TextDecoder(); + text = decoder.decode(response.data); + } catch (decodeError) { + console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`); + } + return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }; } } catch (error) { console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); - return ""; + return { content: "" }; } } @@ -145,7 +150,7 @@ export async function scrapWithPlaywright( url: string, waitFor: number = 0, headers?: Record -): Promise { +): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { const reqParams = await generateRequestParams(url); // If the user has passed a wait parameter in the request, use that @@ -167,21 +172,21 @@ export async function scrapWithPlaywright( console.error( `[Playwright] Error fetching url: ${url} with status: ${response.status}` ); - return ""; + return { content: "" }; } const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return await fetchAndProcessPdf(url); } else { const textData = response.data; try { const data = JSON.parse(textData); const html = data.content; - return html ?? ""; + return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError }; } catch (jsonError) { console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`); - return ""; + return { content: "" }; } } } catch (error) { @@ -190,11 +195,11 @@ export async function scrapWithPlaywright( } else { console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); } - return ""; + return { content: "" }; } } -export async function scrapWithFetch(url: string): Promise { +export async function scrapWithFetch(url: string): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { const response = await axios.get(url, { headers: { @@ -208,15 +213,15 @@ export async function scrapWithFetch(url: string): Promise { console.error( `[Axios] Error fetching url: ${url} with status: ${response.status}` ); - return ""; + return { content: "", pageStatusCode: response.status, pageError: response.statusText }; } const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return fetchAndProcessPdf(url); + return await fetchAndProcessPdf(url); } else { const text = response.data; - return text; + return { content: text, pageStatusCode: 200 }; } } catch (error) { if (error.code === 'ECONNABORTED') { @@ -224,7 +229,7 @@ export async function scrapWithFetch(url: string): Promise { } else { console.error(`[Axios] Error fetching url: ${url} -> ${error}`); } - return ""; + return { content: "" }; } } @@ -317,7 +322,7 @@ export async function scrapSingleUrl( url: string, method: (typeof baseScrapers)[number] ) => { - let text = ""; + let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} }; let screenshot = ""; switch (method) { case "fire-engine": @@ -329,38 +334,52 @@ export async function scrapSingleUrl( pageOptions.screenshot, pageOptions.headers ); - text = response.html; - screenshot = response.screenshot; + scraperResponse.text = response.html; + scraperResponse.screenshot = response.screenshot; + scraperResponse.metadata.pageStatusCode = response.pageStatusCode; + scraperResponse.metadata.pageError = response.pageError; } break; case "scrapingBee": if (process.env.SCRAPING_BEE_API_KEY) { - text = await scrapWithScrapingBee( + const response = await scrapWithScrapingBee( url, "domcontentloaded", pageOptions.fallback === false ? 7000 : 15000 ); + scraperResponse.text = response.content; + scraperResponse.metadata.pageStatusCode = response.pageStatusCode; + scraperResponse.metadata.pageError = response.pageError; } break; case "playwright": if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { - text = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers); + const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers); + scraperResponse.text = response.content; + scraperResponse.metadata.pageStatusCode = response.pageStatusCode; + scraperResponse.metadata.pageError = response.pageError; } break; case "scrapingBeeLoad": if (process.env.SCRAPING_BEE_API_KEY) { - text = await scrapWithScrapingBee(url, "networkidle2"); + const response = await scrapWithScrapingBee(url, "networkidle2"); + scraperResponse.text = response.content; + scraperResponse.metadata.pageStatusCode = response.pageStatusCode; + scraperResponse.metadata.pageError = response.pageError; } break; case "fetch": - text = await scrapWithFetch(url); + const response = await scrapWithFetch(url); + scraperResponse.text = response.content; + scraperResponse.metadata.pageStatusCode = response.pageStatusCode; + scraperResponse.metadata.pageError = response.pageError; break; } let customScrapedContent : FireEngineResponse | null = null; // Check for custom scraping conditions - const customScraperResult = await handleCustomScraping(text, url); + const customScraperResult = await handleCustomScraping(scraperResponse.text, url); if (customScraperResult){ switch (customScraperResult.scraper) { @@ -371,23 +390,30 @@ export async function scrapSingleUrl( } break; case "pdf": - customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot } + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url); + customScrapedContent = { html: content, screenshot, pageStatusCode, pageError } break; } } if (customScrapedContent) { - text = customScrapedContent.html; + scraperResponse.text = customScrapedContent.html; screenshot = customScrapedContent.screenshot; } //* TODO: add an optional to return markdown or structured/extracted content - let cleanedHtml = removeUnwantedElements(text, pageOptions); + let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); - return [await parseMarkdown(cleanedHtml), text, screenshot]; + return { + text: await parseMarkdown(cleanedHtml), + html: scraperResponse.text, + screenshot: scraperResponse.screenshot, + pageStatusCode: scraperResponse.metadata.pageStatusCode, + pageError: scraperResponse.metadata.pageError || undefined + }; }; try { - let [text, html, screenshot] = ["", "", ""]; + let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined }; let urlKey = urlToScrap; try { urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); @@ -410,7 +436,14 @@ export async function scrapSingleUrl( html = existingHtml; break; } - [text, html, screenshot] = await attemptScraping(urlToScrap, scraper); + + const attempt = await attemptScraping(urlToScrap, scraper); + text = attempt.text ?? ''; + html = attempt.html ?? ''; + screenshot = attempt.screenshot ?? ''; + pageStatusCode = attempt.pageStatusCode; + pageError = attempt.pageError; + if (text && text.trim().length >= 100) break; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; if (nextScraperIndex < scrapersInOrder.length) { @@ -435,6 +468,8 @@ export async function scrapSingleUrl( ...metadata, screenshot: screenshot, sourceURL: urlToScrap, + pageStatusCode: pageStatusCode, + pageError: pageError }, }; } else { @@ -442,7 +477,12 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, - metadata: { ...metadata, sourceURL: urlToScrap }, + metadata: { + ...metadata, + sourceURL: urlToScrap, + pageStatusCode: pageStatusCode, + pageError: pageError + }, }; } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts index e018ffa..53237ef 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts @@ -3,11 +3,13 @@ import * as docxProcessor from "../docxProcessor"; describe("DOCX Processing Module - Integration Test", () => { it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => { delete process.env.LLAMAPARSE_API_KEY; - const docxContent = await docxProcessor.fetchAndProcessDocx( + const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx( "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx" ); - expect(docxContent.trim()).toContain( + expect(content.trim()).toContain( "SERIES A PREFERRED STOCK PURCHASE AGREEMENT" ); + expect(pageStatusCode).toBe(200); + expect(pageError).toBeUndefined(); }); }); diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts index f14c8d4..3438287 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -3,8 +3,10 @@ import * as pdfProcessor from '../pdfProcessor'; describe('PDF Processing Module - Integration Test', () => { it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { delete process.env.LLAMAPARSE_API_KEY; - const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); - expect(pdfContent.trim()).toEqual("Dummy PDF file"); + const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); + expect(content.trim()).toEqual("Dummy PDF file"); + expect(pageStatusCode).toEqual(200); + expect(pageError).toBeUndefined(); }); // We're hitting the LLAMAPARSE rate limit 🫠 diff --git a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts index 38759f8..a01b8a2 100644 --- a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts @@ -5,14 +5,14 @@ import path from "path"; import os from "os"; import mammoth from "mammoth"; -export async function fetchAndProcessDocx(url: string): Promise { - const tempFilePath = await downloadDocx(url); +export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> { + const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url); const content = await processDocxToText(tempFilePath); fs.unlinkSync(tempFilePath); // Clean up the temporary file - return content; + return { content, pageStatusCode, pageError }; } -async function downloadDocx(url: string): Promise { +async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> { const response = await axios({ url, method: "GET", @@ -25,7 +25,7 @@ async function downloadDocx(url: string): Promise { response.data.pipe(writer); return new Promise((resolve, reject) => { - writer.on("finish", () => resolve(tempFilePath)); + writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined })); writer.on("error", reject); }); } diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index ddaf1e8..3f2052c 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -29,6 +29,9 @@ interface Metadata { publishedTime?: string; articleTag?: string; articleSection?: string; + sourceURL?: string; + pageStatusCode?: number; + pageError?: string; } export function extractMetadata(soup: CheerioAPI, url: string): Metadata { @@ -61,6 +64,9 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { let publishedTime: string | null = null; let articleTag: string | null = null; let articleSection: string | null = null; + let sourceURL: string | null = null; + let pageStatusCode: number | null = null; + let pageError: string | null = null; try { title = soup("title").text() || null; @@ -132,5 +138,8 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { ...(publishedTime ? { publishedTime } : {}), ...(articleTag ? { articleTag } : {}), ...(articleSection ? { articleSection } : {}), + ...(sourceURL ? { sourceURL } : {}), + ...(pageStatusCode ? { pageStatusCode } : {}), + ...(pageError ? { pageError } : {}), }; } diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 71984f2..715f30d 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -9,14 +9,14 @@ import os from "os"; dotenv.config(); -export async function fetchAndProcessPdf(url: string): Promise { - const tempFilePath = await downloadPdf(url); - const content = await processPdfToText(tempFilePath); +export async function fetchAndProcessPdf(url: string): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { + const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url); + const content = await processPdfToText(tempFilePath); fs.unlinkSync(tempFilePath); // Clean up the temporary file - return content; + return { content, pageStatusCode, pageError }; } -async function downloadPdf(url: string): Promise { +async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> { const response = await axios({ url, method: "GET", @@ -29,7 +29,7 @@ async function downloadPdf(url: string): Promise { response.data.pipe(writer); return new Promise((resolve, reject) => { - writer.on("finish", () => resolve(tempFilePath)); + writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined })); writer.on("error", reject); }); } diff --git a/apps/playwright-service/get_error.py b/apps/playwright-service/get_error.py new file mode 100644 index 0000000..a33de5e --- /dev/null +++ b/apps/playwright-service/get_error.py @@ -0,0 +1,63 @@ +def get_error(status_code: int) -> str: + error_messages = { + 300: "Multiple Choices", + 301: "Moved Permanently", + 302: "Found", + 303: "See Other", + 304: "Not Modified", + 305: "Use Proxy", + 307: "Temporary Redirect", + 308: "Permanent Redirect", + 309: "Resume Incomplete", + 310: "Too Many Redirects", + 311: "Unavailable For Legal Reasons", + 312: "Previously Used", + 313: "I'm Used", + 314: "Switch Proxy", + 315: "Temporary Redirect", + 316: "Resume Incomplete", + 317: "Too Many Redirects", + 400: "Bad Request", + 401: "Unauthorized", + 403: "Forbidden", + 404: "Not Found", + 405: "Method Not Allowed", + 406: "Not Acceptable", + 407: "Proxy Authentication Required", + 408: "Request Timeout", + 409: "Conflict", + 410: "Gone", + 411: "Length Required", + 412: "Precondition Failed", + 413: "Payload Too Large", + 414: "URI Too Long", + 415: "Unsupported Media Type", + 416: "Range Not Satisfiable", + 417: "Expectation Failed", + 418: "I'm a teapot", + 421: "Misdirected Request", + 422: "Unprocessable Entity", + 423: "Locked", + 424: "Failed Dependency", + 425: "Too Early", + 426: "Upgrade Required", + 428: "Precondition Required", + 429: "Too Many Requests", + 431: "Request Header Fields Too Large", + 451: "Unavailable For Legal Reasons", + 500: "Internal Server Error", + 501: "Not Implemented", + 502: "Bad Gateway", + 503: "Service Unavailable", + 504: "Gateway Timeout", + 505: "HTTP Version Not Supported", + 506: "Variant Also Negotiates", + 507: "Insufficient Storage", + 508: "Loop Detected", + 510: "Not Extended", + 511: "Network Authentication Required", + 599: "Network Connect Timeout Error" + } + if status_code < 300: + return None + return error_messages.get(status_code, "Unknown Error") diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index 8ef7418..bd6b14e 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -9,6 +9,7 @@ from fastapi import FastAPI from fastapi.responses import JSONResponse from playwright.async_api import Browser, async_playwright from pydantic import BaseModel +from get_error import get_error PROXY_SERVER = environ.get("PROXY_SERVER", None) PROXY_USERNAME = environ.get("PROXY_USERNAME", None) @@ -73,16 +74,22 @@ async def root(body: UrlModel): if body.headers: await page.set_extra_http_headers(body.headers) - await page.goto( + response = await page.goto( body.url, wait_until="load", timeout=body.timeout, ) + page_status_code = response.status + page_error = get_error(page_status_code) # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough if body.wait_after_load > 0: await page.wait_for_timeout(body.wait_after_load) page_content = await page.content() await context.close() - json_compatible_item_data = {"content": page_content} - return JSONResponse(content=json_compatible_item_data) + json_compatible_item_data = { + "content": page_content, + "pageStatusCode": page_status_code, + "pageError": page_error + } + return JSONResponse(content=json_compatible_item_data) \ No newline at end of file From 5dd18ca79b9f033ea65a6a5ead02bc15cdb9ed4a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 14 Jun 2024 09:46:55 -0300 Subject: [PATCH 09/11] fixed edge cases --- .../src/__tests__/e2e_withAuth/index.test.ts | 89 +++++++++++++++++-- apps/api/src/controllers/scrape.ts | 2 +- apps/api/src/controllers/search.ts | 2 +- apps/api/src/scraper/WebScraper/single_url.ts | 26 ++++-- 4 files changed, 104 insertions(+), 15 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 4fd35a8..1e1d5e3 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -164,21 +164,100 @@ describe("E2E Tests for API Routes", () => { // expect(duration).toBeGreaterThanOrEqual(7000); // }, 12000); // 12 seconds timeout - it.concurrent('should return a successful response for a scrape with 404 page', async () => { + it.concurrent('should return a successful response for a scrape with 400 page', async () => { const response = await request(TEST_URL) .post('/v0/scrape') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) .set('Content-Type', 'application/json') - .send({ url: 'https://mendable.ai/alshdiasuhdasd' }); - await new Promise((r) => setTimeout(r, 6000)); + .send({ url: 'https://httpstat.us/400' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(400); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 401 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/401' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(401); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized"); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response for a scrape with 403 page", async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/403' }); + + await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(403); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 404 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/404' }); + await new Promise((r) => setTimeout(r, 5000)); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty('data'); expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('Mendable'); expect(response.body.data.metadata.pageStatusCode).toBe(404); - expect(response.body.data.metadata.pageError).toBe("Not Found"); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 405 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/405' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(405); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 500 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/500' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(500); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error"); }, 60000); // 60 seconds }); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index d5ab1de..ae0de6a 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -61,7 +61,7 @@ export async function scrapeHelper( (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 ); if (filteredDocs.length === 0) { - return { success: true, error: "No page found", returnCode: 200 }; + return { success: true, error: "No page found", returnCode: 200, data: docs[0] }; } let creditsToBeBilled = filteredDocs.length; diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 7474aae..5427d49 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -100,7 +100,7 @@ export async function searchHelper( ); if (filteredDocs.length === 0) { - return { success: true, error: "No page found", returnCode: 200 }; + return { success: true, error: "No page found", returnCode: 200, data: docs }; } const billingResult = await billTeam( diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index f83771e..3f7e789 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -83,7 +83,7 @@ export async function scrapWithFireEngine( console.error( `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` ); - return { html: "", screenshot: "" }; + return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; } const contentType = response.headers["content-type"]; @@ -94,7 +94,7 @@ export async function scrapWithFireEngine( const data = response.data; const html = data.content; const screenshot = data.screenshot; - return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.error }; + return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError }; } } catch (error) { if (error.code === 'ECONNABORTED') { @@ -142,7 +142,7 @@ export async function scrapWithScrapingBee( } } catch (error) { console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); - return { content: "" }; + return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText }; } } @@ -172,7 +172,7 @@ export async function scrapWithPlaywright( console.error( `[Playwright] Error fetching url: ${url} with status: ${response.status}` ); - return { content: "" }; + return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; } const contentType = response.headers["content-type"]; @@ -412,8 +412,8 @@ export async function scrapSingleUrl( pageError: scraperResponse.metadata.pageError || undefined }; }; + let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined }; try { - let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined }; let urlKey = urlToScrap; try { urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); @@ -441,10 +441,16 @@ export async function scrapSingleUrl( text = attempt.text ?? ''; html = attempt.html ?? ''; screenshot = attempt.screenshot ?? ''; - pageStatusCode = attempt.pageStatusCode; - pageError = attempt.pageError; + if (attempt.pageStatusCode) { + pageStatusCode = attempt.pageStatusCode; + } + if (attempt.pageError) { + pageError = attempt.pageError; + } + if (text && text.trim().length >= 100) break; + if (pageStatusCode && pageStatusCode == 404) break; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; if (nextScraperIndex < scrapersInOrder.length) { console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); @@ -493,7 +499,11 @@ export async function scrapSingleUrl( content: "", markdown: "", html: "", - metadata: { sourceURL: urlToScrap }, + metadata: { + sourceURL: urlToScrap, + pageStatusCode: pageStatusCode, + pageError: pageError + }, } as Document; } } From 6963a490f1284d89756ce9f6290b5c654ae14b79 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 14 Jun 2024 10:21:44 -0300 Subject: [PATCH 10/11] Updated version --- apps/python-sdk/firecrawl/__init__.py | 2 +- .../test.cpython-311-pytest-8.2.1.pyc | Bin 0 -> 44947 bytes 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 4e53e77..2fe16ba 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "0.0.14" +__version__ = "0.0.15" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ba1f1324fe139772739cdae776d127cd5002ca8 GIT binary patch literal 44947 zcmeHwZ)_V!mS`a`Do!E};jGaF-9vrx|x@A+Q zNTr*yV-1zl++`PwU~n@y?B2}mW`Nw@Z8k=Bf%$T{J77OtK5c-zJv0pp(Gdt>;IJR& za34IE`7rRO`@O2>V|B6Hre%9PI}zP_u6p(ARkPS#uimRyul}a5FJ<8KC-DoD3mXjM ze^aLMG5y5Xe;+f9pBk27#R|sxnE1bAIzG`6Gp5b+oiSO`bv_|VlIN4+|L*hM;{Vk7 z6#nlh^i21j?-jK1Lf>@%`Tm&EVc7kjppnlSEgC2{&!?@<^8+YLO|5DaSY4ms9zMgL z{G1;|`H+>c2maFdMMtnKX4r|T;RfZ$s~aixe=BJZe9{4q_>-UWBY|3L8hG`8dnDM> z`L*`Q)Vc=6$Lkv@^?!S9unz8S?bKrpit~?I-5(g|H&`jajaCofsMQO&$?5~lSp9&T ztu){kYXIS_}BJ6+35)t@{-=Ved+=}k`MrDgJTo^knRo07CwH~5 zn^U`HuGM?rJbLcU?2$K)XHOpe-rTw)FP}T}`de=v%^o{`_UJ2TkG%7G_KhQNzV>{k z-g~UU;K(80^L8jCrG+0NVeQiJ$-Z z+gOlxo}O6z3;y0QquBGfF>bi#)sFY#-!rbpV#fFI{NV}gnD3|G&>mI<(yt z;Mk?QClfqeuH+)n3Q*LKQ+{rH|Sw{C)R&se(NHFK?OyXAV9lgqod zTkk&l-h@3<&X=|dGG(D4Z z?0R?B%}v^bsrQ01dto+TDCdiAy{k|Hv0Go8buX1>3zn#ry;vw+$Q821+_YWqKO*iE z%^!7~l2aebPL`bMTsdpMH&e(Jb97^4Nr&FtdQZNHd&*nc+{{e9hZ>S!a*vvMPR;CB zGtbD3>%gd&#DG|%=lfKNThlJdjK3H;6#skkh>`d_9s>-QmLrgvL1|g$8H|DX+23{q zr5vOr%P~k+lDOid%FE!m1<4w#7#}OO8g;;58)3z4?p`FT3CS9pqTxb(AXyb@c~{BG zvq5|l9`!BAT}ak0lEN*LRjncBxzucnWKEQn+1?b($q>n^^5Z!{nI%GqWL0bM9YoiF zbW2#tHt%P+ta9i3sc7>bNLH1O??R_T+sOz@?RSr$(j{v*OV*Utqe#|`^4BLA;Ka|u zvLtKo>3X_3&WmC4dWS>9vy`_OHgAi0u8WCob2@9;SLU`J%jFBUl_{4pE~dLPb_Q~6 zSLR$fSDtk<6D7-jKJ)CJafco{hh|kW6Tr;Xv~;9O@Qll)xqe83LOLYyr5QbRMTP{mRZ01fC?YmB2Ov+X;*T z1SFF46fpHoLVPrv&uabT>;)CwlKa1;I2j!M>#N>@fnjm;E9EkEfVIy$L}m9 zhrUdX-bjvmoAy{tWHx<*0lzcQAv<{4VNeqZJdcIGU zxHbJX4FkU%IWidke9$~HlK6aG3@}{KJRt;)6@vg_bqueoB5YLUtVQg^ROM`{sp|0H zqYsr3I}i;YvuMDb@<&$z?pf@_=i`-*DOMNfO?B9*Xu}p1IVfvT=(;LhLZKVt8!@K% z{-MfsLF@?H7O|t&U`0(AZ%|{{bO3?I!*__-QTeg3DGxO7D=5OD(5W@}4x($2Qe`FD zyr1C!cfKE8tU~OB-i1y_A}@kc`(+}iI)mTdO8R0aVRc)nO2Sv@l4TlI{ZD`s_hTg) z!LaB6l7xbImlex?GP8xWfU1+sU8MC(? z8M8`}F*Hxp1dTIJxOF!lviP{U7%K3`iOp zjUi)Bc*iq?-?8P4RUd-wxO1&2Y?Ki)(X4HR_eQotU)L!gDFZQDs zV^*pu^yd@ktAyXHg+L$ij0uVp^=g_Zm)ENSzRkYZjF;Ac|NWEJDr>Nk^q->cHlCvH z;Bynq`BIZ4X2(1hh+qO8I$hsTzLa;fxTa=d*UDvQ9Q#V%zM3tUF55*RFK@4F2vw() zbIa4$uG$yeeA(WW%U_S(?#jEFVyO%x=?IxWVW!MTOop6Wq~q%gWRNNKJe zXb+}YXAsD-9z}c$QQcGlHSJGmgqx{hp#r9y{gi%*z%~HYMmet9D4$g``-Hlv((YA@ zeY2;uXYO}ta9~O9i}G=)KF&@|*seP{OD0^|x57}Fm8R9;Pk0V&=oZ~+jMT7Cl8E>A zoc2;@s;M)z)R~1>mePYie*eSwz0rf!^z*g!^9!#nnVY<@Yth_P3&VrB+9;AVoJA4W zVq_~OluWV(R4*`*#2OQ3t`aYq887TwG&8j@Jcz5JhzVpCc!}XGW4KBv21t3KBBxah z*lKO5O>mVZ^J%Yf)fx%L$i$-gbW~AQ6fr?)Rx&}6##UlbSF0|WXvAvvYdm#E_)Oyq zNZx4UX+WQi>1I^SIcT=h^072&95Ux%Qzhg*n^mzONOKO8W+(c%j-@`<22qFS3n6n3 zm3MP&Z#4j_bes;Eb99!u6fJXF*V(KtmXNXZGUrfhaJr(;IUZqOTfGmp7irGXx6+IB%HGGb*qFJg^Iv|NR-Q#Hr?jI*P%oza zXAv_&p+$uE;(&OgSFOC!s=HHT7L8sU6k0@gq!tn0t8$zi3e_Ui8l2Y0$-o=bq|&vB zVb&s6TO*1Vad)|JuUf>K(?U;h4&n874ioqmftLszA@B-;Zxc95fbQ8jM&MO|dU87F zT((MAi%8k!oRur*>gJ{LbYTqI1yl<-U5xqK1*rlO)6g!Q*ND$?0w)NZBtUcf>%Gd{ zUYO?YWrFX#4i=#^@F9Hx2VMfs_d!jjMfV?|E&6lpfEjINip?`wB;*WNoAv_#sHfPQ zJu~E5G&k46@F1>=A|{Yo;3bB$jNvMw7$D_^ikwz4V5_yIHo;YvTB^$;uFI61kbIbc z*-8v&K_5kfB9%;X28NqZF<_$cX{`(TKcFwtRH~v>G}f+E6-uT<)itJ9^)RWcsT?F` z8QPokCI+-{?X8Xx{K0E)R@^eJ&U`GdYkaM?W0rb>SznjX;}W5IT=J3WamfgIt!b-P zH)|l|qVX{FxZbujkUlZ%>%Wg#-y`g6EB#P=k><_=E4_$;;ePeFRV%&t0D6%$kh^Qc zD{nM)wdUlm=y5~nMXZ+HqaG)Fb@-v{ajRL68?n|rG(ArC!N51V57vt3VAVIe4`@|z zoubdzduDPM?K5I?flJ=|6NOpJehu3BoA<8jJ8$DPbiPaA9Rg9h`&|6crE%vxkwog@ zhuT>Q^L_NQxuOL}OL6O2vMe}R@FoWscMseM?yRu?ms4My^1h$-PX3YiLav&=P)lD} zSs-iZ>a5JKXz#4(-nY`b(KAD?MRQ{<3=iU}C}IMc1zut}%NVW_iUCqysK{v*1GZXQ zY7<;#MSFiF+WV5Z!)si%MuIUiv1smyDyoViCJ4<+CMeR_N(}01)g=>+SZ0^ZC%nd0 zYa|#W6N}~(QAJfz!~~&P$pl3jTZutkt-55Q5v$odSVj)P(){2pBSAl&m33_hZt)3N zR#w)vHD3exEq1P~Yd4-1-NORjpQ;s+(>mSns28P2;_}uK!#xuD8wI>aG@pxzKG4?^ zqdXEH{959W)IH{&YWN)HocHoqc9ooqyJxJ)-Ej7EN;_vR?LN47&-k7_Qjm1)oPuUaF)7@1f!r=yChqKFAXvyuslG`13h zx>|L~L?c$ScbL9ezmw@(V9+Jk1o6?gOwiUkZOcTcE8<|w@j!t zn%`LyrPwm@$oe{R%fy4}#W2S^wNF-h@m?J7?zHjbPQ5x1y40Z7hs|&Av zWhOs3@u%lOAm=^Fd4uTjztceFLwS@%2iKAM^q~&6(p$j32*8 zSnlcuBg~Q2d0^yR8gt}--Z6fho#+bb`Mn3Jxsz`d5%3Lm6ol`wDG*$yG%+#jILKbT z1k**P>|DziFN)AU1uwGtxALki-nUKgmqe7B1gY9 zRc$|T&wf8}&&C_cjo#>U)#Scfa-WynN8uFqQT+|;O~te#B_E9jtplT8 z5(8q5p6^p7ZcRJS;wKJx#j>!FaeAVnC){~YBLbZ+FHVmU5vNB}$XDX@;Ab5-vUe@Ah8WE`H24dt&46L;Bmg0XrYq&#^bv3&A)f|e7@M}`X6N406vaF>vAv~0t> z{$>209a4vEL|-k>I>oFIuoP!CtH<>zfC24C^5gonhV)q)c@B|&^f0B{WL%%V5Bq*n zswTJAl3TsxRuV+pNWe%yf@m8FA_dnhZKixdq*JX3B2}K;uOK>z-#8$M$OQiT*UT_JObi;f|fp`*2Jf6mxP4aqLB>UqR}0&Qn5`j#8#~?ZIbvEpOAUDO2=uP z4)gHtfJMGZsoSbkyt$wU!F3+O77aD*n|`0LWYZS%j!M^MlfE!Q)+F5j7r680T0(y% zAwt!Fx+SE+K%4!%F2I}nxl9o>{ZG&_1wxejPSqKsYoPB3(EF@QsTTR)pWi+FTj|?! ztyLeHLWm}>g2`T(bs_iuVwv&i9Pk!6(?Ji6gF7AHnZqC~j)D6+^V=p4I(r!PHlZ<$e5KMkp zdxUxt5$~}n8)5A;FN!VM2(46)7wqD7(|H*@nz5S1rtClzCIr!UC@iIOlmI=qVODzK zBtF)so9gpK-mZ1N3}UbdhHP?95sT9V&JcJKKn>0GtaFyo+X*~D;0*%b0dQ%&;yD^6 zVbwf!nh7m|gt}AIci&BXJWMnHw zGKs2-Qmw9v;U*X<`EV8qnU)EL;E6PdHF|D~DLJuZKJGQHS|h<2nOHO*k1DE)A|?pU zN+u}M*h&oQYSkqZjaX)v%w1mNs)>YzjO4_kxhtxuDvFpOG%NX_h#9IF)YYm>CK|Dt zy~Bz?2IJIn9emaBf2tG=&UAtj^$sThbOdMQ7B)VA0*U__OMT234GOnhi6iKySuqhV zGIXXzVWTix>?TTmc3`Cf)@+_#1n(JV-2^>DnpSUZ4jt)1qS0Yymp|fq199p{;WJ%Nn%#c zZKyL^gpN$2p9hG#rJq-|+0W|&yt$vt6t~|01iuGn>@p|J}pCm404< zHR#XEyR9K>xYF%E9Vu1Q3UK1Ss%SIdV6Xa52Zf+cBADiC%)${?RGEeLR(dMEn3Yqo z+d`lP)@%`$hXT*7Axiyu2-M);tfI}gBdB_V_iC;6-;h3QowdHw=l4N(8-37@Uq=u6 zAU@w~J*M}8wfdXfA8L^Ese#73S>Dn}=^K&}cE3OJw3iKc8|&}cz`qf{R{s=5N)n&> zgesl%w`e|%XEZIJW)X;=-;%)iCZFd~qFvteY;C&FQLwA=ee~Xq90en5U=-Yhb4b*+ z+MC z_e2ttcMBWp1MGm#k1f>L4t8QF@FjYiew=9rs`X+=st8VysMe|T+iEi}&M+CQQLcL!&;D{G4nV{Ng` z0Mz|rrHHnawjLeA^IwQ{PU}{bN4cKIM`xJU!&srkJ{Wbz+&n1ES{0lZ#|%8{{rvVQ ziQw8@uofN4fgN87<&YfiW%M}P^U+nziq zCyCYOm8S{6c;Gi_S|r=t`G%)yz4A20RqL339OHaa*bRl31@AlW#Z3~g&I}RTnQx1$ed|0?6;1nor*I(VF0P_x;m{3VL zIDk?*I3S@pI6R{|IP3>U;ZGnP9O&GtXB;w?P-_32daTt!d92mJz)8gPT z)WYDzC41s>R_;NNognWeeHY&b;lHpzeBL1N9Rg$sQvDtFDIO5=2&x11-j`;skq5(c zt~|a+9F4A-grwucW6~hha&y86^dq!GmM6C#rq(bc5F34-_4dE|#Z%rpS?}#X@{Z@c zeHT0{UrkTd(o=sxMxG^e)C;>7&CyyI9>i5q!~`-6yu@&pF|L~L?f12H9njggNT1=wX{HQKwEEewo#}@8Fzigh zSxnnr>njJuL4NBis*O1yyo~68xI0C);JI$ViqSlzcA5Hj#2i>#A?4U8NvoecaM=6d z?pk6y-Vgq9s;XTm#B+ZD9?Kn-l&{b9RC*#k_g6uWfh~ooTlATs`_yM1tmppF@BiV5 z5l@Jw{aeHABSc4KHS7qzZD>iFr%iBt@!t^g-0zuB2@T0VD(iNNX3QBZ;5ygttQbk> zXrZmpTNLUcJx2xa5{OzZZ}Fdo4rS+@iAxXGg<7Z?K@CGF6>{a7JgK8xr_(}%koP3C zhM$1=kEz0c$9Ip?8sGbgl!wes_?Ot*_d0mW=D}@B)axq387YeRf z+D!Sb7CO~U^Qb4Tnt+P}R?VX_D~lKc4}#Dl(8yX$z)U2Dv!Ik1Kt{G=WTR1axjE*h zVp@@sk4A&mfl)7s0kKBU_o))MrduP2&?1T$`L93*JS?XQtINW<;MlUAurRUpVd|s! zMZ-${RScUa;)oF*U-p@MePSuPKXgNPx=nQ@xWvrMFCk-{sz`ELr^8Z9I%49lFGV-v zTASGLF!)fMF62^l%-^0HwG=%poGw=1httI)>}zX;x5YnR6mbvE6hps*UQJkQ152{aj8Tb~$=lffd;%TpGtxoD2W|jK_bb{; z(&qaewY>UZ%@OM(-g#|#)q3nc?^^8_i0jYrCqI#k!jUyFQau)N*V=NgwZS)0bhcrX zXlF@2j$bihZEXJJ*yh&Eto;6l8}-yi{+j>Xet%(*8ubJ54mDyiP1{8)ccEbK%H`_= zh&k)zCd#EoAc495LScHRkHvQOxqtm~4tpXprJ^_|rHHnfyqm!hZJ@Am%7T;0<3NVlnVFJPb~ESBo!Oxmg1j_i7c*{Y z)|s&DU8M_C2s?<#z1L=t!x0e(GgwbtD3uC!t~eI2r~MYQetY%a2K%h*mg@a^H%t7o zlHXX|`FD7u>!}$W2w^+ryzSPLj{WXz9-$1u&73Y-Hunu~{|yIUsH|enVWA$U+8+iR z=UW6`B5;JjD+Im`P^Z46sJ_(0;G+3HI6o)CzW|tfF6ZJXin4HMc&_25fJCdC!VW=$ z5Df^(P*3IEd=Xv(MRbu#oiXOeWFWhmGeCgu!}&{q@L0%gTL=G&`1}Xr)7_3I!oMeS z;S)fUICd$J&ICik6sYyhe*_(AN$&qNqs@WufB3#Px_2qLerfoLTOG#c$&Qb}gFQYGHT~S49yM$Sm*@!&$~~l~4?j@d;2*GfV$ORrlWRK$SP{Q0A|9-Rt)n84X3#wBv3`G#oSc-CG!Q( ze4!@bVgpti7!>h^Qjn=d5(CJ{Rt&03BoZ1KXPIUhtR!2>L2F3m4+42DnTI{|a81C) z2COzPDB=sHAXAGZ29S}h7*v-?Bs4P4GR-nrNw$)M){x2{1nMO$!ecw?$qTdjLK!2r zBfQ?$WoNF@NVrQ5{tytm9Q&%1FWdEQM%5EOPK?wLWY~h|lneO_yJpMzLcN>waw{Qu zlGQs(Zrz-n&0AtuV!g|vjgD6wjH`x2Gr;J6%DPS^OLfZkb~loqHLiwAr4Nen^aIAfW(|F*pfhCx+ft_ zg&$kJtAI0=Y^UC1zXu+YQf{Jrnr>+?fdd525unJ1;`CN=5RwRm@6Z-Qhr%9;$T|+~ zI&{dK=FlRNFs?ZyKdu|1tA&)gojfF4deQl>D55#M+YJo|_On=Qi9}5JKhIdr*OJk< zApR{GJqzOBlF_{&{w*0JUhrBnM!n#5lWP62?*p*7ncBRNSX%##XRKLDul-Z!N7sHf zT1{`SrMEBi-ZVNFI=!wJye$U_cD+zF4%Li9o^j}AV&H@KzfA1Bk=XfZ(R=Z&YU1r$ z;_U@=|fX{@f*m_f>noxt8L1@?9di~{>RV>@mc4EmFlWiXtIJ$cJuu(m5!1VaT5Bm_o* zgJ!HG8fAz^DXqZ%uGns3 Date: Fri, 14 Jun 2024 11:05:19 -0300 Subject: [PATCH 11/11] Fixed tests' message and updated version --- apps/python-sdk/firecrawl/__init__.py | 2 +- .../firecrawl/__tests__/e2e_withAuth/test.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 2fe16ba..fbb2bdb 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "0.0.15" +__version__ = "0.0.16" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 90a6498..452d498 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -27,14 +27,14 @@ def test_scrape_url_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.scrape_url('https://firecrawl.dev') - assert "Failed to scrape URL. Status code: 401" in str(excinfo.value) + assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_blocklisted_url(): blocklisted_url = "https://facebook.com/fake-test" app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) with pytest.raises(Exception) as excinfo: app.scrape_url(blocklisted_url) - assert "Failed to scrape URL. Status code: 403" in str(excinfo.value) + assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") @@ -86,14 +86,14 @@ def test_crawl_url_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.crawl_url('https://firecrawl.dev') - assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value) + assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_should_return_error_for_blocklisted_url(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) blocklisted_url = "https://twitter.com/fake-test" with pytest.raises(Exception) as excinfo: app.crawl_url(blocklisted_url) - assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value) + assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_crawl_url_wait_for_completion_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -114,7 +114,7 @@ def test_crawl_url_with_idempotency_key_e2e(): with pytest.raises(Exception) as excinfo: app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) - assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value) + assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) def test_check_crawl_status_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -141,7 +141,7 @@ def test_search_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.search("test query") - assert "Failed to search. Status code: 401" in str(excinfo.value) + assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_llm_extraction(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)