diff --git a/apps/api/openapi.json b/apps/api/openapi.json index d283376..17b3677 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -61,6 +61,13 @@ "description": "Wait x amount of milliseconds for the page to load to fetch content", "default": 0 }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, "headers": { "type": "object", "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." @@ -194,6 +201,11 @@ "type": "integer", "description": "Maximum number of pages to crawl", "default": 10000 + }, + "allowBackwardCrawling": { + "type": "boolean", + "description": "Allow backward crawling (crawl from the base URL to the previous URLs)", + "default": false } } }, @@ -219,6 +231,13 @@ "type": "object", "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, "replaceAllPathsWithAbsolutePaths": { "type": "boolean", "description": "Replace all relative paths with absolute paths for images and links", diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 1e1d5e3..9149c01 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -143,6 +143,55 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds + it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { + const responseWithoutRemoveTags = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/" }); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer + expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav + expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong + + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("Scrape This Site"); + expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer + expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav + expect(response.body.data.content).not.toContain("web scraping"); // strong + }, 30000); // 30 seconds timeout + // TODO: add this test back once we nail the waitFor option to be more deterministic // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { // const startTime = Date.now(); diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 58d01e2..8fd876d 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -55,8 +55,16 @@ export async function crawlController(req: Request, res: Response) { } const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false }; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + + const crawlerOptions = req.body.crawlerOptions ?? { + allowBackwardCrawling: false + }; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + removeTags: [], + parsePDF: true + }; if (mode === "single_urls" && !url.includes(",")) { try { diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index d3e9afe..2c3dc4e 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] }; const job = await addWebScraperJob({ url: url, diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index ae0de6a..1537c07 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false }; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + waitFor: 0, + screenshot: false, + parsePDF: true + }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 5427d49..b555197 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -85,6 +85,7 @@ export async function searchHelper( onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, includeHtml: pageOptions?.includeHtml ?? false, + removeTags: pageOptions?.removeTags ?? [], fallback: false, }, }); @@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) { includeHtml: false, onlyMainContent: true, fetchPageContent: true, + removeTags: [], fallback: false, }; const origin = req.body.origin ?? "api"; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 21bb6ea..12d8c36 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -19,6 +19,8 @@ export type PageOptions = { screenshot?: boolean; headers?: Record; replaceAllPathsWithAbsolutePaths?: boolean; + parsePDF?: boolean; + removeTags?: string | string[]; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index 8108a9e..081150b 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -1,5 +1,3 @@ -import { fetchAndProcessPdf } from "../utils/pdfProcessor"; - export async function handleCustomScraping( text: string, url: string diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 0446ab7..3fbc6d1 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -280,7 +280,7 @@ export class WebScraperDataProvider { private async fetchPdfDocuments(pdfLinks: string[]): Promise { return Promise.all( pdfLinks.map(async (pdfLink) => { - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF); return { content: content, metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, @@ -475,7 +475,13 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false }; + this.pageOptions = options.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + replaceAllPathsWithAbsolutePaths: false, + parsePDF: true, + removeTags: [] + }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 3f7e789..1ba2832 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -49,7 +49,7 @@ export async function scrapWithFireEngine( url: string, waitFor: number = 0, screenshot: boolean = false, - pageOptions: { scrollXPaths?: string[] } = {}, + pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true }, headers?: Record, options?: any ): Promise { @@ -88,7 +88,7 @@ export async function scrapWithFireEngine( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); return { html: content, screenshot: "", pageStatusCode, pageError }; } else { const data = response.data; @@ -109,7 +109,8 @@ export async function scrapWithFireEngine( export async function scrapWithScrapingBee( url: string, wait_browser: string = "domcontentloaded", - timeout: number = universalTimeout + timeout: number = universalTimeout, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); @@ -129,7 +130,8 @@ export async function scrapWithScrapingBee( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return await fetchAndProcessPdf(url); + return await fetchAndProcessPdf(url, pageOptions?.parsePDF); + } else { let text = ""; try { @@ -149,7 +151,8 @@ export async function scrapWithScrapingBee( export async function scrapWithPlaywright( url: string, waitFor: number = 0, - headers?: Record + headers?: Record, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { const reqParams = await generateRequestParams(url); @@ -177,7 +180,7 @@ export async function scrapWithPlaywright( const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return await fetchAndProcessPdf(url); + return await fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const textData = response.data; try { @@ -199,7 +202,10 @@ export async function scrapWithPlaywright( } } -export async function scrapWithFetch(url: string): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { +export async function scrapWithFetch( + url: string, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { const response = await axios.get(url, { headers: { @@ -218,7 +224,7 @@ export async function scrapWithFetch(url: string): Promise<{ content: string, pa const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - return await fetchAndProcessPdf(url); + return await fetchAndProcessPdf(url, pageOptions?.parsePDF); } else { const text = response.data; return { content: text, pageStatusCode: 200 }; @@ -309,6 +315,19 @@ export async function scrapSingleUrl( const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { const soup = cheerio.load(html); soup("script, style, iframe, noscript, meta, head").remove(); + + if (pageOptions.removeTags) { + if (typeof pageOptions.removeTags === 'string') { + pageOptions.removeTags.split(',').forEach((tag) => { + soup(tag.trim()).remove(); + }); + } else if (Array.isArray(pageOptions.removeTags)) { + pageOptions.removeTags.forEach((tag) => { + soup(tag).remove(); + }); + } + } + if (pageOptions.onlyMainContent) { // remove any other tags that are not in the main content excludeNonMainTags.forEach((tag) => { @@ -390,7 +409,7 @@ export async function scrapSingleUrl( } break; case "pdf": - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF); customScrapedContent = { html: content, screenshot, pageStatusCode, pageError } break; } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts index 3438287..55930f2 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor'; describe('PDF Processing Module - Integration Test', () => { it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { delete process.env.LLAMAPARSE_API_KEY; - const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); + const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true); expect(content.trim()).toEqual("Dummy PDF file"); expect(pageStatusCode).toEqual(200); expect(pageError).toBeUndefined(); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 715f30d..1a67d60 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -9,9 +9,9 @@ import os from "os"; dotenv.config(); -export async function fetchAndProcessPdf(url: string): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { +export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url); - const content = await processPdfToText(tempFilePath); + const content = await processPdfToText(tempFilePath, parsePDF); fs.unlinkSync(tempFilePath); // Clean up the temporary file return { content, pageStatusCode, pageError }; } @@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageSta }); } -export async function processPdfToText(filePath: string): Promise { +export async function processPdfToText(filePath: string, parsePDF: boolean): Promise { let content = ""; - if (process.env.LLAMAPARSE_API_KEY) { + if (process.env.LLAMAPARSE_API_KEY && parsePDF) { const apiKey = process.env.LLAMAPARSE_API_KEY; const headers = { Authorization: `Bearer ${apiKey}`, @@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise { console.error("Error processing pdf document w/ LlamaIndex(2)"); content = await processPdf(filePath); } - } else { + } else if (parsePDF) { content = await processPdf(filePath); + } else { + content = fs.readFileSync(filePath, "utf-8"); } return content; } diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index ecb017f..fbb2bdb 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -1,3 +1,57 @@ +""" +This is the Firecrawl package. + +This package provides a Python SDK for interacting with the Firecrawl API. +It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, +and check the status of these jobs. + +For more information visit https://github.com/firecrawl/ +""" + +import logging +import os + from .firecrawl import FirecrawlApp -__version__ = "0.0.14" +__version__ = "0.0.16" + +# Define the logger for the Firecrawl project +logger: logging.Logger = logging.getLogger("firecrawl") + + +def _basic_config() -> None: + """Set up basic configuration for logging with a specific format and date format.""" + try: + logging.basicConfig( + format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + except Exception as e: + logger.error("Failed to configure logging: %s", e) + + +def setup_logging() -> None: + """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable.""" + env = os.environ.get( + "FIRECRAWL_LOGGING_LEVEL", "INFO" + ).upper() # Default to 'INFO' level + _basic_config() + + if env == "DEBUG": + logger.setLevel(logging.DEBUG) + elif env == "INFO": + logger.setLevel(logging.INFO) + elif env == "WARNING": + logger.setLevel(logging.WARNING) + elif env == "ERROR": + logger.setLevel(logging.ERROR) + elif env == "CRITICAL": + logger.setLevel(logging.CRITICAL) + else: + logger.setLevel(logging.INFO) + logger.warning("Unknown logging level: %s, defaulting to INFO", env) + + +# Initialize logging configuration when the module is imported +setup_logging() +logger.debug("Debugging logger setup") diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc new file mode 100644 index 0000000..5ba1f13 Binary files /dev/null and b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc differ diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 90a6498..452d498 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -27,14 +27,14 @@ def test_scrape_url_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.scrape_url('https://firecrawl.dev') - assert "Failed to scrape URL. Status code: 401" in str(excinfo.value) + assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_blocklisted_url(): blocklisted_url = "https://facebook.com/fake-test" app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) with pytest.raises(Exception) as excinfo: app.scrape_url(blocklisted_url) - assert "Failed to scrape URL. Status code: 403" in str(excinfo.value) + assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") @@ -86,14 +86,14 @@ def test_crawl_url_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.crawl_url('https://firecrawl.dev') - assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value) + assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_should_return_error_for_blocklisted_url(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) blocklisted_url = "https://twitter.com/fake-test" with pytest.raises(Exception) as excinfo: app.crawl_url(blocklisted_url) - assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value) + assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_crawl_url_wait_for_completion_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -114,7 +114,7 @@ def test_crawl_url_with_idempotency_key_e2e(): with pytest.raises(Exception) as excinfo: app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) - assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value) + assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) def test_check_crawl_status_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -141,7 +141,7 @@ def test_search_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.search("test query") - assert "Failed to search. Status code: 401" in str(excinfo.value) + assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_llm_extraction(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index b9a823f..7ec0d33 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes. Classes: - FirecrawlApp: Main class for interacting with the Firecrawl API. """ - +import logging import os import time from typing import Any, Dict, Optional import requests +logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: """ @@ -28,8 +29,15 @@ class FirecrawlApp: def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') if self.api_key is None: + logger.warning("No API key provided") raise ValueError('No API key provided') + else: + logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + if self.api_url != 'https://api.firecrawl.dev': + logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url) + def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ Scrape the specified URL using the Firecrawl API. @@ -45,10 +53,8 @@ class FirecrawlApp: Exception: If the scrape request fails. """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } + headers = self._prepare_headers() + # Prepare the base scrape parameters with the URL scrape_params = {'url': url} @@ -81,13 +87,10 @@ class FirecrawlApp: return response['data'] else: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - elif response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') else: - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') + self._handle_error(response, 'scrape URL') - def search(self, query, params=None): + def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any: """ Perform a search using the Firecrawl API. @@ -101,10 +104,7 @@ class FirecrawlApp: Raises: Exception: If the search request fails. """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } + headers = self._prepare_headers() json_data = {'query': query} if params: json_data.update(params) @@ -121,13 +121,14 @@ class FirecrawlApp: else: raise Exception(f'Failed to search. Error: {response["error"]}') - elif response.status_code in [402, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') else: - raise Exception(f'Failed to search. Status code: {response.status_code}') + self._handle_error(response, 'search') - def crawl_url(self, url, params=None, wait_until_done=True, poll_interval=2, idempotency_key=None): + def crawl_url(self, url: str, + params: Optional[Dict[str, Any]] = None, + wait_until_done: bool = True, + poll_interval: int = 2, + idempotency_key: Optional[str] = None) -> Any: """ Initiate a crawl job for the specified URL using the Firecrawl API. @@ -158,7 +159,7 @@ class FirecrawlApp: else: self._handle_error(response, 'start crawl job') - def check_crawl_status(self, job_id): + def check_crawl_status(self, job_id: str) -> Any: """ Check the status of a crawl job using the Firecrawl API. @@ -178,7 +179,7 @@ class FirecrawlApp: else: self._handle_error(response, 'check crawl status') - def _prepare_headers(self, idempotency_key=None): + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. @@ -200,7 +201,11 @@ class FirecrawlApp: 'Authorization': f'Bearer {self.api_key}', } - def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): + def _post_request(self, url: str, + data: Dict[str, Any], + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a POST request with retries. @@ -225,7 +230,10 @@ class FirecrawlApp: return response return response - def _get_request(self, url, headers, retries=3, backoff_factor=0.5): + def _get_request(self, url: str, + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a GET request with retries. @@ -249,7 +257,7 @@ class FirecrawlApp: return response return response - def _monitor_job_status(self, job_id, headers, poll_interval): + def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any: """ Monitor the status of a crawl job until completion. @@ -281,7 +289,7 @@ class FirecrawlApp: else: self._handle_error(status_response, 'check crawl status') - def _handle_error(self, response, action): + def _handle_error(self, response: requests.Response, action: str) -> None: """ Handle errors from API responses. @@ -292,8 +300,19 @@ class FirecrawlApp: Raises: Exception: An exception with a message containing the status code and error details from the response. """ - if response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') + error_message = response.json().get('error', 'No additional error details provided.') + + if response.status_code == 402: + message = f"Payment Required: Failed to {action}. {error_message}" + elif response.status_code == 408: + message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}" + elif response.status_code == 409: + message = f"Conflict: Failed to {action} due to a conflict. {error_message}" + elif response.status_code == 500: + message = f"Internal Server Error: Failed to {action}. {error_message}" else: - raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') + message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}" + + # Raise an HTTPError with the custom message and attach the response + raise requests.exceptions.HTTPError(message, response=response) + \ No newline at end of file