Merge pull request #191 from mattjoyce/main

Various PyPi Metadata
2024-05-26 18:15:19 -07:00 · 2024-05-26 18:15:19 -07:00 · a9e45cdb15
commit a9e45cdb15
parent 1bbfb98d7e 1de53cc4d0
6 changed files with 198 additions and 9 deletions
--- a/apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz
+++ b/apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz
--- a/apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz
+++ b/apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz
--- a/apps/python-sdk/dist/firecrawl_py-0.0.10-py3-none-any.whl
+++ b/apps/python-sdk/dist/firecrawl_py-0.0.10-py3-none-any.whl
--- a/apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl
+++ b/apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl
--- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO
+++ b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO
@ -1,7 +1,160 @@
 Metadata-Version: 2.1
 Name: firecrawl-py
-Version: 0.0.10
+Version: 0.0.11
 Summary: Python SDK for Firecrawl API
 Home-page: https://github.com/mendableai/firecrawl
 Author: Mendable.ai
 Author-email: nick@mendable.ai
+License: GNU General Public License v3 (GPLv3)
+Project-URL: Documentation, https://docs.firecrawl.dev
+Project-URL: Source, https://github.com/mendableai/firecrawl
+Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues
+Keywords: SDK API firecrawl
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Web Environment
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Topic :: Internet
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
+Classifier: Topic :: Software Development
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing
+Classifier: Topic :: Text Processing :: Indexing
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+
+# Firecrawl Python SDK
+
+The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
+
+## Installation
+
+To install the Firecrawl Python SDK, you can use pip:
+
+```bash
+pip install firecrawl-py
+```
+
+## Usage
+
+1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
+2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
+
+
+Here's an example of how to use the SDK:
+
+```python
+from firecrawl import FirecrawlApp
+
+# Initialize the FirecrawlApp with your API key
+app = FirecrawlApp(api_key='your_api_key')
+
+# Scrape a single URL
+url = 'https://mendable.ai'
+scraped_data = app.scrape_url(url)
+
+# Crawl a website
+crawl_url = 'https://mendable.ai'
+params = {
+    'pageOptions': {
+        'onlyMainContent': True
+    }
+}
+crawl_result = app.crawl_url(crawl_url, params=params)
+```
+
+### Scraping a URL
+
+To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
+
+```python
+url = 'https://example.com'
+scraped_data = app.scrape_url(url)
+```
+### Extracting structured data from a URL
+
+With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
+
+```python
+class ArticleSchema(BaseModel):
+    title: str
+    points: int 
+    by: str
+    commentsURL: str
+
+class TopArticlesSchema(BaseModel):
+    top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
+
+data = app.scrape_url('https://news.ycombinator.com', {
+    'extractorOptions': {
+        'extractionSchema': TopArticlesSchema.model_json_schema(),
+        'mode': 'llm-extraction'
+    },
+    'pageOptions':{
+        'onlyMainContent': True
+    }
+})
+print(data["llm_extraction"])
+```
+
+### Search for a query
+
+Used to search the web, get the most relevant results, scrap each page and return the markdown.
+
+```python
+query = 'what is mendable?'
+search_result = app.search(query)
+```
+
+### Crawling a Website
+
+To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
+
+The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
+
+```python
+crawl_url = 'https://example.com'
+params = {
+    'crawlerOptions': {
+        'excludes': ['blog/*'],
+        'includes': [], # leave empty for all pages
+        'limit': 1000,
+    },
+    'pageOptions': {
+        'onlyMainContent': True
+    }
+}
+crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
+```
+
+If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
+
+### Checking Crawl Status
+
+To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
+
+```python
+job_id = crawl_result['jobId']
+status = app.check_crawl_status(job_id)
+```
+
+## Error Handling
+
+The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
+
+## Contributing
+
+Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
+
+## License
+
+The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
--- a/apps/python-sdk/setup.py
+++ b/apps/python-sdk/setup.py
@ -1,14 +1,50 @@
-from setuptools import setup, find_packages
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+this_directory = Path(__file__).parent
+long_description_content = (this_directory / "README.md").read_text()

 setup(
-    name='firecrawl-py',
-    version='0.0.10',
-    url='https://github.com/mendableai/firecrawl',
-    author='Mendable.ai',
-    author_email='nick@mendable.ai',
-    description='Python SDK for Firecrawl API',
+    name="firecrawl-py",
+    version="0.0.11",
+    url="https://github.com/mendableai/firecrawl",
+    author="Mendable.ai",
+    author_email="nick@mendable.ai",
+    description="Python SDK for Firecrawl API",
+    long_description=long_description_content,
+    long_description_content_type="text/markdown",
    packages=find_packages(),
    install_requires=[
-        'requests',
+        "requests",
    ],
+    python_requires='>=3.8',
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Environment :: Web Environment",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
+        "Natural Language :: English",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Internet",
+        "Topic :: Internet :: WWW/HTTP",
+        "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
+        "Topic :: Software Development",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+        "Topic :: Text Processing",
+        "Topic :: Text Processing :: Indexing",
+    ],    
+    keywords="SDK API firecrawl",
+    project_urls={
+        "Documentation": "https://docs.firecrawl.dev",
+        "Source": "https://github.com/mendableai/firecrawl",
+        "Tracker": "https://github.com/mendableai/firecrawl/issues",
+    },
+    license="GNU General Public License v3 (GPLv3)",
 )