diff --git a/README.md b/README.md index c48ef10..256e2bd 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,15 @@ url = 'https://example.com' scraped_data = app.scrape_url(url) ``` +### Search for a query + +Performs a web search, retrieve the top results, extract data from each page, and returns their markdown. + +```python +query = 'what is mendable?' +search_result = app.search(query) +``` + ## Contributing We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request. diff --git a/apps/api/openapi.json b/apps/api/openapi.json index dd325fa..7861f32 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -373,33 +373,36 @@ "type": "boolean" }, "data": { - "type": "object", - "properties": { - "url": { - "type": "string" - }, - "markdown": { - "type": "string" - }, - "content": { - "type": "string" - }, - "metadata": { - "type": "object", - "properties": { - "title": { - "type": "string" - }, - "description": { - "type": "string" - }, - "language": { - "type": "string", - "nullable": true - }, - "sourceURL": { - "type": "string", - "format": "uri" + "type": "array", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "markdown": { + "type": "string" + }, + "content": { + "type": "string" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + } } } } diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index d3b66aa..88cbf81 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -2,6 +2,9 @@ import { SearchResult } from "../../src/lib/entities"; import { google_search } from "./googlesearch"; import { serper_search } from "./serper"; + + + export async function search({ query, advanced = false, diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js index 1b23bb5..9d8237b 100644 --- a/apps/js-sdk/firecrawl/build/index.js +++ b/apps/js-sdk/firecrawl/build/index.js @@ -61,6 +61,43 @@ export default class FirecrawlApp { return { success: false, error: 'Internal server error.' }; }); } + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query_1) { + return __awaiter(this, arguments, void 0, function* (query, params = null) { + const headers = { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${this.apiKey}`, + }; + let jsonData = { query }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, 'search'); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: 'Internal server error.' }; + }); + } /** * Initiates a crawl job for a URL using the Firecrawl API. * @param {string} url - The URL to crawl. diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index a493dab..9a3e650 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.14", + "version": "0.0.15", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 12bb49f..aea15f8 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -23,6 +23,14 @@ export interface ScrapeResponse { error?: string; } +/** + * Response interface for searching operations. + */ +export interface SearchResponse { + success: boolean; + data?: any; + error?: string; +} /** * Response interface for crawling operations. */ @@ -94,6 +102,39 @@ export default class FirecrawlApp { return { success: false, error: 'Internal server error.' }; } + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + async search(query: string, params: Params | null = null): Promise { + const headers: AxiosRequestHeaders = { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${this.apiKey}`, + } as AxiosRequestHeaders; + let jsonData: Params = { query }; + if (params) { + jsonData = { ...jsonData, ...params }; + } + try { + const response: AxiosResponse = await axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } else { + this.handleError(response, 'search'); + } + } catch (error: any) { + throw new Error(error.message); + } + return { success: false, error: 'Internal server error.' }; + } + /** * Initiates a crawl job for a URL using the Firecrawl API. * @param {string} url - The URL to crawl. diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts index be960f7..7f79d64 100644 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -19,6 +19,14 @@ export interface ScrapeResponse { data?: any; error?: string; } +/** + * Response interface for searching operations. + */ +export interface SearchResponse { + success: boolean; + data?: any; + error?: string; +} /** * Response interface for crawling operations. */ @@ -55,6 +63,13 @@ export default class FirecrawlApp { * @returns {Promise} The response from the scrape operation. */ scrapeUrl(url: string, params?: Params | null): Promise; + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query: string, params?: Params | null): Promise; /** * Initiates a crawl job for a URL using the Firecrawl API. * @param {string} url - The URL to crawl. diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index a73272f..363f301 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -9,14 +9,14 @@ "version": "1.0.0", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^0.0.8", + "@mendable/firecrawl-js": "^0.0.15", "axios": "^1.6.8" } }, "node_modules/@mendable/firecrawl-js": { - "version": "0.0.8", - "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.8.tgz", - "integrity": "sha512-dD7eA5X6UT8CM3z7qCqHgA4YbCsdwmmlaT/L0/ozM6gGvb0PnJMoB+e51+n4lAW8mxXOvHGbq9nrgBT1wEhhhw==", + "version": "0.0.15", + "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.15.tgz", + "integrity": "sha512-e3iCCrLIiEh+jEDerGV9Uhdkn8ymo+sG+k3osCwPg51xW1xUdAnmlcHrcJoR43RvKXdvD/lqoxg8odUEsqyH+w==", "dependencies": { "axios": "^1.6.8", "dotenv": "^16.4.5" diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 9bb5c4f..563e1e3 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -11,7 +11,7 @@ "author": "", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^0.0.8", + "@mendable/firecrawl-js": "^0.0.15", "axios": "^1.6.8" } } diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index b4b83de..7a6e620 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -1,29 +1,36 @@ -from fastapi import FastAPI, Response -from playwright.async_api import async_playwright -import os +from fastapi import FastAPI +from playwright.async_api import async_playwright, Browser from fastapi.responses import JSONResponse from pydantic import BaseModel + app = FastAPI() -from pydantic import BaseModel class UrlModel(BaseModel): url: str -@app.post("/html") # Kept as POST to accept body parameters -async def root(body: UrlModel): # Using Pydantic model for request body - async with async_playwright() as p: - browser = await p.chromium.launch() - context = await browser.new_context() - page = await context.new_page() +browser: Browser = None - await page.goto(body.url) # Adjusted to use the url from the request body model - page_content = await page.content() # Get the HTML content of the page - await context.close() - await browser.close() +@app.on_event("startup") +async def startup_event(): + global browser + playwright = await async_playwright().start() + browser = await playwright.chromium.launch() - json_compatible_item_data = {"content": page_content} - return JSONResponse(content=json_compatible_item_data) - + +@app.on_event("shutdown") +async def shutdown_event(): + await browser.close() + + +@app.post("/html") +async def root(body: UrlModel): + context = await browser.new_context() + page = await context.new_page() + await page.goto(body.url) + page_content = await page.content() + await context.close() + json_compatible_item_data = {"content": page_content} + return JSONResponse(content=json_compatible_item_data) diff --git a/apps/python-sdk/README.md b/apps/python-sdk/README.md index 0a80202..02ad307 100644 --- a/apps/python-sdk/README.md +++ b/apps/python-sdk/README.md @@ -47,6 +47,15 @@ url = 'https://example.com' scraped_data = app.scrape_url(url) ``` +### Search for a query + +Used to search the web, get the most relevant results, scrap each page and return the markdown. + +```python +query = 'what is mendable?' +search_result = app.search(query) +``` + ### Crawling a Website To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. diff --git a/apps/python-sdk/build/lib/firecrawl/firecrawl.py b/apps/python-sdk/build/lib/firecrawl/firecrawl.py index f1f5e6e..ef3eb53 100644 --- a/apps/python-sdk/build/lib/firecrawl/firecrawl.py +++ b/apps/python-sdk/build/lib/firecrawl/firecrawl.py @@ -32,6 +32,32 @@ class FirecrawlApp: raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') else: raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') + + def search(self, query, params=None): + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.api_key}' + } + json_data = {'query': query} + if params: + json_data.update(params) + response = requests.post( + 'https://api.firecrawl.dev/v0/search', + headers=headers, + json=json_data + ) + if response.status_code == 200: + response = response.json() + if response['success'] == True: + return response['data'] + else: + raise Exception(f'Failed to search. Error: {response["error"]}') + + elif response.status_code in [402, 409, 500]: + error_message = response.json().get('error', 'Unknown error occurred') + raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') + else: + raise Exception(f'Failed to search. Status code: {response.status_code}') def crawl_url(self, url, params=None, wait_until_done=True, timeout=2): headers = self._prepare_headers() diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.5.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.5.tar.gz deleted file mode 100644 index fab06b7..0000000 Binary files a/apps/python-sdk/dist/firecrawl-py-0.0.5.tar.gz and /dev/null differ diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.6.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.6.tar.gz new file mode 100644 index 0000000..c1b4206 Binary files /dev/null and b/apps/python-sdk/dist/firecrawl-py-0.0.6.tar.gz differ diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.5-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.5-py3-none-any.whl deleted file mode 100644 index b32d0c8..0000000 Binary files a/apps/python-sdk/dist/firecrawl_py-0.0.5-py3-none-any.whl and /dev/null differ diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.6-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.6-py3-none-any.whl new file mode 100644 index 0000000..5aba561 Binary files /dev/null and b/apps/python-sdk/dist/firecrawl_py-0.0.6-py3-none-any.whl differ diff --git a/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc b/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc index 4926b80..605b3df 100644 Binary files a/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc and b/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc differ diff --git a/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc b/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc index 694553e..7c98fa3 100644 Binary files a/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc and b/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc differ diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index f1f5e6e..ef3eb53 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -32,6 +32,32 @@ class FirecrawlApp: raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') else: raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') + + def search(self, query, params=None): + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.api_key}' + } + json_data = {'query': query} + if params: + json_data.update(params) + response = requests.post( + 'https://api.firecrawl.dev/v0/search', + headers=headers, + json=json_data + ) + if response.status_code == 200: + response = response.json() + if response['success'] == True: + return response['data'] + else: + raise Exception(f'Failed to search. Error: {response["error"]}') + + elif response.status_code in [402, 409, 500]: + error_message = response.json().get('error', 'Unknown error occurred') + raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') + else: + raise Exception(f'Failed to search. Status code: {response.status_code}') def crawl_url(self, url, params=None, wait_until_done=True, timeout=2): headers = self._prepare_headers() diff --git a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO index ad0bd09..61589c2 100644 --- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO +++ b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO @@ -1,7 +1,7 @@ Metadata-Version: 2.1 Name: firecrawl-py -Version: 0.0.5 +Version: 0.0.6 Summary: Python SDK for Firecrawl API -Home-page: https://github.com/mendableai/firecrawl-py +Home-page: https://github.com/mendableai/firecrawl Author: Mendable.ai Author-email: nick@mendable.ai diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py index d2fc6b8..a3589e3 100644 --- a/apps/python-sdk/setup.py +++ b/apps/python-sdk/setup.py @@ -2,8 +2,8 @@ from setuptools import setup, find_packages setup( name='firecrawl-py', - version='0.0.5', - url='https://github.com/mendableai/firecrawl-py', + version='0.0.6', + url='https://github.com/mendableai/firecrawl', author='Mendable.ai', author_email='nick@mendable.ai', description='Python SDK for Firecrawl API',