0

Merge branch 'main' into mdp/dotenv_jest

This commit is contained in:
Nicolas 2024-04-25 17:59:20 -07:00 committed by GitHub
commit f368e94cee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 230 additions and 54 deletions

View File

@ -180,6 +180,15 @@ url = 'https://example.com'
scraped_data = app.scrape_url(url)
```
### Search for a query
Performs a web search, retrieve the top results, extract data from each page, and returns their markdown.
```python
query = 'what is mendable?'
search_result = app.search(query)
```
## Contributing
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.

View File

@ -373,33 +373,36 @@
"type": "boolean"
},
"data": {
"type": "object",
"properties": {
"url": {
"type": "string"
},
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
"type": "array",
"items": {
"type": "object",
"properties": {
"url": {
"type": "string"
},
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
}
}
}
}

View File

@ -2,6 +2,9 @@ import { SearchResult } from "../../src/lib/entities";
import { google_search } from "./googlesearch";
import { serper_search } from "./serper";
export async function search({
query,
advanced = false,

View File

@ -61,6 +61,43 @@ export default class FirecrawlApp {
return { success: false, error: 'Internal server error.' };
});
}
/**
* Searches for a query using the Firecrawl API.
* @param {string} query - The query to search for.
* @param {Params | null} params - Additional parameters for the search request.
* @returns {Promise<SearchResponse>} The response from the search operation.
*/
search(query_1) {
return __awaiter(this, arguments, void 0, function* (query, params = null) {
const headers = {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
};
let jsonData = { query };
if (params) {
jsonData = Object.assign(Object.assign({}, jsonData), params);
}
try {
const response = yield axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers });
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
return responseData;
}
else {
throw new Error(`Failed to search. Error: ${responseData.error}`);
}
}
else {
this.handleError(response, 'search');
}
}
catch (error) {
throw new Error(error.message);
}
return { success: false, error: 'Internal server error.' };
});
}
/**
* Initiates a crawl job for a URL using the Firecrawl API.
* @param {string} url - The URL to crawl.

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "0.0.14",
"version": "0.0.15",
"description": "JavaScript SDK for Firecrawl API",
"main": "build/index.js",
"types": "types/index.d.ts",

View File

@ -23,6 +23,14 @@ export interface ScrapeResponse {
error?: string;
}
/**
* Response interface for searching operations.
*/
export interface SearchResponse {
success: boolean;
data?: any;
error?: string;
}
/**
* Response interface for crawling operations.
*/
@ -94,6 +102,39 @@ export default class FirecrawlApp {
return { success: false, error: 'Internal server error.' };
}
/**
* Searches for a query using the Firecrawl API.
* @param {string} query - The query to search for.
* @param {Params | null} params - Additional parameters for the search request.
* @returns {Promise<SearchResponse>} The response from the search operation.
*/
async search(query: string, params: Params | null = null): Promise<SearchResponse> {
const headers: AxiosRequestHeaders = {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders;
let jsonData: Params = { query };
if (params) {
jsonData = { ...jsonData, ...params };
}
try {
const response: AxiosResponse = await axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers });
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
return responseData;
} else {
throw new Error(`Failed to search. Error: ${responseData.error}`);
}
} else {
this.handleError(response, 'search');
}
} catch (error: any) {
throw new Error(error.message);
}
return { success: false, error: 'Internal server error.' };
}
/**
* Initiates a crawl job for a URL using the Firecrawl API.
* @param {string} url - The URL to crawl.

View File

@ -19,6 +19,14 @@ export interface ScrapeResponse {
data?: any;
error?: string;
}
/**
* Response interface for searching operations.
*/
export interface SearchResponse {
success: boolean;
data?: any;
error?: string;
}
/**
* Response interface for crawling operations.
*/
@ -55,6 +63,13 @@ export default class FirecrawlApp {
* @returns {Promise<ScrapeResponse>} The response from the scrape operation.
*/
scrapeUrl(url: string, params?: Params | null): Promise<ScrapeResponse>;
/**
* Searches for a query using the Firecrawl API.
* @param {string} query - The query to search for.
* @param {Params | null} params - Additional parameters for the search request.
* @returns {Promise<SearchResponse>} The response from the search operation.
*/
search(query: string, params?: Params | null): Promise<SearchResponse>;
/**
* Initiates a crawl job for a URL using the Firecrawl API.
* @param {string} url - The URL to crawl.

View File

@ -9,14 +9,14 @@
"version": "1.0.0",
"license": "ISC",
"dependencies": {
"@mendable/firecrawl-js": "^0.0.8",
"@mendable/firecrawl-js": "^0.0.15",
"axios": "^1.6.8"
}
},
"node_modules/@mendable/firecrawl-js": {
"version": "0.0.8",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.8.tgz",
"integrity": "sha512-dD7eA5X6UT8CM3z7qCqHgA4YbCsdwmmlaT/L0/ozM6gGvb0PnJMoB+e51+n4lAW8mxXOvHGbq9nrgBT1wEhhhw==",
"version": "0.0.15",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.15.tgz",
"integrity": "sha512-e3iCCrLIiEh+jEDerGV9Uhdkn8ymo+sG+k3osCwPg51xW1xUdAnmlcHrcJoR43RvKXdvD/lqoxg8odUEsqyH+w==",
"dependencies": {
"axios": "^1.6.8",
"dotenv": "^16.4.5"

View File

@ -11,7 +11,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"@mendable/firecrawl-js": "^0.0.8",
"@mendable/firecrawl-js": "^0.0.15",
"axios": "^1.6.8"
}
}

View File

@ -1,29 +1,36 @@
from fastapi import FastAPI, Response
from playwright.async_api import async_playwright
import os
from fastapi import FastAPI
from playwright.async_api import async_playwright, Browser
from fastapi.responses import JSONResponse
from pydantic import BaseModel
app = FastAPI()
from pydantic import BaseModel
class UrlModel(BaseModel):
url: str
@app.post("/html") # Kept as POST to accept body parameters
async def root(body: UrlModel): # Using Pydantic model for request body
async with async_playwright() as p:
browser = await p.chromium.launch()
context = await browser.new_context()
page = await context.new_page()
browser: Browser = None
await page.goto(body.url) # Adjusted to use the url from the request body model
page_content = await page.content() # Get the HTML content of the page
await context.close()
await browser.close()
@app.on_event("startup")
async def startup_event():
global browser
playwright = await async_playwright().start()
browser = await playwright.chromium.launch()
json_compatible_item_data = {"content": page_content}
return JSONResponse(content=json_compatible_item_data)
@app.on_event("shutdown")
async def shutdown_event():
await browser.close()
@app.post("/html")
async def root(body: UrlModel):
context = await browser.new_context()
page = await context.new_page()
await page.goto(body.url)
page_content = await page.content()
await context.close()
json_compatible_item_data = {"content": page_content}
return JSONResponse(content=json_compatible_item_data)

View File

@ -47,6 +47,15 @@ url = 'https://example.com'
scraped_data = app.scrape_url(url)
```
### Search for a query
Used to search the web, get the most relevant results, scrap each page and return the markdown.
```python
query = 'what is mendable?'
search_result = app.search(query)
```
### Crawling a Website
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.

View File

@ -32,6 +32,32 @@ class FirecrawlApp:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
def search(self, query, params=None):
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
json_data = {'query': query}
if params:
json_data.update(params)
response = requests.post(
'https://api.firecrawl.dev/v0/search',
headers=headers,
json=json_data
)
if response.status_code == 200:
response = response.json()
if response['success'] == True:
return response['data']
else:
raise Exception(f'Failed to search. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to search. Status code: {response.status_code}')
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
headers = self._prepare_headers()

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -32,6 +32,32 @@ class FirecrawlApp:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
def search(self, query, params=None):
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
json_data = {'query': query}
if params:
json_data.update(params)
response = requests.post(
'https://api.firecrawl.dev/v0/search',
headers=headers,
json=json_data
)
if response.status_code == 200:
response = response.json()
if response['success'] == True:
return response['data']
else:
raise Exception(f'Failed to search. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to search. Status code: {response.status_code}')
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
headers = self._prepare_headers()

View File

@ -1,7 +1,7 @@
Metadata-Version: 2.1
Name: firecrawl-py
Version: 0.0.5
Version: 0.0.6
Summary: Python SDK for Firecrawl API
Home-page: https://github.com/mendableai/firecrawl-py
Home-page: https://github.com/mendableai/firecrawl
Author: Mendable.ai
Author-email: nick@mendable.ai

View File

@ -2,8 +2,8 @@ from setuptools import setup, find_packages
setup(
name='firecrawl-py',
version='0.0.5',
url='https://github.com/mendableai/firecrawl-py',
version='0.0.6',
url='https://github.com/mendableai/firecrawl',
author='Mendable.ai',
author_email='nick@mendable.ai',
description='Python SDK for Firecrawl API',