0

Merge branch 'main' into feat/coupons

This commit is contained in:
Nicolas 2024-04-26 14:23:35 -07:00
commit 0a607b9efa
29 changed files with 4021 additions and 82 deletions

View File

@ -147,7 +147,7 @@ curl -X POST https://api.firecrawl.dev/v0/search \
}
```
Coming soon to the SDKs and Integrations.
Coming soon to the Langchain and LLama Index integrations.
## Using Python SDK
@ -180,6 +180,15 @@ url = 'https://example.com'
scraped_data = app.scrape_url(url)
```
### Search for a query
Performs a web search, retrieve the top results, extract data from each page, and returns their markdown.
```python
query = 'what is mendable?'
search_result = app.search(query)
```
## Contributing
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.

View File

@ -373,33 +373,36 @@
"type": "boolean"
},
"data": {
"type": "object",
"properties": {
"url": {
"type": "string"
},
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
"type": "array",
"items": {
"type": "object",
"properties": {
"url": {
"type": "string"
},
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
}
}
}
}

View File

@ -183,6 +183,8 @@ const TEST_URL = "http://127.0.0.1:3002";
expect(response.statusCode).toBe(401);
});
it("should return a successful response with a valid API key", async () => {
const response = await request(TEST_URL)
.post("/v0/search")

View File

@ -51,9 +51,19 @@ export async function supaAuthenticateUser(
if (
token === "this_is_just_a_preview_token" &&
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview)
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
) {
return { success: true, team_id: "preview" };
// check the origin of the request and make sure its from firecrawl.dev
// const origin = req.headers.origin;
// if (origin && origin.includes("firecrawl.dev")){
// return { success: true, team_id: "preview" };
// }
// if(process.env.ENV !== "production") {
// return { success: true, team_id: "preview" };
// }
// return { success: false, error: "Unauthorized: Invalid token", status: 401 };
}
const normalizedApi = parseApi(token);

View File

@ -2,6 +2,9 @@ import { SearchResult } from "../../src/lib/entities";
import { google_search } from "./googlesearch";
import { serper_search } from "./serper";
export async function search({
query,
advanced = false,

View File

@ -1,7 +1,7 @@
import { withAuth } from "../../lib/withAuth";
import { supabase_service } from "../supabase";
const FREE_CREDITS = 100;
const FREE_CREDITS = 300;
export async function billTeam(team_id: string, credits: number) {
return withAuth(supaBillTeam)(team_id, credits);

View File

@ -61,6 +61,43 @@ export default class FirecrawlApp {
return { success: false, error: 'Internal server error.' };
});
}
/**
* Searches for a query using the Firecrawl API.
* @param {string} query - The query to search for.
* @param {Params | null} params - Additional parameters for the search request.
* @returns {Promise<SearchResponse>} The response from the search operation.
*/
search(query_1) {
return __awaiter(this, arguments, void 0, function* (query, params = null) {
const headers = {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
};
let jsonData = { query };
if (params) {
jsonData = Object.assign(Object.assign({}, jsonData), params);
}
try {
const response = yield axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers });
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
return responseData;
}
else {
throw new Error(`Failed to search. Error: ${responseData.error}`);
}
}
else {
this.handleError(response, 'search');
}
}
catch (error) {
throw new Error(error.message);
}
return { success: false, error: 'Internal server error.' };
});
}
/**
* Initiates a crawl job for a URL using the Firecrawl API.
* @param {string} url - The URL to crawl.

View File

@ -0,0 +1,5 @@
/** @type {import('ts-jest').JestConfigWithTsJest} */
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
};

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,14 @@
{
"name": "@mendable/firecrawl-js",
"version": "0.0.13",
"version": "0.0.16",
"description": "JavaScript SDK for Firecrawl API",
"main": "build/index.js",
"types": "types/index.d.ts",
"type": "module",
"scripts": {
"build": "tsc",
"publish":"npm run build && npm publish --access public",
"test": "echo \"Error: no test specified\" && exit 1"
"publish": "npm run build && npm publish --access public",
"test": "jest src/**/*.test.ts"
},
"repository": {
"type": "git",
@ -17,17 +17,18 @@
"author": "Mendable.ai",
"license": "MIT",
"dependencies": {
"axios": "^1.6.8",
"dotenv": "^16.4.5"
"axios": "^1.6.8"
},
"bugs": {
"url": "https://github.com/mendableai/firecrawl/issues"
},
"homepage": "https://github.com/mendableai/firecrawl#readme",
"devDependencies": {
"@jest/globals": "^29.7.0",
"@types/axios": "^0.14.0",
"@types/dotenv": "^8.2.0",
"@types/node": "^20.12.7",
"jest": "^29.7.0",
"ts-jest": "^29.1.2",
"typescript": "^5.4.5"
},
"keywords": [

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,48 @@
import { describe, test, expect, jest } from '@jest/globals';
import axios from 'axios';
import FirecrawlApp from '../index';
import { readFile } from 'fs/promises';
import { join } from 'path';
// Mock jest and set the type
jest.mock('axios');
const mockedAxios = axios as jest.Mocked<typeof axios>;
// Get the fixure data from the JSON file in ./fixtures
async function loadFixture(name: string): Promise<string> {
return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8')
}
describe('the firecrawl JS SDK', () => {
test('Should require an API key to instantiate FirecrawlApp', async () => {
const fn = () => {
new FirecrawlApp({ apiKey: undefined });
};
expect(fn).toThrow('No API key provided');
});
test('Should return scraped data from a /scrape API call', async () => {
const mockData = await loadFixture('scrape');
mockedAxios.post.mockResolvedValue({
status: 200,
data: JSON.parse(mockData),
});
const apiKey = 'YOUR_API_KEY'
const app = new FirecrawlApp({ apiKey });
// Scrape a single URL
const url = 'https://mendable.ai';
const scrapedData = await app.scrapeUrl(url);
expect(mockedAxios.post).toHaveBeenCalledTimes(1);
expect(mockedAxios.post).toHaveBeenCalledWith(
expect.stringMatching(/^https:\/\/api.firecrawl.dev/),
expect.objectContaining({ url }),
expect.objectContaining({ headers: expect.objectContaining({'Authorization': `Bearer ${apiKey}`}) }),
)
expect(scrapedData.success).toBe(true);
expect(scrapedData.data.metadata.title).toEqual('Mendable');
});
})

View File

@ -1,6 +1,4 @@
import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios';
import dotenv from 'dotenv';
dotenv.config();
/**
* Configuration interface for FirecrawlApp.
@ -25,6 +23,14 @@ export interface ScrapeResponse {
error?: string;
}
/**
* Response interface for searching operations.
*/
export interface SearchResponse {
success: boolean;
data?: any;
error?: string;
}
/**
* Response interface for crawling operations.
*/
@ -57,7 +63,7 @@ export default class FirecrawlApp {
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
*/
constructor({ apiKey = null }: FirecrawlAppConfig) {
this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || '';
this.apiKey = apiKey || '';
if (!this.apiKey) {
throw new Error('No API key provided');
}
@ -96,6 +102,39 @@ export default class FirecrawlApp {
return { success: false, error: 'Internal server error.' };
}
/**
* Searches for a query using the Firecrawl API.
* @param {string} query - The query to search for.
* @param {Params | null} params - Additional parameters for the search request.
* @returns {Promise<SearchResponse>} The response from the search operation.
*/
async search(query: string, params: Params | null = null): Promise<SearchResponse> {
const headers: AxiosRequestHeaders = {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders;
let jsonData: Params = { query };
if (params) {
jsonData = { ...jsonData, ...params };
}
try {
const response: AxiosResponse = await axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers });
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
return responseData;
} else {
throw new Error(`Failed to search. Error: ${responseData.error}`);
}
} else {
this.handleError(response, 'search');
}
} catch (error: any) {
throw new Error(error.message);
}
return { success: false, error: 'Internal server error.' };
}
/**
* Initiates a crawl job for a URL using the Firecrawl API.
* @param {string} url - The URL to crawl.
@ -226,4 +265,4 @@ export default class FirecrawlApp {
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`);
}
}
}
}

View File

@ -19,6 +19,14 @@ export interface ScrapeResponse {
data?: any;
error?: string;
}
/**
* Response interface for searching operations.
*/
export interface SearchResponse {
success: boolean;
data?: any;
error?: string;
}
/**
* Response interface for crawling operations.
*/
@ -55,6 +63,13 @@ export default class FirecrawlApp {
* @returns {Promise<ScrapeResponse>} The response from the scrape operation.
*/
scrapeUrl(url: string, params?: Params | null): Promise<ScrapeResponse>;
/**
* Searches for a query using the Firecrawl API.
* @param {string} query - The query to search for.
* @param {Params | null} params - Additional parameters for the search request.
* @returns {Promise<SearchResponse>} The response from the search operation.
*/
search(query: string, params?: Params | null): Promise<SearchResponse>;
/**
* Initiates a crawl job for a URL using the Firecrawl API.
* @param {string} url - The URL to crawl.

View File

@ -9,14 +9,14 @@
"version": "1.0.0",
"license": "ISC",
"dependencies": {
"@mendable/firecrawl-js": "^0.0.8",
"@mendable/firecrawl-js": "^0.0.15",
"axios": "^1.6.8"
}
},
"node_modules/@mendable/firecrawl-js": {
"version": "0.0.8",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.8.tgz",
"integrity": "sha512-dD7eA5X6UT8CM3z7qCqHgA4YbCsdwmmlaT/L0/ozM6gGvb0PnJMoB+e51+n4lAW8mxXOvHGbq9nrgBT1wEhhhw==",
"version": "0.0.15",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.15.tgz",
"integrity": "sha512-e3iCCrLIiEh+jEDerGV9Uhdkn8ymo+sG+k3osCwPg51xW1xUdAnmlcHrcJoR43RvKXdvD/lqoxg8odUEsqyH+w==",
"dependencies": {
"axios": "^1.6.8",
"dotenv": "^16.4.5"

View File

@ -11,7 +11,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"@mendable/firecrawl-js": "^0.0.8",
"@mendable/firecrawl-js": "^0.0.15",
"axios": "^1.6.8"
}
}

View File

@ -1,29 +1,36 @@
from fastapi import FastAPI, Response
from playwright.async_api import async_playwright
import os
from fastapi import FastAPI
from playwright.async_api import async_playwright, Browser
from fastapi.responses import JSONResponse
from pydantic import BaseModel
app = FastAPI()
from pydantic import BaseModel
class UrlModel(BaseModel):
url: str
@app.post("/html") # Kept as POST to accept body parameters
async def root(body: UrlModel): # Using Pydantic model for request body
async with async_playwright() as p:
browser = await p.chromium.launch()
context = await browser.new_context()
page = await context.new_page()
browser: Browser = None
await page.goto(body.url) # Adjusted to use the url from the request body model
page_content = await page.content() # Get the HTML content of the page
await context.close()
await browser.close()
@app.on_event("startup")
async def startup_event():
global browser
playwright = await async_playwright().start()
browser = await playwright.chromium.launch()
json_compatible_item_data = {"content": page_content}
return JSONResponse(content=json_compatible_item_data)
@app.on_event("shutdown")
async def shutdown_event():
await browser.close()
@app.post("/html")
async def root(body: UrlModel):
context = await browser.new_context()
page = await context.new_page()
await page.goto(body.url)
page_content = await page.content()
await context.close()
json_compatible_item_data = {"content": page_content}
return JSONResponse(content=json_compatible_item_data)

View File

@ -47,6 +47,15 @@ url = 'https://example.com'
scraped_data = app.scrape_url(url)
```
### Search for a query
Used to search the web, get the most relevant results, scrap each page and return the markdown.
```python
query = 'what is mendable?'
search_result = app.search(query)
```
### Crawling a Website
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.

View File

@ -32,6 +32,32 @@ class FirecrawlApp:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
def search(self, query, params=None):
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
json_data = {'query': query}
if params:
json_data.update(params)
response = requests.post(
'https://api.firecrawl.dev/v0/search',
headers=headers,
json=json_data
)
if response.status_code == 200:
response = response.json()
if response['success'] == True:
return response['data']
else:
raise Exception(f'Failed to search. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to search. Status code: {response.status_code}')
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
headers = self._prepare_headers()

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -32,6 +32,32 @@ class FirecrawlApp:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
def search(self, query, params=None):
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
json_data = {'query': query}
if params:
json_data.update(params)
response = requests.post(
'https://api.firecrawl.dev/v0/search',
headers=headers,
json=json_data
)
if response.status_code == 200:
response = response.json()
if response['success'] == True:
return response['data']
else:
raise Exception(f'Failed to search. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to search. Status code: {response.status_code}')
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
headers = self._prepare_headers()

View File

@ -1,7 +1,7 @@
Metadata-Version: 2.1
Name: firecrawl-py
Version: 0.0.5
Version: 0.0.6
Summary: Python SDK for Firecrawl API
Home-page: https://github.com/mendableai/firecrawl-py
Home-page: https://github.com/mendableai/firecrawl
Author: Mendable.ai
Author-email: nick@mendable.ai

View File

@ -2,8 +2,8 @@ from setuptools import setup, find_packages
setup(
name='firecrawl-py',
version='0.0.5',
url='https://github.com/mendableai/firecrawl-py',
version='0.0.6',
url='https://github.com/mendableai/firecrawl',
author='Mendable.ai',
author_email='nick@mendable.ai',
description='Python SDK for Firecrawl API',

View File

@ -0,0 +1,78 @@
# Build an agent that check your website for contradictions
Learn how to use Firecrawl and Claude to scrape your website's data and look for contradictions and inconsistencies in a few lines of code. When you are shipping fast, data is bound to get stale, with FireCrawl and LLMs you can make sure your public web data is always consistent! We will be using Opus's huge 200k context window and Firecrawl's parellization, making this process accurate and fast.
## Setup
Install our python dependencies, including anthropic and firecrawl-py.
```bash
pip install firecrawl-py anthropic
```
## Getting your Claude and Firecrawl API Keys
To use Claude Opus and Firecrawl, you will need to get your API keys. You can get your Anthropic API key from [here](https://www.anthropic.com/) and your Firecrawl API key from [here](https://firecrawl.dev).
## Load website with Firecrawl
To be able to get all the data from our website page put it into an easy to read format for the LLM, we will use [FireCrawl](https://firecrawl.dev). It handles by-passing JS-blocked websites, extracting the main content, and outputting in a LLM-readable format for increased accuracy.
Here is how we will scrape a website url using Firecrawl-py
```python
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="YOUR-KEY")
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*','usecases/*']}})
print(crawl_result)
```
With all of the web data we want scraped and in a clean format, we can move onto the next step.
## Combination and Generation
Now that we have the website data, let's pair up every page and run every combination through Opus for analysis.
```python
from itertools import combinations
page_combinations = []
for first_page, second_page in combinations(crawl_result, 2):
combined_string = "First Page:\n" + first_page['markdown'] + "\n\nSecond Page:\n" + second_page['markdown']
page_combinations.append(combined_string)
import anthropic
client = anthropic.Anthropic(
# defaults to os.environ.get("ANTHROPIC_API_KEY")
api_key="YOUR-KEY",
)
final_output = []
for page_combination in page_combinations:
prompt = "Here are two pages from a companies website, your job is to find any contradictions or differences in opinion between the two pages, this could be caused by outdated information or other. If you find any contradictions, list them out and provide a brief explanation of why they are contradictory or differing. Make sure the explanation is specific and concise. It is okay if you don't find any contradictions, just say 'No contradictions found' and nothing else. Here are the pages: " + "\n\n".join(page_combination)
message = client.messages.create(
model="claude-3-opus-20240229",
max_tokens=1000,
temperature=0.0,
system="You are an assistant that helps find contradictions or differences in opinion between pages in a company website and knowledge base. This could be caused by outdated information in the knowledge base.",
messages=[
{"role": "user", "content": prompt}
]
)
final_output.append(message.content)
```
## That's about it!
You have now built an agent that looks at your website and spots any inconsistencies it might have.
If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev).