Merge branch 'main' into feat/coupons
This commit is contained in:
commit
0a607b9efa
11
README.md
11
README.md
@ -147,7 +147,7 @@ curl -X POST https://api.firecrawl.dev/v0/search \
|
||||
}
|
||||
```
|
||||
|
||||
Coming soon to the SDKs and Integrations.
|
||||
Coming soon to the Langchain and LLama Index integrations.
|
||||
|
||||
## Using Python SDK
|
||||
|
||||
@ -180,6 +180,15 @@ url = 'https://example.com'
|
||||
scraped_data = app.scrape_url(url)
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
Performs a web search, retrieve the top results, extract data from each page, and returns their markdown.
|
||||
|
||||
```python
|
||||
query = 'what is mendable?'
|
||||
search_result = app.search(query)
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.
|
||||
|
@ -373,6 +373,8 @@
|
||||
"type": "boolean"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
@ -406,6 +408,7 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlResponse": {
|
||||
"type": "object",
|
||||
|
@ -183,6 +183,8 @@ const TEST_URL = "http://127.0.0.1:3002";
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
|
||||
|
||||
it("should return a successful response with a valid API key", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/search")
|
||||
|
@ -51,9 +51,19 @@ export async function supaAuthenticateUser(
|
||||
|
||||
if (
|
||||
token === "this_is_just_a_preview_token" &&
|
||||
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview)
|
||||
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
|
||||
) {
|
||||
return { success: true, team_id: "preview" };
|
||||
// check the origin of the request and make sure its from firecrawl.dev
|
||||
// const origin = req.headers.origin;
|
||||
// if (origin && origin.includes("firecrawl.dev")){
|
||||
// return { success: true, team_id: "preview" };
|
||||
// }
|
||||
// if(process.env.ENV !== "production") {
|
||||
// return { success: true, team_id: "preview" };
|
||||
// }
|
||||
|
||||
// return { success: false, error: "Unauthorized: Invalid token", status: 401 };
|
||||
}
|
||||
|
||||
const normalizedApi = parseApi(token);
|
||||
|
@ -2,6 +2,9 @@ import { SearchResult } from "../../src/lib/entities";
|
||||
import { google_search } from "./googlesearch";
|
||||
import { serper_search } from "./serper";
|
||||
|
||||
|
||||
|
||||
|
||||
export async function search({
|
||||
query,
|
||||
advanced = false,
|
||||
|
@ -1,7 +1,7 @@
|
||||
import { withAuth } from "../../lib/withAuth";
|
||||
import { supabase_service } from "../supabase";
|
||||
|
||||
const FREE_CREDITS = 100;
|
||||
const FREE_CREDITS = 300;
|
||||
|
||||
export async function billTeam(team_id: string, credits: number) {
|
||||
return withAuth(supaBillTeam)(team_id, credits);
|
||||
|
@ -61,6 +61,43 @@ export default class FirecrawlApp {
|
||||
return { success: false, error: 'Internal server error.' };
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Searches for a query using the Firecrawl API.
|
||||
* @param {string} query - The query to search for.
|
||||
* @param {Params | null} params - Additional parameters for the search request.
|
||||
* @returns {Promise<SearchResponse>} The response from the search operation.
|
||||
*/
|
||||
search(query_1) {
|
||||
return __awaiter(this, arguments, void 0, function* (query, params = null) {
|
||||
const headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${this.apiKey}`,
|
||||
};
|
||||
let jsonData = { query };
|
||||
if (params) {
|
||||
jsonData = Object.assign(Object.assign({}, jsonData), params);
|
||||
}
|
||||
try {
|
||||
const response = yield axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers });
|
||||
if (response.status === 200) {
|
||||
const responseData = response.data;
|
||||
if (responseData.success) {
|
||||
return responseData;
|
||||
}
|
||||
else {
|
||||
throw new Error(`Failed to search. Error: ${responseData.error}`);
|
||||
}
|
||||
}
|
||||
else {
|
||||
this.handleError(response, 'search');
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
return { success: false, error: 'Internal server error.' };
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Initiates a crawl job for a URL using the Firecrawl API.
|
||||
* @param {string} url - The URL to crawl.
|
||||
|
5
apps/js-sdk/firecrawl/jest.config.cjs
Normal file
5
apps/js-sdk/firecrawl/jest.config.cjs
Normal file
@ -0,0 +1,5 @@
|
||||
/** @type {import('ts-jest').JestConfigWithTsJest} */
|
||||
module.exports = {
|
||||
preset: 'ts-jest',
|
||||
testEnvironment: 'node',
|
||||
};
|
3631
apps/js-sdk/firecrawl/package-lock.json
generated
3631
apps/js-sdk/firecrawl/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "0.0.13",
|
||||
"version": "0.0.16",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "build/index.js",
|
||||
"types": "types/index.d.ts",
|
||||
@ -8,7 +8,7 @@
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"publish": "npm run build && npm publish --access public",
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
"test": "jest src/**/*.test.ts"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
@ -17,17 +17,18 @@
|
||||
"author": "Mendable.ai",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5"
|
||||
"axios": "^1.6.8"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://github.com/mendableai/firecrawl/issues"
|
||||
},
|
||||
"homepage": "https://github.com/mendableai/firecrawl#readme",
|
||||
"devDependencies": {
|
||||
"@jest/globals": "^29.7.0",
|
||||
"@types/axios": "^0.14.0",
|
||||
"@types/dotenv": "^8.2.0",
|
||||
"@types/node": "^20.12.7",
|
||||
"jest": "^29.7.0",
|
||||
"ts-jest": "^29.1.2",
|
||||
"typescript": "^5.4.5"
|
||||
},
|
||||
"keywords": [
|
||||
|
22
apps/js-sdk/firecrawl/src/__tests__/fixtures/scrape.json
Normal file
22
apps/js-sdk/firecrawl/src/__tests__/fixtures/scrape.json
Normal file
File diff suppressed because one or more lines are too long
48
apps/js-sdk/firecrawl/src/__tests__/index.test.ts
Normal file
48
apps/js-sdk/firecrawl/src/__tests__/index.test.ts
Normal file
@ -0,0 +1,48 @@
|
||||
import { describe, test, expect, jest } from '@jest/globals';
|
||||
import axios from 'axios';
|
||||
import FirecrawlApp from '../index';
|
||||
|
||||
import { readFile } from 'fs/promises';
|
||||
import { join } from 'path';
|
||||
|
||||
// Mock jest and set the type
|
||||
jest.mock('axios');
|
||||
const mockedAxios = axios as jest.Mocked<typeof axios>;
|
||||
|
||||
// Get the fixure data from the JSON file in ./fixtures
|
||||
async function loadFixture(name: string): Promise<string> {
|
||||
return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8')
|
||||
}
|
||||
|
||||
describe('the firecrawl JS SDK', () => {
|
||||
|
||||
test('Should require an API key to instantiate FirecrawlApp', async () => {
|
||||
const fn = () => {
|
||||
new FirecrawlApp({ apiKey: undefined });
|
||||
};
|
||||
expect(fn).toThrow('No API key provided');
|
||||
});
|
||||
|
||||
test('Should return scraped data from a /scrape API call', async () => {
|
||||
const mockData = await loadFixture('scrape');
|
||||
mockedAxios.post.mockResolvedValue({
|
||||
status: 200,
|
||||
data: JSON.parse(mockData),
|
||||
});
|
||||
|
||||
const apiKey = 'YOUR_API_KEY'
|
||||
const app = new FirecrawlApp({ apiKey });
|
||||
// Scrape a single URL
|
||||
const url = 'https://mendable.ai';
|
||||
const scrapedData = await app.scrapeUrl(url);
|
||||
|
||||
expect(mockedAxios.post).toHaveBeenCalledTimes(1);
|
||||
expect(mockedAxios.post).toHaveBeenCalledWith(
|
||||
expect.stringMatching(/^https:\/\/api.firecrawl.dev/),
|
||||
expect.objectContaining({ url }),
|
||||
expect.objectContaining({ headers: expect.objectContaining({'Authorization': `Bearer ${apiKey}`}) }),
|
||||
)
|
||||
expect(scrapedData.success).toBe(true);
|
||||
expect(scrapedData.data.metadata.title).toEqual('Mendable');
|
||||
});
|
||||
})
|
@ -1,6 +1,4 @@
|
||||
import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios';
|
||||
import dotenv from 'dotenv';
|
||||
dotenv.config();
|
||||
|
||||
/**
|
||||
* Configuration interface for FirecrawlApp.
|
||||
@ -25,6 +23,14 @@ export interface ScrapeResponse {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for searching operations.
|
||||
*/
|
||||
export interface SearchResponse {
|
||||
success: boolean;
|
||||
data?: any;
|
||||
error?: string;
|
||||
}
|
||||
/**
|
||||
* Response interface for crawling operations.
|
||||
*/
|
||||
@ -57,7 +63,7 @@ export default class FirecrawlApp {
|
||||
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
|
||||
*/
|
||||
constructor({ apiKey = null }: FirecrawlAppConfig) {
|
||||
this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || '';
|
||||
this.apiKey = apiKey || '';
|
||||
if (!this.apiKey) {
|
||||
throw new Error('No API key provided');
|
||||
}
|
||||
@ -96,6 +102,39 @@ export default class FirecrawlApp {
|
||||
return { success: false, error: 'Internal server error.' };
|
||||
}
|
||||
|
||||
/**
|
||||
* Searches for a query using the Firecrawl API.
|
||||
* @param {string} query - The query to search for.
|
||||
* @param {Params | null} params - Additional parameters for the search request.
|
||||
* @returns {Promise<SearchResponse>} The response from the search operation.
|
||||
*/
|
||||
async search(query: string, params: Params | null = null): Promise<SearchResponse> {
|
||||
const headers: AxiosRequestHeaders = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${this.apiKey}`,
|
||||
} as AxiosRequestHeaders;
|
||||
let jsonData: Params = { query };
|
||||
if (params) {
|
||||
jsonData = { ...jsonData, ...params };
|
||||
}
|
||||
try {
|
||||
const response: AxiosResponse = await axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers });
|
||||
if (response.status === 200) {
|
||||
const responseData = response.data;
|
||||
if (responseData.success) {
|
||||
return responseData;
|
||||
} else {
|
||||
throw new Error(`Failed to search. Error: ${responseData.error}`);
|
||||
}
|
||||
} else {
|
||||
this.handleError(response, 'search');
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
return { success: false, error: 'Internal server error.' };
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates a crawl job for a URL using the Firecrawl API.
|
||||
* @param {string} url - The URL to crawl.
|
||||
|
15
apps/js-sdk/firecrawl/types/index.d.ts
vendored
15
apps/js-sdk/firecrawl/types/index.d.ts
vendored
@ -19,6 +19,14 @@ export interface ScrapeResponse {
|
||||
data?: any;
|
||||
error?: string;
|
||||
}
|
||||
/**
|
||||
* Response interface for searching operations.
|
||||
*/
|
||||
export interface SearchResponse {
|
||||
success: boolean;
|
||||
data?: any;
|
||||
error?: string;
|
||||
}
|
||||
/**
|
||||
* Response interface for crawling operations.
|
||||
*/
|
||||
@ -55,6 +63,13 @@ export default class FirecrawlApp {
|
||||
* @returns {Promise<ScrapeResponse>} The response from the scrape operation.
|
||||
*/
|
||||
scrapeUrl(url: string, params?: Params | null): Promise<ScrapeResponse>;
|
||||
/**
|
||||
* Searches for a query using the Firecrawl API.
|
||||
* @param {string} query - The query to search for.
|
||||
* @param {Params | null} params - Additional parameters for the search request.
|
||||
* @returns {Promise<SearchResponse>} The response from the search operation.
|
||||
*/
|
||||
search(query: string, params?: Params | null): Promise<SearchResponse>;
|
||||
/**
|
||||
* Initiates a crawl job for a URL using the Firecrawl API.
|
||||
* @param {string} url - The URL to crawl.
|
||||
|
8
apps/js-sdk/package-lock.json
generated
8
apps/js-sdk/package-lock.json
generated
@ -9,14 +9,14 @@
|
||||
"version": "1.0.0",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^0.0.8",
|
||||
"@mendable/firecrawl-js": "^0.0.15",
|
||||
"axios": "^1.6.8"
|
||||
}
|
||||
},
|
||||
"node_modules/@mendable/firecrawl-js": {
|
||||
"version": "0.0.8",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.8.tgz",
|
||||
"integrity": "sha512-dD7eA5X6UT8CM3z7qCqHgA4YbCsdwmmlaT/L0/ozM6gGvb0PnJMoB+e51+n4lAW8mxXOvHGbq9nrgBT1wEhhhw==",
|
||||
"version": "0.0.15",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.15.tgz",
|
||||
"integrity": "sha512-e3iCCrLIiEh+jEDerGV9Uhdkn8ymo+sG+k3osCwPg51xW1xUdAnmlcHrcJoR43RvKXdvD/lqoxg8odUEsqyH+w==",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5"
|
||||
|
@ -11,7 +11,7 @@
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^0.0.8",
|
||||
"@mendable/firecrawl-js": "^0.0.15",
|
||||
"axios": "^1.6.8"
|
||||
}
|
||||
}
|
||||
|
@ -1,29 +1,36 @@
|
||||
from fastapi import FastAPI, Response
|
||||
from playwright.async_api import async_playwright
|
||||
import os
|
||||
from fastapi import FastAPI
|
||||
from playwright.async_api import async_playwright, Browser
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
class UrlModel(BaseModel):
|
||||
url: str
|
||||
|
||||
@app.post("/html") # Kept as POST to accept body parameters
|
||||
async def root(body: UrlModel): # Using Pydantic model for request body
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch()
|
||||
|
||||
context = await browser.new_context()
|
||||
page = await context.new_page()
|
||||
browser: Browser = None
|
||||
|
||||
await page.goto(body.url) # Adjusted to use the url from the request body model
|
||||
page_content = await page.content() # Get the HTML content of the page
|
||||
|
||||
await context.close()
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
global browser
|
||||
playwright = await async_playwright().start()
|
||||
browser = await playwright.chromium.launch()
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event():
|
||||
await browser.close()
|
||||
|
||||
|
||||
@app.post("/html")
|
||||
async def root(body: UrlModel):
|
||||
context = await browser.new_context()
|
||||
page = await context.new_page()
|
||||
await page.goto(body.url)
|
||||
page_content = await page.content()
|
||||
await context.close()
|
||||
json_compatible_item_data = {"content": page_content}
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
|
||||
|
@ -47,6 +47,15 @@ url = 'https://example.com'
|
||||
scraped_data = app.scrape_url(url)
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
||||
|
||||
```python
|
||||
query = 'what is mendable?'
|
||||
search_result = app.search(query)
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
@ -33,6 +33,32 @@ class FirecrawlApp:
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
||||
|
||||
def search(self, query, params=None):
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
}
|
||||
json_data = {'query': query}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = requests.post(
|
||||
'https://api.firecrawl.dev/v0/search',
|
||||
headers=headers,
|
||||
json=json_data
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
if response['success'] == True:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
||||
|
||||
elif response.status_code in [402, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
raise Exception(f'Failed to search. Status code: {response.status_code}')
|
||||
|
||||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
|
||||
headers = self._prepare_headers()
|
||||
json_data = {'url': url}
|
||||
|
BIN
apps/python-sdk/dist/firecrawl-py-0.0.5.tar.gz
vendored
BIN
apps/python-sdk/dist/firecrawl-py-0.0.5.tar.gz
vendored
Binary file not shown.
BIN
apps/python-sdk/dist/firecrawl-py-0.0.6.tar.gz
vendored
Normal file
BIN
apps/python-sdk/dist/firecrawl-py-0.0.6.tar.gz
vendored
Normal file
Binary file not shown.
Binary file not shown.
BIN
apps/python-sdk/dist/firecrawl_py-0.0.6-py3-none-any.whl
vendored
Normal file
BIN
apps/python-sdk/dist/firecrawl_py-0.0.6-py3-none-any.whl
vendored
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -33,6 +33,32 @@ class FirecrawlApp:
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
||||
|
||||
def search(self, query, params=None):
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
}
|
||||
json_data = {'query': query}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = requests.post(
|
||||
'https://api.firecrawl.dev/v0/search',
|
||||
headers=headers,
|
||||
json=json_data
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
if response['success'] == True:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
||||
|
||||
elif response.status_code in [402, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
raise Exception(f'Failed to search. Status code: {response.status_code}')
|
||||
|
||||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
|
||||
headers = self._prepare_headers()
|
||||
json_data = {'url': url}
|
||||
|
@ -1,7 +1,7 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: firecrawl-py
|
||||
Version: 0.0.5
|
||||
Version: 0.0.6
|
||||
Summary: Python SDK for Firecrawl API
|
||||
Home-page: https://github.com/mendableai/firecrawl-py
|
||||
Home-page: https://github.com/mendableai/firecrawl
|
||||
Author: Mendable.ai
|
||||
Author-email: nick@mendable.ai
|
||||
|
@ -2,8 +2,8 @@ from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='firecrawl-py',
|
||||
version='0.0.5',
|
||||
url='https://github.com/mendableai/firecrawl-py',
|
||||
version='0.0.6',
|
||||
url='https://github.com/mendableai/firecrawl',
|
||||
author='Mendable.ai',
|
||||
author_email='nick@mendable.ai',
|
||||
description='Python SDK for Firecrawl API',
|
||||
|
78
tutorials/contradiction-testing-using-llms.mdx
Normal file
78
tutorials/contradiction-testing-using-llms.mdx
Normal file
@ -0,0 +1,78 @@
|
||||
# Build an agent that check your website for contradictions
|
||||
|
||||
Learn how to use Firecrawl and Claude to scrape your website's data and look for contradictions and inconsistencies in a few lines of code. When you are shipping fast, data is bound to get stale, with FireCrawl and LLMs you can make sure your public web data is always consistent! We will be using Opus's huge 200k context window and Firecrawl's parellization, making this process accurate and fast.
|
||||
|
||||
## Setup
|
||||
|
||||
Install our python dependencies, including anthropic and firecrawl-py.
|
||||
|
||||
```bash
|
||||
pip install firecrawl-py anthropic
|
||||
```
|
||||
|
||||
## Getting your Claude and Firecrawl API Keys
|
||||
|
||||
To use Claude Opus and Firecrawl, you will need to get your API keys. You can get your Anthropic API key from [here](https://www.anthropic.com/) and your Firecrawl API key from [here](https://firecrawl.dev).
|
||||
|
||||
## Load website with Firecrawl
|
||||
|
||||
To be able to get all the data from our website page put it into an easy to read format for the LLM, we will use [FireCrawl](https://firecrawl.dev). It handles by-passing JS-blocked websites, extracting the main content, and outputting in a LLM-readable format for increased accuracy.
|
||||
|
||||
Here is how we will scrape a website url using Firecrawl-py
|
||||
|
||||
```python
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="YOUR-KEY")
|
||||
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*','usecases/*']}})
|
||||
|
||||
print(crawl_result)
|
||||
```
|
||||
|
||||
With all of the web data we want scraped and in a clean format, we can move onto the next step.
|
||||
|
||||
## Combination and Generation
|
||||
|
||||
Now that we have the website data, let's pair up every page and run every combination through Opus for analysis.
|
||||
|
||||
```python
|
||||
from itertools import combinations
|
||||
|
||||
page_combinations = []
|
||||
|
||||
for first_page, second_page in combinations(crawl_result, 2):
|
||||
combined_string = "First Page:\n" + first_page['markdown'] + "\n\nSecond Page:\n" + second_page['markdown']
|
||||
page_combinations.append(combined_string)
|
||||
|
||||
import anthropic
|
||||
|
||||
client = anthropic.Anthropic(
|
||||
# defaults to os.environ.get("ANTHROPIC_API_KEY")
|
||||
api_key="YOUR-KEY",
|
||||
)
|
||||
|
||||
final_output = []
|
||||
|
||||
for page_combination in page_combinations:
|
||||
|
||||
prompt = "Here are two pages from a companies website, your job is to find any contradictions or differences in opinion between the two pages, this could be caused by outdated information or other. If you find any contradictions, list them out and provide a brief explanation of why they are contradictory or differing. Make sure the explanation is specific and concise. It is okay if you don't find any contradictions, just say 'No contradictions found' and nothing else. Here are the pages: " + "\n\n".join(page_combination)
|
||||
|
||||
message = client.messages.create(
|
||||
model="claude-3-opus-20240229",
|
||||
max_tokens=1000,
|
||||
temperature=0.0,
|
||||
system="You are an assistant that helps find contradictions or differences in opinion between pages in a company website and knowledge base. This could be caused by outdated information in the knowledge base.",
|
||||
messages=[
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
)
|
||||
final_output.append(message.content)
|
||||
|
||||
```
|
||||
|
||||
## That's about it!
|
||||
|
||||
You have now built an agent that looks at your website and spots any inconsistencies it might have.
|
||||
|
||||
If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev).
|
Loading…
Reference in New Issue
Block a user