Merge branch 'main' into feat/coupons

2024-04-26 14:23:35 -07:00 · 2024-04-26 14:23:35 -07:00 · 0a607b9efa
commit 0a607b9efa
parent 1f48998970 6cf147f5ec
29 changed files with 4021 additions and 82 deletions
--- a/README.md
+++ b/README.md
@ -147,7 +147,7 @@ curl -X POST https://api.firecrawl.dev/v0/search \
 }
 ```

-Coming soon to the SDKs and Integrations.
+Coming soon to the Langchain and LLama Index integrations.

 ## Using Python SDK

@ -180,6 +180,15 @@ url = 'https://example.com'
 scraped_data = app.scrape_url(url)
 ```

+### Search for a query
+
+Performs a web search, retrieve the top results, extract data from each page, and returns their markdown.
+
+```python
+query = 'what is mendable?'
+search_result = app.search(query)
+```
+
 ## Contributing

 We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.
--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@ -373,6 +373,8 @@
            "type": "boolean"
          },
          "data": {
+            "type": "array",
+            "items": {
              "type": "object",
              "properties": {
                "url": {
@ -406,6 +408,7 @@
              }
            }
          }
+        }
      },
      "CrawlResponse": {
        "type": "object",
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -183,6 +183,8 @@ const TEST_URL = "http://127.0.0.1:3002";
        expect(response.statusCode).toBe(401);
      });

+
+      
      it("should return a successful response with a valid API key", async () => {
        const response = await request(TEST_URL)
          .post("/v0/search")
--- a/apps/api/src/controllers/auth.ts
+++ b/apps/api/src/controllers/auth.ts
@ -51,9 +51,19 @@ export async function supaAuthenticateUser(

  if (
    token === "this_is_just_a_preview_token" &&
-    (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview)
+    (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
  ) {
    return { success: true, team_id: "preview" };
+    // check the origin of the request and make sure its from firecrawl.dev
+    // const origin = req.headers.origin;
+    // if (origin && origin.includes("firecrawl.dev")){
+    //   return { success: true, team_id: "preview" };
+    // }
+    // if(process.env.ENV !== "production") {
+    //   return { success: true, team_id: "preview" };
+    // }
+
+    // return { success: false, error: "Unauthorized: Invalid token", status: 401 };
  }

  const normalizedApi = parseApi(token);
--- a/apps/api/src/search/index.ts
+++ b/apps/api/src/search/index.ts
@ -2,6 +2,9 @@ import { SearchResult } from "../../src/lib/entities";
 import { google_search } from "./googlesearch";
 import { serper_search } from "./serper";

+
+
+
 export async function search({
  query,
  advanced = false,
--- a/apps/api/src/services/billing/credit_billing.ts
+++ b/apps/api/src/services/billing/credit_billing.ts
@ -1,7 +1,7 @@
 import { withAuth } from "../../lib/withAuth";
 import { supabase_service } from "../supabase";

-const FREE_CREDITS = 100;
+const FREE_CREDITS = 300;

 export async function billTeam(team_id: string, credits: number) {
  return withAuth(supaBillTeam)(team_id, credits);
--- a/apps/js-sdk/firecrawl/build/index.js
+++ b/apps/js-sdk/firecrawl/build/index.js
@ -61,6 +61,43 @@ export default class FirecrawlApp {
            return { success: false, error: 'Internal server error.' };
        });
    }
+    /**
+     * Searches for a query using the Firecrawl API.
+     * @param {string} query - The query to search for.
+     * @param {Params | null} params - Additional parameters for the search request.
+     * @returns {Promise<SearchResponse>} The response from the search operation.
+     */
+    search(query_1) {
+        return __awaiter(this, arguments, void 0, function* (query, params = null) {
+            const headers = {
+                'Content-Type': 'application/json',
+                'Authorization': `Bearer ${this.apiKey}`,
+            };
+            let jsonData = { query };
+            if (params) {
+                jsonData = Object.assign(Object.assign({}, jsonData), params);
+            }
+            try {
+                const response = yield axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers });
+                if (response.status === 200) {
+                    const responseData = response.data;
+                    if (responseData.success) {
+                        return responseData;
+                    }
+                    else {
+                        throw new Error(`Failed to search. Error: ${responseData.error}`);
+                    }
+                }
+                else {
+                    this.handleError(response, 'search');
+                }
+            }
+            catch (error) {
+                throw new Error(error.message);
+            }
+            return { success: false, error: 'Internal server error.' };
+        });
+    }
    /**
     * Initiates a crawl job for a URL using the Firecrawl API.
     * @param {string} url - The URL to crawl.
--- a/apps/js-sdk/firecrawl/jest.config.cjs
+++ b/apps/js-sdk/firecrawl/jest.config.cjs
@ -0,0 +1,5 @@
+/** @type {import('ts-jest').JestConfigWithTsJest} */
+module.exports = {
+  preset: 'ts-jest',
+  testEnvironment: 'node',
+};
--- a/apps/js-sdk/firecrawl/package-lock.json
+++ b/apps/js-sdk/firecrawl/package-lock.json
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@ -1,6 +1,6 @@
 {
  "name": "@mendable/firecrawl-js",
-  "version": "0.0.13",
+  "version": "0.0.16",
  "description": "JavaScript SDK for Firecrawl API",
  "main": "build/index.js",
  "types": "types/index.d.ts",
@ -8,7 +8,7 @@
  "scripts": {
    "build": "tsc",
    "publish": "npm run build && npm publish --access public",
-    "test": "echo \"Error: no test specified\" && exit 1"
+    "test": "jest src/**/*.test.ts"
  },
  "repository": {
    "type": "git",
@ -17,17 +17,18 @@
  "author": "Mendable.ai",
  "license": "MIT",
  "dependencies": {
-    "axios": "^1.6.8",
-    "dotenv": "^16.4.5"
+    "axios": "^1.6.8"
  },
  "bugs": {
    "url": "https://github.com/mendableai/firecrawl/issues"
  },
  "homepage": "https://github.com/mendableai/firecrawl#readme",
  "devDependencies": {
+    "@jest/globals": "^29.7.0",
    "@types/axios": "^0.14.0",
-    "@types/dotenv": "^8.2.0",
    "@types/node": "^20.12.7",
+    "jest": "^29.7.0",
+    "ts-jest": "^29.1.2",
    "typescript": "^5.4.5"
  },
  "keywords": [
--- a/apps/js-sdk/firecrawl/src/tests/fixtures/scrape.json
+++ b/apps/js-sdk/firecrawl/src/tests/fixtures/scrape.json
--- a/apps/js-sdk/firecrawl/src/tests/index.test.ts
+++ b/apps/js-sdk/firecrawl/src/tests/index.test.ts
@ -0,0 +1,48 @@
+import { describe, test, expect, jest } from '@jest/globals';
+import axios from 'axios';
+import FirecrawlApp from '../index';
+
+import { readFile } from 'fs/promises';
+import { join } from 'path';
+
+// Mock jest and set the type
+jest.mock('axios');
+const mockedAxios = axios as jest.Mocked<typeof axios>;
+
+// Get the fixure data from the JSON file in ./fixtures
+async function loadFixture(name: string): Promise<string> {
+  return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8')
+}
+
+describe('the firecrawl JS SDK', () => {
+
+  test('Should require an API key to instantiate FirecrawlApp', async () => {
+    const fn = () => {
+      new FirecrawlApp({ apiKey: undefined });
+    };
+    expect(fn).toThrow('No API key provided');
+  });
+
+  test('Should return scraped data from a /scrape API call', async () => {
+    const mockData = await loadFixture('scrape');
+    mockedAxios.post.mockResolvedValue({
+      status: 200,
+      data: JSON.parse(mockData),
+    });
+
+    const apiKey = 'YOUR_API_KEY'
+    const app = new FirecrawlApp({ apiKey });
+    // Scrape a single URL
+    const url = 'https://mendable.ai';
+    const scrapedData = await app.scrapeUrl(url);
+
+    expect(mockedAxios.post).toHaveBeenCalledTimes(1);
+    expect(mockedAxios.post).toHaveBeenCalledWith(
+      expect.stringMatching(/^https:\/\/api.firecrawl.dev/),
+      expect.objectContaining({ url }),
+      expect.objectContaining({ headers: expect.objectContaining({'Authorization': `Bearer ${apiKey}`}) }),
+    )
+    expect(scrapedData.success).toBe(true);
+    expect(scrapedData.data.metadata.title).toEqual('Mendable');
+  });
+})
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -1,6 +1,4 @@
 import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios';
-import dotenv from 'dotenv';
-dotenv.config();

 /**
 * Configuration interface for FirecrawlApp.
@ -25,6 +23,14 @@ export interface ScrapeResponse {
  error?: string;
 }

+/**
+ * Response interface for searching operations.
+ */
+export interface SearchResponse {
+  success: boolean;
+  data?: any;
+  error?: string;
+}
 /**
 * Response interface for crawling operations.
 */
@ -57,7 +63,7 @@ export default class FirecrawlApp {
   * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
   */
  constructor({ apiKey = null }: FirecrawlAppConfig) {
-    this.apiKey = apiKey || process.env.FIRECRAWL_API_KEY || '';
+    this.apiKey = apiKey || '';
    if (!this.apiKey) {
      throw new Error('No API key provided');
    }
@ -96,6 +102,39 @@ export default class FirecrawlApp {
    return { success: false, error: 'Internal server error.' };
  }

+  /**
+   * Searches for a query using the Firecrawl API.
+   * @param {string} query - The query to search for.
+   * @param {Params | null} params - Additional parameters for the search request.
+   * @returns {Promise<SearchResponse>} The response from the search operation.
+   */
+  async search(query: string, params: Params | null = null): Promise<SearchResponse> {
+    const headers: AxiosRequestHeaders = {
+      'Content-Type': 'application/json',
+      'Authorization': `Bearer ${this.apiKey}`,
+    } as AxiosRequestHeaders;
+    let jsonData: Params = { query };
+    if (params) {
+      jsonData = { ...jsonData, ...params };
+    }
+    try {
+      const response: AxiosResponse = await axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers });
+      if (response.status === 200) {
+        const responseData = response.data;
+        if (responseData.success) {
+          return responseData; 
+        } else {
+          throw new Error(`Failed to search. Error: ${responseData.error}`);
+        }
+      } else {
+        this.handleError(response, 'search');
+      }
+    } catch (error: any) {
+      throw new Error(error.message);
+    }
+    return { success: false, error: 'Internal server error.' };
+  }
+
  /**
   * Initiates a crawl job for a URL using the Firecrawl API.
   * @param {string} url - The URL to crawl.
--- a/apps/js-sdk/firecrawl/types/index.d.ts
+++ b/apps/js-sdk/firecrawl/types/index.d.ts
@ -19,6 +19,14 @@ export interface ScrapeResponse {
    data?: any;
    error?: string;
 }
+/**
+ * Response interface for searching operations.
+ */
+export interface SearchResponse {
+    success: boolean;
+    data?: any;
+    error?: string;
+}
 /**
 * Response interface for crawling operations.
 */
@ -55,6 +63,13 @@ export default class FirecrawlApp {
     * @returns {Promise<ScrapeResponse>} The response from the scrape operation.
     */
    scrapeUrl(url: string, params?: Params | null): Promise<ScrapeResponse>;
+    /**
+     * Searches for a query using the Firecrawl API.
+     * @param {string} query - The query to search for.
+     * @param {Params | null} params - Additional parameters for the search request.
+     * @returns {Promise<SearchResponse>} The response from the search operation.
+     */
+    search(query: string, params?: Params | null): Promise<SearchResponse>;
    /**
     * Initiates a crawl job for a URL using the Firecrawl API.
     * @param {string} url - The URL to crawl.
--- a/apps/js-sdk/package-lock.json
+++ b/apps/js-sdk/package-lock.json
@ -9,14 +9,14 @@
      "version": "1.0.0",
      "license": "ISC",
      "dependencies": {
-        "@mendable/firecrawl-js": "^0.0.8",
+        "@mendable/firecrawl-js": "^0.0.15",
        "axios": "^1.6.8"
      }
    },
    "node_modules/@mendable/firecrawl-js": {
-      "version": "0.0.8",
-      "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.8.tgz",
-      "integrity": "sha512-dD7eA5X6UT8CM3z7qCqHgA4YbCsdwmmlaT/L0/ozM6gGvb0PnJMoB+e51+n4lAW8mxXOvHGbq9nrgBT1wEhhhw==",
+      "version": "0.0.15",
+      "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.15.tgz",
+      "integrity": "sha512-e3iCCrLIiEh+jEDerGV9Uhdkn8ymo+sG+k3osCwPg51xW1xUdAnmlcHrcJoR43RvKXdvD/lqoxg8odUEsqyH+w==",
      "dependencies": {
        "axios": "^1.6.8",
        "dotenv": "^16.4.5"
--- a/apps/js-sdk/package.json
+++ b/apps/js-sdk/package.json
@ -11,7 +11,7 @@
  "author": "",
  "license": "ISC",
  "dependencies": {
-    "@mendable/firecrawl-js": "^0.0.8",
+    "@mendable/firecrawl-js": "^0.0.15",
    "axios": "^1.6.8"
  }
 }
--- a/apps/playwright-service/main.py
+++ b/apps/playwright-service/main.py
@ -1,29 +1,36 @@
-from fastapi import FastAPI, Response
-from playwright.async_api import async_playwright
-import os
+from fastapi import FastAPI
+from playwright.async_api import async_playwright, Browser
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
+
 app = FastAPI()

-from pydantic import BaseModel

 class UrlModel(BaseModel):
    url: str

-@app.post("/html")  # Kept as POST to accept body parameters
-async def root(body: UrlModel):  # Using Pydantic model for request body
-    async with async_playwright() as p:
-        browser = await p.chromium.launch()

-        context = await browser.new_context()
-        page = await context.new_page()
+browser: Browser = None

-        await page.goto(body.url)  # Adjusted to use the url from the request body model
-        page_content = await page.content()  # Get the HTML content of the page

-        await context.close()
+@app.on_event("startup")
+async def startup_event():
+    global browser
+    playwright = await async_playwright().start()
+    browser = await playwright.chromium.launch()
+
+
+@app.on_event("shutdown")
+async def shutdown_event():
    await browser.close()

+
+@app.post("/html")
+async def root(body: UrlModel):
+    context = await browser.new_context()
+    page = await context.new_page()
+    await page.goto(body.url)
+    page_content = await page.content()
+    await context.close()
    json_compatible_item_data = {"content": page_content}
    return JSONResponse(content=json_compatible_item_data)
-    
--- a/apps/python-sdk/README.md
+++ b/apps/python-sdk/README.md
@ -47,6 +47,15 @@ url = 'https://example.com'
 scraped_data = app.scrape_url(url)
 ```

+### Search for a query
+
+Used to search the web, get the most relevant results, scrap each page and return the markdown.
+
+```python
+query = 'what is mendable?'
+search_result = app.search(query)
+```
+
 ### Crawling a Website

 To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
--- a/apps/python-sdk/build/lib/firecrawl/firecrawl.py
+++ b/apps/python-sdk/build/lib/firecrawl/firecrawl.py
@ -33,6 +33,32 @@ class FirecrawlApp:
        else:
            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
        
+    def search(self, query, params=None):
+        headers = {
+            'Content-Type': 'application/json',
+            'Authorization': f'Bearer {self.api_key}'
+        }
+        json_data = {'query': query}
+        if params:
+            json_data.update(params)
+        response = requests.post(
+            'https://api.firecrawl.dev/v0/search',
+            headers=headers,
+            json=json_data
+        )
+        if response.status_code == 200:
+            response = response.json()
+            if response['success'] == True:
+                return response['data']
+            else:
+                raise Exception(f'Failed to search. Error: {response["error"]}')
+            
+        elif response.status_code in [402, 409, 500]:
+            error_message = response.json().get('error', 'Unknown error occurred')
+            raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
+        else:
+            raise Exception(f'Failed to search. Status code: {response.status_code}')
+
    def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
        headers = self._prepare_headers()
        json_data = {'url': url}
--- a/apps/python-sdk/dist/firecrawl-py-0.0.5.tar.gz
+++ b/apps/python-sdk/dist/firecrawl-py-0.0.5.tar.gz
--- a/apps/python-sdk/dist/firecrawl-py-0.0.6.tar.gz
+++ b/apps/python-sdk/dist/firecrawl-py-0.0.6.tar.gz
--- a/apps/python-sdk/dist/firecrawl_py-0.0.5-py3-none-any.whl
+++ b/apps/python-sdk/dist/firecrawl_py-0.0.5-py3-none-any.whl
--- a/apps/python-sdk/dist/firecrawl_py-0.0.6-py3-none-any.whl
+++ b/apps/python-sdk/dist/firecrawl_py-0.0.6-py3-none-any.whl
--- a/apps/python-sdk/firecrawl/pycache/init.cpython-311.pyc
+++ b/apps/python-sdk/firecrawl/pycache/init.cpython-311.pyc
--- a/apps/python-sdk/firecrawl/pycache/firecrawl.cpython-311.pyc
+++ b/apps/python-sdk/firecrawl/pycache/firecrawl.cpython-311.pyc
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -33,6 +33,32 @@ class FirecrawlApp:
        else:
            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
        
+    def search(self, query, params=None):
+        headers = {
+            'Content-Type': 'application/json',
+            'Authorization': f'Bearer {self.api_key}'
+        }
+        json_data = {'query': query}
+        if params:
+            json_data.update(params)
+        response = requests.post(
+            'https://api.firecrawl.dev/v0/search',
+            headers=headers,
+            json=json_data
+        )
+        if response.status_code == 200:
+            response = response.json()
+            if response['success'] == True:
+                return response['data']
+            else:
+                raise Exception(f'Failed to search. Error: {response["error"]}')
+            
+        elif response.status_code in [402, 409, 500]:
+            error_message = response.json().get('error', 'Unknown error occurred')
+            raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
+        else:
+            raise Exception(f'Failed to search. Status code: {response.status_code}')
+
    def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
        headers = self._prepare_headers()
        json_data = {'url': url}
--- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO
+++ b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO
@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: firecrawl-py
-Version: 0.0.5
+Version: 0.0.6
 Summary: Python SDK for Firecrawl API
-Home-page: https://github.com/mendableai/firecrawl-py
+Home-page: https://github.com/mendableai/firecrawl
 Author: Mendable.ai
 Author-email: nick@mendable.ai
--- a/apps/python-sdk/setup.py
+++ b/apps/python-sdk/setup.py
@ -2,8 +2,8 @@ from setuptools import setup, find_packages

 setup(
    name='firecrawl-py',
-    version='0.0.5',
-    url='https://github.com/mendableai/firecrawl-py',
+    version='0.0.6',
+    url='https://github.com/mendableai/firecrawl',
    author='Mendable.ai',
    author_email='nick@mendable.ai',
    description='Python SDK for Firecrawl API',
--- a/tutorials/contradiction-testing-using-llms.mdx
+++ b/tutorials/contradiction-testing-using-llms.mdx
@ -0,0 +1,78 @@
+# Build an agent that check your website for contradictions
+
+Learn how to use Firecrawl and Claude to scrape your website's data and look for contradictions and inconsistencies in a few lines of code. When you are shipping fast, data is bound to get stale, with FireCrawl and LLMs you can make sure your public web data is always consistent! We will be using Opus's huge 200k context window and Firecrawl's parellization, making this process accurate and fast.
+
+## Setup
+
+Install our python dependencies, including anthropic and firecrawl-py.
+
+```bash
+pip install firecrawl-py anthropic
+```
+
+## Getting your Claude and Firecrawl API Keys
+
+To use Claude Opus and Firecrawl, you will need to get your API keys. You can get your Anthropic API key from [here](https://www.anthropic.com/) and your Firecrawl API key from [here](https://firecrawl.dev).
+
+## Load website with Firecrawl
+
+To be able to get all the data from our website page put it into an easy to read format for the LLM, we will use [FireCrawl](https://firecrawl.dev). It handles by-passing JS-blocked websites, extracting the main content, and outputting in a LLM-readable format for increased accuracy.
+
+Here is how we will scrape a website url using Firecrawl-py
+
+```python
+from firecrawl import FirecrawlApp
+
+app = FirecrawlApp(api_key="YOUR-KEY")
+
+crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*','usecases/*']}})
+
+print(crawl_result)
+```
+
+With all of the web data we want scraped and in a clean format, we can move onto the next step.
+
+## Combination and Generation
+
+Now that we have the website data, let's pair up every page and run every combination through Opus for analysis.
+
+```python
+from itertools import combinations
+
+page_combinations = []
+
+for first_page, second_page in combinations(crawl_result, 2):
+    combined_string = "First Page:\n" + first_page['markdown'] + "\n\nSecond Page:\n" + second_page['markdown']
+    page_combinations.append(combined_string)
+
+import anthropic
+
+client = anthropic.Anthropic(
+    # defaults to os.environ.get("ANTHROPIC_API_KEY")
+    api_key="YOUR-KEY",
+)
+
+final_output = []
+
+for page_combination in page_combinations:
+
+    prompt = "Here are two pages from a companies website, your job is to find any contradictions or differences in opinion between the two pages, this could be caused by outdated information or other. If you find any contradictions, list them out and provide a brief explanation of why they are contradictory or differing. Make sure the explanation is specific and concise. It is okay if you don't find any contradictions, just say 'No contradictions found' and nothing else. Here are the pages: " + "\n\n".join(page_combination)
+
+    message = client.messages.create(
+        model="claude-3-opus-20240229",
+        max_tokens=1000,
+        temperature=0.0,
+        system="You are an assistant that helps find contradictions or differences in opinion between pages in a company website and knowledge base. This could be caused by outdated information in the knowledge base.",
+        messages=[
+            {"role": "user", "content": prompt}
+        ]
+    )
+    final_output.append(message.content)
+
+```
+
+## That's about it!
+
+You have now built an agent that looks at your website and spots any inconsistencies it might have.
+
+If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev).