Merge pull request #56 from mendableai/nsc/mvp-search

[Feat:mvp] Search Endpoint => serp api + firecrawl => 🔥 🔍
2024-04-23 17:17:40 -07:00 · 2024-04-23 17:17:40 -07:00 · dda77dce05
commit dda77dce05
parent 6a1c7d48ae fdb2789eaa
12 changed files with 482 additions and 12 deletions
--- a/apps/api/.env.example
+++ b/apps/api/.env.example
@ -22,4 +22,5 @@ BULL_AUTH_KEY= #
 LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
 PLAYWRIGHT_MICROSERVICE_URL=  # set if you'd like to run a playwright fallback
 LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
 SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
 SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
--- a/apps/api/src/tests/e2e_noAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_noAuth/index.test.ts
@ -132,6 +132,33 @@ describe("E2E Tests for API Routes with No Authentication", () => {
    });
  });
  describe("POST /v0/search", () => {
    it("should require not authorization", async () => {
      const response = await request(TEST_URL).post("/v0/search");
      expect(response.statusCode).not.toBe(401);
    });
    it("should return no error response with an invalid API key", async () => {
      const response = await request(TEST_URL)
        .post("/v0/search")
        .set("Authorization", `Bearer invalid-api-key`)
        .set("Content-Type", "application/json")
        .send({ query: "test" });
      expect(response.statusCode).not.toBe(401);
    });
    it("should return a successful response without a valid API key", async () => {
      const response = await request(TEST_URL)
        .post("/v0/search")
        .set("Content-Type", "application/json")
        .send({ query: "test" });
      expect(response.statusCode).toBe(200);
      expect(response.body).toHaveProperty("success");
      expect(response.body.success).toBe(true);
      expect(response.body).toHaveProperty("data");
    }, 20000);
  });
  describe("GET /v0/crawl/status/:jobId", () => {
    it("should not require authorization", async () => {
      const response = await request(TEST_URL).get("/v0/crawl/status/123");
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -168,6 +168,34 @@ const TEST_URL = "http://127.0.0.1:3002";
      });
    });
    describe("POST /v0/search", () => {
      it("should require authorization", async () => {
        const response = await request(TEST_URL).post("/v0/search");
        expect(response.statusCode).toBe(401);
      });
      it("should return an error response with an invalid API key", async () => {
        const response = await request(TEST_URL)
          .post("/v0/search")
          .set("Authorization", `Bearer invalid-api-key`)
          .set("Content-Type", "application/json")
          .send({ query: "test" });
        expect(response.statusCode).toBe(401);
      });
      it("should return a successful response with a valid API key", async () => {
        const response = await request(TEST_URL)
          .post("/v0/search")
          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
          .set("Content-Type", "application/json")
          .send({ query: "test" });
        expect(response.statusCode).toBe(200);
        expect(response.body).toHaveProperty("success");
        expect(response.body.success).toBe(true);
        expect(response.body).toHaveProperty("data");
      }, 20000); 
    });
    describe("GET /v0/crawl/status/:jobId", () => {
      it("should require authorization", async () => {
        const response = await request(TEST_URL).get("/v0/crawl/status/123");
--- a/apps/api/src/controllers/search.ts
+++ b/apps/api/src/controllers/search.ts
@ -0,0 +1,156 @@
 import { Request, Response } from "express";
 import { WebScraperDataProvider } from "../scraper/WebScraper";
 import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
 import { authenticateUser } from "./auth";
 import { RateLimiterMode } from "../types";
 import { logJob } from "../services/logging/log_job";
 import { PageOptions, SearchOptions } from "../lib/entities";
 import { search } from "../search";
 import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
 export async function searchHelper(
  req: Request,
  team_id: string,
  crawlerOptions: any,
  pageOptions: PageOptions,
  searchOptions: SearchOptions
 ): Promise<{
  success: boolean;
  error?: string;
  data?: any;
  returnCode: number;
 }> {
  const query = req.body.query;
  const advanced = false;
  if (!query) {
    return { success: false, error: "Query is required", returnCode: 400 };
  }
  const tbs = searchOptions.tbs ?? null;
  const filter = searchOptions.filter ?? null;
  let res = await search({query: query, advanced: advanced, num_results: searchOptions.limit ?? 7, tbs: tbs, filter: filter});
  let justSearch = pageOptions.fetchPageContent === false;
  if (justSearch) {
    return { success: true, data: res, returnCode: 200 };
  }
  res = res.filter((r) => !isUrlBlocked(r));
  if (res.length === 0) {
    return { success: true, error: "No search results found", returnCode: 200 };
  }
  // filter out social media links
  const a = new WebScraperDataProvider();
  await a.setOptions({
    mode: "single_urls",
    urls: res.map((r) => r),
    crawlerOptions: {
      ...crawlerOptions,
    },
    pageOptions: {
      ...pageOptions,
      onlyMainContent: pageOptions?.onlyMainContent ?? true,
      fetchPageContent: pageOptions?.fetchPageContent ?? true,
      fallback: false,
    },
  });
  const docs = await a.getDocuments(true);
  if (docs.length === 0) {
    return { success: true, error: "No search results found", returnCode: 200 };
  }
  // make sure doc.content is not empty
  const filteredDocs = docs.filter(
    (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
  );
  if (filteredDocs.length === 0) {
    return { success: true, error: "No page found", returnCode: 200 };
  }
  const { success, credit_usage } = await billTeam(
    team_id,
    filteredDocs.length
  );
  if (!success) {
    return {
      success: false,
      error:
        "Failed to bill team. Insufficient credits or subscription not found.",
      returnCode: 402,
    };
  }
  return {
    success: true,
    data: filteredDocs,
    returnCode: 200,
  };
 }
 export async function searchController(req: Request, res: Response) {
  try {
    // make sure to authenticate user first, Bearer <token>
    const { success, team_id, error, status } = await authenticateUser(
      req,
      res,
      RateLimiterMode.Search
    );
    if (!success) {
      return res.status(status).json({ error });
    }
    const crawlerOptions = req.body.crawlerOptions ?? {};
    const pageOptions = req.body.pageOptions ?? {
      onlyMainContent: true,
      fetchPageContent: true,
      fallback: false,
    };
    const origin = req.body.origin ?? "api";
    const searchOptions = req.body.searchOptions ?? { limit: 7 };
    try {
      const { success: creditsCheckSuccess, message: creditsCheckMessage } =
        await checkTeamCredits(team_id, 1);
      if (!creditsCheckSuccess) {
        return res.status(402).json({ error: "Insufficient credits" });
      }
    } catch (error) {
      console.error(error);
      return res.status(500).json({ error: "Internal server error" });
    }
    const startTime = new Date().getTime();
    const result = await searchHelper(
      req,
      team_id,
      crawlerOptions,
      pageOptions,
      searchOptions
    );
    const endTime = new Date().getTime();
    const timeTakenInSeconds = (endTime - startTime) / 1000;
    logJob({
      success: result.success,
      message: result.error,
      num_docs: 1,
      docs: [result.data],
      time_taken: timeTakenInSeconds,
      team_id: team_id,
      mode: "search",
      url: req.body.url,
      crawlerOptions: crawlerOptions,
      pageOptions: pageOptions,
      origin: origin,
    });
    return res.status(result.returnCode).json(result);
  } catch (error) {
    console.error(error);
    return res.status(500).json({ error: error.message });
  }
 }
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -11,7 +11,17 @@ export interface Progress {
 export type PageOptions = {
  onlyMainContent?: boolean;
  fallback?: boolean;
  fetchPageContent?: boolean;
 };
 export type SearchOptions = {
  limit?: number;
  tbs?: string;
  filter?: string;
 };
 export type WebScraperOptions = {
  urls: string[];
  mode: "single_urls" | "sitemap" | "crawl";
@ -30,6 +40,7 @@ export type WebScraperOptions = {
 export class Document {
  id?: string;
  url?: string; // Used only in /search for now
  content: string;
  markdown?: string;
  createdAt?: Date;
--- a/apps/api/src/routes/v0.ts
+++ b/apps/api/src/routes/v0.ts
@ -4,6 +4,7 @@ import { crawlStatusController } from "../../src/controllers/crawl-status";
 import { scrapeController } from "../../src/controllers/scrape";
 import { crawlPreviewController } from "../../src/controllers/crawlPreview";
 import { crawlJobStatusPreviewController } from "../../src/controllers/status";
 import { searchController } from "../../src/controllers/search";
 export const v0Router = express.Router();
@ -12,3 +13,7 @@ v0Router.post("/v0/crawl", crawlController);
 v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController);
 v0Router.get("/v0/crawl/status/:jobId", crawlStatusController);
 v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController);
 // Search routes
 v0Router.post("/v0/search", searchController);
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -4,9 +4,7 @@ import { extractMetadata } from "./utils/metadata";
 import dotenv from "dotenv";
 import { Document, PageOptions } from "../../lib/entities";
 import { parseMarkdown } from "../../lib/html-to-markdown";
 import { parseTablesToMarkdown } from "./utils/parseTable";
 import { excludeNonMainTags } from "./utils/excludeTags";
 // import puppeteer from "puppeteer";
 dotenv.config();
@ -25,13 +23,14 @@ export async function scrapWithCustomFirecrawl(
 export async function scrapWithScrapingBee(
  url: string,
-  wait_browser: string = "domcontentloaded"
+  wait_browser: string = "domcontentloaded",
  timeout: number = 15000
 ): Promise<string> {
  try {
    const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
    const response = await client.get({
      url: url,
-      params: { timeout: 15000, wait_browser: wait_browser },
+      params: { timeout: timeout, wait_browser: wait_browser },
      headers: { "ScrapingService-Request": "TRUE" },
    });
@ -108,11 +107,11 @@ export async function scrapSingleUrl(
    let text = "";
    switch (method) {
      case "firecrawl-scraper":
-        text = await scrapWithCustomFirecrawl(url);
+        text = await scrapWithCustomFirecrawl(url,);
        break;
      case "scrapingBee":
        if (process.env.SCRAPING_BEE_API_KEY) {
-          text = await scrapWithScrapingBee(url);
+          text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback  === false? 7000 : 15000);
        }
        break;
      case "playwright":
@ -155,6 +154,17 @@ export async function scrapSingleUrl(
    // }
    let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
    // Basically means that it is using /search endpoint
    if(pageOptions.fallback === false){
      const soup = cheerio.load(html);
      const metadata = extractMetadata(soup, urlToScrap);
      return {
        url: urlToScrap,
        content: text,
        markdown: text,
        metadata: { ...metadata, sourceURL: urlToScrap },
      } as Document;
    }
    if (!text || text.length < 100) {
      console.log("Falling back to playwright");
      [text, html] = await attemptScraping(urlToScrap, "playwright");
--- a/apps/api/src/scraper/WebScraper/utils/metadata.ts
+++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts
@ -1,4 +1,3 @@
 // import * as cheerio from 'cheerio';
 import { CheerioAPI } from "cheerio";
 interface Metadata {
  title?: string;
@ -8,6 +7,14 @@ interface Metadata {
  robots?: string;
  ogTitle?: string;
  ogDescription?: string;
  ogUrl?: string;
  ogImage?: string;
  ogAudio?: string;
  ogDeterminer?: string;
  ogLocale?: string;
  ogLocaleAlternate?: string[];
  ogSiteName?: string;
  ogVideo?: string;
  dctermsCreated?: string;
  dcDateCreated?: string;
  dcDate?: string;
@ -17,7 +24,6 @@ interface Metadata {
  dctermsSubject?: string;
  dcSubject?: string;
  dcDescription?: string;
  ogImage?: string;
  dctermsKeywords?: string;
  modifiedTime?: string;
  publishedTime?: string;
@ -33,6 +39,14 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
  let robots: string | null = null;
  let ogTitle: string | null = null;
  let ogDescription: string | null = null;
  let ogUrl: string | null = null;
  let ogImage: string | null = null;
  let ogAudio: string | null = null;
  let ogDeterminer: string | null = null;
  let ogLocale: string | null = null;
  let ogLocaleAlternate: string[] | null = null;
  let ogSiteName: string | null = null;
  let ogVideo: string | null = null;
  let dctermsCreated: string | null = null;
  let dcDateCreated: string | null = null;
  let dcDate: string | null = null;
@ -42,7 +56,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
  let dctermsSubject: string | null = null;
  let dcSubject: string | null = null;
  let dcDescription: string | null = null;
  let ogImage: string | null = null;
  let dctermsKeywords: string | null = null;
  let modifiedTime: string | null = null;
  let publishedTime: string | null = null;
@ -62,11 +75,18 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
    robots = soup('meta[name="robots"]').attr("content") || null;
    ogTitle = soup('meta[property="og:title"]').attr("content") || null;
    ogDescription = soup('meta[property="og:description"]').attr("content") || null;
    ogUrl = soup('meta[property="og:url"]').attr("content") || null;
    ogImage = soup('meta[property="og:image"]').attr("content") || null;
    ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
    ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null;
    ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
    ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null;
    ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
    ogVideo = soup('meta[property="og:video"]').attr("content") || null;
    articleSection = soup('meta[name="article:section"]').attr("content") || null;
    articleTag = soup('meta[name="article:tag"]').attr("content") || null;
    publishedTime = soup('meta[property="article:published_time"]').attr("content") || null;
    modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null;
    ogImage = soup('meta[property="og:image"]').attr("content") || null;
    dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null;
    dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
    dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
@ -90,6 +110,14 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
    ...(robots ? { robots } : {}),
    ...(ogTitle ? { ogTitle } : {}),
    ...(ogDescription ? { ogDescription } : {}),
    ...(ogUrl ? { ogUrl } : {}),
    ...(ogImage ? { ogImage } : {}),
    ...(ogAudio ? { ogAudio } : {}),
    ...(ogDeterminer ? { ogDeterminer } : {}),
    ...(ogLocale ? { ogLocale } : {}),
    ...(ogLocaleAlternate ? { ogLocaleAlternate } : {}),
    ...(ogSiteName ? { ogSiteName } : {}),
    ...(ogVideo ? { ogVideo } : {}),
    ...(dctermsCreated ? { dctermsCreated } : {}),
    ...(dcDateCreated ? { dcDateCreated } : {}),
    ...(dcDate ? { dcDate } : {}),
@ -99,7 +127,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
    ...(dctermsSubject ? { dctermsSubject } : {}),
    ...(dcSubject ? { dcSubject } : {}),
    ...(dcDescription ? { dcDescription } : {}),
    ...(ogImage ? { ogImage } : {}),
    ...(dctermsKeywords ? { dctermsKeywords } : {}),
    ...(modifiedTime ? { modifiedTime } : {}),
    ...(publishedTime ? { publishedTime } : {}),
--- a/apps/api/src/search/googlesearch.ts
+++ b/apps/api/src/search/googlesearch.ts
@ -0,0 +1,131 @@
 import axios from 'axios';
 import * as cheerio from 'cheerio';
 import * as querystring from 'querystring';
 const _useragent_list = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
 ];
 function get_useragent(): string {
    return _useragent_list[Math.floor(Math.random() * _useragent_list.length)];
 }
 async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number, tbs: string = null, filter: string = null) {
    const params = {
        "q": term,
        "num": results,  // Number of results to return
        "hl": lang,
        "start": start,
    };
    if (tbs) {
        params["tbs"] = tbs;
    }
    if (filter) {
        params["filter"] = filter;
    }
    try {
        const resp = await axios.get("https://www.google.com/search", {
            headers: {
                "User-Agent": get_useragent()
            },
            params: params,
            proxy: proxies,
            timeout: timeout,
        });
        return resp;
    } catch (error) {
        if (error.response && error.response.status === 429) {
            throw new Error('Google Search: Too many requests, try again later.');
        }
        throw error;
    }
 }
 class SearchResult {
    url: string;
    title: string;
    description: string;
    constructor(url: string, title: string, description: string) {
        this.url = url;
        this.title = title;
        this.description = description;
    }
    toString(): string {
        return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
    }
 }
 export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<string[]> {
    const escaped_term = querystring.escape(term);
    let proxies = null;
    if (proxy) {
        if (proxy.startsWith("https")) {
            proxies = {"https": proxy};
        } else {
            proxies = {"http": proxy};
        }
    }
    // TODO: knowledge graph, answer box, etc.
    let start = 0;
    let results : string[] = [];
    let attempts = 0;
    const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop
    while (start < num_results && attempts < maxAttempts) {
        try {
            const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout, tbs, filter);
            const $ = cheerio.load(resp.data);
            const result_block = $("div.g");
            if (result_block.length === 0) {
                start += 1;
                attempts += 1;
            } else {
                attempts = 0; // Reset attempts if we have results
            }
            result_block.each((index, element) => {
                const linkElement = $(element).find("a");
                const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null;
                const title = $(element).find("h3");
                const ogImage = $(element).find("img").eq(1).attr("src");
                const description_box = $(element).find("div[style='-webkit-line-clamp:2']");
                const answerBox = $(element).find(".mod").text();
                if (description_box) {
                    const description = description_box.text();
                    if (link && title && description) {
                        start += 1;
                        if (advanced) {
                            // results.push(new SearchResult(link, title.text(), description));
                        } else {
                            results.push(link);
                        }
                    }
                }
            });
            await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
        } catch (error) {
            if (error.message === 'Too many requests') {
                console.warn('Too many requests, breaking the loop');
                break;
            }
            throw error;
        }
        if (start === 0) {
            return results;
        }
    }
    if (attempts >= maxAttempts) {
        console.warn('Max attempts reached, breaking the loop');
    }
    return results
 }
--- a/apps/api/src/search/index.ts
+++ b/apps/api/src/search/index.ts
@ -0,0 +1,45 @@
 import { google_search } from "./googlesearch";
 import { serper_search } from "./serper";
 export async function search({
  query,
  advanced = false,
  num_results = 7,
  tbs = null,
  filter = null,
  lang = "en",
  proxy = null,
  sleep_interval = 0,
  timeout = 5000,
 }: {
  query: string;
  advanced?: boolean;
  num_results?: number;
  tbs?: string;
  filter?: string;
  lang?: string;
  proxy?: string;
  sleep_interval?: number;
  timeout?: number;
 }) {
  try {
    if (process.env.SERPER_API_KEY) {
      return await serper_search(query, num_results);
    }
    return await google_search(
      query,
      advanced,
      num_results,
      tbs,
      filter,
      lang,
      proxy,
      sleep_interval,
      timeout
    );
  } catch (error) {
    console.error("Error in search function: ", error);
    return []
  }
  // if process.env.SERPER_API_KEY is set, use serper
 }
--- a/apps/api/src/search/serper.ts
+++ b/apps/api/src/search/serper.ts
@ -0,0 +1,27 @@
 import axios from "axios";
 import dotenv from "dotenv";
 dotenv.config();
 export async function serper_search(q, num_results) : Promise<string[]> {
  let data = JSON.stringify({
    q: q,
    "num": num_results
  });
  let config = {
    method: "POST",
    url: "https://google.serper.dev/search",
    headers: {
      "X-API-KEY": process.env.SERPER_API_KEY,
      "Content-Type": "application/json",
    },
    data: data,
  };
  const response = await axios(config);
  if (response && response.data && Array.isArray(response.data.organic)) {
    return response.data.organic.map((a) => a.link);
  } else {
    return [];
  }
 }
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -44,6 +44,8 @@ export enum RateLimiterMode {
  CrawlStatus = "crawl-status",
  Scrape = "scrape",
  Preview = "preview",
  Search = "search",
 }
 export interface AuthResponse {