From 41263bb4b6deb17042d64ea34cab72159e1340dc Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 16:45:06 -0700 Subject: [PATCH] Nick: serper support --- apps/api/.env.example | 3 +- apps/api/src/controllers/search.ts | 12 ++- apps/api/src/lib/entities.ts | 3 + apps/api/src/search/googlesearch.ts | 152 +++++++++++++--------------- apps/api/src/search/index.ts | 45 ++++++++ apps/api/src/search/serper.ts | 27 +++++ 6 files changed, 157 insertions(+), 85 deletions(-) create mode 100644 apps/api/src/search/index.ts create mode 100644 apps/api/src/search/serper.ts diff --git a/apps/api/.env.example b/apps/api/.env.example index 34e24b1..3bd06cd 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -21,4 +21,5 @@ OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) BULL_AUTH_KEY= # LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback -LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs \ No newline at end of file +LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs +SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api \ No newline at end of file diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index bc6659b..6a1c7b4 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -5,7 +5,7 @@ import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { PageOptions, SearchOptions } from "../lib/entities"; -import { search } from "../search/googlesearch"; +import { search } from "../search"; export async function searchHelper( req: Request, @@ -25,7 +25,10 @@ export async function searchHelper( return { success: false, error: "Query is required", returnCode: 400 }; } - const res = await search(query, advanced, searchOptions.limit ?? 7); + const tbs = searchOptions.tbs ?? null; + const filter = searchOptions.filter ?? null; + + const res = await search({query: query, advanced: advanced, num_results: searchOptions.limit ?? 7, tbs: tbs, filter: filter}); let justSearch = pageOptions.fetchPageContent === false; @@ -33,15 +36,14 @@ export async function searchHelper( return { success: true, data: res, returnCode: 200 }; } - if (res.results.length === 0) { + if (res.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } - console.log(res.results); const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", - urls: res.results.map((r) => (!advanced ? r : r.url)), + urls: res.map((r) => r), crawlerOptions: { ...crawlerOptions, }, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index b4b5193..062212b 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -13,10 +13,13 @@ export type PageOptions = { onlyMainContent?: boolean; fallback?: boolean; fetchPageContent?: boolean; + }; export type SearchOptions = { limit?: number; + tbs?: string; + filter?: string; }; export type WebScraperOptions = { diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index c835d08..53227e6 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -1,7 +1,6 @@ import axios from 'axios'; import * as cheerio from 'cheerio'; import * as querystring from 'querystring'; -import { ScrapingBeeClient } from 'scrapingbee'; const _useragent_list = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', @@ -17,20 +16,35 @@ function get_useragent(): string { return _useragent_list[Math.floor(Math.random() * _useragent_list.length)]; } -async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number) { - const resp = await axios.get("https://www.google.com/search", { - headers: { - "User-Agent": get_useragent() - }, - params: { - "q": term, - "num": results, // Number of results to return - "hl": lang, - }, - proxy: proxies, - timeout: timeout, - }); - return resp; +async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number, tbs: string = null, filter: string = null) { + const params = { + "q": term, + "num": results, // Number of results to return + "hl": lang, + "start": start, + }; + if (tbs) { + params["tbs"] = tbs; + } + if (filter) { + params["filter"] = filter; + } + try { + const resp = await axios.get("https://www.google.com/search", { + headers: { + "User-Agent": get_useragent() + }, + params: params, + proxy: proxies, + timeout: timeout, + }); + return resp; + } catch (error) { + if (error.response && error.response.status === 429) { + throw new Error('Google Search: Too many requests, try again later.'); + } + throw error; + } } class SearchResult { @@ -49,7 +63,7 @@ class SearchResult { } } -export async function search(term: string, advanced = false, num_results = 7, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000) { +export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { const escaped_term = querystring.escape(term); let proxies = null; @@ -64,74 +78,54 @@ export async function search(term: string, advanced = false, num_results = 7, la // TODO: knowledge graph, answer box, etc. let start = 0; - let results = []; - while (start < num_results) { - const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout); - const $ = cheerio.load(resp.data); - const result_block = $("div.g"); - if (result_block.length === 0) { - start += 1; - } - result_block.each((index, element) => { - const linkElement = $(element).find("a"); - const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null; - const title = $(element).find("h3"); - const ogImage = $(element).find("img").eq(1).attr("src"); - const description_box = $(element).find("div[style='-webkit-line-clamp:2']"); - const answerBox = $(element).find(".mod").text(); - if (description_box) { - const description = description_box.text(); - if (link && title && description) { - start += 1; - if (advanced) { - results.push(new SearchResult(link, title.text(), description)); - } else { - results.push(link); + let results : string[] = []; + let attempts = 0; + const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop + while (start < num_results && attempts < maxAttempts) { + try { + const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout, tbs, filter); + const $ = cheerio.load(resp.data); + const result_block = $("div.g"); + if (result_block.length === 0) { + start += 1; + attempts += 1; + } else { + attempts = 0; // Reset attempts if we have results + } + result_block.each((index, element) => { + const linkElement = $(element).find("a"); + const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null; + const title = $(element).find("h3"); + const ogImage = $(element).find("img").eq(1).attr("src"); + const description_box = $(element).find("div[style='-webkit-line-clamp:2']"); + const answerBox = $(element).find(".mod").text(); + if (description_box) { + const description = description_box.text(); + if (link && title && description) { + start += 1; + if (advanced) { + // results.push(new SearchResult(link, title.text(), description)); + } else { + results.push(link); + } } } + }); + await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); + } catch (error) { + if (error.message === 'Too many requests') { + console.warn('Too many requests, breaking the loop'); + break; } - }); - await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); + throw error; + } if (start === 0) { - return {results: []}; + return results; } } - return {results: results}; + if (attempts >= maxAttempts) { + console.warn('Max attempts reached, breaking the loop'); + } + return results } - - -// const response = await _req_scraping_bee(escaped_term, num_results, lang); - // const $ = cheerio.load(response); - - // const knowledgeGraphElement = $("div.kno-rdesc"); - // console.log(knowledgeGraphElement); - // console.log(knowledgeGraphElement.html()); - - // let knowledgeGraph = null; - // if (knowledgeGraphElement.length > 0) { - // console.log("Knowledge Graph found"); - // const title = knowledgeGraphElement.find("h2").text(); - // const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text(); - // const website = knowledgeGraphElement.find("a[data-ved]").attr("href"); - // const imageUrl = knowledgeGraphElement.find("g-img img").attr("src"); - // const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text(); - // const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text(); - // const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href"); - // const attributes = {}; - // knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => { - // const attributeKey = $(element).find("span[data-attrid]").text(); - // const attributeValue = $(element).find("span[data-log-string]").text(); - // attributes[attributeKey] = attributeValue; - // }); - // knowledgeGraph = { - // "title": title, - // "type": type, - // "website": website, - // "imageUrl": imageUrl, - // "description": description, - // "descriptionSource": descriptionSource, - // "descriptionLink": descriptionLink, - // "attributes": attributes - // }; - // } \ No newline at end of file diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts new file mode 100644 index 0000000..0f3a596 --- /dev/null +++ b/apps/api/src/search/index.ts @@ -0,0 +1,45 @@ +import { google_search } from "./googlesearch"; +import { serper_search } from "./serper"; + +export async function search({ + query, + advanced = false, + num_results = 7, + tbs = null, + filter = null, + lang = "en", + proxy = null, + sleep_interval = 0, + timeout = 5000, +}: { + query: string; + advanced?: boolean; + num_results?: number; + tbs?: string; + filter?: string; + lang?: string; + proxy?: string; + sleep_interval?: number; + timeout?: number; +}) { + try { + if (process.env.SERPER_API_KEY) { + return await serper_search(query, num_results); + } + return await google_search( + query, + advanced, + num_results, + tbs, + filter, + lang, + proxy, + sleep_interval, + timeout + ); + } catch (error) { + console.error("Error in search function: ", error); + return [] + } + // if process.env.SERPER_API_KEY is set, use serper +} diff --git a/apps/api/src/search/serper.ts b/apps/api/src/search/serper.ts new file mode 100644 index 0000000..f92f2fc --- /dev/null +++ b/apps/api/src/search/serper.ts @@ -0,0 +1,27 @@ +import axios from "axios"; +import dotenv from "dotenv"; + +dotenv.config(); + +export async function serper_search(q, num_results) : Promise { + let data = JSON.stringify({ + q: q, + "num": num_results + }); + + let config = { + method: "POST", + url: "https://google.serper.dev/search", + headers: { + "X-API-KEY": process.env.SERPER_API_KEY, + "Content-Type": "application/json", + }, + data: data, + }; + const response = await axios(config); + if (response && response.data && Array.isArray(response.data.organic)) { + return response.data.organic.map((a) => a.link); + } else { + return []; + } +}