0

Nick: serper support

This commit is contained in:
Nicolas 2024-04-23 16:45:06 -07:00
parent 8cb5d7955a
commit 41263bb4b6
6 changed files with 157 additions and 85 deletions

View File

@ -22,3 +22,4 @@ BULL_AUTH_KEY= #
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api

View File

@ -5,7 +5,7 @@ import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types"; import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job"; import { logJob } from "../services/logging/log_job";
import { PageOptions, SearchOptions } from "../lib/entities"; import { PageOptions, SearchOptions } from "../lib/entities";
import { search } from "../search/googlesearch"; import { search } from "../search";
export async function searchHelper( export async function searchHelper(
req: Request, req: Request,
@ -25,7 +25,10 @@ export async function searchHelper(
return { success: false, error: "Query is required", returnCode: 400 }; return { success: false, error: "Query is required", returnCode: 400 };
} }
const res = await search(query, advanced, searchOptions.limit ?? 7); const tbs = searchOptions.tbs ?? null;
const filter = searchOptions.filter ?? null;
const res = await search({query: query, advanced: advanced, num_results: searchOptions.limit ?? 7, tbs: tbs, filter: filter});
let justSearch = pageOptions.fetchPageContent === false; let justSearch = pageOptions.fetchPageContent === false;
@ -33,15 +36,14 @@ export async function searchHelper(
return { success: true, data: res, returnCode: 200 }; return { success: true, data: res, returnCode: 200 };
} }
if (res.results.length === 0) { if (res.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 }; return { success: true, error: "No search results found", returnCode: 200 };
} }
console.log(res.results);
const a = new WebScraperDataProvider(); const a = new WebScraperDataProvider();
await a.setOptions({ await a.setOptions({
mode: "single_urls", mode: "single_urls",
urls: res.results.map((r) => (!advanced ? r : r.url)), urls: res.map((r) => r),
crawlerOptions: { crawlerOptions: {
...crawlerOptions, ...crawlerOptions,
}, },

View File

@ -13,10 +13,13 @@ export type PageOptions = {
onlyMainContent?: boolean; onlyMainContent?: boolean;
fallback?: boolean; fallback?: boolean;
fetchPageContent?: boolean; fetchPageContent?: boolean;
}; };
export type SearchOptions = { export type SearchOptions = {
limit?: number; limit?: number;
tbs?: string;
filter?: string;
}; };
export type WebScraperOptions = { export type WebScraperOptions = {

View File

@ -1,7 +1,6 @@
import axios from 'axios'; import axios from 'axios';
import * as cheerio from 'cheerio'; import * as cheerio from 'cheerio';
import * as querystring from 'querystring'; import * as querystring from 'querystring';
import { ScrapingBeeClient } from 'scrapingbee';
const _useragent_list = [ const _useragent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
@ -17,20 +16,35 @@ function get_useragent(): string {
return _useragent_list[Math.floor(Math.random() * _useragent_list.length)]; return _useragent_list[Math.floor(Math.random() * _useragent_list.length)];
} }
async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number) { async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number, tbs: string = null, filter: string = null) {
const params = {
"q": term,
"num": results, // Number of results to return
"hl": lang,
"start": start,
};
if (tbs) {
params["tbs"] = tbs;
}
if (filter) {
params["filter"] = filter;
}
try {
const resp = await axios.get("https://www.google.com/search", { const resp = await axios.get("https://www.google.com/search", {
headers: { headers: {
"User-Agent": get_useragent() "User-Agent": get_useragent()
}, },
params: { params: params,
"q": term,
"num": results, // Number of results to return
"hl": lang,
},
proxy: proxies, proxy: proxies,
timeout: timeout, timeout: timeout,
}); });
return resp; return resp;
} catch (error) {
if (error.response && error.response.status === 429) {
throw new Error('Google Search: Too many requests, try again later.');
}
throw error;
}
} }
class SearchResult { class SearchResult {
@ -49,7 +63,7 @@ class SearchResult {
} }
} }
export async function search(term: string, advanced = false, num_results = 7, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000) { export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<string[]> {
const escaped_term = querystring.escape(term); const escaped_term = querystring.escape(term);
let proxies = null; let proxies = null;
@ -64,13 +78,19 @@ export async function search(term: string, advanced = false, num_results = 7, la
// TODO: knowledge graph, answer box, etc. // TODO: knowledge graph, answer box, etc.
let start = 0; let start = 0;
let results = []; let results : string[] = [];
while (start < num_results) { let attempts = 0;
const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout); const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop
while (start < num_results && attempts < maxAttempts) {
try {
const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout, tbs, filter);
const $ = cheerio.load(resp.data); const $ = cheerio.load(resp.data);
const result_block = $("div.g"); const result_block = $("div.g");
if (result_block.length === 0) { if (result_block.length === 0) {
start += 1; start += 1;
attempts += 1;
} else {
attempts = 0; // Reset attempts if we have results
} }
result_block.each((index, element) => { result_block.each((index, element) => {
const linkElement = $(element).find("a"); const linkElement = $(element).find("a");
@ -84,7 +104,7 @@ export async function search(term: string, advanced = false, num_results = 7, la
if (link && title && description) { if (link && title && description) {
start += 1; start += 1;
if (advanced) { if (advanced) {
results.push(new SearchResult(link, title.text(), description)); // results.push(new SearchResult(link, title.text(), description));
} else { } else {
results.push(link); results.push(link);
} }
@ -92,46 +112,20 @@ export async function search(term: string, advanced = false, num_results = 7, la
} }
}); });
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
} catch (error) {
if (error.message === 'Too many requests') {
console.warn('Too many requests, breaking the loop');
break;
}
throw error;
}
if (start === 0) { if (start === 0) {
return {results: []}; return results;
} }
} }
return {results: results}; if (attempts >= maxAttempts) {
console.warn('Max attempts reached, breaking the loop');
}
return results
} }
// const response = await _req_scraping_bee(escaped_term, num_results, lang);
// const $ = cheerio.load(response);
// const knowledgeGraphElement = $("div.kno-rdesc");
// console.log(knowledgeGraphElement);
// console.log(knowledgeGraphElement.html());
// let knowledgeGraph = null;
// if (knowledgeGraphElement.length > 0) {
// console.log("Knowledge Graph found");
// const title = knowledgeGraphElement.find("h2").text();
// const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text();
// const website = knowledgeGraphElement.find("a[data-ved]").attr("href");
// const imageUrl = knowledgeGraphElement.find("g-img img").attr("src");
// const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text();
// const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text();
// const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href");
// const attributes = {};
// knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => {
// const attributeKey = $(element).find("span[data-attrid]").text();
// const attributeValue = $(element).find("span[data-log-string]").text();
// attributes[attributeKey] = attributeValue;
// });
// knowledgeGraph = {
// "title": title,
// "type": type,
// "website": website,
// "imageUrl": imageUrl,
// "description": description,
// "descriptionSource": descriptionSource,
// "descriptionLink": descriptionLink,
// "attributes": attributes
// };
// }

View File

@ -0,0 +1,45 @@
import { google_search } from "./googlesearch";
import { serper_search } from "./serper";
export async function search({
query,
advanced = false,
num_results = 7,
tbs = null,
filter = null,
lang = "en",
proxy = null,
sleep_interval = 0,
timeout = 5000,
}: {
query: string;
advanced?: boolean;
num_results?: number;
tbs?: string;
filter?: string;
lang?: string;
proxy?: string;
sleep_interval?: number;
timeout?: number;
}) {
try {
if (process.env.SERPER_API_KEY) {
return await serper_search(query, num_results);
}
return await google_search(
query,
advanced,
num_results,
tbs,
filter,
lang,
proxy,
sleep_interval,
timeout
);
} catch (error) {
console.error("Error in search function: ", error);
return []
}
// if process.env.SERPER_API_KEY is set, use serper
}

View File

@ -0,0 +1,27 @@
import axios from "axios";
import dotenv from "dotenv";
dotenv.config();
export async function serper_search(q, num_results) : Promise<string[]> {
let data = JSON.stringify({
q: q,
"num": num_results
});
let config = {
method: "POST",
url: "https://google.serper.dev/search",
headers: {
"X-API-KEY": process.env.SERPER_API_KEY,
"Content-Type": "application/json",
},
data: data,
};
const response = await axios(config);
if (response && response.data && Array.isArray(response.data.organic)) {
return response.data.organic.map((a) => a.link);
} else {
return [];
}
}