Nick: serper support
This commit is contained in:
parent
8cb5d7955a
commit
41263bb4b6
@ -21,4 +21,5 @@ OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
|||||||
BULL_AUTH_KEY= #
|
BULL_AUTH_KEY= #
|
||||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||||
|
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
@ -5,7 +5,7 @@ import { authenticateUser } from "./auth";
|
|||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
import { logJob } from "../services/logging/log_job";
|
import { logJob } from "../services/logging/log_job";
|
||||||
import { PageOptions, SearchOptions } from "../lib/entities";
|
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||||
import { search } from "../search/googlesearch";
|
import { search } from "../search";
|
||||||
|
|
||||||
export async function searchHelper(
|
export async function searchHelper(
|
||||||
req: Request,
|
req: Request,
|
||||||
@ -25,7 +25,10 @@ export async function searchHelper(
|
|||||||
return { success: false, error: "Query is required", returnCode: 400 };
|
return { success: false, error: "Query is required", returnCode: 400 };
|
||||||
}
|
}
|
||||||
|
|
||||||
const res = await search(query, advanced, searchOptions.limit ?? 7);
|
const tbs = searchOptions.tbs ?? null;
|
||||||
|
const filter = searchOptions.filter ?? null;
|
||||||
|
|
||||||
|
const res = await search({query: query, advanced: advanced, num_results: searchOptions.limit ?? 7, tbs: tbs, filter: filter});
|
||||||
|
|
||||||
let justSearch = pageOptions.fetchPageContent === false;
|
let justSearch = pageOptions.fetchPageContent === false;
|
||||||
|
|
||||||
@ -33,15 +36,14 @@ export async function searchHelper(
|
|||||||
return { success: true, data: res, returnCode: 200 };
|
return { success: true, data: res, returnCode: 200 };
|
||||||
}
|
}
|
||||||
|
|
||||||
if (res.results.length === 0) {
|
if (res.length === 0) {
|
||||||
return { success: true, error: "No search results found", returnCode: 200 };
|
return { success: true, error: "No search results found", returnCode: 200 };
|
||||||
}
|
}
|
||||||
console.log(res.results);
|
|
||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: res.results.map((r) => (!advanced ? r : r.url)),
|
urls: res.map((r) => r),
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
...crawlerOptions,
|
...crawlerOptions,
|
||||||
},
|
},
|
||||||
|
@ -13,10 +13,13 @@ export type PageOptions = {
|
|||||||
onlyMainContent?: boolean;
|
onlyMainContent?: boolean;
|
||||||
fallback?: boolean;
|
fallback?: boolean;
|
||||||
fetchPageContent?: boolean;
|
fetchPageContent?: boolean;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export type SearchOptions = {
|
export type SearchOptions = {
|
||||||
limit?: number;
|
limit?: number;
|
||||||
|
tbs?: string;
|
||||||
|
filter?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type WebScraperOptions = {
|
export type WebScraperOptions = {
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import axios from 'axios';
|
import axios from 'axios';
|
||||||
import * as cheerio from 'cheerio';
|
import * as cheerio from 'cheerio';
|
||||||
import * as querystring from 'querystring';
|
import * as querystring from 'querystring';
|
||||||
import { ScrapingBeeClient } from 'scrapingbee';
|
|
||||||
|
|
||||||
const _useragent_list = [
|
const _useragent_list = [
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
|
||||||
@ -17,20 +16,35 @@ function get_useragent(): string {
|
|||||||
return _useragent_list[Math.floor(Math.random() * _useragent_list.length)];
|
return _useragent_list[Math.floor(Math.random() * _useragent_list.length)];
|
||||||
}
|
}
|
||||||
|
|
||||||
async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number) {
|
async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number, tbs: string = null, filter: string = null) {
|
||||||
const resp = await axios.get("https://www.google.com/search", {
|
const params = {
|
||||||
headers: {
|
"q": term,
|
||||||
"User-Agent": get_useragent()
|
"num": results, // Number of results to return
|
||||||
},
|
"hl": lang,
|
||||||
params: {
|
"start": start,
|
||||||
"q": term,
|
};
|
||||||
"num": results, // Number of results to return
|
if (tbs) {
|
||||||
"hl": lang,
|
params["tbs"] = tbs;
|
||||||
},
|
}
|
||||||
proxy: proxies,
|
if (filter) {
|
||||||
timeout: timeout,
|
params["filter"] = filter;
|
||||||
});
|
}
|
||||||
return resp;
|
try {
|
||||||
|
const resp = await axios.get("https://www.google.com/search", {
|
||||||
|
headers: {
|
||||||
|
"User-Agent": get_useragent()
|
||||||
|
},
|
||||||
|
params: params,
|
||||||
|
proxy: proxies,
|
||||||
|
timeout: timeout,
|
||||||
|
});
|
||||||
|
return resp;
|
||||||
|
} catch (error) {
|
||||||
|
if (error.response && error.response.status === 429) {
|
||||||
|
throw new Error('Google Search: Too many requests, try again later.');
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class SearchResult {
|
class SearchResult {
|
||||||
@ -49,7 +63,7 @@ class SearchResult {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function search(term: string, advanced = false, num_results = 7, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000) {
|
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<string[]> {
|
||||||
const escaped_term = querystring.escape(term);
|
const escaped_term = querystring.escape(term);
|
||||||
|
|
||||||
let proxies = null;
|
let proxies = null;
|
||||||
@ -64,74 +78,54 @@ export async function search(term: string, advanced = false, num_results = 7, la
|
|||||||
// TODO: knowledge graph, answer box, etc.
|
// TODO: knowledge graph, answer box, etc.
|
||||||
|
|
||||||
let start = 0;
|
let start = 0;
|
||||||
let results = [];
|
let results : string[] = [];
|
||||||
while (start < num_results) {
|
let attempts = 0;
|
||||||
const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout);
|
const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop
|
||||||
const $ = cheerio.load(resp.data);
|
while (start < num_results && attempts < maxAttempts) {
|
||||||
const result_block = $("div.g");
|
try {
|
||||||
if (result_block.length === 0) {
|
const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout, tbs, filter);
|
||||||
start += 1;
|
const $ = cheerio.load(resp.data);
|
||||||
}
|
const result_block = $("div.g");
|
||||||
result_block.each((index, element) => {
|
if (result_block.length === 0) {
|
||||||
const linkElement = $(element).find("a");
|
start += 1;
|
||||||
const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null;
|
attempts += 1;
|
||||||
const title = $(element).find("h3");
|
} else {
|
||||||
const ogImage = $(element).find("img").eq(1).attr("src");
|
attempts = 0; // Reset attempts if we have results
|
||||||
const description_box = $(element).find("div[style='-webkit-line-clamp:2']");
|
}
|
||||||
const answerBox = $(element).find(".mod").text();
|
result_block.each((index, element) => {
|
||||||
if (description_box) {
|
const linkElement = $(element).find("a");
|
||||||
const description = description_box.text();
|
const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null;
|
||||||
if (link && title && description) {
|
const title = $(element).find("h3");
|
||||||
start += 1;
|
const ogImage = $(element).find("img").eq(1).attr("src");
|
||||||
if (advanced) {
|
const description_box = $(element).find("div[style='-webkit-line-clamp:2']");
|
||||||
results.push(new SearchResult(link, title.text(), description));
|
const answerBox = $(element).find(".mod").text();
|
||||||
} else {
|
if (description_box) {
|
||||||
results.push(link);
|
const description = description_box.text();
|
||||||
|
if (link && title && description) {
|
||||||
|
start += 1;
|
||||||
|
if (advanced) {
|
||||||
|
// results.push(new SearchResult(link, title.text(), description));
|
||||||
|
} else {
|
||||||
|
results.push(link);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
});
|
||||||
|
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
|
||||||
|
} catch (error) {
|
||||||
|
if (error.message === 'Too many requests') {
|
||||||
|
console.warn('Too many requests, breaking the loop');
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
});
|
throw error;
|
||||||
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
|
}
|
||||||
|
|
||||||
if (start === 0) {
|
if (start === 0) {
|
||||||
return {results: []};
|
return results;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return {results: results};
|
if (attempts >= maxAttempts) {
|
||||||
|
console.warn('Max attempts reached, breaking the loop');
|
||||||
|
}
|
||||||
|
return results
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// const response = await _req_scraping_bee(escaped_term, num_results, lang);
|
|
||||||
// const $ = cheerio.load(response);
|
|
||||||
|
|
||||||
// const knowledgeGraphElement = $("div.kno-rdesc");
|
|
||||||
// console.log(knowledgeGraphElement);
|
|
||||||
// console.log(knowledgeGraphElement.html());
|
|
||||||
|
|
||||||
// let knowledgeGraph = null;
|
|
||||||
// if (knowledgeGraphElement.length > 0) {
|
|
||||||
// console.log("Knowledge Graph found");
|
|
||||||
// const title = knowledgeGraphElement.find("h2").text();
|
|
||||||
// const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text();
|
|
||||||
// const website = knowledgeGraphElement.find("a[data-ved]").attr("href");
|
|
||||||
// const imageUrl = knowledgeGraphElement.find("g-img img").attr("src");
|
|
||||||
// const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text();
|
|
||||||
// const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text();
|
|
||||||
// const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href");
|
|
||||||
// const attributes = {};
|
|
||||||
// knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => {
|
|
||||||
// const attributeKey = $(element).find("span[data-attrid]").text();
|
|
||||||
// const attributeValue = $(element).find("span[data-log-string]").text();
|
|
||||||
// attributes[attributeKey] = attributeValue;
|
|
||||||
// });
|
|
||||||
// knowledgeGraph = {
|
|
||||||
// "title": title,
|
|
||||||
// "type": type,
|
|
||||||
// "website": website,
|
|
||||||
// "imageUrl": imageUrl,
|
|
||||||
// "description": description,
|
|
||||||
// "descriptionSource": descriptionSource,
|
|
||||||
// "descriptionLink": descriptionLink,
|
|
||||||
// "attributes": attributes
|
|
||||||
// };
|
|
||||||
// }
|
|
45
apps/api/src/search/index.ts
Normal file
45
apps/api/src/search/index.ts
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import { google_search } from "./googlesearch";
|
||||||
|
import { serper_search } from "./serper";
|
||||||
|
|
||||||
|
export async function search({
|
||||||
|
query,
|
||||||
|
advanced = false,
|
||||||
|
num_results = 7,
|
||||||
|
tbs = null,
|
||||||
|
filter = null,
|
||||||
|
lang = "en",
|
||||||
|
proxy = null,
|
||||||
|
sleep_interval = 0,
|
||||||
|
timeout = 5000,
|
||||||
|
}: {
|
||||||
|
query: string;
|
||||||
|
advanced?: boolean;
|
||||||
|
num_results?: number;
|
||||||
|
tbs?: string;
|
||||||
|
filter?: string;
|
||||||
|
lang?: string;
|
||||||
|
proxy?: string;
|
||||||
|
sleep_interval?: number;
|
||||||
|
timeout?: number;
|
||||||
|
}) {
|
||||||
|
try {
|
||||||
|
if (process.env.SERPER_API_KEY) {
|
||||||
|
return await serper_search(query, num_results);
|
||||||
|
}
|
||||||
|
return await google_search(
|
||||||
|
query,
|
||||||
|
advanced,
|
||||||
|
num_results,
|
||||||
|
tbs,
|
||||||
|
filter,
|
||||||
|
lang,
|
||||||
|
proxy,
|
||||||
|
sleep_interval,
|
||||||
|
timeout
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error in search function: ", error);
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
// if process.env.SERPER_API_KEY is set, use serper
|
||||||
|
}
|
27
apps/api/src/search/serper.ts
Normal file
27
apps/api/src/search/serper.ts
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import axios from "axios";
|
||||||
|
import dotenv from "dotenv";
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
export async function serper_search(q, num_results) : Promise<string[]> {
|
||||||
|
let data = JSON.stringify({
|
||||||
|
q: q,
|
||||||
|
"num": num_results
|
||||||
|
});
|
||||||
|
|
||||||
|
let config = {
|
||||||
|
method: "POST",
|
||||||
|
url: "https://google.serper.dev/search",
|
||||||
|
headers: {
|
||||||
|
"X-API-KEY": process.env.SERPER_API_KEY,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
data: data,
|
||||||
|
};
|
||||||
|
const response = await axios(config);
|
||||||
|
if (response && response.data && Array.isArray(response.data.organic)) {
|
||||||
|
return response.data.organic.map((a) => a.link);
|
||||||
|
} else {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user