From 307ea6f5ec48760715f75939b269a1d5a1078eaa Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 24 Apr 2024 10:11:01 -0700 Subject: [PATCH] Nick: improvements to search --- apps/api/src/controllers/search.ts | 4 ++-- apps/api/src/lib/entities.ts | 17 +++++++++++++++++ apps/api/src/search/googlesearch.ts | 25 ++++--------------------- apps/api/src/search/index.ts | 3 ++- apps/api/src/search/serper.ts | 14 +++++++++----- 5 files changed, 34 insertions(+), 29 deletions(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index f18f1c5..28169c0 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -37,7 +37,7 @@ export async function searchHelper( return { success: true, data: res, returnCode: 200 }; } - res = res.filter((r) => !isUrlBlocked(r)); + res = res.filter((r) => !isUrlBlocked(r.url)); if (res.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; @@ -48,7 +48,7 @@ export async function searchHelper( const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", - urls: res.map((r) => r), + urls: res.map((r) => r.url), crawlerOptions: { ...crawlerOptions, }, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 1144c63..bda7448 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -71,3 +71,20 @@ export class Document { this.provider = data.provider || undefined; } } + + +export class SearchResult { + url: string; + title: string; + description: string; + + constructor(url: string, title: string, description: string) { + this.url = url; + this.title = title; + this.description = description; + } + + toString(): string { + return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; + } +} \ No newline at end of file diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index 53227e6..0f7c72f 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -1,6 +1,7 @@ import axios from 'axios'; import * as cheerio from 'cheerio'; import * as querystring from 'querystring'; +import { SearchResult } from '../../src/lib/entities'; const _useragent_list = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', @@ -47,23 +48,9 @@ async function _req(term: string, results: number, lang: string, start: number, } } -class SearchResult { - url: string; - title: string; - description: string; - constructor(url: string, title: string, description: string) { - this.url = url; - this.title = title; - this.description = description; - } - toString(): string { - return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; - } -} - -export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { +export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { const escaped_term = querystring.escape(term); let proxies = null; @@ -78,7 +65,7 @@ export async function google_search(term: string, advanced = false, num_results // TODO: knowledge graph, answer box, etc. let start = 0; - let results : string[] = []; + let results : SearchResult[] = []; let attempts = 0; const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop while (start < num_results && attempts < maxAttempts) { @@ -103,11 +90,7 @@ export async function google_search(term: string, advanced = false, num_results const description = description_box.text(); if (link && title && description) { start += 1; - if (advanced) { - // results.push(new SearchResult(link, title.text(), description)); - } else { - results.push(link); - } + results.push(new SearchResult(link, title.text(), description)); } } }); diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index ae62451..5a6a3d8 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -1,3 +1,4 @@ +import { SearchResult } from "../../src/lib/entities"; import { google_search } from "./googlesearch"; import { serper_search } from "./serper"; @@ -21,7 +22,7 @@ export async function search({ proxy?: string; sleep_interval?: number; timeout?: number; -}) { +}) : Promise { try { if (process.env.SERPER_API_KEY && !tbs) { return await serper_search(query, num_results); diff --git a/apps/api/src/search/serper.ts b/apps/api/src/search/serper.ts index 2b4ba02..f8806b7 100644 --- a/apps/api/src/search/serper.ts +++ b/apps/api/src/search/serper.ts @@ -1,13 +1,13 @@ import axios from "axios"; import dotenv from "dotenv"; +import { SearchResult } from "../../src/lib/entities"; dotenv.config(); -export async function serper_search(q, num_results) : Promise { +export async function serper_search(q, num_results): Promise { let data = JSON.stringify({ q: q, - "num": num_results, - + num: num_results, }); let config = { @@ -21,8 +21,12 @@ export async function serper_search(q, num_results) : Promise { }; const response = await axios(config); if (response && response.data && Array.isArray(response.data.organic)) { - return response.data.organic.map((a) => a.link); - } else { + return response.data.organic.map((a) => ({ + url: a.link, + title: a.title, + description: a.snippet, + })); + }else{ return []; } }