0

Nick: improvements to search

This commit is contained in:
Nicolas 2024-04-24 10:11:01 -07:00
parent f189589da4
commit 307ea6f5ec
5 changed files with 34 additions and 29 deletions

View File

@ -37,7 +37,7 @@ export async function searchHelper(
return { success: true, data: res, returnCode: 200 };
}
res = res.filter((r) => !isUrlBlocked(r));
res = res.filter((r) => !isUrlBlocked(r.url));
if (res.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 };
@ -48,7 +48,7 @@ export async function searchHelper(
const a = new WebScraperDataProvider();
await a.setOptions({
mode: "single_urls",
urls: res.map((r) => r),
urls: res.map((r) => r.url),
crawlerOptions: {
...crawlerOptions,
},

View File

@ -71,3 +71,20 @@ export class Document {
this.provider = data.provider || undefined;
}
}
export class SearchResult {
url: string;
title: string;
description: string;
constructor(url: string, title: string, description: string) {
this.url = url;
this.title = title;
this.description = description;
}
toString(): string {
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
}
}

View File

@ -1,6 +1,7 @@
import axios from 'axios';
import * as cheerio from 'cheerio';
import * as querystring from 'querystring';
import { SearchResult } from '../../src/lib/entities';
const _useragent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
@ -47,23 +48,9 @@ async function _req(term: string, results: number, lang: string, start: number,
}
}
class SearchResult {
url: string;
title: string;
description: string;
constructor(url: string, title: string, description: string) {
this.url = url;
this.title = title;
this.description = description;
}
toString(): string {
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
}
}
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<string[]> {
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
const escaped_term = querystring.escape(term);
let proxies = null;
@ -78,7 +65,7 @@ export async function google_search(term: string, advanced = false, num_results
// TODO: knowledge graph, answer box, etc.
let start = 0;
let results : string[] = [];
let results : SearchResult[] = [];
let attempts = 0;
const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop
while (start < num_results && attempts < maxAttempts) {
@ -103,11 +90,7 @@ export async function google_search(term: string, advanced = false, num_results
const description = description_box.text();
if (link && title && description) {
start += 1;
if (advanced) {
// results.push(new SearchResult(link, title.text(), description));
} else {
results.push(link);
}
results.push(new SearchResult(link, title.text(), description));
}
}
});

View File

@ -1,3 +1,4 @@
import { SearchResult } from "../../src/lib/entities";
import { google_search } from "./googlesearch";
import { serper_search } from "./serper";
@ -21,7 +22,7 @@ export async function search({
proxy?: string;
sleep_interval?: number;
timeout?: number;
}) {
}) : Promise<SearchResult[]> {
try {
if (process.env.SERPER_API_KEY && !tbs) {
return await serper_search(query, num_results);

View File

@ -1,13 +1,13 @@
import axios from "axios";
import dotenv from "dotenv";
import { SearchResult } from "../../src/lib/entities";
dotenv.config();
export async function serper_search(q, num_results) : Promise<string[]> {
export async function serper_search(q, num_results): Promise<SearchResult[]> {
let data = JSON.stringify({
q: q,
"num": num_results,
num: num_results,
});
let config = {
@ -21,8 +21,12 @@ export async function serper_search(q, num_results) : Promise<string[]> {
};
const response = await axios(config);
if (response && response.data && Array.isArray(response.data.organic)) {
return response.data.organic.map((a) => a.link);
} else {
return response.data.organic.map((a) => ({
url: a.link,
title: a.title,
description: a.snippet,
}));
}else{
return [];
}
}