2024-04-23 15:28:32 -07:00
|
|
|
import axios from 'axios';
|
|
|
|
import * as cheerio from 'cheerio';
|
|
|
|
import * as querystring from 'querystring';
|
|
|
|
import { ScrapingBeeClient } from 'scrapingbee';
|
|
|
|
|
|
|
|
const _useragent_list = [
|
|
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
|
|
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
|
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
|
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
|
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
|
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
|
|
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
|
|
|
|
];
|
|
|
|
|
|
|
|
function get_useragent(): string {
|
|
|
|
return _useragent_list[Math.floor(Math.random() * _useragent_list.length)];
|
|
|
|
}
|
|
|
|
|
|
|
|
async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number) {
|
|
|
|
const resp = await axios.get("https://www.google.com/search", {
|
|
|
|
headers: {
|
|
|
|
"User-Agent": get_useragent()
|
|
|
|
},
|
|
|
|
params: {
|
|
|
|
"q": term,
|
2024-04-23 15:48:37 -07:00
|
|
|
"num": results, // Number of results to return
|
2024-04-23 15:28:32 -07:00
|
|
|
"hl": lang,
|
|
|
|
},
|
|
|
|
proxy: proxies,
|
|
|
|
timeout: timeout,
|
|
|
|
});
|
|
|
|
return resp;
|
|
|
|
}
|
|
|
|
|
|
|
|
class SearchResult {
|
|
|
|
url: string;
|
|
|
|
title: string;
|
|
|
|
description: string;
|
|
|
|
|
|
|
|
constructor(url: string, title: string, description: string) {
|
|
|
|
this.url = url;
|
|
|
|
this.title = title;
|
|
|
|
this.description = description;
|
|
|
|
}
|
|
|
|
|
|
|
|
toString(): string {
|
|
|
|
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
export async function search(term: string, advanced = false, num_results = 7, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000) {
|
|
|
|
const escaped_term = querystring.escape(term);
|
|
|
|
|
|
|
|
let proxies = null;
|
|
|
|
if (proxy) {
|
|
|
|
if (proxy.startsWith("https")) {
|
|
|
|
proxies = {"https": proxy};
|
|
|
|
} else {
|
|
|
|
proxies = {"http": proxy};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// const response = await _req_scraping_bee(escaped_term, num_results, lang);
|
|
|
|
// const $ = cheerio.load(response);
|
|
|
|
|
|
|
|
// const knowledgeGraphElement = $("div.kno-rdesc");
|
|
|
|
// console.log(knowledgeGraphElement);
|
|
|
|
// console.log(knowledgeGraphElement.html());
|
|
|
|
|
|
|
|
// let knowledgeGraph = null;
|
|
|
|
// if (knowledgeGraphElement.length > 0) {
|
|
|
|
// console.log("Knowledge Graph found");
|
|
|
|
// const title = knowledgeGraphElement.find("h2").text();
|
|
|
|
// const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text();
|
|
|
|
// const website = knowledgeGraphElement.find("a[data-ved]").attr("href");
|
|
|
|
// const imageUrl = knowledgeGraphElement.find("g-img img").attr("src");
|
|
|
|
// const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text();
|
|
|
|
// const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text();
|
|
|
|
// const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href");
|
|
|
|
// const attributes = {};
|
|
|
|
// knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => {
|
|
|
|
// const attributeKey = $(element).find("span[data-attrid]").text();
|
|
|
|
// const attributeValue = $(element).find("span[data-log-string]").text();
|
|
|
|
// attributes[attributeKey] = attributeValue;
|
|
|
|
// });
|
|
|
|
// knowledgeGraph = {
|
|
|
|
// "title": title,
|
|
|
|
// "type": type,
|
|
|
|
// "website": website,
|
|
|
|
// "imageUrl": imageUrl,
|
|
|
|
// "description": description,
|
|
|
|
// "descriptionSource": descriptionSource,
|
|
|
|
// "descriptionLink": descriptionLink,
|
|
|
|
// "attributes": attributes
|
|
|
|
// };
|
|
|
|
// }
|
|
|
|
|
|
|
|
let start = 0;
|
|
|
|
let results = [];
|
|
|
|
while (start < num_results) {
|
|
|
|
const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout);
|
|
|
|
const $ = cheerio.load(resp.data);
|
|
|
|
const result_block = $("div.g");
|
|
|
|
if (result_block.length === 0) {
|
|
|
|
start += 1;
|
|
|
|
}
|
|
|
|
result_block.each((index, element) => {
|
|
|
|
const linkElement = $(element).find("a");
|
|
|
|
const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null;
|
|
|
|
const title = $(element).find("h3");
|
|
|
|
const ogImage = $(element).find("img").eq(1).attr("src");
|
|
|
|
const description_box = $(element).find("div[style='-webkit-line-clamp:2']");
|
|
|
|
const answerBox = $(element).find(".mod").text();
|
|
|
|
if (description_box) {
|
|
|
|
const description = description_box.text();
|
|
|
|
if (link && title && description) {
|
|
|
|
start += 1;
|
|
|
|
if (advanced) {
|
|
|
|
results.push(new SearchResult(link, title.text(), description));
|
|
|
|
} else {
|
|
|
|
results.push(link);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
|
|
|
|
|
|
|
|
if (start === 0) {
|
|
|
|
return {results: []};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return {results: results};
|
|
|
|
}
|