Nick: mvp
This commit is contained in:
parent
c70bc08d73
commit
0146157876
136
apps/api/src/controllers/search.ts
Normal file
136
apps/api/src/controllers/search.ts
Normal file
@ -0,0 +1,136 @@
|
||||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { PageOptions } from "../lib/entities";
|
||||
import { search } from "../search/googlesearch";
|
||||
|
||||
export async function searchHelper(
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
data?: any;
|
||||
returnCode: number;
|
||||
}> {
|
||||
const query = req.body.query;
|
||||
if (!query) {
|
||||
return { success: false, error: "Query is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
const res = await search(query, true, 7);
|
||||
|
||||
let justSearch = pageOptions.fetchPageContent === false;
|
||||
|
||||
if(justSearch){
|
||||
return { success: true, data: res, returnCode: 200 };
|
||||
}
|
||||
|
||||
if (res.results.length === 0) {
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
}
|
||||
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
mode: "single_urls",
|
||||
urls: res.results.map((r) => r.url),
|
||||
crawlerOptions: {
|
||||
...crawlerOptions,
|
||||
},
|
||||
pageOptions: {...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, fallback:false},
|
||||
});
|
||||
|
||||
const docs = await a.getDocuments(true);
|
||||
if (docs.length === 0)
|
||||
{
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
}
|
||||
|
||||
|
||||
// make sure doc.content is not empty
|
||||
const filteredDocs = docs.filter(
|
||||
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||
);
|
||||
|
||||
if (filteredDocs.length === 0) {
|
||||
return { success: true, error: "No page found", returnCode: 200 };
|
||||
}
|
||||
|
||||
const { success, credit_usage } = await billTeam(
|
||||
team_id,
|
||||
filteredDocs.length
|
||||
);
|
||||
if (!success) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
returnCode: 402,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: filteredDocs,
|
||||
returnCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
export async function searchController(req: Request, res: Response) {
|
||||
try {
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Search
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: true, fetchPageContent: true, fallback: false};
|
||||
const origin = req.body.origin ?? "api";
|
||||
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
return res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
const startTime = new Date().getTime();
|
||||
const result = await searchHelper(
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
pageOptions
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
logJob({
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: 1,
|
||||
docs: [result.data],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: team_id,
|
||||
mode: "search",
|
||||
url: req.body.url,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
});
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
@ -11,6 +11,8 @@ export interface Progress {
|
||||
|
||||
export type PageOptions = {
|
||||
onlyMainContent?: boolean;
|
||||
fallback?: boolean;
|
||||
fetchPageContent?: boolean;
|
||||
};
|
||||
export type WebScraperOptions = {
|
||||
urls: string[];
|
||||
|
@ -4,6 +4,7 @@ import { crawlStatusController } from "../../src/controllers/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/scrape";
|
||||
import { crawlPreviewController } from "../../src/controllers/crawlPreview";
|
||||
import { crawlJobStatusPreviewController } from "../../src/controllers/status";
|
||||
import { searchController } from "../../src/controllers/search";
|
||||
|
||||
export const v0Router = express.Router();
|
||||
|
||||
@ -12,3 +13,7 @@ v0Router.post("/v0/crawl", crawlController);
|
||||
v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController);
|
||||
v0Router.get("/v0/crawl/status/:jobId", crawlStatusController);
|
||||
v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||
|
||||
// Search routes
|
||||
v0Router.post("/v0/search", searchController);
|
||||
|
||||
|
@ -4,9 +4,7 @@ import { extractMetadata } from "./utils/metadata";
|
||||
import dotenv from "dotenv";
|
||||
import { Document, PageOptions } from "../../lib/entities";
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import { parseTablesToMarkdown } from "./utils/parseTable";
|
||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||
// import puppeteer from "puppeteer";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
@ -155,6 +153,15 @@ export async function scrapSingleUrl(
|
||||
// }
|
||||
|
||||
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
|
||||
if(pageOptions.fallback === false){
|
||||
const soup = cheerio.load(html);
|
||||
const metadata = extractMetadata(soup, urlToScrap);
|
||||
return {
|
||||
content: text,
|
||||
markdown: text,
|
||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
||||
} as Document;
|
||||
}
|
||||
if (!text || text.length < 100) {
|
||||
console.log("Falling back to playwright");
|
||||
[text, html] = await attemptScraping(urlToScrap, "playwright");
|
||||
|
@ -1,4 +1,3 @@
|
||||
// import * as cheerio from 'cheerio';
|
||||
import { CheerioAPI } from "cheerio";
|
||||
interface Metadata {
|
||||
title?: string;
|
||||
@ -8,6 +7,14 @@ interface Metadata {
|
||||
robots?: string;
|
||||
ogTitle?: string;
|
||||
ogDescription?: string;
|
||||
ogUrl?: string;
|
||||
ogImage?: string;
|
||||
ogAudio?: string;
|
||||
ogDeterminer?: string;
|
||||
ogLocale?: string;
|
||||
ogLocaleAlternate?: string[];
|
||||
ogSiteName?: string;
|
||||
ogVideo?: string;
|
||||
dctermsCreated?: string;
|
||||
dcDateCreated?: string;
|
||||
dcDate?: string;
|
||||
@ -17,7 +24,6 @@ interface Metadata {
|
||||
dctermsSubject?: string;
|
||||
dcSubject?: string;
|
||||
dcDescription?: string;
|
||||
ogImage?: string;
|
||||
dctermsKeywords?: string;
|
||||
modifiedTime?: string;
|
||||
publishedTime?: string;
|
||||
@ -33,6 +39,14 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
let robots: string | null = null;
|
||||
let ogTitle: string | null = null;
|
||||
let ogDescription: string | null = null;
|
||||
let ogUrl: string | null = null;
|
||||
let ogImage: string | null = null;
|
||||
let ogAudio: string | null = null;
|
||||
let ogDeterminer: string | null = null;
|
||||
let ogLocale: string | null = null;
|
||||
let ogLocaleAlternate: string[] | null = null;
|
||||
let ogSiteName: string | null = null;
|
||||
let ogVideo: string | null = null;
|
||||
let dctermsCreated: string | null = null;
|
||||
let dcDateCreated: string | null = null;
|
||||
let dcDate: string | null = null;
|
||||
@ -42,7 +56,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
let dctermsSubject: string | null = null;
|
||||
let dcSubject: string | null = null;
|
||||
let dcDescription: string | null = null;
|
||||
let ogImage: string | null = null;
|
||||
let dctermsKeywords: string | null = null;
|
||||
let modifiedTime: string | null = null;
|
||||
let publishedTime: string | null = null;
|
||||
@ -62,11 +75,18 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||
ogTitle = soup('meta[property="og:title"]').attr("content") || null;
|
||||
ogDescription = soup('meta[property="og:description"]').attr("content") || null;
|
||||
ogUrl = soup('meta[property="og:url"]').attr("content") || null;
|
||||
ogImage = soup('meta[property="og:image"]').attr("content") || null;
|
||||
ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
|
||||
ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null;
|
||||
ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
|
||||
ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null;
|
||||
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
|
||||
ogVideo = soup('meta[property="og:video"]').attr("content") || null;
|
||||
articleSection = soup('meta[name="article:section"]').attr("content") || null;
|
||||
articleTag = soup('meta[name="article:tag"]').attr("content") || null;
|
||||
publishedTime = soup('meta[property="article:published_time"]').attr("content") || null;
|
||||
modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null;
|
||||
ogImage = soup('meta[property="og:image"]').attr("content") || null;
|
||||
dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null;
|
||||
dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
|
||||
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
|
||||
@ -90,6 +110,14 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
...(robots ? { robots } : {}),
|
||||
...(ogTitle ? { ogTitle } : {}),
|
||||
...(ogDescription ? { ogDescription } : {}),
|
||||
...(ogUrl ? { ogUrl } : {}),
|
||||
...(ogImage ? { ogImage } : {}),
|
||||
...(ogAudio ? { ogAudio } : {}),
|
||||
...(ogDeterminer ? { ogDeterminer } : {}),
|
||||
...(ogLocale ? { ogLocale } : {}),
|
||||
...(ogLocaleAlternate ? { ogLocaleAlternate } : {}),
|
||||
...(ogSiteName ? { ogSiteName } : {}),
|
||||
...(ogVideo ? { ogVideo } : {}),
|
||||
...(dctermsCreated ? { dctermsCreated } : {}),
|
||||
...(dcDateCreated ? { dcDateCreated } : {}),
|
||||
...(dcDate ? { dcDate } : {}),
|
||||
@ -99,7 +127,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
...(dctermsSubject ? { dctermsSubject } : {}),
|
||||
...(dcSubject ? { dcSubject } : {}),
|
||||
...(dcDescription ? { dcDescription } : {}),
|
||||
...(ogImage ? { ogImage } : {}),
|
||||
...(dctermsKeywords ? { dctermsKeywords } : {}),
|
||||
...(modifiedTime ? { modifiedTime } : {}),
|
||||
...(publishedTime ? { publishedTime } : {}),
|
||||
|
134
apps/api/src/search/googlesearch.ts
Normal file
134
apps/api/src/search/googlesearch.ts
Normal file
@ -0,0 +1,134 @@
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import * as querystring from 'querystring';
|
||||
import { ScrapingBeeClient } from 'scrapingbee';
|
||||
|
||||
const _useragent_list = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
|
||||
];
|
||||
|
||||
function get_useragent(): string {
|
||||
return _useragent_list[Math.floor(Math.random() * _useragent_list.length)];
|
||||
}
|
||||
|
||||
async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number) {
|
||||
const resp = await axios.get("https://www.google.com/search", {
|
||||
headers: {
|
||||
"User-Agent": get_useragent()
|
||||
},
|
||||
params: {
|
||||
"q": term,
|
||||
"num": results + 2, // Prevents multiple requests
|
||||
"hl": lang,
|
||||
},
|
||||
proxy: proxies,
|
||||
timeout: timeout,
|
||||
});
|
||||
return resp;
|
||||
}
|
||||
|
||||
class SearchResult {
|
||||
url: string;
|
||||
title: string;
|
||||
description: string;
|
||||
|
||||
constructor(url: string, title: string, description: string) {
|
||||
this.url = url;
|
||||
this.title = title;
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
toString(): string {
|
||||
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
|
||||
}
|
||||
}
|
||||
|
||||
export async function search(term: string, advanced = false, num_results = 7, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000) {
|
||||
const escaped_term = querystring.escape(term);
|
||||
|
||||
let proxies = null;
|
||||
if (proxy) {
|
||||
if (proxy.startsWith("https")) {
|
||||
proxies = {"https": proxy};
|
||||
} else {
|
||||
proxies = {"http": proxy};
|
||||
}
|
||||
}
|
||||
|
||||
// const response = await _req_scraping_bee(escaped_term, num_results, lang);
|
||||
// const $ = cheerio.load(response);
|
||||
|
||||
// const knowledgeGraphElement = $("div.kno-rdesc");
|
||||
// console.log(knowledgeGraphElement);
|
||||
// console.log(knowledgeGraphElement.html());
|
||||
|
||||
// let knowledgeGraph = null;
|
||||
// if (knowledgeGraphElement.length > 0) {
|
||||
// console.log("Knowledge Graph found");
|
||||
// const title = knowledgeGraphElement.find("h2").text();
|
||||
// const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text();
|
||||
// const website = knowledgeGraphElement.find("a[data-ved]").attr("href");
|
||||
// const imageUrl = knowledgeGraphElement.find("g-img img").attr("src");
|
||||
// const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text();
|
||||
// const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text();
|
||||
// const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href");
|
||||
// const attributes = {};
|
||||
// knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => {
|
||||
// const attributeKey = $(element).find("span[data-attrid]").text();
|
||||
// const attributeValue = $(element).find("span[data-log-string]").text();
|
||||
// attributes[attributeKey] = attributeValue;
|
||||
// });
|
||||
// knowledgeGraph = {
|
||||
// "title": title,
|
||||
// "type": type,
|
||||
// "website": website,
|
||||
// "imageUrl": imageUrl,
|
||||
// "description": description,
|
||||
// "descriptionSource": descriptionSource,
|
||||
// "descriptionLink": descriptionLink,
|
||||
// "attributes": attributes
|
||||
// };
|
||||
// }
|
||||
|
||||
let start = 0;
|
||||
let results = [];
|
||||
while (start < num_results) {
|
||||
const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout);
|
||||
const $ = cheerio.load(resp.data);
|
||||
const result_block = $("div.g");
|
||||
if (result_block.length === 0) {
|
||||
start += 1;
|
||||
}
|
||||
result_block.each((index, element) => {
|
||||
const linkElement = $(element).find("a");
|
||||
const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null;
|
||||
const title = $(element).find("h3");
|
||||
const ogImage = $(element).find("img").eq(1).attr("src");
|
||||
const description_box = $(element).find("div[style='-webkit-line-clamp:2']");
|
||||
const answerBox = $(element).find(".mod").text();
|
||||
if (description_box) {
|
||||
const description = description_box.text();
|
||||
if (link && title && description) {
|
||||
start += 1;
|
||||
if (advanced) {
|
||||
results.push(new SearchResult(link, title.text(), description));
|
||||
} else {
|
||||
results.push(link);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
|
||||
|
||||
if (start === 0) {
|
||||
return {results: []};
|
||||
}
|
||||
}
|
||||
return {results: results};
|
||||
}
|
@ -44,6 +44,8 @@ export enum RateLimiterMode {
|
||||
CrawlStatus = "crawl-status",
|
||||
Scrape = "scrape",
|
||||
Preview = "preview",
|
||||
Search = "search",
|
||||
|
||||
}
|
||||
|
||||
export interface AuthResponse {
|
||||
|
Loading…
Reference in New Issue
Block a user