From 0146157876b0f59690bde22df8b38a8730ce2742 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 15:28:32 -0700 Subject: [PATCH 01/11] Nick: mvp --- apps/api/src/controllers/search.ts | 136 ++++++++++++++++++ apps/api/src/lib/entities.ts | 2 + apps/api/src/routes/v0.ts | 5 + apps/api/src/scraper/WebScraper/single_url.ts | 11 +- .../src/scraper/WebScraper/utils/metadata.ts | 37 ++++- apps/api/src/search/googlesearch.ts | 134 +++++++++++++++++ apps/api/src/types.ts | 2 + 7 files changed, 320 insertions(+), 7 deletions(-) create mode 100644 apps/api/src/controllers/search.ts create mode 100644 apps/api/src/search/googlesearch.ts diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts new file mode 100644 index 0000000..7cd5209 --- /dev/null +++ b/apps/api/src/controllers/search.ts @@ -0,0 +1,136 @@ +import { Request, Response } from "express"; +import { WebScraperDataProvider } from "../scraper/WebScraper"; +import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../types"; +import { logJob } from "../services/logging/log_job"; +import { PageOptions } from "../lib/entities"; +import { search } from "../search/googlesearch"; + +export async function searchHelper( + req: Request, + team_id: string, + crawlerOptions: any, + pageOptions: PageOptions +): Promise<{ + success: boolean; + error?: string; + data?: any; + returnCode: number; +}> { + const query = req.body.query; + if (!query) { + return { success: false, error: "Query is required", returnCode: 400 }; + } + + const res = await search(query, true, 7); + + let justSearch = pageOptions.fetchPageContent === false; + + if(justSearch){ + return { success: true, data: res, returnCode: 200 }; + } + + if (res.results.length === 0) { + return { success: true, error: "No search results found", returnCode: 200 }; + } + + const a = new WebScraperDataProvider(); + await a.setOptions({ + mode: "single_urls", + urls: res.results.map((r) => r.url), + crawlerOptions: { + ...crawlerOptions, + }, + pageOptions: {...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, fallback:false}, + }); + + const docs = await a.getDocuments(true); + if (docs.length === 0) + { + return { success: true, error: "No search results found", returnCode: 200 }; + } + + + // make sure doc.content is not empty + const filteredDocs = docs.filter( + (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 + ); + + if (filteredDocs.length === 0) { + return { success: true, error: "No page found", returnCode: 200 }; + } + + const { success, credit_usage } = await billTeam( + team_id, + filteredDocs.length + ); + if (!success) { + return { + success: false, + error: + "Failed to bill team. Insufficient credits or subscription not found.", + returnCode: 402, + }; + } + + return { + success: true, + data: filteredDocs, + returnCode: 200, + }; +} + +export async function searchController(req: Request, res: Response) { + try { + // make sure to authenticate user first, Bearer + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.Search + ); + if (!success) { + return res.status(status).json({ error }); + } + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: true, fetchPageContent: true, fallback: false}; + const origin = req.body.origin ?? "api"; + + try { + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return res.status(402).json({ error: "Insufficient credits" }); + } + } catch (error) { + console.error(error); + return res.status(500).json({ error: "Internal server error" }); + } + const startTime = new Date().getTime(); + const result = await searchHelper( + req, + team_id, + crawlerOptions, + pageOptions + ); + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + logJob({ + success: result.success, + message: result.error, + num_docs: 1, + docs: [result.data], + time_taken: timeTakenInSeconds, + team_id: team_id, + mode: "search", + url: req.body.url, + crawlerOptions: crawlerOptions, + pageOptions: pageOptions, + origin: origin, + }); + return res.status(result.returnCode).json(result); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index e261dd4..07f07e4 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -11,6 +11,8 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; + fallback?: boolean; + fetchPageContent?: boolean; }; export type WebScraperOptions = { urls: string[]; diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts index 023282a..f84b974 100644 --- a/apps/api/src/routes/v0.ts +++ b/apps/api/src/routes/v0.ts @@ -4,6 +4,7 @@ import { crawlStatusController } from "../../src/controllers/crawl-status"; import { scrapeController } from "../../src/controllers/scrape"; import { crawlPreviewController } from "../../src/controllers/crawlPreview"; import { crawlJobStatusPreviewController } from "../../src/controllers/status"; +import { searchController } from "../../src/controllers/search"; export const v0Router = express.Router(); @@ -12,3 +13,7 @@ v0Router.post("/v0/crawl", crawlController); v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController); v0Router.get("/v0/crawl/status/:jobId", crawlStatusController); v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController); + +// Search routes +v0Router.post("/v0/search", searchController); + diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0f3cc38..fcbb688 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -4,9 +4,7 @@ import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; import { Document, PageOptions } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; -import { parseTablesToMarkdown } from "./utils/parseTable"; import { excludeNonMainTags } from "./utils/excludeTags"; -// import puppeteer from "puppeteer"; dotenv.config(); @@ -155,6 +153,15 @@ export async function scrapSingleUrl( // } let [text, html] = await attemptScraping(urlToScrap, "scrapingBee"); + if(pageOptions.fallback === false){ + const soup = cheerio.load(html); + const metadata = extractMetadata(soup, urlToScrap); + return { + content: text, + markdown: text, + metadata: { ...metadata, sourceURL: urlToScrap }, + } as Document; + } if (!text || text.length < 100) { console.log("Falling back to playwright"); [text, html] = await attemptScraping(urlToScrap, "playwright"); diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index ef883c3..ddaf1e8 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -1,4 +1,3 @@ -// import * as cheerio from 'cheerio'; import { CheerioAPI } from "cheerio"; interface Metadata { title?: string; @@ -8,6 +7,14 @@ interface Metadata { robots?: string; ogTitle?: string; ogDescription?: string; + ogUrl?: string; + ogImage?: string; + ogAudio?: string; + ogDeterminer?: string; + ogLocale?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogVideo?: string; dctermsCreated?: string; dcDateCreated?: string; dcDate?: string; @@ -17,7 +24,6 @@ interface Metadata { dctermsSubject?: string; dcSubject?: string; dcDescription?: string; - ogImage?: string; dctermsKeywords?: string; modifiedTime?: string; publishedTime?: string; @@ -33,6 +39,14 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { let robots: string | null = null; let ogTitle: string | null = null; let ogDescription: string | null = null; + let ogUrl: string | null = null; + let ogImage: string | null = null; + let ogAudio: string | null = null; + let ogDeterminer: string | null = null; + let ogLocale: string | null = null; + let ogLocaleAlternate: string[] | null = null; + let ogSiteName: string | null = null; + let ogVideo: string | null = null; let dctermsCreated: string | null = null; let dcDateCreated: string | null = null; let dcDate: string | null = null; @@ -42,7 +56,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { let dctermsSubject: string | null = null; let dcSubject: string | null = null; let dcDescription: string | null = null; - let ogImage: string | null = null; let dctermsKeywords: string | null = null; let modifiedTime: string | null = null; let publishedTime: string | null = null; @@ -62,11 +75,18 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { robots = soup('meta[name="robots"]').attr("content") || null; ogTitle = soup('meta[property="og:title"]').attr("content") || null; ogDescription = soup('meta[property="og:description"]').attr("content") || null; + ogUrl = soup('meta[property="og:url"]').attr("content") || null; + ogImage = soup('meta[property="og:image"]').attr("content") || null; + ogAudio = soup('meta[property="og:audio"]').attr("content") || null; + ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null; + ogLocale = soup('meta[property="og:locale"]').attr("content") || null; + ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null; + ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null; + ogVideo = soup('meta[property="og:video"]').attr("content") || null; articleSection = soup('meta[name="article:section"]').attr("content") || null; articleTag = soup('meta[name="article:tag"]').attr("content") || null; publishedTime = soup('meta[property="article:published_time"]').attr("content") || null; modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null; - ogImage = soup('meta[property="og:image"]').attr("content") || null; dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null; dcDescription = soup('meta[name="dc.description"]').attr("content") || null; dcSubject = soup('meta[name="dc.subject"]').attr("content") || null; @@ -90,6 +110,14 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { ...(robots ? { robots } : {}), ...(ogTitle ? { ogTitle } : {}), ...(ogDescription ? { ogDescription } : {}), + ...(ogUrl ? { ogUrl } : {}), + ...(ogImage ? { ogImage } : {}), + ...(ogAudio ? { ogAudio } : {}), + ...(ogDeterminer ? { ogDeterminer } : {}), + ...(ogLocale ? { ogLocale } : {}), + ...(ogLocaleAlternate ? { ogLocaleAlternate } : {}), + ...(ogSiteName ? { ogSiteName } : {}), + ...(ogVideo ? { ogVideo } : {}), ...(dctermsCreated ? { dctermsCreated } : {}), ...(dcDateCreated ? { dcDateCreated } : {}), ...(dcDate ? { dcDate } : {}), @@ -99,7 +127,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { ...(dctermsSubject ? { dctermsSubject } : {}), ...(dcSubject ? { dcSubject } : {}), ...(dcDescription ? { dcDescription } : {}), - ...(ogImage ? { ogImage } : {}), ...(dctermsKeywords ? { dctermsKeywords } : {}), ...(modifiedTime ? { modifiedTime } : {}), ...(publishedTime ? { publishedTime } : {}), diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts new file mode 100644 index 0000000..fd3b645 --- /dev/null +++ b/apps/api/src/search/googlesearch.ts @@ -0,0 +1,134 @@ +import axios from 'axios'; +import * as cheerio from 'cheerio'; +import * as querystring from 'querystring'; +import { ScrapingBeeClient } from 'scrapingbee'; + +const _useragent_list = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0' +]; + +function get_useragent(): string { + return _useragent_list[Math.floor(Math.random() * _useragent_list.length)]; +} + +async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number) { + const resp = await axios.get("https://www.google.com/search", { + headers: { + "User-Agent": get_useragent() + }, + params: { + "q": term, + "num": results + 2, // Prevents multiple requests + "hl": lang, + }, + proxy: proxies, + timeout: timeout, + }); + return resp; +} + +class SearchResult { + url: string; + title: string; + description: string; + + constructor(url: string, title: string, description: string) { + this.url = url; + this.title = title; + this.description = description; + } + + toString(): string { + return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; + } +} + +export async function search(term: string, advanced = false, num_results = 7, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000) { + const escaped_term = querystring.escape(term); + + let proxies = null; + if (proxy) { + if (proxy.startsWith("https")) { + proxies = {"https": proxy}; + } else { + proxies = {"http": proxy}; + } + } + + // const response = await _req_scraping_bee(escaped_term, num_results, lang); + // const $ = cheerio.load(response); + + // const knowledgeGraphElement = $("div.kno-rdesc"); + // console.log(knowledgeGraphElement); + // console.log(knowledgeGraphElement.html()); + + // let knowledgeGraph = null; + // if (knowledgeGraphElement.length > 0) { + // console.log("Knowledge Graph found"); + // const title = knowledgeGraphElement.find("h2").text(); + // const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text(); + // const website = knowledgeGraphElement.find("a[data-ved]").attr("href"); + // const imageUrl = knowledgeGraphElement.find("g-img img").attr("src"); + // const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text(); + // const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text(); + // const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href"); + // const attributes = {}; + // knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => { + // const attributeKey = $(element).find("span[data-attrid]").text(); + // const attributeValue = $(element).find("span[data-log-string]").text(); + // attributes[attributeKey] = attributeValue; + // }); + // knowledgeGraph = { + // "title": title, + // "type": type, + // "website": website, + // "imageUrl": imageUrl, + // "description": description, + // "descriptionSource": descriptionSource, + // "descriptionLink": descriptionLink, + // "attributes": attributes + // }; + // } + + let start = 0; + let results = []; + while (start < num_results) { + const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout); + const $ = cheerio.load(resp.data); + const result_block = $("div.g"); + if (result_block.length === 0) { + start += 1; + } + result_block.each((index, element) => { + const linkElement = $(element).find("a"); + const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null; + const title = $(element).find("h3"); + const ogImage = $(element).find("img").eq(1).attr("src"); + const description_box = $(element).find("div[style='-webkit-line-clamp:2']"); + const answerBox = $(element).find(".mod").text(); + if (description_box) { + const description = description_box.text(); + if (link && title && description) { + start += 1; + if (advanced) { + results.push(new SearchResult(link, title.text(), description)); + } else { + results.push(link); + } + } + } + }); + await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); + + if (start === 0) { + return {results: []}; + } + } + return {results: results}; +} diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 5d778a2..c65140c 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -44,6 +44,8 @@ export enum RateLimiterMode { CrawlStatus = "crawl-status", Scrape = "scrape", Preview = "preview", + Search = "search", + } export interface AuthResponse { From 5e3e2ec966e4c28120f52c037a9df8e93c58ff9b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 15:44:11 -0700 Subject: [PATCH 02/11] Nick: --- apps/api/src/controllers/search.ts | 59 ++++++++++++++++++------------ apps/api/src/lib/entities.ts | 5 +++ 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 7cd5209..bc6659b 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -4,14 +4,15 @@ import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; -import { PageOptions } from "../lib/entities"; +import { PageOptions, SearchOptions } from "../lib/entities"; import { search } from "../search/googlesearch"; export async function searchHelper( req: Request, team_id: string, crawlerOptions: any, - pageOptions: PageOptions + pageOptions: PageOptions, + searchOptions: SearchOptions ): Promise<{ success: boolean; error?: string; @@ -19,39 +20,44 @@ export async function searchHelper( returnCode: number; }> { const query = req.body.query; + const advanced = false; if (!query) { return { success: false, error: "Query is required", returnCode: 400 }; } - const res = await search(query, true, 7); + const res = await search(query, advanced, searchOptions.limit ?? 7); let justSearch = pageOptions.fetchPageContent === false; - if(justSearch){ + if (justSearch) { return { success: true, data: res, returnCode: 200 }; } if (res.results.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } + console.log(res.results); const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", - urls: res.results.map((r) => r.url), + urls: res.results.map((r) => (!advanced ? r : r.url)), crawlerOptions: { ...crawlerOptions, }, - pageOptions: {...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, fallback:false}, + pageOptions: { + ...pageOptions, + onlyMainContent: pageOptions?.onlyMainContent ?? true, + fetchPageContent: pageOptions?.fetchPageContent ?? true, + fallback: false, + }, }); const docs = await a.getDocuments(true); - if (docs.length === 0) - { + if (docs.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } - // make sure doc.content is not empty const filteredDocs = docs.filter( (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 @@ -61,18 +67,18 @@ export async function searchHelper( return { success: true, error: "No page found", returnCode: 200 }; } - const { success, credit_usage } = await billTeam( - team_id, - filteredDocs.length - ); - if (!success) { - return { - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - returnCode: 402, - }; - } + const { success, credit_usage } = await billTeam( + team_id, + filteredDocs.length + ); + if (!success) { + return { + success: false, + error: + "Failed to bill team. Insufficient credits or subscription not found.", + returnCode: 402, + }; + } return { success: true, @@ -93,9 +99,15 @@ export async function searchController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: true, fetchPageContent: true, fallback: false}; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: true, + fetchPageContent: true, + fallback: false, + }; const origin = req.body.origin ?? "api"; + const searchOptions = req.body.searchOptions ?? { limit: 7 }; + try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); @@ -111,7 +123,8 @@ export async function searchController(req: Request, res: Response) { req, team_id, crawlerOptions, - pageOptions + pageOptions, + searchOptions ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 07f07e4..b4b5193 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -14,6 +14,11 @@ export type PageOptions = { fallback?: boolean; fetchPageContent?: boolean; }; + +export type SearchOptions = { + limit?: number; +}; + export type WebScraperOptions = { urls: string[]; mode: "single_urls" | "sitemap" | "crawl"; From 495adc9a3f3b056b84abe101bb5633bb783d410d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 15:48:37 -0700 Subject: [PATCH 03/11] Update googlesearch.ts --- apps/api/src/search/googlesearch.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index fd3b645..c63c907 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -24,7 +24,7 @@ async function _req(term: string, results: number, lang: string, start: number, }, params: { "q": term, - "num": results + 2, // Prevents multiple requests + "num": results, // Number of results to return "hl": lang, }, proxy: proxies, From 8cb5d7955a36aec3f87ea91791cbfac51f4b6070 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 15:49:05 -0700 Subject: [PATCH 04/11] Update googlesearch.ts --- apps/api/src/search/googlesearch.ts | 71 +++++++++++++++-------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index c63c907..c835d08 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -61,40 +61,7 @@ export async function search(term: string, advanced = false, num_results = 7, la } } - // const response = await _req_scraping_bee(escaped_term, num_results, lang); - // const $ = cheerio.load(response); - - // const knowledgeGraphElement = $("div.kno-rdesc"); - // console.log(knowledgeGraphElement); - // console.log(knowledgeGraphElement.html()); - - // let knowledgeGraph = null; - // if (knowledgeGraphElement.length > 0) { - // console.log("Knowledge Graph found"); - // const title = knowledgeGraphElement.find("h2").text(); - // const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text(); - // const website = knowledgeGraphElement.find("a[data-ved]").attr("href"); - // const imageUrl = knowledgeGraphElement.find("g-img img").attr("src"); - // const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text(); - // const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text(); - // const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href"); - // const attributes = {}; - // knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => { - // const attributeKey = $(element).find("span[data-attrid]").text(); - // const attributeValue = $(element).find("span[data-log-string]").text(); - // attributes[attributeKey] = attributeValue; - // }); - // knowledgeGraph = { - // "title": title, - // "type": type, - // "website": website, - // "imageUrl": imageUrl, - // "description": description, - // "descriptionSource": descriptionSource, - // "descriptionLink": descriptionLink, - // "attributes": attributes - // }; - // } + // TODO: knowledge graph, answer box, etc. let start = 0; let results = []; @@ -132,3 +99,39 @@ export async function search(term: string, advanced = false, num_results = 7, la } return {results: results}; } + + +// const response = await _req_scraping_bee(escaped_term, num_results, lang); + // const $ = cheerio.load(response); + + // const knowledgeGraphElement = $("div.kno-rdesc"); + // console.log(knowledgeGraphElement); + // console.log(knowledgeGraphElement.html()); + + // let knowledgeGraph = null; + // if (knowledgeGraphElement.length > 0) { + // console.log("Knowledge Graph found"); + // const title = knowledgeGraphElement.find("h2").text(); + // const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text(); + // const website = knowledgeGraphElement.find("a[data-ved]").attr("href"); + // const imageUrl = knowledgeGraphElement.find("g-img img").attr("src"); + // const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text(); + // const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text(); + // const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href"); + // const attributes = {}; + // knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => { + // const attributeKey = $(element).find("span[data-attrid]").text(); + // const attributeValue = $(element).find("span[data-log-string]").text(); + // attributes[attributeKey] = attributeValue; + // }); + // knowledgeGraph = { + // "title": title, + // "type": type, + // "website": website, + // "imageUrl": imageUrl, + // "description": description, + // "descriptionSource": descriptionSource, + // "descriptionLink": descriptionLink, + // "attributes": attributes + // }; + // } \ No newline at end of file From 41263bb4b6deb17042d64ea34cab72159e1340dc Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 16:45:06 -0700 Subject: [PATCH 05/11] Nick: serper support --- apps/api/.env.example | 3 +- apps/api/src/controllers/search.ts | 12 ++- apps/api/src/lib/entities.ts | 3 + apps/api/src/search/googlesearch.ts | 152 +++++++++++++--------------- apps/api/src/search/index.ts | 45 ++++++++ apps/api/src/search/serper.ts | 27 +++++ 6 files changed, 157 insertions(+), 85 deletions(-) create mode 100644 apps/api/src/search/index.ts create mode 100644 apps/api/src/search/serper.ts diff --git a/apps/api/.env.example b/apps/api/.env.example index 34e24b1..3bd06cd 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -21,4 +21,5 @@ OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) BULL_AUTH_KEY= # LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback -LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs \ No newline at end of file +LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs +SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api \ No newline at end of file diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index bc6659b..6a1c7b4 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -5,7 +5,7 @@ import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { PageOptions, SearchOptions } from "../lib/entities"; -import { search } from "../search/googlesearch"; +import { search } from "../search"; export async function searchHelper( req: Request, @@ -25,7 +25,10 @@ export async function searchHelper( return { success: false, error: "Query is required", returnCode: 400 }; } - const res = await search(query, advanced, searchOptions.limit ?? 7); + const tbs = searchOptions.tbs ?? null; + const filter = searchOptions.filter ?? null; + + const res = await search({query: query, advanced: advanced, num_results: searchOptions.limit ?? 7, tbs: tbs, filter: filter}); let justSearch = pageOptions.fetchPageContent === false; @@ -33,15 +36,14 @@ export async function searchHelper( return { success: true, data: res, returnCode: 200 }; } - if (res.results.length === 0) { + if (res.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } - console.log(res.results); const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", - urls: res.results.map((r) => (!advanced ? r : r.url)), + urls: res.map((r) => r), crawlerOptions: { ...crawlerOptions, }, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index b4b5193..062212b 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -13,10 +13,13 @@ export type PageOptions = { onlyMainContent?: boolean; fallback?: boolean; fetchPageContent?: boolean; + }; export type SearchOptions = { limit?: number; + tbs?: string; + filter?: string; }; export type WebScraperOptions = { diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index c835d08..53227e6 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -1,7 +1,6 @@ import axios from 'axios'; import * as cheerio from 'cheerio'; import * as querystring from 'querystring'; -import { ScrapingBeeClient } from 'scrapingbee'; const _useragent_list = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', @@ -17,20 +16,35 @@ function get_useragent(): string { return _useragent_list[Math.floor(Math.random() * _useragent_list.length)]; } -async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number) { - const resp = await axios.get("https://www.google.com/search", { - headers: { - "User-Agent": get_useragent() - }, - params: { - "q": term, - "num": results, // Number of results to return - "hl": lang, - }, - proxy: proxies, - timeout: timeout, - }); - return resp; +async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number, tbs: string = null, filter: string = null) { + const params = { + "q": term, + "num": results, // Number of results to return + "hl": lang, + "start": start, + }; + if (tbs) { + params["tbs"] = tbs; + } + if (filter) { + params["filter"] = filter; + } + try { + const resp = await axios.get("https://www.google.com/search", { + headers: { + "User-Agent": get_useragent() + }, + params: params, + proxy: proxies, + timeout: timeout, + }); + return resp; + } catch (error) { + if (error.response && error.response.status === 429) { + throw new Error('Google Search: Too many requests, try again later.'); + } + throw error; + } } class SearchResult { @@ -49,7 +63,7 @@ class SearchResult { } } -export async function search(term: string, advanced = false, num_results = 7, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000) { +export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { const escaped_term = querystring.escape(term); let proxies = null; @@ -64,74 +78,54 @@ export async function search(term: string, advanced = false, num_results = 7, la // TODO: knowledge graph, answer box, etc. let start = 0; - let results = []; - while (start < num_results) { - const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout); - const $ = cheerio.load(resp.data); - const result_block = $("div.g"); - if (result_block.length === 0) { - start += 1; - } - result_block.each((index, element) => { - const linkElement = $(element).find("a"); - const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null; - const title = $(element).find("h3"); - const ogImage = $(element).find("img").eq(1).attr("src"); - const description_box = $(element).find("div[style='-webkit-line-clamp:2']"); - const answerBox = $(element).find(".mod").text(); - if (description_box) { - const description = description_box.text(); - if (link && title && description) { - start += 1; - if (advanced) { - results.push(new SearchResult(link, title.text(), description)); - } else { - results.push(link); + let results : string[] = []; + let attempts = 0; + const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop + while (start < num_results && attempts < maxAttempts) { + try { + const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout, tbs, filter); + const $ = cheerio.load(resp.data); + const result_block = $("div.g"); + if (result_block.length === 0) { + start += 1; + attempts += 1; + } else { + attempts = 0; // Reset attempts if we have results + } + result_block.each((index, element) => { + const linkElement = $(element).find("a"); + const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null; + const title = $(element).find("h3"); + const ogImage = $(element).find("img").eq(1).attr("src"); + const description_box = $(element).find("div[style='-webkit-line-clamp:2']"); + const answerBox = $(element).find(".mod").text(); + if (description_box) { + const description = description_box.text(); + if (link && title && description) { + start += 1; + if (advanced) { + // results.push(new SearchResult(link, title.text(), description)); + } else { + results.push(link); + } } } + }); + await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); + } catch (error) { + if (error.message === 'Too many requests') { + console.warn('Too many requests, breaking the loop'); + break; } - }); - await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); + throw error; + } if (start === 0) { - return {results: []}; + return results; } } - return {results: results}; + if (attempts >= maxAttempts) { + console.warn('Max attempts reached, breaking the loop'); + } + return results } - - -// const response = await _req_scraping_bee(escaped_term, num_results, lang); - // const $ = cheerio.load(response); - - // const knowledgeGraphElement = $("div.kno-rdesc"); - // console.log(knowledgeGraphElement); - // console.log(knowledgeGraphElement.html()); - - // let knowledgeGraph = null; - // if (knowledgeGraphElement.length > 0) { - // console.log("Knowledge Graph found"); - // const title = knowledgeGraphElement.find("h2").text(); - // const type = knowledgeGraphElement.find("div[data-attrid='subtitle']").text(); - // const website = knowledgeGraphElement.find("a[data-ved]").attr("href"); - // const imageUrl = knowledgeGraphElement.find("g-img img").attr("src"); - // const description = knowledgeGraphElement.find("div[data-attrid='description'] span").text(); - // const descriptionSource = knowledgeGraphElement.find("div[data-attrid='description'] a").text(); - // const descriptionLink = knowledgeGraphElement.find("div[data-attrid='description'] a").attr("href"); - // const attributes = {}; - // knowledgeGraphElement.find("div[data-attrid='kc:/common:sideways']").each((index, element) => { - // const attributeKey = $(element).find("span[data-attrid]").text(); - // const attributeValue = $(element).find("span[data-log-string]").text(); - // attributes[attributeKey] = attributeValue; - // }); - // knowledgeGraph = { - // "title": title, - // "type": type, - // "website": website, - // "imageUrl": imageUrl, - // "description": description, - // "descriptionSource": descriptionSource, - // "descriptionLink": descriptionLink, - // "attributes": attributes - // }; - // } \ No newline at end of file diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts new file mode 100644 index 0000000..0f3a596 --- /dev/null +++ b/apps/api/src/search/index.ts @@ -0,0 +1,45 @@ +import { google_search } from "./googlesearch"; +import { serper_search } from "./serper"; + +export async function search({ + query, + advanced = false, + num_results = 7, + tbs = null, + filter = null, + lang = "en", + proxy = null, + sleep_interval = 0, + timeout = 5000, +}: { + query: string; + advanced?: boolean; + num_results?: number; + tbs?: string; + filter?: string; + lang?: string; + proxy?: string; + sleep_interval?: number; + timeout?: number; +}) { + try { + if (process.env.SERPER_API_KEY) { + return await serper_search(query, num_results); + } + return await google_search( + query, + advanced, + num_results, + tbs, + filter, + lang, + proxy, + sleep_interval, + timeout + ); + } catch (error) { + console.error("Error in search function: ", error); + return [] + } + // if process.env.SERPER_API_KEY is set, use serper +} diff --git a/apps/api/src/search/serper.ts b/apps/api/src/search/serper.ts new file mode 100644 index 0000000..f92f2fc --- /dev/null +++ b/apps/api/src/search/serper.ts @@ -0,0 +1,27 @@ +import axios from "axios"; +import dotenv from "dotenv"; + +dotenv.config(); + +export async function serper_search(q, num_results) : Promise { + let data = JSON.stringify({ + q: q, + "num": num_results + }); + + let config = { + method: "POST", + url: "https://google.serper.dev/search", + headers: { + "X-API-KEY": process.env.SERPER_API_KEY, + "Content-Type": "application/json", + }, + data: data, + }; + const response = await axios(config); + if (response && response.data && Array.isArray(response.data.organic)) { + return response.data.organic.map((a) => a.link); + } else { + return []; + } +} From e6779aff6824282c2cfdeaaa016a0f3512202216 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 16:56:09 -0700 Subject: [PATCH 06/11] Nick: tests --- .../src/__tests__/e2e_noAuth/index.test.ts | 27 ++++++++++++++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 28 +++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index e0aca36..dfe6aeb 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -102,6 +102,33 @@ describe("E2E Tests for API Routes with No Authentication", () => { }); }); + describe("POST /v0/search", () => { + it("should require not authorization", async () => { + const response = await request(TEST_URL).post("/v0/search"); + expect(response.statusCode).not.toBe(401); + }); + + it("should return no error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).not.toBe(401); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }); + }); + describe("GET /v0/crawl/status/:jobId", () => { it("should not require authorization", async () => { const response = await request(TEST_URL).get("/v0/crawl/status/123"); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index ba01a7c..f0887eb 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -133,6 +133,34 @@ const TEST_URL = "http://127.0.0.1:3002"; }); }); + describe("POST /v0/search", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/search"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(401); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }, 20000); + }); + describe("GET /v0/crawl/status/:jobId", () => { it("should require authorization", async () => { const response = await request(TEST_URL).get("/v0/crawl/status/123"); From 4328a68ec19049caba40ffdb3d442ba915483454 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 16:57:53 -0700 Subject: [PATCH 07/11] Nick: --- apps/api/src/__tests__/e2e_noAuth/index.test.ts | 4 ++-- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index dfe6aeb..37eeb0e 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -117,7 +117,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).not.toBe(401); }); - it("should return a successful response with a valid API key", async () => { + it("should return a successful response without a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/search") .set("Content-Type", "application/json") @@ -126,7 +126,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.body).toHaveProperty("success"); expect(response.body.success).toBe(true); expect(response.body).toHaveProperty("data"); - }); + }, 20000); }); describe("GET /v0/crawl/status/:jobId", () => { diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index f0887eb..59dfde2 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -158,7 +158,7 @@ const TEST_URL = "http://127.0.0.1:3002"; expect(response.body).toHaveProperty("success"); expect(response.body.success).toBe(true); expect(response.body).toHaveProperty("data"); - }, 20000); + }, 20000); }); describe("GET /v0/crawl/status/:jobId", () => { From f0695c712307b06bde55e251f799373882b6a7ad Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 17:04:10 -0700 Subject: [PATCH 08/11] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index fcbb688..e110b0e 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -23,13 +23,14 @@ export async function scrapWithCustomFirecrawl( export async function scrapWithScrapingBee( url: string, - wait_browser: string = "domcontentloaded" + wait_browser: string = "domcontentloaded", + timeout: number = 15000 ): Promise { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const response = await client.get({ url: url, - params: { timeout: 15000, wait_browser: wait_browser }, + params: { timeout: timeout, wait_browser: wait_browser }, headers: { "ScrapingService-Request": "TRUE" }, }); @@ -106,11 +107,11 @@ export async function scrapSingleUrl( let text = ""; switch (method) { case "firecrawl-scraper": - text = await scrapWithCustomFirecrawl(url); + text = await scrapWithCustomFirecrawl(url,); break; case "scrapingBee": if (process.env.SCRAPING_BEE_API_KEY) { - text = await scrapWithScrapingBee(url); + text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000); } break; case "playwright": From 53cc4c396fea229ac87004e822f2228a090feb5c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 17:05:58 -0700 Subject: [PATCH 09/11] Update search.ts --- apps/api/src/controllers/search.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 6a1c7b4..4c03644 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -6,6 +6,7 @@ import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { PageOptions, SearchOptions } from "../lib/entities"; import { search } from "../search"; +import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; export async function searchHelper( req: Request, @@ -28,7 +29,7 @@ export async function searchHelper( const tbs = searchOptions.tbs ?? null; const filter = searchOptions.filter ?? null; - const res = await search({query: query, advanced: advanced, num_results: searchOptions.limit ?? 7, tbs: tbs, filter: filter}); + let res = await search({query: query, advanced: advanced, num_results: searchOptions.limit ?? 7, tbs: tbs, filter: filter}); let justSearch = pageOptions.fetchPageContent === false; @@ -40,6 +41,9 @@ export async function searchHelper( return { success: true, error: "No search results found", returnCode: 200 }; } + // filter out social media links + res = res.filter((r) => !isUrlBlocked(r)); + const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", From 3abfd6b4c19d9ce14c6a5b8dea47dda16f6383d0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 17:06:48 -0700 Subject: [PATCH 10/11] Update search.ts --- apps/api/src/controllers/search.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 4c03644..f18f1c5 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -37,12 +37,13 @@ export async function searchHelper( return { success: true, data: res, returnCode: 200 }; } + res = res.filter((r) => !isUrlBlocked(r)); + if (res.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } // filter out social media links - res = res.filter((r) => !isUrlBlocked(r)); const a = new WebScraperDataProvider(); await a.setOptions({ From fdb2789eaa302b2f90bed7f1dad6dcc95613cb1f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 17:14:34 -0700 Subject: [PATCH 11/11] Nick: added url as return param --- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/single_url.ts | 2 ++ 2 files changed, 3 insertions(+) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 062212b..fdc1c61 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -40,6 +40,7 @@ export type WebScraperOptions = { export class Document { id?: string; + url?: string; // Used only in /search for now content: string; markdown?: string; createdAt?: Date; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index e110b0e..6ab3003 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -154,10 +154,12 @@ export async function scrapSingleUrl( // } let [text, html] = await attemptScraping(urlToScrap, "scrapingBee"); + // Basically means that it is using /search endpoint if(pageOptions.fallback === false){ const soup = cheerio.load(html); const metadata = extractMetadata(soup, urlToScrap); return { + url: urlToScrap, content: text, markdown: text, metadata: { ...metadata, sourceURL: urlToScrap },