Merge pull request #56 from mendableai/nsc/mvp-search
[Feat:mvp] Search Endpoint => serp api + firecrawl => 🔥 🔍
This commit is contained in:
commit
dda77dce05
@ -22,4 +22,5 @@ BULL_AUTH_KEY= #
|
|||||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||||
|
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
@ -132,6 +132,33 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("POST /v0/search", () => {
|
||||||
|
it("should require not authorization", async () => {
|
||||||
|
const response = await request(TEST_URL).post("/v0/search");
|
||||||
|
expect(response.statusCode).not.toBe(401);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return no error response with an invalid API key", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/search")
|
||||||
|
.set("Authorization", `Bearer invalid-api-key`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ query: "test" });
|
||||||
|
expect(response.statusCode).not.toBe(401);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return a successful response without a valid API key", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/search")
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ query: "test" });
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("success");
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
}, 20000);
|
||||||
|
});
|
||||||
|
|
||||||
describe("GET /v0/crawl/status/:jobId", () => {
|
describe("GET /v0/crawl/status/:jobId", () => {
|
||||||
it("should not require authorization", async () => {
|
it("should not require authorization", async () => {
|
||||||
const response = await request(TEST_URL).get("/v0/crawl/status/123");
|
const response = await request(TEST_URL).get("/v0/crawl/status/123");
|
||||||
|
@ -168,6 +168,34 @@ const TEST_URL = "http://127.0.0.1:3002";
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("POST /v0/search", () => {
|
||||||
|
it("should require authorization", async () => {
|
||||||
|
const response = await request(TEST_URL).post("/v0/search");
|
||||||
|
expect(response.statusCode).toBe(401);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return an error response with an invalid API key", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/search")
|
||||||
|
.set("Authorization", `Bearer invalid-api-key`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ query: "test" });
|
||||||
|
expect(response.statusCode).toBe(401);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return a successful response with a valid API key", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/search")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ query: "test" });
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("success");
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
}, 20000);
|
||||||
|
});
|
||||||
|
|
||||||
describe("GET /v0/crawl/status/:jobId", () => {
|
describe("GET /v0/crawl/status/:jobId", () => {
|
||||||
it("should require authorization", async () => {
|
it("should require authorization", async () => {
|
||||||
const response = await request(TEST_URL).get("/v0/crawl/status/123");
|
const response = await request(TEST_URL).get("/v0/crawl/status/123");
|
||||||
|
156
apps/api/src/controllers/search.ts
Normal file
156
apps/api/src/controllers/search.ts
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
import { Request, Response } from "express";
|
||||||
|
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||||
|
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||||
|
import { authenticateUser } from "./auth";
|
||||||
|
import { RateLimiterMode } from "../types";
|
||||||
|
import { logJob } from "../services/logging/log_job";
|
||||||
|
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||||
|
import { search } from "../search";
|
||||||
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
|
|
||||||
|
export async function searchHelper(
|
||||||
|
req: Request,
|
||||||
|
team_id: string,
|
||||||
|
crawlerOptions: any,
|
||||||
|
pageOptions: PageOptions,
|
||||||
|
searchOptions: SearchOptions
|
||||||
|
): Promise<{
|
||||||
|
success: boolean;
|
||||||
|
error?: string;
|
||||||
|
data?: any;
|
||||||
|
returnCode: number;
|
||||||
|
}> {
|
||||||
|
const query = req.body.query;
|
||||||
|
const advanced = false;
|
||||||
|
if (!query) {
|
||||||
|
return { success: false, error: "Query is required", returnCode: 400 };
|
||||||
|
}
|
||||||
|
|
||||||
|
const tbs = searchOptions.tbs ?? null;
|
||||||
|
const filter = searchOptions.filter ?? null;
|
||||||
|
|
||||||
|
let res = await search({query: query, advanced: advanced, num_results: searchOptions.limit ?? 7, tbs: tbs, filter: filter});
|
||||||
|
|
||||||
|
let justSearch = pageOptions.fetchPageContent === false;
|
||||||
|
|
||||||
|
if (justSearch) {
|
||||||
|
return { success: true, data: res, returnCode: 200 };
|
||||||
|
}
|
||||||
|
|
||||||
|
res = res.filter((r) => !isUrlBlocked(r));
|
||||||
|
|
||||||
|
if (res.length === 0) {
|
||||||
|
return { success: true, error: "No search results found", returnCode: 200 };
|
||||||
|
}
|
||||||
|
|
||||||
|
// filter out social media links
|
||||||
|
|
||||||
|
const a = new WebScraperDataProvider();
|
||||||
|
await a.setOptions({
|
||||||
|
mode: "single_urls",
|
||||||
|
urls: res.map((r) => r),
|
||||||
|
crawlerOptions: {
|
||||||
|
...crawlerOptions,
|
||||||
|
},
|
||||||
|
pageOptions: {
|
||||||
|
...pageOptions,
|
||||||
|
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
||||||
|
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
||||||
|
fallback: false,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const docs = await a.getDocuments(true);
|
||||||
|
if (docs.length === 0) {
|
||||||
|
return { success: true, error: "No search results found", returnCode: 200 };
|
||||||
|
}
|
||||||
|
|
||||||
|
// make sure doc.content is not empty
|
||||||
|
const filteredDocs = docs.filter(
|
||||||
|
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||||
|
);
|
||||||
|
|
||||||
|
if (filteredDocs.length === 0) {
|
||||||
|
return { success: true, error: "No page found", returnCode: 200 };
|
||||||
|
}
|
||||||
|
|
||||||
|
const { success, credit_usage } = await billTeam(
|
||||||
|
team_id,
|
||||||
|
filteredDocs.length
|
||||||
|
);
|
||||||
|
if (!success) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error:
|
||||||
|
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||||
|
returnCode: 402,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
data: filteredDocs,
|
||||||
|
returnCode: 200,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function searchController(req: Request, res: Response) {
|
||||||
|
try {
|
||||||
|
// make sure to authenticate user first, Bearer <token>
|
||||||
|
const { success, team_id, error, status } = await authenticateUser(
|
||||||
|
req,
|
||||||
|
res,
|
||||||
|
RateLimiterMode.Search
|
||||||
|
);
|
||||||
|
if (!success) {
|
||||||
|
return res.status(status).json({ error });
|
||||||
|
}
|
||||||
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
|
onlyMainContent: true,
|
||||||
|
fetchPageContent: true,
|
||||||
|
fallback: false,
|
||||||
|
};
|
||||||
|
const origin = req.body.origin ?? "api";
|
||||||
|
|
||||||
|
const searchOptions = req.body.searchOptions ?? { limit: 7 };
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||||
|
await checkTeamCredits(team_id, 1);
|
||||||
|
if (!creditsCheckSuccess) {
|
||||||
|
return res.status(402).json({ error: "Insufficient credits" });
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
|
return res.status(500).json({ error: "Internal server error" });
|
||||||
|
}
|
||||||
|
const startTime = new Date().getTime();
|
||||||
|
const result = await searchHelper(
|
||||||
|
req,
|
||||||
|
team_id,
|
||||||
|
crawlerOptions,
|
||||||
|
pageOptions,
|
||||||
|
searchOptions
|
||||||
|
);
|
||||||
|
const endTime = new Date().getTime();
|
||||||
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
logJob({
|
||||||
|
success: result.success,
|
||||||
|
message: result.error,
|
||||||
|
num_docs: 1,
|
||||||
|
docs: [result.data],
|
||||||
|
time_taken: timeTakenInSeconds,
|
||||||
|
team_id: team_id,
|
||||||
|
mode: "search",
|
||||||
|
url: req.body.url,
|
||||||
|
crawlerOptions: crawlerOptions,
|
||||||
|
pageOptions: pageOptions,
|
||||||
|
origin: origin,
|
||||||
|
});
|
||||||
|
return res.status(result.returnCode).json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
|
return res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
}
|
@ -11,7 +11,17 @@ export interface Progress {
|
|||||||
|
|
||||||
export type PageOptions = {
|
export type PageOptions = {
|
||||||
onlyMainContent?: boolean;
|
onlyMainContent?: boolean;
|
||||||
|
fallback?: boolean;
|
||||||
|
fetchPageContent?: boolean;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export type SearchOptions = {
|
||||||
|
limit?: number;
|
||||||
|
tbs?: string;
|
||||||
|
filter?: string;
|
||||||
|
};
|
||||||
|
|
||||||
export type WebScraperOptions = {
|
export type WebScraperOptions = {
|
||||||
urls: string[];
|
urls: string[];
|
||||||
mode: "single_urls" | "sitemap" | "crawl";
|
mode: "single_urls" | "sitemap" | "crawl";
|
||||||
@ -30,6 +40,7 @@ export type WebScraperOptions = {
|
|||||||
|
|
||||||
export class Document {
|
export class Document {
|
||||||
id?: string;
|
id?: string;
|
||||||
|
url?: string; // Used only in /search for now
|
||||||
content: string;
|
content: string;
|
||||||
markdown?: string;
|
markdown?: string;
|
||||||
createdAt?: Date;
|
createdAt?: Date;
|
||||||
|
@ -4,6 +4,7 @@ import { crawlStatusController } from "../../src/controllers/crawl-status";
|
|||||||
import { scrapeController } from "../../src/controllers/scrape";
|
import { scrapeController } from "../../src/controllers/scrape";
|
||||||
import { crawlPreviewController } from "../../src/controllers/crawlPreview";
|
import { crawlPreviewController } from "../../src/controllers/crawlPreview";
|
||||||
import { crawlJobStatusPreviewController } from "../../src/controllers/status";
|
import { crawlJobStatusPreviewController } from "../../src/controllers/status";
|
||||||
|
import { searchController } from "../../src/controllers/search";
|
||||||
|
|
||||||
export const v0Router = express.Router();
|
export const v0Router = express.Router();
|
||||||
|
|
||||||
@ -12,3 +13,7 @@ v0Router.post("/v0/crawl", crawlController);
|
|||||||
v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController);
|
v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController);
|
||||||
v0Router.get("/v0/crawl/status/:jobId", crawlStatusController);
|
v0Router.get("/v0/crawl/status/:jobId", crawlStatusController);
|
||||||
v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||||
|
|
||||||
|
// Search routes
|
||||||
|
v0Router.post("/v0/search", searchController);
|
||||||
|
|
||||||
|
@ -4,9 +4,7 @@ import { extractMetadata } from "./utils/metadata";
|
|||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
import { Document, PageOptions } from "../../lib/entities";
|
import { Document, PageOptions } from "../../lib/entities";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { parseTablesToMarkdown } from "./utils/parseTable";
|
|
||||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||||
// import puppeteer from "puppeteer";
|
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
@ -25,13 +23,14 @@ export async function scrapWithCustomFirecrawl(
|
|||||||
|
|
||||||
export async function scrapWithScrapingBee(
|
export async function scrapWithScrapingBee(
|
||||||
url: string,
|
url: string,
|
||||||
wait_browser: string = "domcontentloaded"
|
wait_browser: string = "domcontentloaded",
|
||||||
|
timeout: number = 15000
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||||
const response = await client.get({
|
const response = await client.get({
|
||||||
url: url,
|
url: url,
|
||||||
params: { timeout: 15000, wait_browser: wait_browser },
|
params: { timeout: timeout, wait_browser: wait_browser },
|
||||||
headers: { "ScrapingService-Request": "TRUE" },
|
headers: { "ScrapingService-Request": "TRUE" },
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -108,11 +107,11 @@ export async function scrapSingleUrl(
|
|||||||
let text = "";
|
let text = "";
|
||||||
switch (method) {
|
switch (method) {
|
||||||
case "firecrawl-scraper":
|
case "firecrawl-scraper":
|
||||||
text = await scrapWithCustomFirecrawl(url);
|
text = await scrapWithCustomFirecrawl(url,);
|
||||||
break;
|
break;
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||||
text = await scrapWithScrapingBee(url);
|
text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "playwright":
|
case "playwright":
|
||||||
@ -155,6 +154,17 @@ export async function scrapSingleUrl(
|
|||||||
// }
|
// }
|
||||||
|
|
||||||
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
|
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
|
||||||
|
// Basically means that it is using /search endpoint
|
||||||
|
if(pageOptions.fallback === false){
|
||||||
|
const soup = cheerio.load(html);
|
||||||
|
const metadata = extractMetadata(soup, urlToScrap);
|
||||||
|
return {
|
||||||
|
url: urlToScrap,
|
||||||
|
content: text,
|
||||||
|
markdown: text,
|
||||||
|
metadata: { ...metadata, sourceURL: urlToScrap },
|
||||||
|
} as Document;
|
||||||
|
}
|
||||||
if (!text || text.length < 100) {
|
if (!text || text.length < 100) {
|
||||||
console.log("Falling back to playwright");
|
console.log("Falling back to playwright");
|
||||||
[text, html] = await attemptScraping(urlToScrap, "playwright");
|
[text, html] = await attemptScraping(urlToScrap, "playwright");
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
// import * as cheerio from 'cheerio';
|
|
||||||
import { CheerioAPI } from "cheerio";
|
import { CheerioAPI } from "cheerio";
|
||||||
interface Metadata {
|
interface Metadata {
|
||||||
title?: string;
|
title?: string;
|
||||||
@ -8,6 +7,14 @@ interface Metadata {
|
|||||||
robots?: string;
|
robots?: string;
|
||||||
ogTitle?: string;
|
ogTitle?: string;
|
||||||
ogDescription?: string;
|
ogDescription?: string;
|
||||||
|
ogUrl?: string;
|
||||||
|
ogImage?: string;
|
||||||
|
ogAudio?: string;
|
||||||
|
ogDeterminer?: string;
|
||||||
|
ogLocale?: string;
|
||||||
|
ogLocaleAlternate?: string[];
|
||||||
|
ogSiteName?: string;
|
||||||
|
ogVideo?: string;
|
||||||
dctermsCreated?: string;
|
dctermsCreated?: string;
|
||||||
dcDateCreated?: string;
|
dcDateCreated?: string;
|
||||||
dcDate?: string;
|
dcDate?: string;
|
||||||
@ -17,7 +24,6 @@ interface Metadata {
|
|||||||
dctermsSubject?: string;
|
dctermsSubject?: string;
|
||||||
dcSubject?: string;
|
dcSubject?: string;
|
||||||
dcDescription?: string;
|
dcDescription?: string;
|
||||||
ogImage?: string;
|
|
||||||
dctermsKeywords?: string;
|
dctermsKeywords?: string;
|
||||||
modifiedTime?: string;
|
modifiedTime?: string;
|
||||||
publishedTime?: string;
|
publishedTime?: string;
|
||||||
@ -33,6 +39,14 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||||||
let robots: string | null = null;
|
let robots: string | null = null;
|
||||||
let ogTitle: string | null = null;
|
let ogTitle: string | null = null;
|
||||||
let ogDescription: string | null = null;
|
let ogDescription: string | null = null;
|
||||||
|
let ogUrl: string | null = null;
|
||||||
|
let ogImage: string | null = null;
|
||||||
|
let ogAudio: string | null = null;
|
||||||
|
let ogDeterminer: string | null = null;
|
||||||
|
let ogLocale: string | null = null;
|
||||||
|
let ogLocaleAlternate: string[] | null = null;
|
||||||
|
let ogSiteName: string | null = null;
|
||||||
|
let ogVideo: string | null = null;
|
||||||
let dctermsCreated: string | null = null;
|
let dctermsCreated: string | null = null;
|
||||||
let dcDateCreated: string | null = null;
|
let dcDateCreated: string | null = null;
|
||||||
let dcDate: string | null = null;
|
let dcDate: string | null = null;
|
||||||
@ -42,7 +56,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||||||
let dctermsSubject: string | null = null;
|
let dctermsSubject: string | null = null;
|
||||||
let dcSubject: string | null = null;
|
let dcSubject: string | null = null;
|
||||||
let dcDescription: string | null = null;
|
let dcDescription: string | null = null;
|
||||||
let ogImage: string | null = null;
|
|
||||||
let dctermsKeywords: string | null = null;
|
let dctermsKeywords: string | null = null;
|
||||||
let modifiedTime: string | null = null;
|
let modifiedTime: string | null = null;
|
||||||
let publishedTime: string | null = null;
|
let publishedTime: string | null = null;
|
||||||
@ -62,11 +75,18 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||||
ogTitle = soup('meta[property="og:title"]').attr("content") || null;
|
ogTitle = soup('meta[property="og:title"]').attr("content") || null;
|
||||||
ogDescription = soup('meta[property="og:description"]').attr("content") || null;
|
ogDescription = soup('meta[property="og:description"]').attr("content") || null;
|
||||||
|
ogUrl = soup('meta[property="og:url"]').attr("content") || null;
|
||||||
|
ogImage = soup('meta[property="og:image"]').attr("content") || null;
|
||||||
|
ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
|
||||||
|
ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null;
|
||||||
|
ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
|
||||||
|
ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null;
|
||||||
|
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
|
||||||
|
ogVideo = soup('meta[property="og:video"]').attr("content") || null;
|
||||||
articleSection = soup('meta[name="article:section"]').attr("content") || null;
|
articleSection = soup('meta[name="article:section"]').attr("content") || null;
|
||||||
articleTag = soup('meta[name="article:tag"]').attr("content") || null;
|
articleTag = soup('meta[name="article:tag"]').attr("content") || null;
|
||||||
publishedTime = soup('meta[property="article:published_time"]').attr("content") || null;
|
publishedTime = soup('meta[property="article:published_time"]').attr("content") || null;
|
||||||
modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null;
|
modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null;
|
||||||
ogImage = soup('meta[property="og:image"]').attr("content") || null;
|
|
||||||
dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null;
|
dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null;
|
||||||
dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
|
dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
|
||||||
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
|
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
|
||||||
@ -90,6 +110,14 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||||||
...(robots ? { robots } : {}),
|
...(robots ? { robots } : {}),
|
||||||
...(ogTitle ? { ogTitle } : {}),
|
...(ogTitle ? { ogTitle } : {}),
|
||||||
...(ogDescription ? { ogDescription } : {}),
|
...(ogDescription ? { ogDescription } : {}),
|
||||||
|
...(ogUrl ? { ogUrl } : {}),
|
||||||
|
...(ogImage ? { ogImage } : {}),
|
||||||
|
...(ogAudio ? { ogAudio } : {}),
|
||||||
|
...(ogDeterminer ? { ogDeterminer } : {}),
|
||||||
|
...(ogLocale ? { ogLocale } : {}),
|
||||||
|
...(ogLocaleAlternate ? { ogLocaleAlternate } : {}),
|
||||||
|
...(ogSiteName ? { ogSiteName } : {}),
|
||||||
|
...(ogVideo ? { ogVideo } : {}),
|
||||||
...(dctermsCreated ? { dctermsCreated } : {}),
|
...(dctermsCreated ? { dctermsCreated } : {}),
|
||||||
...(dcDateCreated ? { dcDateCreated } : {}),
|
...(dcDateCreated ? { dcDateCreated } : {}),
|
||||||
...(dcDate ? { dcDate } : {}),
|
...(dcDate ? { dcDate } : {}),
|
||||||
@ -99,7 +127,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||||||
...(dctermsSubject ? { dctermsSubject } : {}),
|
...(dctermsSubject ? { dctermsSubject } : {}),
|
||||||
...(dcSubject ? { dcSubject } : {}),
|
...(dcSubject ? { dcSubject } : {}),
|
||||||
...(dcDescription ? { dcDescription } : {}),
|
...(dcDescription ? { dcDescription } : {}),
|
||||||
...(ogImage ? { ogImage } : {}),
|
|
||||||
...(dctermsKeywords ? { dctermsKeywords } : {}),
|
...(dctermsKeywords ? { dctermsKeywords } : {}),
|
||||||
...(modifiedTime ? { modifiedTime } : {}),
|
...(modifiedTime ? { modifiedTime } : {}),
|
||||||
...(publishedTime ? { publishedTime } : {}),
|
...(publishedTime ? { publishedTime } : {}),
|
||||||
|
131
apps/api/src/search/googlesearch.ts
Normal file
131
apps/api/src/search/googlesearch.ts
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
import axios from 'axios';
|
||||||
|
import * as cheerio from 'cheerio';
|
||||||
|
import * as querystring from 'querystring';
|
||||||
|
|
||||||
|
const _useragent_list = [
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
|
||||||
|
];
|
||||||
|
|
||||||
|
function get_useragent(): string {
|
||||||
|
return _useragent_list[Math.floor(Math.random() * _useragent_list.length)];
|
||||||
|
}
|
||||||
|
|
||||||
|
async function _req(term: string, results: number, lang: string, start: number, proxies: any, timeout: number, tbs: string = null, filter: string = null) {
|
||||||
|
const params = {
|
||||||
|
"q": term,
|
||||||
|
"num": results, // Number of results to return
|
||||||
|
"hl": lang,
|
||||||
|
"start": start,
|
||||||
|
};
|
||||||
|
if (tbs) {
|
||||||
|
params["tbs"] = tbs;
|
||||||
|
}
|
||||||
|
if (filter) {
|
||||||
|
params["filter"] = filter;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const resp = await axios.get("https://www.google.com/search", {
|
||||||
|
headers: {
|
||||||
|
"User-Agent": get_useragent()
|
||||||
|
},
|
||||||
|
params: params,
|
||||||
|
proxy: proxies,
|
||||||
|
timeout: timeout,
|
||||||
|
});
|
||||||
|
return resp;
|
||||||
|
} catch (error) {
|
||||||
|
if (error.response && error.response.status === 429) {
|
||||||
|
throw new Error('Google Search: Too many requests, try again later.');
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class SearchResult {
|
||||||
|
url: string;
|
||||||
|
title: string;
|
||||||
|
description: string;
|
||||||
|
|
||||||
|
constructor(url: string, title: string, description: string) {
|
||||||
|
this.url = url;
|
||||||
|
this.title = title;
|
||||||
|
this.description = description;
|
||||||
|
}
|
||||||
|
|
||||||
|
toString(): string {
|
||||||
|
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<string[]> {
|
||||||
|
const escaped_term = querystring.escape(term);
|
||||||
|
|
||||||
|
let proxies = null;
|
||||||
|
if (proxy) {
|
||||||
|
if (proxy.startsWith("https")) {
|
||||||
|
proxies = {"https": proxy};
|
||||||
|
} else {
|
||||||
|
proxies = {"http": proxy};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: knowledge graph, answer box, etc.
|
||||||
|
|
||||||
|
let start = 0;
|
||||||
|
let results : string[] = [];
|
||||||
|
let attempts = 0;
|
||||||
|
const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop
|
||||||
|
while (start < num_results && attempts < maxAttempts) {
|
||||||
|
try {
|
||||||
|
const resp = await _req(escaped_term, num_results - start, lang, start, proxies, timeout, tbs, filter);
|
||||||
|
const $ = cheerio.load(resp.data);
|
||||||
|
const result_block = $("div.g");
|
||||||
|
if (result_block.length === 0) {
|
||||||
|
start += 1;
|
||||||
|
attempts += 1;
|
||||||
|
} else {
|
||||||
|
attempts = 0; // Reset attempts if we have results
|
||||||
|
}
|
||||||
|
result_block.each((index, element) => {
|
||||||
|
const linkElement = $(element).find("a");
|
||||||
|
const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null;
|
||||||
|
const title = $(element).find("h3");
|
||||||
|
const ogImage = $(element).find("img").eq(1).attr("src");
|
||||||
|
const description_box = $(element).find("div[style='-webkit-line-clamp:2']");
|
||||||
|
const answerBox = $(element).find(".mod").text();
|
||||||
|
if (description_box) {
|
||||||
|
const description = description_box.text();
|
||||||
|
if (link && title && description) {
|
||||||
|
start += 1;
|
||||||
|
if (advanced) {
|
||||||
|
// results.push(new SearchResult(link, title.text(), description));
|
||||||
|
} else {
|
||||||
|
results.push(link);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
|
||||||
|
} catch (error) {
|
||||||
|
if (error.message === 'Too many requests') {
|
||||||
|
console.warn('Too many requests, breaking the loop');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (start === 0) {
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (attempts >= maxAttempts) {
|
||||||
|
console.warn('Max attempts reached, breaking the loop');
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
}
|
45
apps/api/src/search/index.ts
Normal file
45
apps/api/src/search/index.ts
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import { google_search } from "./googlesearch";
|
||||||
|
import { serper_search } from "./serper";
|
||||||
|
|
||||||
|
export async function search({
|
||||||
|
query,
|
||||||
|
advanced = false,
|
||||||
|
num_results = 7,
|
||||||
|
tbs = null,
|
||||||
|
filter = null,
|
||||||
|
lang = "en",
|
||||||
|
proxy = null,
|
||||||
|
sleep_interval = 0,
|
||||||
|
timeout = 5000,
|
||||||
|
}: {
|
||||||
|
query: string;
|
||||||
|
advanced?: boolean;
|
||||||
|
num_results?: number;
|
||||||
|
tbs?: string;
|
||||||
|
filter?: string;
|
||||||
|
lang?: string;
|
||||||
|
proxy?: string;
|
||||||
|
sleep_interval?: number;
|
||||||
|
timeout?: number;
|
||||||
|
}) {
|
||||||
|
try {
|
||||||
|
if (process.env.SERPER_API_KEY) {
|
||||||
|
return await serper_search(query, num_results);
|
||||||
|
}
|
||||||
|
return await google_search(
|
||||||
|
query,
|
||||||
|
advanced,
|
||||||
|
num_results,
|
||||||
|
tbs,
|
||||||
|
filter,
|
||||||
|
lang,
|
||||||
|
proxy,
|
||||||
|
sleep_interval,
|
||||||
|
timeout
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error in search function: ", error);
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
// if process.env.SERPER_API_KEY is set, use serper
|
||||||
|
}
|
27
apps/api/src/search/serper.ts
Normal file
27
apps/api/src/search/serper.ts
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import axios from "axios";
|
||||||
|
import dotenv from "dotenv";
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
export async function serper_search(q, num_results) : Promise<string[]> {
|
||||||
|
let data = JSON.stringify({
|
||||||
|
q: q,
|
||||||
|
"num": num_results
|
||||||
|
});
|
||||||
|
|
||||||
|
let config = {
|
||||||
|
method: "POST",
|
||||||
|
url: "https://google.serper.dev/search",
|
||||||
|
headers: {
|
||||||
|
"X-API-KEY": process.env.SERPER_API_KEY,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
data: data,
|
||||||
|
};
|
||||||
|
const response = await axios(config);
|
||||||
|
if (response && response.data && Array.isArray(response.data.organic)) {
|
||||||
|
return response.data.organic.map((a) => a.link);
|
||||||
|
} else {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
@ -44,6 +44,8 @@ export enum RateLimiterMode {
|
|||||||
CrawlStatus = "crawl-status",
|
CrawlStatus = "crawl-status",
|
||||||
Scrape = "scrape",
|
Scrape = "scrape",
|
||||||
Preview = "preview",
|
Preview = "preview",
|
||||||
|
Search = "search",
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface AuthResponse {
|
export interface AuthResponse {
|
||||||
|
Loading…
Reference in New Issue
Block a user