0

Merge pull request #83 from mendableai/website-specific-params

Specific website params support
This commit is contained in:
Nicolas 2024-04-28 12:44:55 -07:00 committed by GitHub
commit 23e3f88070
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 58 additions and 9 deletions

View File

@ -5,9 +5,28 @@ import dotenv from "dotenv";
import { Document, PageOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params";
dotenv.config();
export async function generateRequestParams(
url: string,
wait_browser: string = "domcontentloaded",
timeout: number = 15000
): Promise<any> {
const defaultParams = {
url: url,
params: { timeout: timeout, wait_browser: wait_browser },
headers: { "ScrapingService-Request": "TRUE" },
};
const urlKey = new URL(url).hostname;
if (urlSpecificParams.hasOwnProperty(urlKey)) {
return { ...defaultParams, ...urlSpecificParams[urlKey] };
} else {
return defaultParams;
}
}
export async function scrapWithCustomFirecrawl(
url: string,
options?: any
@ -28,11 +47,13 @@ export async function scrapWithScrapingBee(
): Promise<string> {
try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
const response = await client.get({
url: url,
params: { timeout: timeout, wait_browser: wait_browser },
headers: { "ScrapingService-Request": "TRUE" },
});
const clientParams = await generateRequestParams(
url,
wait_browser,
timeout
);
const response = await client.get(clientParams);
if (response.status !== 200 && response.status !== 404) {
console.error(
@ -107,11 +128,15 @@ export async function scrapSingleUrl(
let text = "";
switch (method) {
case "firecrawl-scraper":
text = await scrapWithCustomFirecrawl(url,);
text = await scrapWithCustomFirecrawl(url);
break;
case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) {
text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000);
text = await scrapWithScrapingBee(
url,
"domcontentloaded",
pageOptions.fallback === false ? 7000 : 15000
);
}
break;
case "playwright":
@ -141,7 +166,7 @@ export async function scrapSingleUrl(
break;
}
let cleanedHtml = removeUnwantedElements(text, pageOptions);
return [await parseMarkdown(cleanedHtml), text];
};
@ -155,7 +180,7 @@ export async function scrapSingleUrl(
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
// Basically means that it is using /search endpoint
if(pageOptions.fallback === false){
if (pageOptions.fallback === false) {
const soup = cheerio.load(html);
const metadata = extractMetadata(soup, urlToScrap);
return {

View File

@ -0,0 +1,24 @@
export const urlSpecificParams = {
"platform.openai.com": {
params: {
wait_browser: "networkidle2",
block_resources: false,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
cookies: {
__cf_bm:
"mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ",
},
},
};