0

Merge pull request #83 from mendableai/website-specific-params

Specific website params support
This commit is contained in:
Nicolas 2024-04-28 12:44:55 -07:00 committed by GitHub
commit 23e3f88070
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 58 additions and 9 deletions

View File

@ -5,9 +5,28 @@ import dotenv from "dotenv";
import { Document, PageOptions } from "../../lib/entities"; import { Document, PageOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags"; import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params";
dotenv.config(); dotenv.config();
export async function generateRequestParams(
url: string,
wait_browser: string = "domcontentloaded",
timeout: number = 15000
): Promise<any> {
const defaultParams = {
url: url,
params: { timeout: timeout, wait_browser: wait_browser },
headers: { "ScrapingService-Request": "TRUE" },
};
const urlKey = new URL(url).hostname;
if (urlSpecificParams.hasOwnProperty(urlKey)) {
return { ...defaultParams, ...urlSpecificParams[urlKey] };
} else {
return defaultParams;
}
}
export async function scrapWithCustomFirecrawl( export async function scrapWithCustomFirecrawl(
url: string, url: string,
options?: any options?: any
@ -28,11 +47,13 @@ export async function scrapWithScrapingBee(
): Promise<string> { ): Promise<string> {
try { try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
const response = await client.get({ const clientParams = await generateRequestParams(
url: url, url,
params: { timeout: timeout, wait_browser: wait_browser }, wait_browser,
headers: { "ScrapingService-Request": "TRUE" }, timeout
}); );
const response = await client.get(clientParams);
if (response.status !== 200 && response.status !== 404) { if (response.status !== 200 && response.status !== 404) {
console.error( console.error(
@ -107,11 +128,15 @@ export async function scrapSingleUrl(
let text = ""; let text = "";
switch (method) { switch (method) {
case "firecrawl-scraper": case "firecrawl-scraper":
text = await scrapWithCustomFirecrawl(url,); text = await scrapWithCustomFirecrawl(url);
break; break;
case "scrapingBee": case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) { if (process.env.SCRAPING_BEE_API_KEY) {
text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000); text = await scrapWithScrapingBee(
url,
"domcontentloaded",
pageOptions.fallback === false ? 7000 : 15000
);
} }
break; break;
case "playwright": case "playwright":
@ -141,7 +166,7 @@ export async function scrapSingleUrl(
break; break;
} }
let cleanedHtml = removeUnwantedElements(text, pageOptions); let cleanedHtml = removeUnwantedElements(text, pageOptions);
return [await parseMarkdown(cleanedHtml), text]; return [await parseMarkdown(cleanedHtml), text];
}; };
@ -155,7 +180,7 @@ export async function scrapSingleUrl(
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee"); let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
// Basically means that it is using /search endpoint // Basically means that it is using /search endpoint
if(pageOptions.fallback === false){ if (pageOptions.fallback === false) {
const soup = cheerio.load(html); const soup = cheerio.load(html);
const metadata = extractMetadata(soup, urlToScrap); const metadata = extractMetadata(soup, urlToScrap);
return { return {

View File

@ -0,0 +1,24 @@
export const urlSpecificParams = {
"platform.openai.com": {
params: {
wait_browser: "networkidle2",
block_resources: false,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
cookies: {
__cf_bm:
"mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ",
},
},
};