Nick:
This commit is contained in:
parent
fb08f28edf
commit
8e44696c4d
@ -5,9 +5,28 @@ import dotenv from "dotenv";
|
|||||||
import { Document, PageOptions } from "../../lib/entities";
|
import { Document, PageOptions } from "../../lib/entities";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||||
|
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
|
export async function generateRequestParams(
|
||||||
|
url: string,
|
||||||
|
wait_browser: string = "domcontentloaded",
|
||||||
|
timeout: number = 15000
|
||||||
|
): Promise<any> {
|
||||||
|
const defaultParams = {
|
||||||
|
url: url,
|
||||||
|
params: { timeout: timeout, wait_browser: wait_browser },
|
||||||
|
headers: { "ScrapingService-Request": "TRUE" },
|
||||||
|
};
|
||||||
|
|
||||||
|
const urlKey = new URL(url).hostname;
|
||||||
|
if (urlSpecificParams.hasOwnProperty(urlKey)) {
|
||||||
|
return { ...defaultParams, ...urlSpecificParams[urlKey] };
|
||||||
|
} else {
|
||||||
|
return defaultParams;
|
||||||
|
}
|
||||||
|
}
|
||||||
export async function scrapWithCustomFirecrawl(
|
export async function scrapWithCustomFirecrawl(
|
||||||
url: string,
|
url: string,
|
||||||
options?: any
|
options?: any
|
||||||
@ -28,11 +47,13 @@ export async function scrapWithScrapingBee(
|
|||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||||
const response = await client.get({
|
const clientParams = await generateRequestParams(
|
||||||
url: url,
|
url,
|
||||||
params: { timeout: timeout, wait_browser: wait_browser },
|
wait_browser,
|
||||||
headers: { "ScrapingService-Request": "TRUE" },
|
timeout
|
||||||
});
|
);
|
||||||
|
|
||||||
|
const response = await client.get(clientParams);
|
||||||
|
|
||||||
if (response.status !== 200 && response.status !== 404) {
|
if (response.status !== 200 && response.status !== 404) {
|
||||||
console.error(
|
console.error(
|
||||||
@ -107,11 +128,15 @@ export async function scrapSingleUrl(
|
|||||||
let text = "";
|
let text = "";
|
||||||
switch (method) {
|
switch (method) {
|
||||||
case "firecrawl-scraper":
|
case "firecrawl-scraper":
|
||||||
text = await scrapWithCustomFirecrawl(url,);
|
text = await scrapWithCustomFirecrawl(url);
|
||||||
break;
|
break;
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||||
text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000);
|
text = await scrapWithScrapingBee(
|
||||||
|
url,
|
||||||
|
"domcontentloaded",
|
||||||
|
pageOptions.fallback === false ? 7000 : 15000
|
||||||
|
);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "playwright":
|
case "playwright":
|
||||||
@ -141,7 +166,7 @@ export async function scrapSingleUrl(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||||
|
|
||||||
return [await parseMarkdown(cleanedHtml), text];
|
return [await parseMarkdown(cleanedHtml), text];
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -155,7 +180,7 @@ export async function scrapSingleUrl(
|
|||||||
|
|
||||||
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
|
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
|
||||||
// Basically means that it is using /search endpoint
|
// Basically means that it is using /search endpoint
|
||||||
if(pageOptions.fallback === false){
|
if (pageOptions.fallback === false) {
|
||||||
const soup = cheerio.load(html);
|
const soup = cheerio.load(html);
|
||||||
const metadata = extractMetadata(soup, urlToScrap);
|
const metadata = extractMetadata(soup, urlToScrap);
|
||||||
return {
|
return {
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
export const urlSpecificParams = {
|
||||||
|
"platform.openai.com": {
|
||||||
|
params: {
|
||||||
|
wait_browser: "networkidle2",
|
||||||
|
block_resources: false,
|
||||||
|
},
|
||||||
|
headers: {
|
||||||
|
"User-Agent":
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
|
"sec-fetch-site": "same-origin",
|
||||||
|
"sec-fetch-mode": "cors",
|
||||||
|
"sec-fetch-dest": "empty",
|
||||||
|
referer: "https://www.google.com/",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"accept-encoding": "gzip, deflate, br",
|
||||||
|
accept:
|
||||||
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||||
|
},
|
||||||
|
cookies: {
|
||||||
|
__cf_bm:
|
||||||
|
"mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
Loading…
Reference in New Issue
Block a user