Merge branch 'main' into llm-extraction
This commit is contained in:
commit
b69feab916
@ -194,4 +194,4 @@ search_result = app.search(query)
|
|||||||
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.
|
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.
|
||||||
|
|
||||||
|
|
||||||
*It is the sole responsibility of the end users to scrape, search and crawl websites. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions.*
|
*It is the sole responsibility of the end users to respect websites' policies when scraping, searching and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions.*
|
||||||
|
@ -5,9 +5,33 @@ import dotenv from "dotenv";
|
|||||||
import { Document, PageOptions } from "../../lib/entities";
|
import { Document, PageOptions } from "../../lib/entities";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||||
|
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
|
export async function generateRequestParams(
|
||||||
|
url: string,
|
||||||
|
wait_browser: string = "domcontentloaded",
|
||||||
|
timeout: number = 15000
|
||||||
|
): Promise<any> {
|
||||||
|
const defaultParams = {
|
||||||
|
url: url,
|
||||||
|
params: { timeout: timeout, wait_browser: wait_browser },
|
||||||
|
headers: { "ScrapingService-Request": "TRUE" },
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
const urlKey = new URL(url).hostname;
|
||||||
|
if (urlSpecificParams.hasOwnProperty(urlKey)) {
|
||||||
|
return { ...defaultParams, ...urlSpecificParams[urlKey] };
|
||||||
|
} else {
|
||||||
|
return defaultParams;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error generating URL key: ${error}`);
|
||||||
|
return defaultParams;
|
||||||
|
}
|
||||||
|
}
|
||||||
export async function scrapWithCustomFirecrawl(
|
export async function scrapWithCustomFirecrawl(
|
||||||
url: string,
|
url: string,
|
||||||
options?: any
|
options?: any
|
||||||
@ -28,11 +52,13 @@ export async function scrapWithScrapingBee(
|
|||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||||
const response = await client.get({
|
const clientParams = await generateRequestParams(
|
||||||
url: url,
|
url,
|
||||||
params: { timeout: timeout, wait_browser: wait_browser },
|
wait_browser,
|
||||||
headers: { "ScrapingService-Request": "TRUE" },
|
timeout
|
||||||
});
|
);
|
||||||
|
|
||||||
|
const response = await client.get(clientParams);
|
||||||
|
|
||||||
if (response.status !== 200 && response.status !== 404) {
|
if (response.status !== 200 && response.status !== 404) {
|
||||||
console.error(
|
console.error(
|
||||||
@ -107,11 +133,15 @@ export async function scrapSingleUrl(
|
|||||||
let text = "";
|
let text = "";
|
||||||
switch (method) {
|
switch (method) {
|
||||||
case "firecrawl-scraper":
|
case "firecrawl-scraper":
|
||||||
text = await scrapWithCustomFirecrawl(url,);
|
text = await scrapWithCustomFirecrawl(url);
|
||||||
break;
|
break;
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||||
text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000);
|
text = await scrapWithScrapingBee(
|
||||||
|
url,
|
||||||
|
"domcontentloaded",
|
||||||
|
pageOptions.fallback === false ? 7000 : 15000
|
||||||
|
);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case "playwright":
|
case "playwright":
|
||||||
@ -143,7 +173,7 @@ export async function scrapSingleUrl(
|
|||||||
|
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||||
|
|
||||||
return [await parseMarkdown(cleanedHtml), text];
|
return [await parseMarkdown(cleanedHtml), text];
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -157,7 +187,7 @@ export async function scrapSingleUrl(
|
|||||||
|
|
||||||
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
|
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
|
||||||
// Basically means that it is using /search endpoint
|
// Basically means that it is using /search endpoint
|
||||||
if(pageOptions.fallback === false){
|
if (pageOptions.fallback === false) {
|
||||||
const soup = cheerio.load(html);
|
const soup = cheerio.load(html);
|
||||||
const metadata = extractMetadata(soup, urlToScrap);
|
const metadata = extractMetadata(soup, urlToScrap);
|
||||||
return {
|
return {
|
||||||
|
@ -0,0 +1,42 @@
|
|||||||
|
export const urlSpecificParams = {
|
||||||
|
"platform.openai.com": {
|
||||||
|
params: {
|
||||||
|
wait_browser: "networkidle2",
|
||||||
|
block_resources: false,
|
||||||
|
},
|
||||||
|
headers: {
|
||||||
|
"User-Agent":
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
|
"sec-fetch-site": "same-origin",
|
||||||
|
"sec-fetch-mode": "cors",
|
||||||
|
"sec-fetch-dest": "empty",
|
||||||
|
referer: "https://www.google.com/",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"accept-encoding": "gzip, deflate, br",
|
||||||
|
accept:
|
||||||
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||||
|
},
|
||||||
|
cookies: {
|
||||||
|
__cf_bm:
|
||||||
|
"mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"support.greenpay.me":{
|
||||||
|
params: {
|
||||||
|
wait_browser: "networkidle2",
|
||||||
|
block_resources: false,
|
||||||
|
},
|
||||||
|
headers: {
|
||||||
|
"User-Agent":
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
|
"sec-fetch-site": "same-origin",
|
||||||
|
"sec-fetch-mode": "cors",
|
||||||
|
"sec-fetch-dest": "empty",
|
||||||
|
referer: "https://www.google.com/",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"accept-encoding": "gzip, deflate, br",
|
||||||
|
accept:
|
||||||
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
|
import time
|
||||||
|
|
||||||
class FirecrawlApp:
|
class FirecrawlApp:
|
||||||
def __init__(self, api_key=None):
|
def __init__(self, api_key=None):
|
||||||
@ -88,11 +89,23 @@ class FirecrawlApp:
|
|||||||
'Authorization': f'Bearer {self.api_key}'
|
'Authorization': f'Bearer {self.api_key}'
|
||||||
}
|
}
|
||||||
|
|
||||||
def _post_request(self, url, data, headers):
|
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
|
||||||
return requests.post(url, headers=headers, json=data)
|
for attempt in range(retries):
|
||||||
|
response = requests.post(url, headers=headers, json=data)
|
||||||
|
if response.status_code == 502:
|
||||||
|
time.sleep(backoff_factor * (2 ** attempt))
|
||||||
|
else:
|
||||||
|
return response
|
||||||
|
return response
|
||||||
|
|
||||||
def _get_request(self, url, headers):
|
def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
|
||||||
return requests.get(url, headers=headers)
|
for attempt in range(retries):
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
if response.status_code == 502:
|
||||||
|
time.sleep(backoff_factor * (2 ** attempt))
|
||||||
|
else:
|
||||||
|
return response
|
||||||
|
return response
|
||||||
|
|
||||||
def _monitor_job_status(self, job_id, headers, timeout):
|
def _monitor_job_status(self, job_id, headers, timeout):
|
||||||
import time
|
import time
|
||||||
|
Loading…
Reference in New Issue
Block a user