From d3ab2ea9260017322c4527545abadfb041f8420c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 25 Apr 2024 10:51:01 -0300 Subject: [PATCH 1/5] [Feat] Implemented retry attempts to handle 502 errors --- apps/python-sdk/firecrawl/firecrawl.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index f1f5e6e..4fc78cf 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -1,5 +1,6 @@ import os import requests +import time class FirecrawlApp: def __init__(self, api_key=None): @@ -62,11 +63,23 @@ class FirecrawlApp: 'Authorization': f'Bearer {self.api_key}' } - def _post_request(self, url, data, headers): - return requests.post(url, headers=headers, json=data) + def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): + for attempt in range(retries): + response = requests.post(url, headers=headers, json=data) + if response.status_code == 502: + time.sleep(backoff_factor * (2 ** attempt)) + else: + return response + return response - def _get_request(self, url, headers): - return requests.get(url, headers=headers) + def _get_request(self, url, headers, retries=3, backoff_factor=0.5): + for attempt in range(retries): + response = requests.get(url, headers=headers) + if response.status_code == 502: + time.sleep(backoff_factor * (2 ** attempt)) + else: + return response + return response def _monitor_job_status(self, job_id, headers, timeout): import time From 8e44696c4d47edf282dff27f401f9b6ba5610897 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 28 Apr 2024 11:34:25 -0700 Subject: [PATCH 2/5] Nick: --- apps/api/src/scraper/WebScraper/single_url.ts | 43 +++++++++++++++---- .../WebScraper/utils/custom/website_params.ts | 24 +++++++++++ 2 files changed, 58 insertions(+), 9 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/custom/website_params.ts diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 6ab3003..262a90c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -5,9 +5,28 @@ import dotenv from "dotenv"; import { Document, PageOptions } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { excludeNonMainTags } from "./utils/excludeTags"; +import { urlSpecificParams } from "./utils/custom/website_params"; dotenv.config(); +export async function generateRequestParams( + url: string, + wait_browser: string = "domcontentloaded", + timeout: number = 15000 +): Promise { + const defaultParams = { + url: url, + params: { timeout: timeout, wait_browser: wait_browser }, + headers: { "ScrapingService-Request": "TRUE" }, + }; + + const urlKey = new URL(url).hostname; + if (urlSpecificParams.hasOwnProperty(urlKey)) { + return { ...defaultParams, ...urlSpecificParams[urlKey] }; + } else { + return defaultParams; + } +} export async function scrapWithCustomFirecrawl( url: string, options?: any @@ -28,11 +47,13 @@ export async function scrapWithScrapingBee( ): Promise { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); - const response = await client.get({ - url: url, - params: { timeout: timeout, wait_browser: wait_browser }, - headers: { "ScrapingService-Request": "TRUE" }, - }); + const clientParams = await generateRequestParams( + url, + wait_browser, + timeout + ); + + const response = await client.get(clientParams); if (response.status !== 200 && response.status !== 404) { console.error( @@ -107,11 +128,15 @@ export async function scrapSingleUrl( let text = ""; switch (method) { case "firecrawl-scraper": - text = await scrapWithCustomFirecrawl(url,); + text = await scrapWithCustomFirecrawl(url); break; case "scrapingBee": if (process.env.SCRAPING_BEE_API_KEY) { - text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000); + text = await scrapWithScrapingBee( + url, + "domcontentloaded", + pageOptions.fallback === false ? 7000 : 15000 + ); } break; case "playwright": @@ -141,7 +166,7 @@ export async function scrapSingleUrl( break; } let cleanedHtml = removeUnwantedElements(text, pageOptions); - + return [await parseMarkdown(cleanedHtml), text]; }; @@ -155,7 +180,7 @@ export async function scrapSingleUrl( let [text, html] = await attemptScraping(urlToScrap, "scrapingBee"); // Basically means that it is using /search endpoint - if(pageOptions.fallback === false){ + if (pageOptions.fallback === false) { const soup = cheerio.load(html); const metadata = extractMetadata(soup, urlToScrap); return { diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts new file mode 100644 index 0000000..164b074 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -0,0 +1,24 @@ +export const urlSpecificParams = { + "platform.openai.com": { + params: { + wait_browser: "networkidle2", + block_resources: false, + }, + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "sec-fetch-site": "same-origin", + "sec-fetch-mode": "cors", + "sec-fetch-dest": "empty", + referer: "https://www.google.com/", + "accept-language": "en-US,en;q=0.9", + "accept-encoding": "gzip, deflate, br", + accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + }, + cookies: { + __cf_bm: + "mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ", + }, + }, +}; From d8ee4e90d6e64eff8cb5bb0ab557360e08dcba75 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 28 Apr 2024 11:47:25 -0700 Subject: [PATCH 3/5] Update website_params.ts --- .../WebScraper/utils/custom/website_params.ts | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index 164b074..dd9f20e 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -21,4 +21,22 @@ export const urlSpecificParams = { "mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ", }, }, + "support.greenpay.me":{ + params: { + wait_browser: "networkidle2", + block_resources: false, + }, + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "sec-fetch-site": "same-origin", + "sec-fetch-mode": "cors", + "sec-fetch-dest": "empty", + referer: "https://www.google.com/", + "accept-language": "en-US,en;q=0.9", + "accept-encoding": "gzip, deflate, br", + accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + }, + } }; From 68838c9e0da8c74f9921e4d89e459275f9d235ce Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 28 Apr 2024 12:44:00 -0700 Subject: [PATCH 4/5] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 262a90c..ff73e95 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -20,10 +20,15 @@ export async function generateRequestParams( headers: { "ScrapingService-Request": "TRUE" }, }; - const urlKey = new URL(url).hostname; - if (urlSpecificParams.hasOwnProperty(urlKey)) { - return { ...defaultParams, ...urlSpecificParams[urlKey] }; - } else { + try { + const urlKey = new URL(url).hostname; + if (urlSpecificParams.hasOwnProperty(urlKey)) { + return { ...defaultParams, ...urlSpecificParams[urlKey] }; + } else { + return defaultParams; + } + } catch (error) { + console.error(`Error generating URL key: ${error}`); return defaultParams; } } From a72d2cc68ec07d553552a16e7847ba1c3433c9b5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 28 Apr 2024 13:06:46 -0700 Subject: [PATCH 5/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7749a64..5d0a485 100644 --- a/README.md +++ b/README.md @@ -194,4 +194,4 @@ search_result = app.search(query) We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request. -*It is the sole responsibility of the end users to scrape, search and crawl websites. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions.* +*It is the sole responsibility of the end users to respect websites' policies when scraping, searching and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions.*