From be5661a76834c059efce941dff572a1324fec313 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 9 May 2024 17:45:16 -0700 Subject: [PATCH] Nick: a lot better --- apps/api/src/scraper/WebScraper/single_url.ts | 74 +++++++++---------- .../WebScraper/utils/custom/website_params.ts | 40 ++++++++++ 2 files changed, 76 insertions(+), 38 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index a67ce31..75a9d5c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -21,7 +21,7 @@ export async function generateRequestParams( }; try { - const urlKey = new URL(url).hostname; + const urlKey = new URL(url).hostname.replace(/^www\./, ""); if (urlSpecificParams.hasOwnProperty(urlKey)) { return { ...defaultParams, ...urlSpecificParams[urlKey] }; } else { @@ -57,7 +57,7 @@ export async function scrapWithScrapingBee( wait_browser, timeout ); - + const response = await client.get(clientParams); if (response.status !== 200 && response.status !== 404) { @@ -77,12 +77,15 @@ export async function scrapWithScrapingBee( export async function scrapWithPlaywright(url: string): Promise { try { + const reqParams = await generateRequestParams(url); + const wait_playwright = reqParams["params"]["wait"]; + const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, { method: "POST", headers: { "Content-Type": "application/json", }, - body: JSON.stringify({ url: url }), + body: JSON.stringify({ url: url, wait: wait_playwright }), }); if (!response.ok) { @@ -103,7 +106,7 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, + pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false } ): Promise { urlToScrap = urlToScrap.trim(); @@ -169,56 +172,51 @@ export async function scrapSingleUrl( break; } - //* TODO: add an optional to return markdown or structured/extracted content + //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); - + return [await parseMarkdown(cleanedHtml), text]; }; - try { - // TODO: comment this out once we're ready to merge firecrawl-scraper into the mono-repo - // let [text, html] = await attemptScraping(urlToScrap, 'firecrawl-scraper'); - // if (!text || text.length < 100) { - // console.log("Falling back to scraping bee load"); - // [text, html] = await attemptScraping(urlToScrap, 'scrapingBeeLoad'); - // } - - let [text, html] = await attemptScraping(urlToScrap, "scrapingBee"); - // Basically means that it is using /search endpoint - if (pageOptions.fallback === false) { - const soup = cheerio.load(html); - const metadata = extractMetadata(soup, urlToScrap); - return { - url: urlToScrap, - content: text, - markdown: text, - html: pageOptions.includeHtml ? html : undefined, - metadata: { ...metadata, sourceURL: urlToScrap }, - } as Document; + let [text, html] = ["", ""]; + let urlKey = urlToScrap; + try { + urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); + } catch (error) { + console.error(`Invalid URL key, trying: ${urlToScrap}`); } - if (!text || text.length < 100) { - console.log("Falling back to playwright"); - [text, html] = await attemptScraping(urlToScrap, "playwright"); + const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; + const scrapersInOrder = defaultScraper + ? [ + defaultScraper, + "scrapingBee", + "playwright", + "scrapingBeeLoad", + "fetch", + ] + : ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"]; + + for (const scraper of scrapersInOrder) { + [text, html] = await attemptScraping(urlToScrap, scraper); + if (text && text.length >= 100) break; + console.log(`Falling back to ${scraper}`); } if (!text || text.length < 100) { - console.log("Falling back to scraping bee load"); - [text, html] = await attemptScraping(urlToScrap, "scrapingBeeLoad"); - } - if (!text || text.length < 100) { - console.log("Falling back to fetch"); - [text, html] = await attemptScraping(urlToScrap, "fetch"); + throw new Error(`All scraping methods failed for URL: ${urlToScrap}`); } const soup = cheerio.load(html); const metadata = extractMetadata(soup, urlToScrap); - - return { + const document: Document = { + url: urlToScrap, content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, - } as Document; + }; + + return document; } catch (error) { console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); return { diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index dd9f20e..069f433 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -38,5 +38,45 @@ export const urlSpecificParams = { accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", }, + }, + "docs.pdw.co":{ + defaultScraper: "playwright", + params: { + wait_browser: "networkidle2", + block_resources: false, + wait: 5000, + }, + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "sec-fetch-site": "same-origin", + "sec-fetch-mode": "cors", + "sec-fetch-dest": "empty", + referer: "https://www.google.com/", + "accept-language": "en-US,en;q=0.9", + "accept-encoding": "gzip, deflate, br", + accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + }, + }, + "ycombinator.com":{ + defaultScraper: "playwright", + params: { + wait_browser: "networkidle2", + block_resources: false, + wait: 5000, + }, + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "sec-fetch-site": "same-origin", + "sec-fetch-mode": "cors", + "sec-fetch-dest": "empty", + referer: "https://www.google.com/", + "accept-language": "en-US,en;q=0.9", + "accept-encoding": "gzip, deflate, br", + accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + }, } };