0

Update single_url.ts

This commit is contained in:
Nicolas 2024-05-21 18:50:42 -07:00
parent a5e718b084
commit a8ff295977

View File

@ -74,7 +74,7 @@ export async function scrapWithFireEngine(
return html ?? ""; return html ?? "";
} }
} catch (error) { } catch (error) {
console.error(`Error scraping with Fire Engine: ${error}`); console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }
@ -110,7 +110,7 @@ export async function scrapWithScrapingBee(
return text; return text;
} }
} catch (error) { } catch (error) {
console.error(`[ScrapingBee] Error fetching url: ${url} -> ${error}`); console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }
@ -144,14 +144,58 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
return html ?? ""; return html ?? "";
} }
} catch (error) { } catch (error) {
console.error(`Error scraping with Playwright: ${error}`); console.error(`[Playwright][c] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }
export async function scrapWithFetch(url: string): Promise<string> {
try {
const response = await fetch(url);
if (!response.ok) {
console.error(
`[Fetch] Error fetching url: ${url} with status: ${response.status}`
);
return "";
}
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url);
} else {
const text = await response.text();
return text;
}
} catch (error) {
console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`);
return "";
}
}
/**
* Get the order of scrapers to be used for scraping a URL
* If the user doesn't have envs set for a specific scraper, it will be removed from the order.
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
* @returns The order of scrapers to be used for scraping a URL
*/
function getScrapingFallbackOrder(defaultScraper?: string) { function getScrapingFallbackOrder(defaultScraper?: string) {
const fireEngineScraper = process.env.FIRE_ENGINE_BETA_URL ? ["fire-engine"] : []; const availableScrapers = baseScrapers.filter(scraper => {
const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...fireEngineScraper, ...baseScrapers] : [...fireEngineScraper, ...baseScrapers]); switch (scraper) {
case "scrapingBee":
case "scrapingBeeLoad":
return !!process.env.SCRAPING_BEE_API_KEY;
case "fire-engine":
return !!process.env.FIRE_ENGINE_BETA_URL;
case "playwright":
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
default:
return true;
}
});
const defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"];
const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper));
const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]);
const scrapersInOrder = Array.from(uniqueScrapers); const scrapersInOrder = Array.from(uniqueScrapers);
return scrapersInOrder as typeof baseScrapers[number][]; return scrapersInOrder as typeof baseScrapers[number][];
} }
@ -182,7 +226,9 @@ export async function scrapSingleUrl(
let text = ""; let text = "";
switch (method) { switch (method) {
case "fire-engine": case "fire-engine":
text = await scrapWithFireEngine(url); if (process.env.FIRE_ENGINE_BETA_URL) {
text = await scrapWithFireEngine(url);
}
break; break;
case "scrapingBee": case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) { if (process.env.SCRAPING_BEE_API_KEY) {
@ -204,25 +250,7 @@ export async function scrapSingleUrl(
} }
break; break;
case "fetch": case "fetch":
try { text = await scrapWithFetch(url);
const response = await fetch(url);
if (!response.ok) {
console.error(
`Error fetching URL: ${url} with status: ${response.status}`
);
return "";
}
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url);
} else {
text = await response.text();
}
} catch (error) {
console.error(`Error scraping URL: ${error}`);
return "";
}
break; break;
} }