From 75597f72a197b692b600d1c1f006bc2f3dc37dae Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 25 Apr 2024 08:39:45 -0300 Subject: [PATCH] [Feat] Added allowed urls FireCrawl should be able to scrape LinkedIn Articles (/pulse/*) --- apps/api/src/scraper/WebScraper/utils/blocklist.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 0eef332..a50e42e 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -14,6 +14,14 @@ const socialMediaBlocklist = [ 'telegram.org', ]; +const allowedUrls = [ + 'linkedin.com/pulse' +]; + export function isUrlBlocked(url: string): boolean { + if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) { + return false; + } + return socialMediaBlocklist.some(domain => url.includes(domain)); }