From 756f54466d37f00850343cc8ed57979a0d587c50 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 20 May 2024 17:24:21 -0700 Subject: [PATCH 1/2] Nick: allowed keywords for now --- apps/api/src/scraper/WebScraper/utils/blocklist.ts | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index a50e42e..ededfc7 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -1,6 +1,7 @@ const socialMediaBlocklist = [ 'facebook.com', 'twitter.com', + 'x.com', 'instagram.com', 'linkedin.com', 'pinterest.com', @@ -14,12 +15,18 @@ const socialMediaBlocklist = [ 'telegram.org', ]; -const allowedUrls = [ - 'linkedin.com/pulse' +const allowedKeywords = [ + 'pulse', + 'privacy', + 'terms', + 'policy', + 'user-agreement', + 'legal', + 'help' ]; export function isUrlBlocked(url: string): boolean { - if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) { + if (allowedKeywords.some(keyword => url.includes(keyword))) { return false; } From 7f64fe884a57441ab1103fab8d8ca44a1e92bfd7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 20 May 2024 17:26:01 -0700 Subject: [PATCH 2/2] Update blocklist.ts --- apps/api/src/scraper/WebScraper/utils/blocklist.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index ededfc7..c3d37c4 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -22,7 +22,14 @@ const allowedKeywords = [ 'policy', 'user-agreement', 'legal', - 'help' + 'help', + 'support', + 'contact', + 'about', + 'careers', + 'blog', + 'press', + 'conditions', ]; export function isUrlBlocked(url: string): boolean {