From e98434606d77593ea7d0bdd1219e23f20651a35a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 24 May 2024 15:04:15 -0700 Subject: [PATCH] Update blocklist.ts --- apps/api/src/scraper/WebScraper/utils/blocklist.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index c6a1232..2a047e4 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -1,5 +1,6 @@ const socialMediaBlocklist = [ 'facebook.com', + 'x.com', 'twitter.com', 'instagram.com', 'linkedin.com', @@ -32,9 +33,17 @@ const allowedKeywords = [ ]; export function isUrlBlocked(url: string): boolean { + // Check if the URL contains any allowed keywords if (allowedKeywords.some(keyword => url.includes(keyword))) { return false; } - return socialMediaBlocklist.some(domain => url.includes(domain)); + // Check if the URL matches any domain in the blocklist + return socialMediaBlocklist.some(domain => { + // Create a regular expression to match the exact domain + const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`); + // Test the hostname of the URL against the pattern + return domainPattern.test(new URL(url).hostname); + }); } +