diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts new file mode 100644 index 0000000..7cadf4e --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts @@ -0,0 +1,63 @@ +import { isUrlBlocked } from '../blocklist'; + +describe('isUrlBlocked', () => { + it('should return true for blocked social media URLs', () => { + const blockedUrls = [ + 'https://www.facebook.com', + 'https://twitter.com/someuser', + 'https://instagram.com/someuser', + 'https://www.linkedin.com/in/someuser', + 'https://pinterest.com/someuser', + 'https://snapchat.com/someuser', + 'https://tiktok.com/@someuser', + 'https://reddit.com/r/somesubreddit', + 'https://flickr.com/photos/someuser', + 'https://whatsapp.com/someuser', + 'https://wechat.com/someuser', + 'https://telegram.org/someuser', + ]; + + blockedUrls.forEach(url => { + if (!isUrlBlocked(url)) { + console.log(`URL not blocked: ${url}`); + } + expect(isUrlBlocked(url)).toBe(true); + }); + }); + + it('should return false for URLs containing allowed keywords', () => { + const allowedUrls = [ + 'https://www.facebook.com/privacy', + 'https://twitter.com/terms', + 'https://instagram.com/legal', + 'https://www.linkedin.com/help', + 'https://pinterest.com/about', + 'https://snapchat.com/support', + 'https://tiktok.com/contact', + 'https://reddit.com/user-agreement', + 'https://tumblr.com/policy', + 'https://flickr.com/blog', + 'https://whatsapp.com/press', + 'https://wechat.com/careers', + 'https://telegram.org/conditions', + ]; + + allowedUrls.forEach(url => { + expect(isUrlBlocked(url)).toBe(false); + }); + }); + + it('should return false for non-blocked URLs', () => { + const nonBlockedUrls = [ + 'https://www.example.com', + 'https://www.somewebsite.org', + 'https://subdomain.example.com', + 'firecrawl.dev', + 'amazon.com', + ]; + + nonBlockedUrls.forEach(url => { + expect(isUrlBlocked(url)).toBe(false); + }); + }); +}); diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 2a047e4..45d1970 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -38,12 +38,17 @@ export function isUrlBlocked(url: string): boolean { return false; } - // Check if the URL matches any domain in the blocklist - return socialMediaBlocklist.some(domain => { - // Create a regular expression to match the exact domain - const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`); - // Test the hostname of the URL against the pattern - return domainPattern.test(new URL(url).hostname); - }); + try { + // Check if the URL matches any domain in the blocklist + return socialMediaBlocklist.some(domain => { + // Create a regular expression to match the exact domain + const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`); + // Test the hostname of the URL against the pattern + return domainPattern.test(new URL(url).hostname); + }); + } catch (e) { + // If an error occurs (e.g., invalid URL), return false + return false; + } }