0

Merge pull request #185 from mendableai/nsc/improved-blocklist

Improvements to the blocklist regex
This commit is contained in:
Nicolas 2024-06-03 16:43:34 -07:00 committed by GitHub
commit b26c5f1588
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 78 additions and 1 deletions

View File

@ -0,0 +1,63 @@
import { isUrlBlocked } from '../blocklist';
describe('isUrlBlocked', () => {
it('should return true for blocked social media URLs', () => {
const blockedUrls = [
'https://www.facebook.com',
'https://twitter.com/someuser',
'https://instagram.com/someuser',
'https://www.linkedin.com/in/someuser',
'https://pinterest.com/someuser',
'https://snapchat.com/someuser',
'https://tiktok.com/@someuser',
'https://reddit.com/r/somesubreddit',
'https://flickr.com/photos/someuser',
'https://whatsapp.com/someuser',
'https://wechat.com/someuser',
'https://telegram.org/someuser',
];
blockedUrls.forEach(url => {
if (!isUrlBlocked(url)) {
console.log(`URL not blocked: ${url}`);
}
expect(isUrlBlocked(url)).toBe(true);
});
});
it('should return false for URLs containing allowed keywords', () => {
const allowedUrls = [
'https://www.facebook.com/privacy',
'https://twitter.com/terms',
'https://instagram.com/legal',
'https://www.linkedin.com/help',
'https://pinterest.com/about',
'https://snapchat.com/support',
'https://tiktok.com/contact',
'https://reddit.com/user-agreement',
'https://tumblr.com/policy',
'https://flickr.com/blog',
'https://whatsapp.com/press',
'https://wechat.com/careers',
'https://telegram.org/conditions',
];
allowedUrls.forEach(url => {
expect(isUrlBlocked(url)).toBe(false);
});
});
it('should return false for non-blocked URLs', () => {
const nonBlockedUrls = [
'https://www.example.com',
'https://www.somewebsite.org',
'https://subdomain.example.com',
'firecrawl.dev',
'amazon.com',
];
nonBlockedUrls.forEach(url => {
expect(isUrlBlocked(url)).toBe(false);
});
});
});

View File

@ -1,5 +1,6 @@
const socialMediaBlocklist = [ const socialMediaBlocklist = [
'facebook.com', 'facebook.com',
'x.com',
'twitter.com', 'twitter.com',
'instagram.com', 'instagram.com',
'linkedin.com', 'linkedin.com',
@ -32,9 +33,22 @@ const allowedKeywords = [
]; ];
export function isUrlBlocked(url: string): boolean { export function isUrlBlocked(url: string): boolean {
// Check if the URL contains any allowed keywords
if (allowedKeywords.some(keyword => url.includes(keyword))) { if (allowedKeywords.some(keyword => url.includes(keyword))) {
return false; return false;
} }
return socialMediaBlocklist.some(domain => url.includes(domain)); try {
// Check if the URL matches any domain in the blocklist
return socialMediaBlocklist.some(domain => {
// Create a regular expression to match the exact domain
const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`);
// Test the hostname of the URL against the pattern
return domainPattern.test(new URL(url).hostname);
});
} catch (e) {
// If an error occurs (e.g., invalid URL), return false
return false;
} }
}