Merge pull request #185 from mendableai/nsc/improved-blocklist
Improvements to the blocklist regex
This commit is contained in:
commit
b26c5f1588
@ -0,0 +1,63 @@
|
|||||||
|
import { isUrlBlocked } from '../blocklist';
|
||||||
|
|
||||||
|
describe('isUrlBlocked', () => {
|
||||||
|
it('should return true for blocked social media URLs', () => {
|
||||||
|
const blockedUrls = [
|
||||||
|
'https://www.facebook.com',
|
||||||
|
'https://twitter.com/someuser',
|
||||||
|
'https://instagram.com/someuser',
|
||||||
|
'https://www.linkedin.com/in/someuser',
|
||||||
|
'https://pinterest.com/someuser',
|
||||||
|
'https://snapchat.com/someuser',
|
||||||
|
'https://tiktok.com/@someuser',
|
||||||
|
'https://reddit.com/r/somesubreddit',
|
||||||
|
'https://flickr.com/photos/someuser',
|
||||||
|
'https://whatsapp.com/someuser',
|
||||||
|
'https://wechat.com/someuser',
|
||||||
|
'https://telegram.org/someuser',
|
||||||
|
];
|
||||||
|
|
||||||
|
blockedUrls.forEach(url => {
|
||||||
|
if (!isUrlBlocked(url)) {
|
||||||
|
console.log(`URL not blocked: ${url}`);
|
||||||
|
}
|
||||||
|
expect(isUrlBlocked(url)).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return false for URLs containing allowed keywords', () => {
|
||||||
|
const allowedUrls = [
|
||||||
|
'https://www.facebook.com/privacy',
|
||||||
|
'https://twitter.com/terms',
|
||||||
|
'https://instagram.com/legal',
|
||||||
|
'https://www.linkedin.com/help',
|
||||||
|
'https://pinterest.com/about',
|
||||||
|
'https://snapchat.com/support',
|
||||||
|
'https://tiktok.com/contact',
|
||||||
|
'https://reddit.com/user-agreement',
|
||||||
|
'https://tumblr.com/policy',
|
||||||
|
'https://flickr.com/blog',
|
||||||
|
'https://whatsapp.com/press',
|
||||||
|
'https://wechat.com/careers',
|
||||||
|
'https://telegram.org/conditions',
|
||||||
|
];
|
||||||
|
|
||||||
|
allowedUrls.forEach(url => {
|
||||||
|
expect(isUrlBlocked(url)).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return false for non-blocked URLs', () => {
|
||||||
|
const nonBlockedUrls = [
|
||||||
|
'https://www.example.com',
|
||||||
|
'https://www.somewebsite.org',
|
||||||
|
'https://subdomain.example.com',
|
||||||
|
'firecrawl.dev',
|
||||||
|
'amazon.com',
|
||||||
|
];
|
||||||
|
|
||||||
|
nonBlockedUrls.forEach(url => {
|
||||||
|
expect(isUrlBlocked(url)).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
@ -1,5 +1,6 @@
|
|||||||
const socialMediaBlocklist = [
|
const socialMediaBlocklist = [
|
||||||
'facebook.com',
|
'facebook.com',
|
||||||
|
'x.com',
|
||||||
'twitter.com',
|
'twitter.com',
|
||||||
'instagram.com',
|
'instagram.com',
|
||||||
'linkedin.com',
|
'linkedin.com',
|
||||||
@ -32,9 +33,22 @@ const allowedKeywords = [
|
|||||||
];
|
];
|
||||||
|
|
||||||
export function isUrlBlocked(url: string): boolean {
|
export function isUrlBlocked(url: string): boolean {
|
||||||
|
// Check if the URL contains any allowed keywords
|
||||||
if (allowedKeywords.some(keyword => url.includes(keyword))) {
|
if (allowedKeywords.some(keyword => url.includes(keyword))) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return socialMediaBlocklist.some(domain => url.includes(domain));
|
try {
|
||||||
|
// Check if the URL matches any domain in the blocklist
|
||||||
|
return socialMediaBlocklist.some(domain => {
|
||||||
|
// Create a regular expression to match the exact domain
|
||||||
|
const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`);
|
||||||
|
// Test the hostname of the URL against the pattern
|
||||||
|
return domainPattern.test(new URL(url).hostname);
|
||||||
|
});
|
||||||
|
} catch (e) {
|
||||||
|
// If an error occurs (e.g., invalid URL), return false
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user