Merge pull request #55 from mendableai/feat/blocklist-social-media
[Feat] Added blocklist for social media urls
This commit is contained in:
commit
6a1c7d48ae
@ -55,6 +55,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
||||
expect(response.statusCode).not.toBe(401);
|
||||
});
|
||||
|
||||
it("should return an error for a blocklisted URL without requiring authorization", async () => {
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: blocklistedUrl });
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should return a successful response", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
@ -70,6 +80,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
||||
expect(response.statusCode).not.toBe(401);
|
||||
});
|
||||
|
||||
it("should return an error for a blocklisted URL", async () => {
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: blocklistedUrl });
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should return a successful response", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
@ -89,6 +109,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
||||
expect(response.statusCode).not.toBe(401);
|
||||
});
|
||||
|
||||
it("should return an error for a blocklisted URL", async () => {
|
||||
const blocklistedUrl = "https://instagram.com/fake-test";
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/crawlWebsitePreview")
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: blocklistedUrl });
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should return a successful response", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/crawlWebsitePreview")
|
||||
|
@ -47,6 +47,18 @@ const TEST_URL = "http://127.0.0.1:3002";
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it("should return an error for a blocklisted URL", async () => {
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: blocklistedUrl });
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should return a successful response with a valid preview token", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
@ -86,6 +98,17 @@ const TEST_URL = "http://127.0.0.1:3002";
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it("should return an error for a blocklisted URL", async () => {
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: blocklistedUrl });
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should return a successful response with a valid API key", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
@ -99,6 +122,7 @@ const TEST_URL = "http://127.0.0.1:3002";
|
||||
);
|
||||
});
|
||||
|
||||
|
||||
// Additional tests for insufficient credits?
|
||||
});
|
||||
|
||||
@ -119,6 +143,17 @@ const TEST_URL = "http://127.0.0.1:3002";
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it("should return an error for a blocklisted URL", async () => {
|
||||
const blocklistedUrl = "https://instagram.com/fake-test";
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/crawlWebsitePreview")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: blocklistedUrl });
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should return a successful response with a valid API key", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/crawlWebsitePreview")
|
||||
|
@ -5,6 +5,7 @@ import { checkTeamCredits } from "../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -27,6 +28,11 @@ export async function crawlController(req: Request, res: Response) {
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
|
||||
}
|
||||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||
|
@ -2,6 +2,7 @@ import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
@ -18,6 +19,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
|
||||
}
|
||||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||
|
@ -5,6 +5,7 @@ import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { Document } from "../lib/entities";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
|
||||
export async function scrapeHelper(
|
||||
req: Request,
|
||||
@ -22,6 +23,10 @@ export async function scrapeHelper(
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||
}
|
||||
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
mode: "single_urls",
|
||||
|
19
apps/api/src/scraper/WebScraper/utils/blocklist.ts
Normal file
19
apps/api/src/scraper/WebScraper/utils/blocklist.ts
Normal file
@ -0,0 +1,19 @@
|
||||
const socialMediaBlocklist = [
|
||||
'facebook.com',
|
||||
'twitter.com',
|
||||
'instagram.com',
|
||||
'linkedin.com',
|
||||
'pinterest.com',
|
||||
'snapchat.com',
|
||||
'tiktok.com',
|
||||
'reddit.com',
|
||||
'tumblr.com',
|
||||
'flickr.com',
|
||||
'whatsapp.com',
|
||||
'wechat.com',
|
||||
'telegram.org',
|
||||
];
|
||||
|
||||
export function isUrlBlocked(url: string): boolean {
|
||||
return socialMediaBlocklist.some(domain => url.includes(domain));
|
||||
}
|
Loading…
Reference in New Issue
Block a user