Merge pull request #55 from mendableai/feat/blocklist-social-media
[Feat] Added blocklist for social media urls
This commit is contained in:
commit
6a1c7d48ae
@ -55,6 +55,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
expect(response.statusCode).not.toBe(401);
|
expect(response.statusCode).not.toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should return an error for a blocklisted URL without requiring authorization", async () => {
|
||||||
|
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: blocklistedUrl });
|
||||||
|
expect(response.statusCode).toBe(403);
|
||||||
|
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||||
|
});
|
||||||
|
|
||||||
it("should return a successful response", async () => {
|
it("should return a successful response", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v0/scrape")
|
.post("/v0/scrape")
|
||||||
@ -70,6 +80,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
expect(response.statusCode).not.toBe(401);
|
expect(response.statusCode).not.toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should return an error for a blocklisted URL", async () => {
|
||||||
|
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: blocklistedUrl });
|
||||||
|
expect(response.statusCode).toBe(403);
|
||||||
|
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||||
|
});
|
||||||
|
|
||||||
it("should return a successful response", async () => {
|
it("should return a successful response", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v0/crawl")
|
.post("/v0/crawl")
|
||||||
@ -89,6 +109,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
expect(response.statusCode).not.toBe(401);
|
expect(response.statusCode).not.toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should return an error for a blocklisted URL", async () => {
|
||||||
|
const blocklistedUrl = "https://instagram.com/fake-test";
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/crawlWebsitePreview")
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: blocklistedUrl });
|
||||||
|
expect(response.statusCode).toBe(403);
|
||||||
|
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||||
|
});
|
||||||
|
|
||||||
it("should return a successful response", async () => {
|
it("should return a successful response", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v0/crawlWebsitePreview")
|
.post("/v0/crawlWebsitePreview")
|
||||||
|
@ -47,6 +47,18 @@ const TEST_URL = "http://127.0.0.1:3002";
|
|||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should return an error for a blocklisted URL", async () => {
|
||||||
|
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: blocklistedUrl });
|
||||||
|
expect(response.statusCode).toBe(403);
|
||||||
|
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||||
|
});
|
||||||
|
|
||||||
it("should return a successful response with a valid preview token", async () => {
|
it("should return a successful response with a valid preview token", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v0/scrape")
|
.post("/v0/scrape")
|
||||||
@ -86,6 +98,17 @@ const TEST_URL = "http://127.0.0.1:3002";
|
|||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should return an error for a blocklisted URL", async () => {
|
||||||
|
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: blocklistedUrl });
|
||||||
|
expect(response.statusCode).toBe(403);
|
||||||
|
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||||
|
});
|
||||||
|
|
||||||
it("should return a successful response with a valid API key", async () => {
|
it("should return a successful response with a valid API key", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v0/crawl")
|
.post("/v0/crawl")
|
||||||
@ -99,6 +122,7 @@ const TEST_URL = "http://127.0.0.1:3002";
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
// Additional tests for insufficient credits?
|
// Additional tests for insufficient credits?
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -119,6 +143,17 @@ const TEST_URL = "http://127.0.0.1:3002";
|
|||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should return an error for a blocklisted URL", async () => {
|
||||||
|
const blocklistedUrl = "https://instagram.com/fake-test";
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/crawlWebsitePreview")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: blocklistedUrl });
|
||||||
|
expect(response.statusCode).toBe(403);
|
||||||
|
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||||
|
});
|
||||||
|
|
||||||
it("should return a successful response with a valid API key", async () => {
|
it("should return a successful response with a valid API key", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v0/crawlWebsitePreview")
|
.post("/v0/crawlWebsitePreview")
|
||||||
|
@ -5,6 +5,7 @@ import { checkTeamCredits } from "../../src/services/billing/credit_billing";
|
|||||||
import { authenticateUser } from "./auth";
|
import { authenticateUser } from "./auth";
|
||||||
import { RateLimiterMode } from "../../src/types";
|
import { RateLimiterMode } from "../../src/types";
|
||||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||||
|
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -27,6 +28,11 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
if (!url) {
|
if (!url) {
|
||||||
return res.status(400).json({ error: "Url is required" });
|
return res.status(400).json({ error: "Url is required" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (isUrlBlocked(url)) {
|
||||||
|
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
|
||||||
|
}
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||||
|
@ -2,6 +2,7 @@ import { Request, Response } from "express";
|
|||||||
import { authenticateUser } from "./auth";
|
import { authenticateUser } from "./auth";
|
||||||
import { RateLimiterMode } from "../../src/types";
|
import { RateLimiterMode } from "../../src/types";
|
||||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||||
|
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||||
|
|
||||||
export async function crawlPreviewController(req: Request, res: Response) {
|
export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -18,6 +19,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
if (!url) {
|
if (!url) {
|
||||||
return res.status(400).json({ error: "Url is required" });
|
return res.status(400).json({ error: "Url is required" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (isUrlBlocked(url)) {
|
||||||
|
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
|
||||||
|
}
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||||
|
@ -5,6 +5,7 @@ import { authenticateUser } from "./auth";
|
|||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
import { logJob } from "../services/logging/log_job";
|
import { logJob } from "../services/logging/log_job";
|
||||||
import { Document } from "../lib/entities";
|
import { Document } from "../lib/entities";
|
||||||
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
|
|
||||||
export async function scrapeHelper(
|
export async function scrapeHelper(
|
||||||
req: Request,
|
req: Request,
|
||||||
@ -22,6 +23,10 @@ export async function scrapeHelper(
|
|||||||
return { success: false, error: "Url is required", returnCode: 400 };
|
return { success: false, error: "Url is required", returnCode: 400 };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (isUrlBlocked(url)) {
|
||||||
|
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||||
|
}
|
||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
|
19
apps/api/src/scraper/WebScraper/utils/blocklist.ts
Normal file
19
apps/api/src/scraper/WebScraper/utils/blocklist.ts
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
const socialMediaBlocklist = [
|
||||||
|
'facebook.com',
|
||||||
|
'twitter.com',
|
||||||
|
'instagram.com',
|
||||||
|
'linkedin.com',
|
||||||
|
'pinterest.com',
|
||||||
|
'snapchat.com',
|
||||||
|
'tiktok.com',
|
||||||
|
'reddit.com',
|
||||||
|
'tumblr.com',
|
||||||
|
'flickr.com',
|
||||||
|
'whatsapp.com',
|
||||||
|
'wechat.com',
|
||||||
|
'telegram.org',
|
||||||
|
];
|
||||||
|
|
||||||
|
export function isUrlBlocked(url: string): boolean {
|
||||||
|
return socialMediaBlocklist.some(domain => url.includes(domain));
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user