From 849c0b6ebfcc0c7d0e202330c7df0d6260c4b1a0 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 23 Apr 2024 18:50:35 -0300 Subject: [PATCH 1/2] [Feat] Added blocklist for social media urls --- .../src/__tests__/e2e_noAuth/index.test.ts | 30 ++++++++++++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 35 +++++++++++++++++++ apps/api/src/controllers/crawl.ts | 6 ++++ apps/api/src/controllers/crawlPreview.ts | 6 ++++ apps/api/src/controllers/scrape.ts | 5 +++ .../src/scraper/WebScraper/utils/blocklist.ts | 19 ++++++++++ 6 files changed, 101 insertions(+) create mode 100644 apps/api/src/scraper/WebScraper/utils/blocklist.ts diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index e0aca36..f76a8dc 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -55,6 +55,16 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).not.toBe(401); }); + it("should return an error for a blocklisted URL without requiring authorization", async () => { + const blocklistedUrl = "https://facebook.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -70,6 +80,16 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).not.toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://twitter.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response", async () => { const response = await request(TEST_URL) .post("/v0/crawl") @@ -89,6 +109,16 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).not.toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://instagram.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response", async () => { const response = await request(TEST_URL) .post("/v0/crawlWebsitePreview") diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index ba01a7c..578a033 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -47,6 +47,18 @@ const TEST_URL = "http://127.0.0.1:3002"; .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://facebook.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response with a valid preview token", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -86,6 +98,17 @@ const TEST_URL = "http://127.0.0.1:3002"; expect(response.statusCode).toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://twitter.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/crawl") @@ -99,6 +122,7 @@ const TEST_URL = "http://127.0.0.1:3002"; ); }); + // Additional tests for insufficient credits? }); @@ -119,6 +143,17 @@ const TEST_URL = "http://127.0.0.1:3002"; expect(response.statusCode).toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://instagram.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + }); + it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/crawlWebsitePreview") diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index bd3feca..9301c4d 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -5,6 +5,7 @@ import { checkTeamCredits } from "../../src/services/billing/credit_billing"; import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; +import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; export async function crawlController(req: Request, res: Response) { try { @@ -27,6 +28,11 @@ export async function crawlController(req: Request, res: Response) { if (!url) { return res.status(400).json({ error: "Url is required" }); } + + if (isUrlBlocked(url)) { + return res.status(403).json({ error: "URL is blocked due to policy restrictions" }); + } + const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 3f28ef6..4c40197 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -2,6 +2,7 @@ import { Request, Response } from "express"; import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; +import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; export async function crawlPreviewController(req: Request, res: Response) { try { @@ -18,6 +19,11 @@ export async function crawlPreviewController(req: Request, res: Response) { if (!url) { return res.status(400).json({ error: "Url is required" }); } + + if (isUrlBlocked(url)) { + return res.status(403).json({ error: "URL is blocked due to policy restrictions" }); + } + const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index be70800..d24c882 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -5,6 +5,7 @@ import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { Document } from "../lib/entities"; +import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function export async function scrapeHelper( req: Request, @@ -22,6 +23,10 @@ export async function scrapeHelper( return { success: false, error: "Url is required", returnCode: 400 }; } + if (isUrlBlocked(url)) { + return { success: false, error: "URL is blocked due to policy restrictions", returnCode: 403 }; + } + const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts new file mode 100644 index 0000000..0eef332 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -0,0 +1,19 @@ +const socialMediaBlocklist = [ + 'facebook.com', + 'twitter.com', + 'instagram.com', + 'linkedin.com', + 'pinterest.com', + 'snapchat.com', + 'tiktok.com', + 'reddit.com', + 'tumblr.com', + 'flickr.com', + 'whatsapp.com', + 'wechat.com', + 'telegram.org', +]; + +export function isUrlBlocked(url: string): boolean { + return socialMediaBlocklist.some(domain => url.includes(domain)); +} From f3c190c21ced7b87989abbbb4e7180653c820aad Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 16:47:24 -0700 Subject: [PATCH 2/2] Nick: --- apps/api/src/__tests__/e2e_noAuth/index.test.ts | 6 +++--- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 6 +++--- apps/api/src/controllers/crawl.ts | 2 +- apps/api/src/controllers/crawlPreview.ts | 2 +- apps/api/src/controllers/scrape.ts | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index f76a8dc..b2b2938 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -62,7 +62,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response", async () => { @@ -87,7 +87,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response", async () => { @@ -116,7 +116,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response", async () => { diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 578a033..a165ae2 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -56,7 +56,7 @@ const TEST_URL = "http://127.0.0.1:3002"; .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response with a valid preview token", async () => { @@ -106,7 +106,7 @@ const TEST_URL = "http://127.0.0.1:3002"; .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response with a valid API key", async () => { @@ -151,7 +151,7 @@ const TEST_URL = "http://127.0.0.1:3002"; .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("URL is blocked due to policy restrictions"); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); }); it("should return a successful response with a valid API key", async () => { diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 9301c4d..3d64f7f 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -30,7 +30,7 @@ export async function crawlController(req: Request, res: Response) { } if (isUrlBlocked(url)) { - return res.status(403).json({ error: "URL is blocked due to policy restrictions" }); + return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." }); } const mode = req.body.mode ?? "crawl"; diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 4c40197..569be33 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -21,7 +21,7 @@ export async function crawlPreviewController(req: Request, res: Response) { } if (isUrlBlocked(url)) { - return res.status(403).json({ error: "URL is blocked due to policy restrictions" }); + return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." }); } const mode = req.body.mode ?? "crawl"; diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index d24c882..cfe35b5 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -24,7 +24,7 @@ export async function scrapeHelper( } if (isUrlBlocked(url)) { - return { success: false, error: "URL is blocked due to policy restrictions", returnCode: 403 }; + return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; } const a = new WebScraperDataProvider();