diff --git a/apps/api/package.json b/apps/api/package.json index c786b17..e114a0f 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -12,8 +12,7 @@ "build": "tsc", "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", - "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", - "workers": "nodemon --exec ts-node src/services/queue-worker.ts", + "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'", "worker:production": "node dist/src/services/queue-worker.js", "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", "mongo-docker-console": "docker exec -it mongodb mongosh", diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts new file mode 100644 index 0000000..9f04093 --- /dev/null +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -0,0 +1,1390 @@ +import request from "supertest"; +import dotenv from "dotenv"; +import { v4 as uuidv4 } from "uuid"; + +dotenv.config(); + +// const TEST_URL = 'http://localhost:3002' +const TEST_URL = "http://127.0.0.1:3002"; + +describe("E2E Tests for API Routes", () => { + beforeAll(() => { + process.env.USE_DB_AUTHENTICATION = "true"; + }); + + afterAll(() => { + delete process.env.USE_DB_AUTHENTICATION; + }); + describe("GET /", () => { + it.concurrent("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/"); + + expect(response.statusCode).toBe(200); + expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); + }); + }); + + describe("GET /test", () => { + it.concurrent("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/test"); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("Hello, world!"); + }); + }); + + describe("POST /v0/scrape", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/scrape"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://facebook.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); + }); + + // tested on rate limit test + // it.concurrent("should return a successful response with a valid preview token", async () => { + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer this_is_just_a_preview_token`) + // .set("Content-Type", "application/json") + // .send({ url: "https://roastmywebsite.ai" }); + // expect(response.statusCode).toBe(200); + // }, 30000); // 30 seconds timeout + + it.concurrent("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://roastmywebsite.ai" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, 30000); // 30 seconds timeout + + it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeHtml: true }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.html).toContain(" { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { + const responseWithoutRemoveTags = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/" }); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer + expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav + expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong + + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("Scrape This Site"); + expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer + expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav + expect(response.body.data.content).not.toContain("web scraping"); // strong + }, 30000); // 30 seconds timeout + + // TODO: add this test back once we nail the waitFor option to be more deterministic + // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { + // const startTime = Date.now(); + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } }); + // const endTime = Date.now(); + // const duration = endTime - startTime; + + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty("data"); + // expect(response.body.data).toHaveProperty("content"); + // expect(response.body.data).toHaveProperty("markdown"); + // expect(response.body.data).toHaveProperty("metadata"); + // expect(response.body.data).not.toHaveProperty("html"); + // expect(response.body.data.content).toContain("🔥 Firecrawl"); + // expect(duration).toBeGreaterThanOrEqual(7000); + // }, 12000); // 12 seconds timeout + + it.concurrent('should return a successful response for a scrape with 400 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/400' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(400); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 401 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/401' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(401); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized"); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response for a scrape with 403 page", async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/403' }); + + await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(403); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 404 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/404' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(404); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 405 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/405' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(405); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed"); + }, 60000); // 60 seconds + + it.concurrent('should return a successful response for a scrape with 500 page', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/500' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.pageStatusCode).toBe(500); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error"); + }, 60000); // 60 seconds + }); + + describe("POST /v0/crawl", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawl"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://twitter.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); + }); + + it.concurrent("should return a successful response with a valid API key for crawl", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + it.concurrent('should prevent duplicate requests using the same idempotency key', async () => { + const uniqueIdempotencyKey = uuidv4(); + + // First request with the idempotency key + const firstResponse = await request(TEST_URL) + .post('/v0/crawl') + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .set("x-idempotency-key", uniqueIdempotencyKey) + .send({ url: 'https://mendable.ai' }); + + expect(firstResponse.statusCode).toBe(200); + + // Second request with the same idempotency key + const secondResponse = await request(TEST_URL) + .post('/v0/crawl') + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .set("x-idempotency-key", uniqueIdempotencyKey) + .send({ url: 'https://mendable.ai' }); + + expect(secondResponse.statusCode).toBe(409); + expect(secondResponse.body.error).toBe('Idempotency key already used'); + }); + + it.concurrent("should return a successful response with a valid API key and valid includes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + includes: ["blog/*"], + }, + }); + + let response; + let isFinished = false; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy(); + }); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + excludes: ["blog/*"], + }, + }); + + let isFinished = false; + let response; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); + }); + }, 90000); // 90 seconds + + it.concurrent("should return a successful response with a valid API key and limit to 3", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + crawlerOptions: { limit: 3 }, + }); + + let isFinished = false; + let response; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data.length).toBe(3); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + + it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 1 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + expect(depth).toBeLessThanOrEqual(2); + }); + }, 180000); + + it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com/pages/", + crawlerOptions: { maxDepth: 1 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + expect(depth).toBeLessThanOrEqual(3); + }); + }, 180000); + + it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { + + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.mendable.ai", + crawlerOptions: { maxDepth: 0 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const testurls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + //console.log(testurls) + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); + + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + expect(depth).toBeLessThanOrEqual(1); + }); + }, 180000); + + + + + + // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { + // const crawlResponse = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://mendable.ai", + // crawlerOptions: { limit: 10 }, + // }); + + // const response = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty("status"); + // expect(response.body.status).toBe("active"); + + // let isCompleted = false; + // while (!isCompleted) { + // const statusCheckResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + // expect(statusCheckResponse.statusCode).toBe(200); + // isCompleted = statusCheckResponse.body.status === "completed"; + // if (!isCompleted) { + // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + // } + // } + + // const completedResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + // expect(completedResponse.statusCode).toBe(200); + // expect(completedResponse.body).toHaveProperty("status"); + // expect(completedResponse.body.status).toBe("completed"); + // expect(completedResponse.body).toHaveProperty("data"); + // expect(completedResponse.body.data.length).toBe(10); + // expect(completedResponse.body.data[0]).toHaveProperty("content"); + // expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + // expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + // expect(completedResponse.body.data[0].content).toContain("Mendable"); + // expect(completedResponse.body.data[0].content).not.toContain("main menu"); + // }, 60000); // 60 seconds + + it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeHtml: true }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + + // 120 seconds + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("_Roast_"); + expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); + expect(completedResponse.body.data[0].html).toContain(" { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawlWebsitePreview"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + // it.concurrent("should return an error for a blocklisted URL", async () => { + // const blocklistedUrl = "https://instagram.com/fake-test"; + // const response = await request(TEST_URL) + // .post("/v0/crawlWebsitePreview") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: blocklistedUrl }); + // // is returning 429 instead of 403 + // expect(response.statusCode).toBe(403); + // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + // }); + + it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", timeout: 1000 }); + + expect(response.statusCode).toBe(408); + }, 3000); + + // it.concurrent("should return a successful response with a valid API key for crawlWebsitePreview", async () => { + // const response = await request(TEST_URL) + // .post("/v0/crawlWebsitePreview") + // .set("Authorization", `Bearer this_is_just_a_preview_token`) + // .set("Content-Type", "application/json") + // .send({ url: "https://firecrawl.dev" }); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty("jobId"); + // expect(response.body.jobId).toMatch( + // /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + // ); + // }); + }); + + describe("POST /v0/search", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/search"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return a successful response with a valid API key for search", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }, 30000); // 30 seconds timeout + }); + + describe("GET /v0/crawl/status/:jobId", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).get("/v0/crawl/status/123"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return Job not found for invalid job ID", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + }); + + it.concurrent("should return a successful crawl status response for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://mendable.ai/blog" }); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + let completedResponse; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + + const childrenLinks = completedResponse.body.data.filter(doc => + doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") + ); + + expect(childrenLinks.length).toBe(completedResponse.body.data.length); + }, 180000); // 120 seconds + + it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { + const crawlResponse = await request(TEST_URL) + .post('/v0/crawl') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }}); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + let completedResponse; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('status'); + + if (response.body.status === 'completed') { + isCompleted = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + expect(completedResponse.body.status).toBe('completed'); + expect(completedResponse.body).toHaveProperty('data'); + expect(completedResponse.body.data.length).toEqual(1); + expect(completedResponse.body.data).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.') + }) + ]) + ); + + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + }, 180000); // 120 seconds + + + + it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeHtml: true }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + + let isFinished = false; + let completedResponse; + + while (!isFinished) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isFinished = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0].content).toContain("_Roast_"); + expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); + expect(completedResponse.body.data[0].html).toContain(" { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai/blog", + pageOptions: { includeHtml: true }, + crawlerOptions: { allowBackwardCrawling: true }, + }); + expect(crawlResponse.statusCode).toBe(200); + + let isFinished = false; + let completedResponse; + + while (!isFinished) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isFinished = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].markdown).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + + const onlyChildrenLinks = completedResponse.body.data.filter(doc => { + return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") + }); + + expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length); + }, 60000); + + it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://jestjs.io" }); + + expect(crawlResponse.statusCode).toBe(200); + + await new Promise((r) => setTimeout(r, 20000)); + + const responseCancel = await request(TEST_URL) + .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); + + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("failed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data).toBeNull(); + expect(completedResponse.body).toHaveProperty("partial_data"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + + describe("POST /v0/scrape with LLM Extraction", () => { + it.concurrent("should extract data using LLM extraction mode", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + pageOptions: { + onlyMainContent: true, + }, + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: + "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionSchema: { + type: "object", + properties: { + company_mission: { + type: "string", + }, + supports_sso: { + type: "boolean", + }, + is_open_source: { + type: "boolean", + }, + }, + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, + }); + + // Ensure that the job was successfully created before proceeding with LLM extraction + expect(response.statusCode).toBe(200); + + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` + let llmExtraction = response.body.data.llm_extraction; + + // Check if the llm_extraction object has the required properties with correct types and values + expect(llmExtraction).toHaveProperty("company_mission"); + expect(typeof llmExtraction.company_mission).toBe("string"); + expect(llmExtraction).toHaveProperty("supports_sso"); + expect(llmExtraction.supports_sso).toBe(true); + expect(typeof llmExtraction.supports_sso).toBe("boolean"); + expect(llmExtraction).toHaveProperty("is_open_source"); + expect(llmExtraction.is_open_source).toBe(false); + expect(typeof llmExtraction.is_open_source).toBe("boolean"); + }, 60000); // 60 secs + }); + + // describe("POST /v0/scrape for Top 100 Companies", () => { + // it.concurrent("should extract data for the top 100 companies", async () => { + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://companiesmarketcap.com/", + // pageOptions: { + // onlyMainContent: true + // }, + // extractorOptions: { + // mode: "llm-extraction", + // extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.", + // extractionSchema: { + // type: "object", + // properties: { + // companies: { + // type: "array", + // items: { + // type: "object", + // properties: { + // rank: { type: "number" }, + // name: { type: "string" }, + // marketCap: { type: "string" }, + // price: { type: "string" }, + // todayChange: { type: "string" } + // }, + // required: ["rank", "name", "marketCap", "price", "todayChange"] + // } + // } + // }, + // required: ["companies"] + // } + // } + // }); + + // // Print the response body to the console for debugging purposes + // console.log("Response companies:", response.body.data.llm_extraction.companies); + + // // Check if the response has the correct structure and data types + // expect(response.status).toBe(200); + // expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true); + // expect(response.body.data.llm_extraction.companies.length).toBe(40); + + // // Sample check for the first company + // const firstCompany = response.body.data.llm_extraction.companies[0]; + // expect(firstCompany).toHaveProperty("name"); + // expect(typeof firstCompany.name).toBe("string"); + // expect(firstCompany).toHaveProperty("marketCap"); + // expect(typeof firstCompany.marketCap).toBe("string"); + // expect(firstCompany).toHaveProperty("price"); + // expect(typeof firstCompany.price).toBe("string"); + // expect(firstCompany).toHaveProperty("todayChange"); + // expect(typeof firstCompany.todayChange).toBe("string"); + // }, 120000); // 120 secs + // }); + + describe("POST /v0/crawl with fast mode", () => { + it.concurrent("should complete the crawl under 20 seconds", async () => { + const startTime = Date.now(); + + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://flutterbricks.com", + crawlerOptions: { + mode: "fast" + } + }); + + expect(crawlResponse.statusCode).toBe(200); + + const jobId = crawlResponse.body.jobId; + let statusResponse; + let isFinished = false; + + while (!isFinished) { + statusResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(statusResponse.statusCode).toBe(200); + isFinished = statusResponse.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + // const endTime = Date.now(); + // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + + // console.log(`Time elapsed: ${timeElapsed} seconds`); + + expect(statusResponse.body.status).toBe("completed"); + expect(statusResponse.body).toHaveProperty("data"); + expect(statusResponse.body.data[0]).toHaveProperty("content"); + expect(statusResponse.body.data[0]).toHaveProperty("markdown"); + expect(statusResponse.body.data[0]).toHaveProperty("metadata"); + expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined(); + + const results = statusResponse.body.data; + // results.forEach((result, i) => { + // console.log(result.metadata.sourceURL); + // }); + expect(results.length).toBeGreaterThanOrEqual(10); + expect(results.length).toBeLessThanOrEqual(15); + + }, 20000); + + // it.concurrent("should complete the crawl in more than 10 seconds", async () => { + // const startTime = Date.now(); + + // const crawlResponse = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://flutterbricks.com", + // }); + + // expect(crawlResponse.statusCode).toBe(200); + + // const jobId = crawlResponse.body.jobId; + // let statusResponse; + // let isFinished = false; + + // while (!isFinished) { + // statusResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + // expect(statusResponse.statusCode).toBe(200); + // isFinished = statusResponse.body.status === "completed"; + + // if (!isFinished) { + // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + // } + // } + + // const endTime = Date.now(); + // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + + // console.log(`Time elapsed: ${timeElapsed} seconds`); + + // expect(statusResponse.body.status).toBe("completed"); + // expect(statusResponse.body).toHaveProperty("data"); + // expect(statusResponse.body.data[0]).toHaveProperty("content"); + // expect(statusResponse.body.data[0]).toHaveProperty("markdown"); + // const results = statusResponse.body.data; + // // results.forEach((result, i) => { + // // console.log(result.metadata.sourceURL); + // // }); + // expect(results.length).toBeGreaterThanOrEqual(10); + // expect(results.length).toBeLessThanOrEqual(15); + + // }, 50000);// 15 seconds timeout to account for network delays + }); + + describe("GET /is-production", () => { + it.concurrent("should return the production status", async () => { + const response = await request(TEST_URL).get("/is-production"); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("isProduction"); + }); + }); + + describe("Rate Limiter", () => { + it.concurrent("should return 429 when rate limit is exceeded for preview token", async () => { + for (let i = 0; i < 5; i++) { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com" }); + + expect(response.statusCode).toBe(200); + } + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com" }); + + expect(response.statusCode).toBe(429); + }, 90000); + }); + + // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => { + // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) { + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); + + // expect(response.statusCode).toBe(200); + // } + + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); + + // expect(response.statusCode).toBe(429); + // }, 60000); + + // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => { + // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) { + // const response = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); + + // expect(response.statusCode).toBe(200); + // } + + // const response = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); + + // expect(response.statusCode).toBe(429); + // }, 60000); +}); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 9f04093..b1b0cc0 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -1,10 +1,7 @@ import request from "supertest"; import dotenv from "dotenv"; -import { v4 as uuidv4 } from "uuid"; dotenv.config(); - -// const TEST_URL = 'http://localhost:3002' const TEST_URL = "http://127.0.0.1:3002"; describe("E2E Tests for API Routes", () => { @@ -15,20 +12,12 @@ describe("E2E Tests for API Routes", () => { afterAll(() => { delete process.env.USE_DB_AUTHENTICATION; }); - describe("GET /", () => { - it.concurrent("should return Hello, world! message", async () => { - const response = await request(TEST_URL).get("/"); + describe("GET /is-production", () => { + it.concurrent("should return the production status", async () => { + const response = await request(TEST_URL).get("/is-production"); expect(response.statusCode).toBe(200); - expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); - }); - }); - - describe("GET /test", () => { - it.concurrent("should return Hello, world! message", async () => { - const response = await request(TEST_URL).get("/test"); - expect(response.statusCode).toBe(200); - expect(response.text).toContain("Hello, world!"); + expect(response.body).toHaveProperty("isProduction"); }); }); @@ -47,29 +36,6 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://facebook.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." - ); - }); - - // tested on rate limit test - // it.concurrent("should return a successful response with a valid preview token", async () => { - // const response = await request(TEST_URL) - // .post("/v0/scrape") - // .set("Authorization", `Bearer this_is_just_a_preview_token`) - // .set("Content-Type", "application/json") - // .send({ url: "https://roastmywebsite.ai" }); - // expect(response.statusCode).toBe(200); - // }, 30000); // 30 seconds timeout - it.concurrent("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -143,21 +109,6 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.metadata.pageError).toBeUndefined(); }, 60000); // 60 seconds - it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } }); - await new Promise((r) => setTimeout(r, 6000)); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); - }, 60000); // 60 seconds - it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { const responseWithoutRemoveTags = await request(TEST_URL) .post("/v0/scrape") @@ -192,27 +143,6 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).not.toContain("web scraping"); // strong }, 30000); // 30 seconds timeout - // TODO: add this test back once we nail the waitFor option to be more deterministic - // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { - // const startTime = Date.now(); - // const response = await request(TEST_URL) - // .post("/v0/scrape") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } }); - // const endTime = Date.now(); - // const duration = endTime - startTime; - - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty("data"); - // expect(response.body.data).toHaveProperty("content"); - // expect(response.body.data).toHaveProperty("markdown"); - // expect(response.body.data).toHaveProperty("metadata"); - // expect(response.body.data).not.toHaveProperty("html"); - // expect(response.body.data.content).toContain("🔥 Firecrawl"); - // expect(duration).toBeGreaterThanOrEqual(7000); - // }, 12000); // 12 seconds timeout - it.concurrent('should return a successful response for a scrape with 400 page', async () => { const response = await request(TEST_URL) .post('/v0/scrape') @@ -325,19 +255,6 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://twitter.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." - ); - }); - it.concurrent("should return a successful response with a valid API key for crawl", async () => { const response = await request(TEST_URL) .post("/v0/crawl") @@ -350,31 +267,7 @@ describe("E2E Tests for API Routes", () => { /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ ); }); - it.concurrent('should prevent duplicate requests using the same idempotency key', async () => { - const uniqueIdempotencyKey = uuidv4(); - - // First request with the idempotency key - const firstResponse = await request(TEST_URL) - .post('/v0/crawl') - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://mendable.ai' }); - - expect(firstResponse.statusCode).toBe(200); - - // Second request with the same idempotency key - const secondResponse = await request(TEST_URL) - .post('/v0/crawl') - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://mendable.ai' }); - - expect(secondResponse.statusCode).toBe(409); - expect(secondResponse.body.error).toBe('Idempotency key already used'); - }); - + it.concurrent("should return a successful response with a valid API key and valid includes option", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") @@ -564,223 +457,6 @@ describe("E2E Tests for API Routes", () => { expect(depth).toBeLessThanOrEqual(2); }); }, 180000); - - it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com/pages/", - crawlerOptions: { maxDepth: 1 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(1); - - // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(3); - }); - }, 180000); - - it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { - - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.mendable.ai", - crawlerOptions: { maxDepth: 0 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - const testurls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - //console.log(testurls) - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThanOrEqual(1); - - // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(1); - }); - }, 180000); - - - - - - // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { - // const crawlResponse = await request(TEST_URL) - // .post("/v0/crawl") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ - // url: "https://mendable.ai", - // crawlerOptions: { limit: 10 }, - // }); - - // const response = await request(TEST_URL) - // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty("status"); - // expect(response.body.status).toBe("active"); - - // let isCompleted = false; - // while (!isCompleted) { - // const statusCheckResponse = await request(TEST_URL) - // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - // expect(statusCheckResponse.statusCode).toBe(200); - // isCompleted = statusCheckResponse.body.status === "completed"; - // if (!isCompleted) { - // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - // } - // } - - // const completedResponse = await request(TEST_URL) - // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - // expect(completedResponse.statusCode).toBe(200); - // expect(completedResponse.body).toHaveProperty("status"); - // expect(completedResponse.body.status).toBe("completed"); - // expect(completedResponse.body).toHaveProperty("data"); - // expect(completedResponse.body.data.length).toBe(10); - // expect(completedResponse.body.data[0]).toHaveProperty("content"); - // expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - // expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - // expect(completedResponse.body.data[0].content).toContain("Mendable"); - // expect(completedResponse.body.data[0].content).not.toContain("main menu"); - // }, 60000); // 60 seconds - - it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - - // 120 seconds - expect(completedResponse.body.data[0]).toHaveProperty("html"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("_Roast_"); - expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); - expect(completedResponse.body.data[0].html).toContain(" { @@ -798,18 +474,6 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - // it.concurrent("should return an error for a blocklisted URL", async () => { - // const blocklistedUrl = "https://instagram.com/fake-test"; - // const response = await request(TEST_URL) - // .post("/v0/crawlWebsitePreview") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: blocklistedUrl }); - // // is returning 429 instead of 403 - // expect(response.statusCode).toBe(403); - // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - // }); - it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -819,19 +483,6 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(408); }, 3000); - - // it.concurrent("should return a successful response with a valid API key for crawlWebsitePreview", async () => { - // const response = await request(TEST_URL) - // .post("/v0/crawlWebsitePreview") - // .set("Authorization", `Bearer this_is_just_a_preview_token`) - // .set("Content-Type", "application/json") - // .send({ url: "https://firecrawl.dev" }); - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty("jobId"); - // expect(response.body.jobId).toMatch( - // /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ - // ); - // }); }); describe("POST /v0/search", () => { @@ -965,145 +616,42 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); }, 180000); // 120 seconds - - - it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => { + it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ - url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true }, - }); + .send({ url: "https://jestjs.io" }); + expect(crawlResponse.statusCode).toBe(200); - const response = await request(TEST_URL) + await new Promise((r) => setTimeout(r, 20000)); + + const responseCancel = await request(TEST_URL) + .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); + + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - - let isFinished = false; - let completedResponse; - - while (!isFinished) { - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - - if (response.body.status === "completed") { - isFinished = true; - completedResponse = response; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again - } - } expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body.status).toBe("failed"); expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0]).toHaveProperty("html"); - expect(completedResponse.body.data[0].content).toContain("_Roast_"); - expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); - expect(completedResponse.body.data[0].html).toContain(" { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai/blog", - pageOptions: { includeHtml: true }, - crawlerOptions: { allowBackwardCrawling: true }, - }); - expect(crawlResponse.statusCode).toBe(200); - - let isFinished = false; - let completedResponse; - - while (!isFinished) { - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - - if (response.body.status === "completed") { - isFinished = true; - completedResponse = response; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again - } - } - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0]).toHaveProperty("html"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].markdown).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - - const onlyChildrenLinks = completedResponse.body.data.filter(doc => { - return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") - }); - - expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length); - }, 60000); - - it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://jestjs.io" }); - - expect(crawlResponse.statusCode).toBe(200); - - await new Promise((r) => setTimeout(r, 20000)); - - const responseCancel = await request(TEST_URL) - .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(responseCancel.statusCode).toBe(200); - expect(responseCancel.body).toHaveProperty("status"); - expect(responseCancel.body.status).toBe("cancelled"); - - await new Promise((r) => setTimeout(r, 10000)); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("failed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data).toBeNull(); - expect(completedResponse.body).toHaveProperty("partial_data"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds + expect(completedResponse.body.data).toBeNull(); + expect(completedResponse.body).toHaveProperty("partial_data"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); + }, 60000); // 60 seconds + }); describe("POST /v0/scrape with LLM Extraction", () => { it.concurrent("should extract data using LLM extraction mode", async () => { @@ -1156,64 +704,6 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 secs }); - // describe("POST /v0/scrape for Top 100 Companies", () => { - // it.concurrent("should extract data for the top 100 companies", async () => { - // const response = await request(TEST_URL) - // .post("/v0/scrape") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ - // url: "https://companiesmarketcap.com/", - // pageOptions: { - // onlyMainContent: true - // }, - // extractorOptions: { - // mode: "llm-extraction", - // extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.", - // extractionSchema: { - // type: "object", - // properties: { - // companies: { - // type: "array", - // items: { - // type: "object", - // properties: { - // rank: { type: "number" }, - // name: { type: "string" }, - // marketCap: { type: "string" }, - // price: { type: "string" }, - // todayChange: { type: "string" } - // }, - // required: ["rank", "name", "marketCap", "price", "todayChange"] - // } - // } - // }, - // required: ["companies"] - // } - // } - // }); - - // // Print the response body to the console for debugging purposes - // console.log("Response companies:", response.body.data.llm_extraction.companies); - - // // Check if the response has the correct structure and data types - // expect(response.status).toBe(200); - // expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true); - // expect(response.body.data.llm_extraction.companies.length).toBe(40); - - // // Sample check for the first company - // const firstCompany = response.body.data.llm_extraction.companies[0]; - // expect(firstCompany).toHaveProperty("name"); - // expect(typeof firstCompany.name).toBe("string"); - // expect(firstCompany).toHaveProperty("marketCap"); - // expect(typeof firstCompany.marketCap).toBe("string"); - // expect(firstCompany).toHaveProperty("price"); - // expect(typeof firstCompany.price).toBe("string"); - // expect(firstCompany).toHaveProperty("todayChange"); - // expect(typeof firstCompany.todayChange).toBe("string"); - // }, 120000); // 120 secs - // }); - describe("POST /v0/crawl with fast mode", () => { it.concurrent("should complete the crawl under 20 seconds", async () => { const startTime = Date.now(); @@ -1269,122 +759,5 @@ describe("E2E Tests for API Routes", () => { expect(results.length).toBeLessThanOrEqual(15); }, 20000); - - // it.concurrent("should complete the crawl in more than 10 seconds", async () => { - // const startTime = Date.now(); - - // const crawlResponse = await request(TEST_URL) - // .post("/v0/crawl") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ - // url: "https://flutterbricks.com", - // }); - - // expect(crawlResponse.statusCode).toBe(200); - - // const jobId = crawlResponse.body.jobId; - // let statusResponse; - // let isFinished = false; - - // while (!isFinished) { - // statusResponse = await request(TEST_URL) - // .get(`/v0/crawl/status/${jobId}`) - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - // expect(statusResponse.statusCode).toBe(200); - // isFinished = statusResponse.body.status === "completed"; - - // if (!isFinished) { - // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - // } - // } - - // const endTime = Date.now(); - // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds - - // console.log(`Time elapsed: ${timeElapsed} seconds`); - - // expect(statusResponse.body.status).toBe("completed"); - // expect(statusResponse.body).toHaveProperty("data"); - // expect(statusResponse.body.data[0]).toHaveProperty("content"); - // expect(statusResponse.body.data[0]).toHaveProperty("markdown"); - // const results = statusResponse.body.data; - // // results.forEach((result, i) => { - // // console.log(result.metadata.sourceURL); - // // }); - // expect(results.length).toBeGreaterThanOrEqual(10); - // expect(results.length).toBeLessThanOrEqual(15); - - // }, 50000);// 15 seconds timeout to account for network delays }); - - describe("GET /is-production", () => { - it.concurrent("should return the production status", async () => { - const response = await request(TEST_URL).get("/is-production"); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("isProduction"); - }); - }); - - describe("Rate Limiter", () => { - it.concurrent("should return 429 when rate limit is exceeded for preview token", async () => { - for (let i = 0; i < 5; i++) { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer this_is_just_a_preview_token`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com" }); - - expect(response.statusCode).toBe(200); - } - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer this_is_just_a_preview_token`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com" }); - - expect(response.statusCode).toBe(429); - }, 90000); - }); - - // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => { - // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) { - // const response = await request(TEST_URL) - // .post("/v0/scrape") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: "https://www.scrapethissite.com" }); - - // expect(response.statusCode).toBe(200); - // } - - // const response = await request(TEST_URL) - // .post("/v0/scrape") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: "https://www.scrapethissite.com" }); - - // expect(response.statusCode).toBe(429); - // }, 60000); - - // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => { - // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) { - // const response = await request(TEST_URL) - // .post("/v0/crawl") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: "https://www.scrapethissite.com" }); - - // expect(response.statusCode).toBe(200); - // } - - // const response = await request(TEST_URL) - // .post("/v0/crawl") - // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - // .set("Content-Type", "application/json") - // .send({ url: "https://www.scrapethissite.com" }); - - // expect(response.statusCode).toBe(429); - // }, 60000); }); diff --git a/apps/api/src/controllers/__tests__/crawl.test.ts b/apps/api/src/controllers/__tests__/crawl.test.ts new file mode 100644 index 0000000..621c743 --- /dev/null +++ b/apps/api/src/controllers/__tests__/crawl.test.ts @@ -0,0 +1,47 @@ +import { crawlController } from '../crawl' +import { Request, Response } from 'express'; +import { authenticateUser } from '../auth'; // Ensure this import is correct +import { createIdempotencyKey } from '../../services/idempotency/create'; +import { validateIdempotencyKey } from '../../services/idempotency/validate'; +import { v4 as uuidv4 } from 'uuid'; + +jest.mock('../auth', () => ({ + authenticateUser: jest.fn().mockResolvedValue({ + success: true, + team_id: 'team123', + error: null, + status: 200 + }), + reduce: jest.fn() +})); +jest.mock('../../services/idempotency/validate'); + +describe('crawlController', () => { + it('should prevent duplicate requests using the same idempotency key', async () => { + const req = { + headers: { + 'x-idempotency-key': await uuidv4(), + 'Authorization': `Bearer ${process.env.TEST_API_KEY}` + }, + body: { + url: 'https://mendable.ai' + } + } as unknown as Request; + const res = { + status: jest.fn().mockReturnThis(), + json: jest.fn() + } as unknown as Response; + + // Mock the idempotency key validation to return false for the second call + (validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false); + + // First request should succeed + await crawlController(req, res); + expect(res.status).not.toHaveBeenCalledWith(409); + + // Second request with the same key should fail + await crawlController(req, res); + expect(res.status).toHaveBeenCalledWith(409); + expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' }); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts index c7c54aa..6d38370 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts @@ -7,7 +7,7 @@ import { getAdjustedMaxDepth } from '../utils/maxDepthUtils'; jest.mock('axios'); jest.mock('robots-parser'); -describe('WebCrawler maxDepth and filterLinks', () => { +describe('WebCrawler', () => { let crawler: WebCrawler; const mockAxios = axios as jest.Mocked; const mockRobotsParser = robotsParser as jest.MockedFunction; @@ -156,8 +156,37 @@ describe('WebCrawler maxDepth and filterLinks', () => { ]); }); - - - // Add more tests to cover other scenarios, such as checking includes and excludes + it('should handle allowBackwardCrawling option correctly', async () => { + const initialUrl = 'https://mendable.ai/blog'; + + // Setup the crawler with the specific test case options + const crawler = new WebCrawler({ + initialUrl: initialUrl, + includes: [], + excludes: [], + limit: 100, + maxCrawledDepth: 3, // Example depth + allowBackwardCrawling: true + }); + + // Mock the sitemap fetching function to simulate backward crawling + crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ + initialUrl, + 'https://mendable.ai', // backward link + initialUrl + '/page1', + initialUrl + '/page1/page2' + ]); + + const results = await crawler.start(); + expect(results).toEqual([ + { url: initialUrl, html: '' }, + { url: 'https://mendable.ai', html: '' }, // Expect the backward link to be included + { url: initialUrl + '/page1', html: '' }, + { url: initialUrl + '/page1/page2', html: '' } + ]); + + // Check that the backward link is included if allowBackwardCrawling is true + expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true); + }); }); diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts new file mode 100644 index 0000000..7966648 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -0,0 +1,24 @@ +jest.mock('../single_url', () => { + const originalModule = jest.requireActual('../single_url'); + originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('Test

Roast

'); + + return originalModule; +}); + +import { scrapSingleUrl } from '../single_url'; +import { PageOptions } from '../../../lib/entities'; + +describe('scrapSingleUrl', () => { + it('should handle includeHtml option correctly', async () => { + const url = 'https://roastmywebsite.ai'; + const pageOptionsWithHtml: PageOptions = { includeHtml: true }; + const pageOptionsWithoutHtml: PageOptions = { includeHtml: false }; + + const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml); + const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml); + + expect(resultWithHtml.html).toBeDefined(); + expect(resultWithoutHtml.html).toBeUndefined(); + }, 10000); +}); + diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts new file mode 100644 index 0000000..4252525 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts @@ -0,0 +1,89 @@ +import { isUrlBlocked } from '../blocklist'; + +describe('Blocklist Functionality', () => { + describe('isUrlBlocked', () => { + test.each([ + 'https://facebook.com/fake-test', + 'https://x.com/user-profile', + 'https://twitter.com/home', + 'https://instagram.com/explore', + 'https://linkedin.com/in/johndoe', + 'https://pinterest.com/pin/create', + 'https://snapchat.com/add/johndoe', + 'https://tiktok.com/@johndoe', + 'https://reddit.com/r/funny', + 'https://tumblr.com/dashboard', + 'https://flickr.com/photos/johndoe', + 'https://whatsapp.com/download', + 'https://wechat.com/features', + 'https://telegram.org/apps' + ])('should return true for blocklisted URL %s', (url) => { + expect(isUrlBlocked(url)).toBe(true); + }); + + test.each([ + 'https://facebook.com/policy', + 'https://twitter.com/tos', + 'https://instagram.com/about/legal/terms', + 'https://linkedin.com/legal/privacy-policy', + 'https://pinterest.com/about/privacy', + 'https://snapchat.com/legal/terms', + 'https://tiktok.com/legal/privacy-policy', + 'https://reddit.com/policies', + 'https://tumblr.com/policy/en/privacy', + 'https://flickr.com/help/terms', + 'https://whatsapp.com/legal', + 'https://wechat.com/en/privacy-policy', + 'https://telegram.org/tos' + ])('should return false for allowed URLs with keywords %s', (url) => { + expect(isUrlBlocked(url)).toBe(false); + }); + + test('should return false for non-blocklisted domain', () => { + const url = 'https://example.com'; + expect(isUrlBlocked(url)).toBe(false); + }); + + test('should handle invalid URLs gracefully', () => { + const url = 'htp://invalid-url'; + expect(isUrlBlocked(url)).toBe(false); + }); + }); + + test.each([ + 'https://subdomain.facebook.com', + 'https://facebook.com.someotherdomain.com', + 'https://www.facebook.com/profile', + 'https://api.twitter.com/info', + 'https://instagram.com/accounts/login' + ])('should return true for URLs with blocklisted domains in subdomains or paths %s', (url) => { + expect(isUrlBlocked(url)).toBe(true); + }); + + test.each([ + 'https://example.com/facebook.com', + 'https://example.com/redirect?url=https://twitter.com', + 'https://facebook.com.policy.example.com' + ])('should return false for URLs where blocklisted domain is part of another domain or path %s', (url) => { + expect(isUrlBlocked(url)).toBe(false); + }); + + test.each([ + 'https://FACEBOOK.com', + 'https://INSTAGRAM.com/@something' + ])('should handle case variations %s', (url) => { + expect(isUrlBlocked(url)).toBe(true); + }); + + test.each([ + 'https://facebook.com?redirect=https://example.com', + 'https://twitter.com?query=something' + ])('should handle query parameters %s', (url) => { + expect(isUrlBlocked(url)).toBe(true); + }); + + test('should handle internationalized domain names', () => { + const url = 'https://xn--d1acpjx3f.xn--p1ai'; + expect(isUrlBlocked(url)).toBe(false); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts index 55930f2..1830265 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts @@ -9,41 +9,11 @@ describe('PDF Processing Module - Integration Test', () => { expect(pageError).toBeUndefined(); }); -// We're hitting the LLAMAPARSE rate limit 🫠 -// it('should download and read a simple PDF file by URL', async () => { -// const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); -// expect(pdfContent).toEqual("Dummy PDF file"); -// }); + it('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { + const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/astro-ph/9301001.pdf', false); + expect(pageStatusCode).toBe(200); + expect(pageError).toBeUndefined(); + expect(content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); + }, 60000); // 60 seconds -// it('should download and read a complex PDF file by URL', async () => { -// const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf'); - -// const expectedContent = 'A Comprehensive Overview of Large Language Models\n' + -// ' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' + -// ' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' + -// ' Nick Barnes h, Ajmal Mian i\n' + -// ' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' + -// ' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' + -// ' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' + -// ' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' + -// ' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' + -// ' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' + -// ' gThe University of Melbourne (UoM), Melbourne, Australia\n' + -// ' hAustralian National University (ANU), Canberra, Australia\n' + -// ' iThe University of Western Australia (UWA), Perth, Australia\n' + -// ' Abstract\n' + -// ' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' + -// ' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' + -// ' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' + -// ' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' + -// ' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' + -// ' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' + -// ' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' + -// ' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' + -// ' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' + -// ' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' + -// ' extensive informative summaries of the existing works to advance the LLM research.\n' -// expect(pdfContent).toContain(expectedContent); -// }, 60000); - -}); \ No newline at end of file +}); diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 45d1970..7116963 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -23,6 +23,7 @@ const allowedKeywords = [ 'user-agreement', 'legal', 'help', + 'policies', 'support', 'contact', 'about', @@ -30,25 +31,31 @@ const allowedKeywords = [ 'blog', 'press', 'conditions', + 'tos' ]; export function isUrlBlocked(url: string): boolean { - // Check if the URL contains any allowed keywords - if (allowedKeywords.some(keyword => url.includes(keyword))) { + const lowerCaseUrl = url.toLowerCase(); + + // Check if the URL contains any allowed keywords as whole words + if (allowedKeywords.some(keyword => new RegExp(`\\b${keyword}\\b`, 'i').test(lowerCaseUrl))) { return false; } try { + const urlObj = new URL(url); + const hostname = urlObj.hostname.toLowerCase(); + // Check if the URL matches any domain in the blocklist - return socialMediaBlocklist.some(domain => { - // Create a regular expression to match the exact domain - const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}$`); - // Test the hostname of the URL against the pattern - return domainPattern.test(new URL(url).hostname); + const isBlocked = socialMediaBlocklist.some(domain => { + const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}(\\.|$)`, 'i'); + return domainPattern.test(hostname); }); + + return isBlocked; } catch (e) { // If an error occurs (e.g., invalid URL), return false + console.error(`Error processing URL: ${url}`, e); return false; } } -