diff --git a/apps/api/package.json b/apps/api/package.json index 078c6b6..047feaf 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -46,11 +46,12 @@ "@bull-board/api": "^5.14.2", "@bull-board/express": "^5.8.0", "@devil7softwares/pos": "^1.0.2", - "@dqbd/tiktoken": "^1.0.7", + "@dqbd/tiktoken": "^1.0.13", "@logtail/node": "^0.4.12", "@nangohq/node": "^0.36.33", "@sentry/node": "^7.48.0", "@supabase/supabase-js": "^2.7.1", + "ajv": "^8.12.0", "async": "^3.2.5", "async-mutex": "^0.4.0", "axios": "^1.3.4", @@ -68,6 +69,7 @@ "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.3.2", "joplin-turndown-plugin-gfm": "^1.0.12", + "json-schema-to-zod": "^2.1.0", "keyword-extractor": "^0.0.25", "langchain": "^0.1.25", "languagedetect": "^2.0.0", @@ -93,7 +95,9 @@ "unstructured-client": "^0.9.4", "uuid": "^9.0.1", "wordpos": "^2.1.0", - "xml2js": "^0.6.2" + "xml2js": "^0.6.2", + "zod": "^3.23.4", + "zod-to-json-schema": "^3.23.0" }, "nodemonConfig": { "ignore": [ diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 2b61222..bd5e37b 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -21,7 +21,7 @@ dependencies: specifier: ^1.0.2 version: 1.0.2 '@dqbd/tiktoken': - specifier: ^1.0.7 + specifier: ^1.0.13 version: 1.0.13 '@logtail/node': specifier: ^0.4.12 @@ -35,6 +35,9 @@ dependencies: '@supabase/supabase-js': specifier: ^2.7.1 version: 2.39.7 + ajv: + specifier: ^8.12.0 + version: 8.12.0 async: specifier: ^3.2.5 version: 3.2.5 @@ -86,6 +89,9 @@ dependencies: joplin-turndown-plugin-gfm: specifier: ^1.0.12 version: 1.0.12 + json-schema-to-zod: + specifier: ^2.1.0 + version: 2.1.0 keyword-extractor: specifier: ^0.0.25 version: 0.0.25 @@ -164,6 +170,12 @@ dependencies: xml2js: specifier: ^0.6.2 version: 0.6.2 + zod: + specifier: ^3.23.4 + version: 3.23.4 + zod-to-json-schema: + specifier: ^3.23.0 + version: 3.23.0(zod@3.23.4) devDependencies: '@flydotio/dockerfile': @@ -1200,7 +1212,7 @@ packages: redis: 4.6.13 typesense: 1.7.2(@babel/runtime@7.24.0) uuid: 9.0.1 - zod: 3.22.4 + zod: 3.23.4 transitivePeerDependencies: - encoding dev: false @@ -1218,8 +1230,8 @@ packages: p-queue: 6.6.2 p-retry: 4.6.2 uuid: 9.0.1 - zod: 3.22.4 - zod-to-json-schema: 3.22.4(zod@3.22.4) + zod: 3.23.4 + zod-to-json-schema: 3.23.0(zod@3.23.4) dev: false /@langchain/openai@0.0.18: @@ -1229,8 +1241,8 @@ packages: '@langchain/core': 0.1.43 js-tiktoken: 1.0.10 openai: 4.28.4 - zod: 3.22.4 - zod-to-json-schema: 3.22.4(zod@3.22.4) + zod: 3.23.4 + zod-to-json-schema: 3.23.0(zod@3.23.4) transitivePeerDependencies: - encoding dev: false @@ -1811,6 +1823,15 @@ packages: humanize-ms: 1.2.1 dev: false + /ajv@8.12.0: + resolution: {integrity: sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==} + dependencies: + fast-deep-equal: 3.1.3 + json-schema-traverse: 1.0.0 + require-from-string: 2.0.2 + uri-js: 4.4.1 + dev: false + /ansi-escapes@4.3.2: resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==} engines: {node: '>=8'} @@ -2917,6 +2938,10 @@ packages: - supports-color dev: false + /fast-deep-equal@3.1.3: + resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==} + dev: false + /fast-fifo@1.3.2: resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==} dev: false @@ -3985,6 +4010,15 @@ packages: /json-parse-even-better-errors@2.3.1: resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==} + /json-schema-to-zod@2.1.0: + resolution: {integrity: sha512-7ishNgYY+AbIKeeHcp5xCOdJbdVwSfDx/4V2ktc16LUusCJJbz2fEKdWUmAxhKIiYzhZ9Fp4E8OsAoM/h9cOLA==} + hasBin: true + dev: false + + /json-schema-traverse@1.0.0: + resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==} + dev: false + /json5@2.2.3: resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==} engines: {node: '>=6'} @@ -4209,8 +4243,8 @@ packages: redis: 4.6.13 uuid: 9.0.1 yaml: 2.4.1 - zod: 3.22.4 - zod-to-json-schema: 3.22.4(zod@3.22.4) + zod: 3.23.4 + zod-to-json-schema: 3.23.0(zod@3.23.4) transitivePeerDependencies: - '@aws-crypto/sha256-js' - '@aws-sdk/client-bedrock-agent-runtime' @@ -5069,7 +5103,7 @@ packages: sbd: 1.0.19 typescript: 5.4.5 uuid: 9.0.1 - zod: 3.22.4 + zod: 3.23.4 transitivePeerDependencies: - debug dev: false @@ -5250,6 +5284,11 @@ packages: resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} engines: {node: '>=0.10.0'} + /require-from-string@2.0.2: + resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==} + engines: {node: '>=0.10.0'} + dev: false + /resolve-cwd@3.0.0: resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==} engines: {node: '>=8'} @@ -5956,6 +5995,12 @@ packages: picocolors: 1.0.0 dev: true + /uri-js@4.4.1: + resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==} + dependencies: + punycode: 2.3.1 + dev: false + /urlpattern-polyfill@10.0.0: resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==} dev: false @@ -6185,14 +6230,18 @@ packages: engines: {node: '>=10'} dev: true - /zod-to-json-schema@3.22.4(zod@3.22.4): - resolution: {integrity: sha512-2Ed5dJ+n/O3cU383xSY28cuVi0BCQhF8nYqWU5paEpl7fVdqdAmiLdqLyfblbNdfOFwFfi/mqU4O1pwc60iBhQ==} + /zod-to-json-schema@3.23.0(zod@3.23.4): + resolution: {integrity: sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==} peerDependencies: - zod: ^3.22.4 + zod: ^3.23.3 dependencies: - zod: 3.22.4 + zod: 3.23.4 dev: false /zod@3.22.4: resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==} dev: false + + /zod@3.23.4: + resolution: {integrity: sha512-/AtWOKbBgjzEYYQRNfoGKHObgfAZag6qUJX1VbHo2PRBgS+wfWagEY2mizjfyAPcGesrJOcx/wcl0L9WnVrHFw==} + dev: false diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index 271e848..356fe76 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -199,7 +199,8 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); + + }, 60000); // 60 seconds }); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 2b4c7e9..c6c59bc 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -8,255 +8,370 @@ dotenv.config(); const TEST_URL = "http://127.0.0.1:3002"; - describe("E2E Tests for API Routes", () => { - beforeAll(() => { - process.env.USE_DB_AUTHENTICATION = "true"; - }); +describe("E2E Tests for API Routes", () => { + beforeAll(() => { + process.env.USE_DB_AUTHENTICATION = "true"; + }); - afterAll(() => { - delete process.env.USE_DB_AUTHENTICATION; - }); - describe("GET /", () => { - it("should return Hello, world! message", async () => { - const response = await request(TEST_URL).get("/"); + afterAll(() => { + delete process.env.USE_DB_AUTHENTICATION; + }); + describe("GET /", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/"); - expect(response.statusCode).toBe(200); - expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); - }); - }); - - describe("GET /test", () => { - it("should return Hello, world! message", async () => { - const response = await request(TEST_URL).get("/test"); - expect(response.statusCode).toBe(200); - expect(response.text).toContain("Hello, world!"); - }); - }); - - describe("POST /v0/scrape", () => { - it("should require authorization", async () => { - const response = await request(app).post("/v0/scrape"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://facebook.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); - - it("should return a successful response with a valid preview token", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer this_is_just_a_preview_token`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - }, 10000); // 10 seconds timeout - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.content).toContain("🔥 FireCrawl"); - }, 30000); // 30 seconds timeout - }); - - describe("POST /v0/crawl", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).post("/v0/crawl"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://twitter.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("jobId"); - expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ - ); - }); - - - // Additional tests for insufficient credits? - }); - - describe("POST /v0/crawlWebsitePreview", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).post( - "/v0/crawlWebsitePreview" - ); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://instagram.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer this_is_just_a_preview_token`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("jobId"); - expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ - ); - }); - }); - - describe("POST /v0/search", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).post("/v0/search"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(401); - }); - - - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success"); - expect(response.body.success).toBe(true); - expect(response.body).toHaveProperty("data"); - }, 30000); // 30 seconds timeout - }); - - describe("GET /v0/crawl/status/:jobId", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).get("/v0/crawl/status/123"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/123") - .set("Authorization", `Bearer invalid-api-key`); - expect(response.statusCode).toBe(401); - }); - - it("should return Job not found for invalid job ID", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/invalidJobId") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(404); - }); - - it("should return a successful response for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); - - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); - - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain( - "🔥 FireCrawl" - ); - }, 60000); // 60 seconds - }); - - describe("GET /is-production", () => { - it("should return the production status", async () => { - const response = await request(TEST_URL).get("/is-production"); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("isProduction"); - }); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); }); }); + + describe("GET /test", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/test"); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("Hello, world!"); + }); + }); + + describe("POST /v0/scrape", () => { + it("should require authorization", async () => { + const response = await request(app).post("/v0/scrape"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://facebook.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should return a successful response with a valid preview token", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + }, 10000); // 10 seconds timeout + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("🔥 FireCrawl"); + }, 30000); // 30 seconds timeout + }); + + describe("POST /v0/crawl", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawl"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://twitter.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + + + // Additional tests for insufficient credits? + }); + + describe("POST /v0/crawlWebsitePreview", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post( + "/v0/crawlWebsitePreview" + ); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://instagram.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + }); + + describe("POST /v0/search", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/search"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(401); + }); + + + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }, 30000); // 30 seconds timeout + }); + + describe("GET /v0/crawl/status/:jobId", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).get("/v0/crawl/status/123"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + }); + + it("should return Job not found for invalid job ID", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + }); + + it("should return a successful response for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain( + "🔥 FireCrawl" + ); + }, 60000); // 60 seconds + }); + + describe("POST /v0/scrape with LLM Extraction", () => { + it("should extract data using LLM extraction mode", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + pageOptions: { + onlyMainContent: true + }, + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionSchema: { + type: "object", + properties: { + company_mission: { + type: "string" + }, + supports_sso: { + type: "boolean" + }, + is_open_source: { + type: "boolean" + } + }, + required: ["company_mission", "supports_sso", "is_open_source"] + } + } + }); + + + // Ensure that the job was successfully created before proceeding with LLM extraction + expect(response.statusCode).toBe(200); + + + + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` + let llmExtraction = response.body.data.llm_extraction; + + // Check if the llm_extraction object has the required properties with correct types and values + expect(llmExtraction).toHaveProperty("company_mission"); + expect(typeof llmExtraction.company_mission).toBe("string"); + expect(llmExtraction).toHaveProperty("supports_sso"); + expect(llmExtraction.supports_sso).toBe(true); + expect(typeof llmExtraction.supports_sso).toBe("boolean"); + expect(llmExtraction).toHaveProperty("is_open_source"); + expect(llmExtraction.is_open_source).toBe(false); + expect(typeof llmExtraction.is_open_source).toBe("boolean"); + }, 60000); // 60 secs + }); + + // describe("POST /v0/scrape for Top 100 Companies", () => { + // it("should extract data for the top 100 companies", async () => { + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://companiesmarketcap.com/", + // pageOptions: { + // onlyMainContent: true + // }, + // extractorOptions: { + // mode: "llm-extraction", + // extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.", + // extractionSchema: { + // type: "object", + // properties: { + // companies: { + // type: "array", + // items: { + // type: "object", + // properties: { + // rank: { type: "number" }, + // name: { type: "string" }, + // marketCap: { type: "string" }, + // price: { type: "string" }, + // todayChange: { type: "string" } + // }, + // required: ["rank", "name", "marketCap", "price", "todayChange"] + // } + // } + // }, + // required: ["companies"] + // } + // } + // }); + + + // // Print the response body to the console for debugging purposes + // console.log("Response companies:", response.body.data.llm_extraction.companies); + + // // Check if the response has the correct structure and data types + // expect(response.status).toBe(200); + // expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true); + // expect(response.body.data.llm_extraction.companies.length).toBe(40); + + // // Sample check for the first company + // const firstCompany = response.body.data.llm_extraction.companies[0]; + // expect(firstCompany).toHaveProperty("name"); + // expect(typeof firstCompany.name).toBe("string"); + // expect(firstCompany).toHaveProperty("marketCap"); + // expect(typeof firstCompany.marketCap).toBe("string"); + // expect(firstCompany).toHaveProperty("price"); + // expect(typeof firstCompany.price).toBe("string"); + // expect(firstCompany).toHaveProperty("todayChange"); + // expect(typeof firstCompany.todayChange).toBe("string"); + // }, 120000); // 120 secs + // }); + + + + + describe("GET /is-production", () => { + it("should return the production status", async () => { + const response = await request(TEST_URL).get("/is-production"); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("isProduction"); + }); + }); +}); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index eebdcb4..849500a 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -1,3 +1,4 @@ +import { ExtractorOptions } from './../lib/entities'; import { Request, Response } from "express"; import { WebScraperDataProvider } from "../scraper/WebScraper"; import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; @@ -6,12 +7,14 @@ import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function +import { numTokensFromString } from '../lib/LLM-extraction/helpers'; export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, - pageOptions: any + pageOptions: any, + extractorOptions: ExtractorOptions ): Promise<{ success: boolean; error?: string; @@ -27,6 +30,7 @@ export async function scrapeHelper( return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; } + const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", @@ -35,6 +39,7 @@ export async function scrapeHelper( ...crawlerOptions, }, pageOptions: pageOptions, + extractorOptions: extractorOptions }); const docs = await a.getDocuments(false); @@ -46,9 +51,17 @@ export async function scrapeHelper( return { success: true, error: "No page found", returnCode: 200 }; } + + let creditsToBeBilled = filteredDocs.length; + const creditsPerLLMExtract = 5; + + if (extractorOptions.mode === "llm-extraction"){ + creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length) + } + const billingResult = await billTeam( team_id, - filteredDocs.length + creditsToBeBilled ); if (!billingResult.success) { return { @@ -79,6 +92,9 @@ export async function scrapeController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const extractorOptions = req.body.extractorOptions ?? { + mode: "markdown" + } const origin = req.body.origin ?? "api"; try { @@ -96,10 +112,13 @@ export async function scrapeController(req: Request, res: Response) { req, team_id, crawlerOptions, - pageOptions + pageOptions, + extractorOptions ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; + const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0; + logJob({ success: result.success, message: result.error, @@ -111,7 +130,9 @@ export async function scrapeController(req: Request, res: Response) { url: req.body.url, crawlerOptions: crawlerOptions, pageOptions: pageOptions, - origin: origin, + origin: origin, + extractor_options: extractorOptions, + num_tokens: numTokens }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/lib/LLM-extraction/helpers.ts b/apps/api/src/lib/LLM-extraction/helpers.ts new file mode 100644 index 0000000..f47a6b3 --- /dev/null +++ b/apps/api/src/lib/LLM-extraction/helpers.ts @@ -0,0 +1,16 @@ +import { encoding_for_model } from "@dqbd/tiktoken"; +import { TiktokenModel } from "@dqbd/tiktoken"; + +// This function calculates the number of tokens in a text string using GPT-3.5-turbo model +export function numTokensFromString(message: string, model: string): number { + const encoder = encoding_for_model(model as TiktokenModel); + + // Encode the message into tokens + const tokens = encoder.encode(message); + + // Free the encoder resources after use + encoder.free(); + + // Return the number of tokens + return tokens.length; +} diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts new file mode 100644 index 0000000..86e2f90 --- /dev/null +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -0,0 +1,51 @@ +import Turndown from "turndown"; +import OpenAI from "openai"; +import Ajv from "ajv"; +const ajv = new Ajv(); // Initialize AJV for JSON schema validation + +import { generateOpenAICompletions } from "./models"; +import { Document, ExtractorOptions } from "../entities"; + +// Generate completion using OpenAI +export async function generateCompletions( + documents: Document[], + extractionOptions: ExtractorOptions +): Promise { + // const schema = zodToJsonSchema(options.schema) + + const schema = extractionOptions.extractionSchema; + const prompt = extractionOptions.extractionPrompt; + + const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider + + const completions = await Promise.all( + documents.map(async (document: Document) => { + switch (switchVariable) { + case "openAI": + const llm = new OpenAI(); + const completionResult = await generateOpenAICompletions({ + client: llm, + document: document, + schema: schema, + prompt: prompt, + }); + // Validate the JSON output against the schema using AJV + const validate = ajv.compile(schema); + if (!validate(completionResult.llm_extraction)) { + //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. + throw new Error( + `JSON parsing error(s): ${validate.errors + ?.map((err) => err.message) + .join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.` + ); + } + + return completionResult; + default: + throw new Error("Invalid client"); + } + }) + ); + + return completions; +} diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts new file mode 100644 index 0000000..ec8a710 --- /dev/null +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -0,0 +1,76 @@ +import OpenAI from "openai"; +import { Document } from "../../lib/entities"; + +export type ScraperCompletionResult = { + data: any | null; + url: string; +}; + +const defaultPrompt = + "You are a professional web scraper. Extract the contents of the webpage"; + +function prepareOpenAIDoc( + document: Document +): OpenAI.Chat.Completions.ChatCompletionContentPart[] { + // Check if the markdown content exists in the document + if (!document.markdown) { + throw new Error( + "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" + ); + } + + return [{ type: "text", text: document.markdown }]; +} + +export async function generateOpenAICompletions({ + client, + model = "gpt-4-turbo", + document, + schema, //TODO - add zod dynamic type checking + prompt = defaultPrompt, + temperature, +}: { + client: OpenAI; + model?: string; + document: Document; + schema: any; // This should be replaced with a proper Zod schema type when available + prompt?: string; + temperature?: number; +}): Promise { + const openai = client as OpenAI; + const content = prepareOpenAIDoc(document); + + const completion = await openai.chat.completions.create({ + model, + messages: [ + { + role: "system", + content: prompt, + }, + { role: "user", content }, + ], + tools: [ + { + type: "function", + function: { + name: "extract_content", + description: "Extracts the content from the given webpage(s)", + parameters: schema, + }, + }, + ], + tool_choice: "auto", + temperature, + }); + + const c = completion.choices[0].message.tool_calls[0].function.arguments; + + // Extract the LLM extraction content from the completion response + const llmExtraction = JSON.parse(c); + + // Return the document with the LLM extraction content added + return { + ...document, + llm_extraction: llmExtraction, + }; +} diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 7b46305..4008785 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -16,6 +16,12 @@ export type PageOptions = { }; +export type ExtractorOptions = { + mode: "markdown" | "llm-extraction"; + extractionPrompt?: string; + extractionSchema?: Record; +} + export type SearchOptions = { limit?: number; tbs?: string; @@ -38,6 +44,7 @@ export type WebScraperOptions = { replaceAllPathsWithAbsolutePaths?: boolean; }; pageOptions?: PageOptions; + extractorOptions?: ExtractorOptions; concurrentRequests?: number; }; @@ -50,6 +57,8 @@ export class Document { url?: string; // Used only in /search for now content: string; markdown?: string; + html?: string; + llm_extraction?: Record; createdAt?: Date; updatedAt?: Date; type?: string; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 386dfb2..fef5f69 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -1,4 +1,4 @@ -import { Document, PageOptions, WebScraperOptions } from "../../lib/entities"; +import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities"; import { Progress } from "../../lib/entities"; import { scrapSingleUrl } from "./single_url"; import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; @@ -7,6 +7,9 @@ import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/imageDescription"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; +import OpenAI from 'openai' +import { generateCompletions } from "../../lib/LLM-extraction"; + export class WebScraperDataProvider { private urls: string[] = [""]; @@ -19,6 +22,7 @@ export class WebScraperDataProvider { private concurrentRequests: number = 20; private generateImgAltText: boolean = false; private pageOptions?: PageOptions; + private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; @@ -36,8 +40,7 @@ export class WebScraperDataProvider { ): Promise { const totalUrls = urls.length; let processedUrls = 0; - console.log("Converting urls to documents"); - console.log("Total urls", urls); + const results: (Document | null)[] = new Array(urls.length).fill(null); for (let i = 0; i < urls.length; i += this.concurrentRequests) { const batchUrls = urls.slice(i, i + this.concurrentRequests); @@ -192,6 +195,13 @@ export class WebScraperDataProvider { documents = await this.getSitemapData(baseUrl, documents); documents = documents.concat(pdfDocuments); + if(this.extractorOptions.mode === "llm-extraction") { + documents = await generateCompletions( + documents, + this.extractorOptions + ) + } + await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); @@ -377,6 +387,7 @@ export class WebScraperDataProvider { this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; + this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index ff73e95..fab54bd 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -106,7 +106,6 @@ export async function scrapSingleUrl( toMarkdown: boolean = true, pageOptions: PageOptions = { onlyMainContent: true } ): Promise { - console.log(`Scraping URL: ${urlToScrap}`); urlToScrap = urlToScrap.trim(); const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { @@ -170,6 +169,8 @@ export async function scrapSingleUrl( } break; } + + //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); return [await parseMarkdown(cleanedHtml), text]; diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 639b3a8..92a1dc1 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -1,3 +1,4 @@ +import { ExtractorOptions } from './../../lib/entities'; import { supabase_service } from "../supabase"; import { FirecrawlJob } from "../../types"; import "dotenv/config"; @@ -8,6 +9,8 @@ export async function logJob(job: FirecrawlJob) { if (process.env.ENV !== "production") { return; } + + const { data, error } = await supabase_service .from("firecrawl_jobs") .insert([ @@ -23,6 +26,8 @@ export async function logJob(job: FirecrawlJob) { crawler_options: job.crawlerOptions, page_options: job.pageOptions, origin: job.origin, + extractor_options: job.extractor_options, + num_tokens: job.num_tokens }, ]); if (error) { diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index c65140c..c1858f1 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -1,3 +1,5 @@ +import { ExtractorOptions } from "./lib/entities"; + export interface CrawlResult { source: string; content: string; @@ -37,6 +39,8 @@ export interface FirecrawlJob { crawlerOptions?: any; pageOptions?: any; origin: string; + extractor_options?: ExtractorOptions, + num_tokens?: number } export enum RateLimiterMode {