From 4f7737c922d05e7a34c83468796ec5959c9528c5 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Mon, 29 Apr 2024 12:12:55 -0700 Subject: [PATCH] Caleb: added ajv json schema validation. --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 31 +++++++++++++++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 5 +++ apps/api/src/controllers/scrape.ts | 4 ++- apps/api/src/lib/LLM-extraction/index.ts | 13 +++++++- apps/api/src/lib/LLM-extraction/models.ts | 2 +- apps/api/src/lib/entities.ts | 2 +- 7 files changed, 54 insertions(+), 4 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 0f826da..00ce1bb 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -51,6 +51,7 @@ "@nangohq/node": "^0.36.33", "@sentry/node": "^7.48.0", "@supabase/supabase-js": "^2.7.1", + "ajv": "^8.12.0", "async": "^3.2.5", "async-mutex": "^0.4.0", "axios": "^1.3.4", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index d72dad0..8062354 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -35,6 +35,9 @@ dependencies: '@supabase/supabase-js': specifier: ^2.7.1 version: 2.39.7 + ajv: + specifier: ^8.12.0 + version: 8.12.0 async: specifier: ^3.2.5 version: 3.2.5 @@ -1820,6 +1823,15 @@ packages: humanize-ms: 1.2.1 dev: false + /ajv@8.12.0: + resolution: {integrity: sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==} + dependencies: + fast-deep-equal: 3.1.3 + json-schema-traverse: 1.0.0 + require-from-string: 2.0.2 + uri-js: 4.4.1 + dev: false + /ansi-escapes@4.3.2: resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==} engines: {node: '>=8'} @@ -2926,6 +2938,10 @@ packages: - supports-color dev: false + /fast-deep-equal@3.1.3: + resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==} + dev: false + /fast-fifo@1.3.2: resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==} dev: false @@ -3999,6 +4015,10 @@ packages: hasBin: true dev: false + /json-schema-traverse@1.0.0: + resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==} + dev: false + /json5@2.2.3: resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==} engines: {node: '>=6'} @@ -5264,6 +5284,11 @@ packages: resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} engines: {node: '>=0.10.0'} + /require-from-string@2.0.2: + resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==} + engines: {node: '>=0.10.0'} + dev: false + /resolve-cwd@3.0.0: resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==} engines: {node: '>=8'} @@ -5970,6 +5995,12 @@ packages: picocolors: 1.0.0 dev: true + /uri-js@4.4.1: + resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==} + dependencies: + punycode: 2.3.1 + dev: false + /urlpattern-polyfill@10.0.0: resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==} dev: false diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 9e3fed1..4a47638 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -285,6 +285,11 @@ describe("E2E Tests for API Routes", () => { }); + // Ensure that the job was successfully created before proceeding with LLM extraction + expect(response.statusCode).toBe(200); + + + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` let llmExtraction = response.body.data.llm_extraction; diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 13c4dd2..43b8ca4 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -7,13 +7,14 @@ import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function +import Ajv from 'ajv'; export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, pageOptions: any, - extractorOptions: any + extractorOptions: ExtractorOptions ): Promise<{ success: boolean; error?: string; @@ -29,6 +30,7 @@ export async function scrapeHelper( return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; } + const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index d221498..237fdbe 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -3,6 +3,8 @@ import OpenAI from 'openai' // import { LlamaModel } from 'node-llama-cpp' import { z } from 'zod' import { zodToJsonSchema } from 'zod-to-json-schema' +import Ajv from 'ajv'; +const ajv = new Ajv(); // Initialize AJV for JSON schema validation import { ScraperCompletionResult, @@ -22,20 +24,29 @@ export async function generateCompletions( const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider + const completions = await Promise.all(documents.map(async (document: Document) => { switch (switchVariable) { case "openAI": const llm = new OpenAI(); - return await generateOpenAICompletions({ + const completionResult = await generateOpenAICompletions({ client: llm, document: document, schema: schema, prompt: prompt }); + // Validate the JSON output against the schema using AJV + const validate = ajv.compile(schema); + if (!validate(completionResult.llm_extraction)) { + throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`); + } + + return completionResult; default: throw new Error('Invalid client'); } })); + return completions; } diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 71daa27..9114511 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -31,7 +31,7 @@ function prepareOpenAIDoc( return [{ type: 'text', text: document.markdown }] } -export async function generateOpenAICompletions({ +export async function generateOpenAICompletions({ client, model = 'gpt-3.5-turbo', document, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index c492c4d..4ceab63 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -57,7 +57,7 @@ export class Document { url?: string; // Used only in /search for now content: string; markdown?: string; - llm_extraction?: string; + llm_extraction?: Record; createdAt?: Date; updatedAt?: Date; type?: string;