From 4f526cff9212c6cc58917884a268c1d687957965 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 30 Apr 2024 12:19:43 -0700 Subject: [PATCH] Nick: cleanup --- apps/api/src/controllers/scrape.ts | 1 - apps/api/src/lib/LLM-extraction/helpers.ts | 18 ++-- apps/api/src/lib/LLM-extraction/index.ts | 82 +++++++++--------- apps/api/src/lib/LLM-extraction/models.ts | 97 +++++++--------------- apps/api/src/scraper/WebScraper/index.ts | 2 - 5 files changed, 76 insertions(+), 124 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index c42f451..852d9b0 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -7,7 +7,6 @@ import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function -import Ajv from 'ajv'; import { numTokensFromString } from '../lib/LLM-extraction/helpers'; export async function scrapeHelper( diff --git a/apps/api/src/lib/LLM-extraction/helpers.ts b/apps/api/src/lib/LLM-extraction/helpers.ts index 2535964..f47a6b3 100644 --- a/apps/api/src/lib/LLM-extraction/helpers.ts +++ b/apps/api/src/lib/LLM-extraction/helpers.ts @@ -1,18 +1,16 @@ - - import { encoding_for_model } from "@dqbd/tiktoken"; import { TiktokenModel } from "@dqbd/tiktoken"; // This function calculates the number of tokens in a text string using GPT-3.5-turbo model export function numTokensFromString(message: string, model: string): number { - const encoder = encoding_for_model(model as TiktokenModel); + const encoder = encoding_for_model(model as TiktokenModel); - // Encode the message into tokens - const tokens = encoder.encode(message); + // Encode the message into tokens + const tokens = encoder.encode(message); - // Free the encoder resources after use - encoder.free(); + // Free the encoder resources after use + encoder.free(); - // Return the number of tokens - return tokens.length; -} \ No newline at end of file + // Return the number of tokens + return tokens.length; +} diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 9fae79d..0f156d2 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -1,53 +1,51 @@ -import Turndown from 'turndown' -import OpenAI from 'openai' -// import { LlamaModel } from 'node-llama-cpp' -import { z } from 'zod' -import { zodToJsonSchema } from 'zod-to-json-schema' -import Ajv from 'ajv'; +import Turndown from "turndown"; +import OpenAI from "openai"; +import Ajv from "ajv"; const ajv = new Ajv(); // Initialize AJV for JSON schema validation -import { - ScraperCompletionResult, - generateOpenAICompletions, -} from './models' -import { Document, ExtractorOptions } from '../entities' +import { generateOpenAICompletions } from "./models"; +import { Document, ExtractorOptions } from "../entities"; - // Generate completion using OpenAI +// Generate completion using OpenAI export async function generateCompletions( - documents: Document[], - extractionOptions: ExtractorOptions + documents: Document[], + extractionOptions: ExtractorOptions ): Promise { - // const schema = zodToJsonSchema(options.schema) + // const schema = zodToJsonSchema(options.schema) - const schema = extractionOptions.extractionSchema; - const prompt = extractionOptions.extractionPrompt; + const schema = extractionOptions.extractionSchema; + const prompt = extractionOptions.extractionPrompt; - const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider + const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider + const completions = await Promise.all( + documents.map(async (document: Document) => { + switch (switchVariable) { + case "openAI": + const llm = new OpenAI(); + const completionResult = await generateOpenAICompletions({ + client: llm, + document: document, + schema: schema, + prompt: prompt, + }); + // Validate the JSON output against the schema using AJV + const validate = ajv.compile(schema); + if (!validate(completionResult.llm_extraction)) { + //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. + throw new Error( + `LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. JSON parsing error(s): ${validate.errors + ?.map((err) => err.message) + .join(", ")}` + ); + } - const completions = await Promise.all(documents.map(async (document: Document) => { - switch (switchVariable) { - case "openAI": - const llm = new OpenAI(); - const completionResult = await generateOpenAICompletions({ - client: llm, - document: document, - schema: schema, - prompt: prompt - }); - // Validate the JSON output against the schema using AJV - const validate = ajv.compile(schema); - if (!validate(completionResult.llm_extraction)) { - //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. - throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`); - } + return completionResult; + default: + throw new Error("Invalid client"); + } + }) + ); - return completionResult; - default: - throw new Error('Invalid client'); - } - })); - - - return completions; + return completions; } diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index df1b6d1..d60979e 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -1,115 +1,74 @@ -import OpenAI from 'openai' -import { z } from 'zod' -import { Document, ExtractorOptions } from "../../lib/entities"; -import { numTokensFromString } from './helpers'; +import OpenAI from "openai"; +import { Document } from "../../lib/entities"; -// import { -// LlamaModel, -// LlamaJsonSchemaGrammar, -// LlamaContext, -// LlamaChatSession, -// GbnfJsonSchema, -// } from 'node-llama-cpp' -// import { JsonSchema7Type } from 'zod-to-json-schema' - -export type ScraperCompletionResult> = { - data: any | null - url: string -} +export type ScraperCompletionResult = { + data: any | null; + url: string; +}; const defaultPrompt = - 'You are a professional web scraper. Extract the contents of the webpage' + "You are a professional web scraper. Extract the contents of the webpage"; function prepareOpenAIDoc( document: Document ): OpenAI.Chat.Completions.ChatCompletionContentPart[] { - // Check if the markdown content exists in the document if (!document.markdown) { throw new Error("Markdown content is missing in the document."); } - return [{ type: 'text', text: document.markdown}] + return [{ type: "text", text: document.markdown }]; } export async function generateOpenAICompletions({ client, - model = 'gpt-4-turbo', + model = "gpt-4-turbo", document, schema, //TODO - add zod dynamic type checking prompt = defaultPrompt, - temperature + temperature, }: { - client: OpenAI, - model?: string, - document: Document, - schema: any, // This should be replaced with a proper Zod schema type when available - prompt?: string, - temperature?: number + client: OpenAI; + model?: string; + document: Document; + schema: any; // This should be replaced with a proper Zod schema type when available + prompt?: string; + temperature?: number; }): Promise { - const openai = client as OpenAI - const content = prepareOpenAIDoc(document) - + const openai = client as OpenAI; + const content = prepareOpenAIDoc(document); const completion = await openai.chat.completions.create({ model, messages: [ { - role: 'system', + role: "system", content: prompt, }, - { role: 'user', content }, + { role: "user", content }, ], tools: [ { - type: 'function', + type: "function", function: { - name: 'extract_content', - description: 'Extracts the content from the given webpage(s)', + name: "extract_content", + description: "Extracts the content from the given webpage(s)", parameters: schema, }, }, ], - tool_choice: 'auto', + tool_choice: "auto", temperature, - }) + }); + + const c = completion.choices[0].message.tool_calls[0].function.arguments; - const c = completion.choices[0].message.tool_calls[0].function.arguments - // Extract the LLM extraction content from the completion response const llmExtraction = JSON.parse(c); -// console.log("llm extraction: ", llmExtraction); - - // Return the document with the LLM extraction content added return { ...document, - llm_extraction: llmExtraction + llm_extraction: llmExtraction, }; - } - -// export async function generateLlamaCompletions>( -// model: LlamaModel, -// page: ScraperLoadResult, -// schema: JsonSchema7Type, -// prompt: string = defaultPrompt, -// temperature?: number -// ): Promise> { -// const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on -// const context = new LlamaContext({ model }) -// const session = new LlamaChatSession({ context }) -// const pagePrompt = `${prompt}\n${page.content}` - -// const result = await session.prompt(pagePrompt, { -// grammar, -// temperature, -// }) - -// const parsed = grammar.parse(result) -// return { -// data: parsed, -// url: page.url, -// } -// } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 0bd1a82..a56f8ff 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -195,8 +195,6 @@ export class WebScraperDataProvider { documents = documents.concat(pdfDocuments); if(this.extractorOptions.mode === "llm-extraction") { - - const llm = new OpenAI() documents = await generateCompletions( documents, this.extractorOptions