Nick: cleanup

2024-04-30 12:19:43 -07:00 · 2024-04-30 12:19:43 -07:00 · 4f526cff92
commit 4f526cff92
parent d9d206aff6
5 changed files with 76 additions and 124 deletions
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@ -7,7 +7,6 @@ import { RateLimiterMode } from "../types";
 import { logJob } from "../services/logging/log_job";
 import { Document } from "../lib/entities";
 import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
 import Ajv from 'ajv';
 import { numTokensFromString } from '../lib/LLM-extraction/helpers';
 export async function scrapeHelper(
--- a/apps/api/src/lib/LLM-extraction/helpers.ts
+++ b/apps/api/src/lib/LLM-extraction/helpers.ts
@ -1,18 +1,16 @@
 import { encoding_for_model } from "@dqbd/tiktoken";
 import { TiktokenModel } from "@dqbd/tiktoken";
 // This function calculates the number of tokens in a text string using GPT-3.5-turbo model
 export function numTokensFromString(message: string, model: string): number {
-    const encoder = encoding_for_model(model as TiktokenModel);
+  const encoder = encoding_for_model(model as TiktokenModel);
-    // Encode the message into tokens
+  // Encode the message into tokens
-    const tokens = encoder.encode(message);
+  const tokens = encoder.encode(message);
-    // Free the encoder resources after use
+  // Free the encoder resources after use
-    encoder.free();
+  encoder.free();
-    // Return the number of tokens
+  // Return the number of tokens
-    return tokens.length;
+  return tokens.length;
 }
--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@ -1,53 +1,51 @@
-import Turndown from 'turndown'
+import Turndown from "turndown";
-import OpenAI from 'openai'
+import OpenAI from "openai";
-// import { LlamaModel } from 'node-llama-cpp'
+import Ajv from "ajv";
 import { z } from 'zod'
 import { zodToJsonSchema } from 'zod-to-json-schema'
 import Ajv from 'ajv';
 const ajv = new Ajv(); // Initialize AJV for JSON schema validation
-import {
+import { generateOpenAICompletions } from "./models";
-    ScraperCompletionResult,
+import { Document, ExtractorOptions } from "../entities";
    generateOpenAICompletions,
 } from './models'
 import { Document, ExtractorOptions } from '../entities'
-  // Generate completion using OpenAI
+// Generate completion using OpenAI
 export async function generateCompletions(
-    documents: Document[],
+  documents: Document[],
-    extractionOptions: ExtractorOptions
+  extractionOptions: ExtractorOptions
 ): Promise<Document[]> {
-    // const schema = zodToJsonSchema(options.schema)
+  // const schema = zodToJsonSchema(options.schema)
-    const schema = extractionOptions.extractionSchema;
+  const schema = extractionOptions.extractionSchema;
-    const prompt = extractionOptions.extractionPrompt;
+  const prompt = extractionOptions.extractionPrompt;
-    const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
+  const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
  const completions = await Promise.all(
    documents.map(async (document: Document) => {
      switch (switchVariable) {
        case "openAI":
          const llm = new OpenAI();
          const completionResult = await generateOpenAICompletions({
            client: llm,
            document: document,
            schema: schema,
            prompt: prompt,
          });
          // Validate the JSON output against the schema using AJV
          const validate = ajv.compile(schema);
          if (!validate(completionResult.llm_extraction)) {
            //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
            throw new Error(
              `LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. JSON parsing error(s): ${validate.errors
                ?.map((err) => err.message)
                .join(", ")}`
            );
          }
-    const completions = await Promise.all(documents.map(async (document: Document) => {
+          return completionResult;
-        switch (switchVariable) {
+        default:
-            case "openAI":
+          throw new Error("Invalid client");
-                const llm = new OpenAI();
+      }
-                const completionResult = await generateOpenAICompletions({
+    })
-                    client: llm,
+  );
                    document: document,
                    schema: schema,
                    prompt: prompt
                });
                // Validate the JSON output against the schema using AJV
                const validate = ajv.compile(schema);
                if (!validate(completionResult.llm_extraction)) {
                    //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
                    throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
                }
-                return completionResult;
+  return completions;
            default:
                throw new Error('Invalid client');
        }
    }));
    return completions;
 }
--- a/apps/api/src/lib/LLM-extraction/models.ts
+++ b/apps/api/src/lib/LLM-extraction/models.ts
@ -1,115 +1,74 @@
-import OpenAI from 'openai'
+import OpenAI from "openai";
-import { z } from 'zod'
+import { Document } from "../../lib/entities";
 import { Document, ExtractorOptions } from "../../lib/entities";
 import { numTokensFromString } from './helpers';
-// import {
+export type ScraperCompletionResult = {
-//   LlamaModel,
+  data: any | null;
-//   LlamaJsonSchemaGrammar,
+  url: string;
-//   LlamaContext,
+};
 //   LlamaChatSession,
 //   GbnfJsonSchema,
 // } from 'node-llama-cpp'
 // import { JsonSchema7Type } from 'zod-to-json-schema'
 export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
  data: any | null
  url: string
 }
 const defaultPrompt =
-  'You are a professional web scraper. Extract the contents of the webpage'
+  "You are a professional web scraper. Extract the contents of the webpage";
 function prepareOpenAIDoc(
  document: Document
 ): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
  // Check if the markdown content exists in the document
  if (!document.markdown) {
    throw new Error("Markdown content is missing in the document.");
  }
-  return [{ type: 'text', text: document.markdown}]
+  return [{ type: "text", text: document.markdown }];
 }
 export async function generateOpenAICompletions({
  client,
-  model = 'gpt-4-turbo',
+  model = "gpt-4-turbo",
  document,
  schema, //TODO - add zod dynamic type checking
  prompt = defaultPrompt,
-  temperature
+  temperature,
 }: {
-  client: OpenAI,
+  client: OpenAI;
-  model?: string,
+  model?: string;
-  document: Document,
+  document: Document;
-  schema: any, // This should be replaced with a proper Zod schema type when available
+  schema: any; // This should be replaced with a proper Zod schema type when available
-  prompt?: string,
+  prompt?: string;
-  temperature?: number
+  temperature?: number;
 }): Promise<Document> {
-  const openai = client as OpenAI
+  const openai = client as OpenAI;
-  const content = prepareOpenAIDoc(document)
+  const content = prepareOpenAIDoc(document);
  const completion = await openai.chat.completions.create({
    model,
    messages: [
      {
-        role: 'system',
+        role: "system",
        content: prompt,
      },
-      { role: 'user', content },
+      { role: "user", content },
    ],
    tools: [
      {
-        type: 'function',
+        type: "function",
        function: {
-          name: 'extract_content',
+          name: "extract_content",
-          description: 'Extracts the content from the given webpage(s)',
+          description: "Extracts the content from the given webpage(s)",
          parameters: schema,
        },
      },
    ],
-    tool_choice: 'auto',
+    tool_choice: "auto",
    temperature,
-  })
+  });
-  const c = completion.choices[0].message.tool_calls[0].function.arguments
+  const c = completion.choices[0].message.tool_calls[0].function.arguments;
  // Extract the LLM extraction content from the completion response
  const llmExtraction = JSON.parse(c);
 //   console.log("llm extraction: ", llmExtraction);
  // Return the document with the LLM extraction content added
  return {
    ...document,
-    llm_extraction: llmExtraction
+    llm_extraction: llmExtraction,
  };
 }
 // export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
 //   model: LlamaModel,
 //   page: ScraperLoadResult,
 //   schema: JsonSchema7Type,
 //   prompt: string = defaultPrompt,
 //   temperature?: number
 // ): Promise<ScraperCompletionResult<T>> {
 //   const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on
 //   const context = new LlamaContext({ model })
 //   const session = new LlamaChatSession({ context })
 //   const pagePrompt = `${prompt}\n${page.content}`
 //   const result = await session.prompt(pagePrompt, {
 //     grammar,
 //     temperature,
 //   })
 //   const parsed = grammar.parse(result)
 //   return {
 //     data: parsed,
 //     url: page.url,
 //   }
 // }
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -195,8 +195,6 @@ export class WebScraperDataProvider {
        documents = documents.concat(pdfDocuments);
        if(this.extractorOptions.mode === "llm-extraction") {
          const llm = new OpenAI()
          documents = await generateCompletions(
            documents,
            this.extractorOptions