Nick: cleanup

2024-04-30 12:19:43 -07:00 · 2024-04-30 12:19:43 -07:00 · 4f526cff92
commit 4f526cff92
parent d9d206aff6
5 changed files with 76 additions and 124 deletions
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@ -7,7 +7,6 @@ import { RateLimiterMode } from "../types";
 import { logJob } from "../services/logging/log_job";
 import { Document } from "../lib/entities";
 import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
-import Ajv from 'ajv';
 import { numTokensFromString } from '../lib/LLM-extraction/helpers';

 export async function scrapeHelper(
--- a/apps/api/src/lib/LLM-extraction/helpers.ts
+++ b/apps/api/src/lib/LLM-extraction/helpers.ts
@ -1,18 +1,16 @@
-
-
 import { encoding_for_model } from "@dqbd/tiktoken";
 import { TiktokenModel } from "@dqbd/tiktoken";

 // This function calculates the number of tokens in a text string using GPT-3.5-turbo model
 export function numTokensFromString(message: string, model: string): number {
-    const encoder = encoding_for_model(model as TiktokenModel);
+  const encoder = encoding_for_model(model as TiktokenModel);

-    // Encode the message into tokens
-    const tokens = encoder.encode(message);
+  // Encode the message into tokens
+  const tokens = encoder.encode(message);

-    // Free the encoder resources after use
-    encoder.free();
+  // Free the encoder resources after use
+  encoder.free();

-    // Return the number of tokens
-    return tokens.length;
-}
+  // Return the number of tokens
+  return tokens.length;
+}
--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@ -1,53 +1,51 @@
-import Turndown from 'turndown'
-import OpenAI from 'openai'
-// import { LlamaModel } from 'node-llama-cpp'
-import { z } from 'zod'
-import { zodToJsonSchema } from 'zod-to-json-schema'
-import Ajv from 'ajv';
+import Turndown from "turndown";
+import OpenAI from "openai";
+import Ajv from "ajv";
 const ajv = new Ajv(); // Initialize AJV for JSON schema validation

-import {
-    ScraperCompletionResult,
-    generateOpenAICompletions,
-} from './models'
-import { Document, ExtractorOptions } from '../entities'
+import { generateOpenAICompletions } from "./models";
+import { Document, ExtractorOptions } from "../entities";

-  // Generate completion using OpenAI
+// Generate completion using OpenAI
 export async function generateCompletions(
-    documents: Document[],
-    extractionOptions: ExtractorOptions
+  documents: Document[],
+  extractionOptions: ExtractorOptions
 ): Promise<Document[]> {
-    // const schema = zodToJsonSchema(options.schema)
+  // const schema = zodToJsonSchema(options.schema)

-    const schema = extractionOptions.extractionSchema;
-    const prompt = extractionOptions.extractionPrompt;
+  const schema = extractionOptions.extractionSchema;
+  const prompt = extractionOptions.extractionPrompt;

-    const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
+  const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider

+  const completions = await Promise.all(
+    documents.map(async (document: Document) => {
+      switch (switchVariable) {
+        case "openAI":
+          const llm = new OpenAI();
+          const completionResult = await generateOpenAICompletions({
+            client: llm,
+            document: document,
+            schema: schema,
+            prompt: prompt,
+          });
+          // Validate the JSON output against the schema using AJV
+          const validate = ajv.compile(schema);
+          if (!validate(completionResult.llm_extraction)) {
+            //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
+            throw new Error(
+              `LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. JSON parsing error(s): ${validate.errors
+                ?.map((err) => err.message)
+                .join(", ")}`
+            );
+          }

-    const completions = await Promise.all(documents.map(async (document: Document) => {
-        switch (switchVariable) {
-            case "openAI":
-                const llm = new OpenAI();
-                const completionResult = await generateOpenAICompletions({
-                    client: llm,
-                    document: document,
-                    schema: schema,
-                    prompt: prompt
-                });
-                // Validate the JSON output against the schema using AJV
-                const validate = ajv.compile(schema);
-                if (!validate(completionResult.llm_extraction)) {
-                    //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
-                    throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
-                }
+          return completionResult;
+        default:
+          throw new Error("Invalid client");
+      }
+    })
+  );

-                return completionResult;
-            default:
-                throw new Error('Invalid client');
-        }
-    }));
-    
-
-    return completions;
+  return completions;
 }
--- a/apps/api/src/lib/LLM-extraction/models.ts
+++ b/apps/api/src/lib/LLM-extraction/models.ts
@ -1,115 +1,74 @@
-import OpenAI from 'openai'
-import { z } from 'zod'
-import { Document, ExtractorOptions } from "../../lib/entities";
-import { numTokensFromString } from './helpers';
+import OpenAI from "openai";
+import { Document } from "../../lib/entities";

-// import {
-//   LlamaModel,
-//   LlamaJsonSchemaGrammar,
-//   LlamaContext,
-//   LlamaChatSession,
-//   GbnfJsonSchema,
-// } from 'node-llama-cpp'
-// import { JsonSchema7Type } from 'zod-to-json-schema'
-
-export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
-  data: any | null
-  url: string
-}
+export type ScraperCompletionResult = {
+  data: any | null;
+  url: string;
+};

 const defaultPrompt =
-  'You are a professional web scraper. Extract the contents of the webpage'
+  "You are a professional web scraper. Extract the contents of the webpage";

 function prepareOpenAIDoc(
  document: Document
 ): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
-
  // Check if the markdown content exists in the document
  if (!document.markdown) {
    throw new Error("Markdown content is missing in the document.");
  }

-  return [{ type: 'text', text: document.markdown}]
+  return [{ type: "text", text: document.markdown }];
 }

 export async function generateOpenAICompletions({
  client,
-  model = 'gpt-4-turbo',
+  model = "gpt-4-turbo",
  document,
  schema, //TODO - add zod dynamic type checking
  prompt = defaultPrompt,
-  temperature
+  temperature,
 }: {
-  client: OpenAI,
-  model?: string,
-  document: Document,
-  schema: any, // This should be replaced with a proper Zod schema type when available
-  prompt?: string,
-  temperature?: number
+  client: OpenAI;
+  model?: string;
+  document: Document;
+  schema: any; // This should be replaced with a proper Zod schema type when available
+  prompt?: string;
+  temperature?: number;
 }): Promise<Document> {
-  const openai = client as OpenAI
-  const content = prepareOpenAIDoc(document)
-
+  const openai = client as OpenAI;
+  const content = prepareOpenAIDoc(document);

  const completion = await openai.chat.completions.create({
    model,
    messages: [
      {
-        role: 'system',
+        role: "system",
        content: prompt,
      },
-      { role: 'user', content },
+      { role: "user", content },
    ],
    tools: [
      {
-        type: 'function',
+        type: "function",
        function: {
-          name: 'extract_content',
-          description: 'Extracts the content from the given webpage(s)',
+          name: "extract_content",
+          description: "Extracts the content from the given webpage(s)",
          parameters: schema,
        },
      },
    ],
-    tool_choice: 'auto',
+    tool_choice: "auto",
    temperature,
-  })
+  });
+
+  const c = completion.choices[0].message.tool_calls[0].function.arguments;

-  const c = completion.choices[0].message.tool_calls[0].function.arguments
-  
  // Extract the LLM extraction content from the completion response
  const llmExtraction = JSON.parse(c);

-//   console.log("llm extraction: ", llmExtraction);
-
-
  // Return the document with the LLM extraction content added
  return {
    ...document,
-    llm_extraction: llmExtraction
+    llm_extraction: llmExtraction,
  };
-   
 }
-
-// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
-//   model: LlamaModel,
-//   page: ScraperLoadResult,
-//   schema: JsonSchema7Type,
-//   prompt: string = defaultPrompt,
-//   temperature?: number
-// ): Promise<ScraperCompletionResult<T>> {
-//   const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on
-//   const context = new LlamaContext({ model })
-//   const session = new LlamaChatSession({ context })
-//   const pagePrompt = `${prompt}\n${page.content}`
-
-//   const result = await session.prompt(pagePrompt, {
-//     grammar,
-//     temperature,
-//   })
-
-//   const parsed = grammar.parse(result)
-//   return {
-//     data: parsed,
-//     url: page.url,
-//   }
-// }
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -195,8 +195,6 @@ export class WebScraperDataProvider {
        documents = documents.concat(pdfDocuments);

        if(this.extractorOptions.mode === "llm-extraction") {
-
-          const llm = new OpenAI()
          documents = await generateCompletions(
            documents,
            this.extractorOptions