From 4f526cff9212c6cc58917884a268c1d687957965 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Tue, 30 Apr 2024 12:19:43 -0700
Subject: [PATCH] Nick: cleanup

---
 apps/api/src/controllers/scrape.ts         |  1 -
 apps/api/src/lib/LLM-extraction/helpers.ts | 18 ++--
 apps/api/src/lib/LLM-extraction/index.ts   | 82 +++++++++---------
 apps/api/src/lib/LLM-extraction/models.ts  | 97 +++++++---------------
 apps/api/src/scraper/WebScraper/index.ts   |  2 -
 5 files changed, 76 insertions(+), 124 deletions(-)

diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts
index c42f451..852d9b0 100644
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@@ -7,7 +7,6 @@ import { RateLimiterMode } from "../types";
 import { logJob } from "../services/logging/log_job";
 import { Document } from "../lib/entities";
 import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
-import Ajv from 'ajv';
 import { numTokensFromString } from '../lib/LLM-extraction/helpers';
 
 export async function scrapeHelper(
diff --git a/apps/api/src/lib/LLM-extraction/helpers.ts b/apps/api/src/lib/LLM-extraction/helpers.ts
index 2535964..f47a6b3 100644
--- a/apps/api/src/lib/LLM-extraction/helpers.ts
+++ b/apps/api/src/lib/LLM-extraction/helpers.ts
@@ -1,18 +1,16 @@
-
-
 import { encoding_for_model } from "@dqbd/tiktoken";
 import { TiktokenModel } from "@dqbd/tiktoken";
 
 // This function calculates the number of tokens in a text string using GPT-3.5-turbo model
 export function numTokensFromString(message: string, model: string): number {
-    const encoder = encoding_for_model(model as TiktokenModel);
+  const encoder = encoding_for_model(model as TiktokenModel);
 
-    // Encode the message into tokens
-    const tokens = encoder.encode(message);
+  // Encode the message into tokens
+  const tokens = encoder.encode(message);
 
-    // Free the encoder resources after use
-    encoder.free();
+  // Free the encoder resources after use
+  encoder.free();
 
-    // Return the number of tokens
-    return tokens.length;
-}
\ No newline at end of file
+  // Return the number of tokens
+  return tokens.length;
+}
diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts
index 9fae79d..0f156d2 100644
--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@@ -1,53 +1,51 @@
-import Turndown from 'turndown'
-import OpenAI from 'openai'
-// import { LlamaModel } from 'node-llama-cpp'
-import { z } from 'zod'
-import { zodToJsonSchema } from 'zod-to-json-schema'
-import Ajv from 'ajv';
+import Turndown from "turndown";
+import OpenAI from "openai";
+import Ajv from "ajv";
 const ajv = new Ajv(); // Initialize AJV for JSON schema validation
 
-import {
-    ScraperCompletionResult,
-    generateOpenAICompletions,
-} from './models'
-import { Document, ExtractorOptions } from '../entities'
+import { generateOpenAICompletions } from "./models";
+import { Document, ExtractorOptions } from "../entities";
 
-  // Generate completion using OpenAI
+// Generate completion using OpenAI
 export async function generateCompletions(
-    documents: Document[],
-    extractionOptions: ExtractorOptions
+  documents: Document[],
+  extractionOptions: ExtractorOptions
 ): Promise<Document[]> {
-    // const schema = zodToJsonSchema(options.schema)
+  // const schema = zodToJsonSchema(options.schema)
 
-    const schema = extractionOptions.extractionSchema;
-    const prompt = extractionOptions.extractionPrompt;
+  const schema = extractionOptions.extractionSchema;
+  const prompt = extractionOptions.extractionPrompt;
 
-    const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
+  const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
 
+  const completions = await Promise.all(
+    documents.map(async (document: Document) => {
+      switch (switchVariable) {
+        case "openAI":
+          const llm = new OpenAI();
+          const completionResult = await generateOpenAICompletions({
+            client: llm,
+            document: document,
+            schema: schema,
+            prompt: prompt,
+          });
+          // Validate the JSON output against the schema using AJV
+          const validate = ajv.compile(schema);
+          if (!validate(completionResult.llm_extraction)) {
+            //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
+            throw new Error(
+              `LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. JSON parsing error(s): ${validate.errors
+                ?.map((err) => err.message)
+                .join(", ")}`
+            );
+          }
 
-    const completions = await Promise.all(documents.map(async (document: Document) => {
-        switch (switchVariable) {
-            case "openAI":
-                const llm = new OpenAI();
-                const completionResult = await generateOpenAICompletions({
-                    client: llm,
-                    document: document,
-                    schema: schema,
-                    prompt: prompt
-                });
-                // Validate the JSON output against the schema using AJV
-                const validate = ajv.compile(schema);
-                if (!validate(completionResult.llm_extraction)) {
-                    //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
-                    throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
-                }
+          return completionResult;
+        default:
+          throw new Error("Invalid client");
+      }
+    })
+  );
 
-                return completionResult;
-            default:
-                throw new Error('Invalid client');
-        }
-    }));
-    
-
-    return completions;
+  return completions;
 }
diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts
index df1b6d1..d60979e 100644
--- a/apps/api/src/lib/LLM-extraction/models.ts
+++ b/apps/api/src/lib/LLM-extraction/models.ts
@@ -1,115 +1,74 @@
-import OpenAI from 'openai'
-import { z } from 'zod'
-import { Document, ExtractorOptions } from "../../lib/entities";
-import { numTokensFromString } from './helpers';
+import OpenAI from "openai";
+import { Document } from "../../lib/entities";
 
-// import {
-//   LlamaModel,
-//   LlamaJsonSchemaGrammar,
-//   LlamaContext,
-//   LlamaChatSession,
-//   GbnfJsonSchema,
-// } from 'node-llama-cpp'
-// import { JsonSchema7Type } from 'zod-to-json-schema'
-
-export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
-  data: any | null
-  url: string
-}
+export type ScraperCompletionResult = {
+  data: any | null;
+  url: string;
+};
 
 const defaultPrompt =
-  'You are a professional web scraper. Extract the contents of the webpage'
+  "You are a professional web scraper. Extract the contents of the webpage";
 
 function prepareOpenAIDoc(
   document: Document
 ): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
-
   // Check if the markdown content exists in the document
   if (!document.markdown) {
     throw new Error("Markdown content is missing in the document.");
   }
 
-  return [{ type: 'text', text: document.markdown}]
+  return [{ type: "text", text: document.markdown }];
 }
 
 export async function generateOpenAICompletions({
   client,
-  model = 'gpt-4-turbo',
+  model = "gpt-4-turbo",
   document,
   schema, //TODO - add zod dynamic type checking
   prompt = defaultPrompt,
-  temperature
+  temperature,
 }: {
-  client: OpenAI,
-  model?: string,
-  document: Document,
-  schema: any, // This should be replaced with a proper Zod schema type when available
-  prompt?: string,
-  temperature?: number
+  client: OpenAI;
+  model?: string;
+  document: Document;
+  schema: any; // This should be replaced with a proper Zod schema type when available
+  prompt?: string;
+  temperature?: number;
 }): Promise<Document> {
-  const openai = client as OpenAI
-  const content = prepareOpenAIDoc(document)
-
+  const openai = client as OpenAI;
+  const content = prepareOpenAIDoc(document);
 
   const completion = await openai.chat.completions.create({
     model,
     messages: [
       {
-        role: 'system',
+        role: "system",
         content: prompt,
       },
-      { role: 'user', content },
+      { role: "user", content },
     ],
     tools: [
       {
-        type: 'function',
+        type: "function",
         function: {
-          name: 'extract_content',
-          description: 'Extracts the content from the given webpage(s)',
+          name: "extract_content",
+          description: "Extracts the content from the given webpage(s)",
           parameters: schema,
         },
       },
     ],
-    tool_choice: 'auto',
+    tool_choice: "auto",
     temperature,
-  })
+  });
+
+  const c = completion.choices[0].message.tool_calls[0].function.arguments;
 
-  const c = completion.choices[0].message.tool_calls[0].function.arguments
-  
   // Extract the LLM extraction content from the completion response
   const llmExtraction = JSON.parse(c);
 
-//   console.log("llm extraction: ", llmExtraction);
-
-
   // Return the document with the LLM extraction content added
   return {
     ...document,
-    llm_extraction: llmExtraction
+    llm_extraction: llmExtraction,
   };
-   
 }
-
-// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
-//   model: LlamaModel,
-//   page: ScraperLoadResult,
-//   schema: JsonSchema7Type,
-//   prompt: string = defaultPrompt,
-//   temperature?: number
-// ): Promise<ScraperCompletionResult<T>> {
-//   const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on
-//   const context = new LlamaContext({ model })
-//   const session = new LlamaChatSession({ context })
-//   const pagePrompt = `${prompt}\n${page.content}`
-
-//   const result = await session.prompt(pagePrompt, {
-//     grammar,
-//     temperature,
-//   })
-
-//   const parsed = grammar.parse(result)
-//   return {
-//     data: parsed,
-//     url: page.url,
-//   }
-// }
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 0bd1a82..a56f8ff 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -195,8 +195,6 @@ export class WebScraperDataProvider {
         documents = documents.concat(pdfDocuments);
 
         if(this.extractorOptions.mode === "llm-extraction") {
-
-          const llm = new OpenAI()
           documents = await generateCompletions(
             documents,
             this.extractorOptions