0

Nick: cleanup

This commit is contained in:
Nicolas 2024-04-30 12:19:43 -07:00
parent d9d206aff6
commit 4f526cff92
5 changed files with 76 additions and 124 deletions

View File

@ -7,7 +7,6 @@ import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import Ajv from 'ajv';
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
export async function scrapeHelper(

View File

@ -1,18 +1,16 @@
import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
// This function calculates the number of tokens in a text string using GPT-3.5-turbo model
export function numTokensFromString(message: string, model: string): number {
const encoder = encoding_for_model(model as TiktokenModel);
const encoder = encoding_for_model(model as TiktokenModel);
// Encode the message into tokens
const tokens = encoder.encode(message);
// Encode the message into tokens
const tokens = encoder.encode(message);
// Free the encoder resources after use
encoder.free();
// Free the encoder resources after use
encoder.free();
// Return the number of tokens
return tokens.length;
// Return the number of tokens
return tokens.length;
}

View File

@ -1,53 +1,51 @@
import Turndown from 'turndown'
import OpenAI from 'openai'
// import { LlamaModel } from 'node-llama-cpp'
import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema'
import Ajv from 'ajv';
import Turndown from "turndown";
import OpenAI from "openai";
import Ajv from "ajv";
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
import {
ScraperCompletionResult,
generateOpenAICompletions,
} from './models'
import { Document, ExtractorOptions } from '../entities'
import { generateOpenAICompletions } from "./models";
import { Document, ExtractorOptions } from "../entities";
// Generate completion using OpenAI
// Generate completion using OpenAI
export async function generateCompletions(
documents: Document[],
extractionOptions: ExtractorOptions
documents: Document[],
extractionOptions: ExtractorOptions
): Promise<Document[]> {
// const schema = zodToJsonSchema(options.schema)
// const schema = zodToJsonSchema(options.schema)
const schema = extractionOptions.extractionSchema;
const prompt = extractionOptions.extractionPrompt;
const schema = extractionOptions.extractionSchema;
const prompt = extractionOptions.extractionPrompt;
const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
const completions = await Promise.all(
documents.map(async (document: Document) => {
switch (switchVariable) {
case "openAI":
const llm = new OpenAI();
const completionResult = await generateOpenAICompletions({
client: llm,
document: document,
schema: schema,
prompt: prompt,
});
// Validate the JSON output against the schema using AJV
const validate = ajv.compile(schema);
if (!validate(completionResult.llm_extraction)) {
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
throw new Error(
`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. JSON parsing error(s): ${validate.errors
?.map((err) => err.message)
.join(", ")}`
);
}
const completions = await Promise.all(documents.map(async (document: Document) => {
switch (switchVariable) {
case "openAI":
const llm = new OpenAI();
const completionResult = await generateOpenAICompletions({
client: llm,
document: document,
schema: schema,
prompt: prompt
});
// Validate the JSON output against the schema using AJV
const validate = ajv.compile(schema);
if (!validate(completionResult.llm_extraction)) {
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
}
return completionResult;
default:
throw new Error("Invalid client");
}
})
);
return completionResult;
default:
throw new Error('Invalid client');
}
}));
return completions;
return completions;
}

View File

@ -1,115 +1,74 @@
import OpenAI from 'openai'
import { z } from 'zod'
import { Document, ExtractorOptions } from "../../lib/entities";
import { numTokensFromString } from './helpers';
import OpenAI from "openai";
import { Document } from "../../lib/entities";
// import {
// LlamaModel,
// LlamaJsonSchemaGrammar,
// LlamaContext,
// LlamaChatSession,
// GbnfJsonSchema,
// } from 'node-llama-cpp'
// import { JsonSchema7Type } from 'zod-to-json-schema'
export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
data: any | null
url: string
}
export type ScraperCompletionResult = {
data: any | null;
url: string;
};
const defaultPrompt =
'You are a professional web scraper. Extract the contents of the webpage'
"You are a professional web scraper. Extract the contents of the webpage";
function prepareOpenAIDoc(
document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
// Check if the markdown content exists in the document
if (!document.markdown) {
throw new Error("Markdown content is missing in the document.");
}
return [{ type: 'text', text: document.markdown}]
return [{ type: "text", text: document.markdown }];
}
export async function generateOpenAICompletions({
client,
model = 'gpt-4-turbo',
model = "gpt-4-turbo",
document,
schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt,
temperature
temperature,
}: {
client: OpenAI,
model?: string,
document: Document,
schema: any, // This should be replaced with a proper Zod schema type when available
prompt?: string,
temperature?: number
client: OpenAI;
model?: string;
document: Document;
schema: any; // This should be replaced with a proper Zod schema type when available
prompt?: string;
temperature?: number;
}): Promise<Document> {
const openai = client as OpenAI
const content = prepareOpenAIDoc(document)
const openai = client as OpenAI;
const content = prepareOpenAIDoc(document);
const completion = await openai.chat.completions.create({
model,
messages: [
{
role: 'system',
role: "system",
content: prompt,
},
{ role: 'user', content },
{ role: "user", content },
],
tools: [
{
type: 'function',
type: "function",
function: {
name: 'extract_content',
description: 'Extracts the content from the given webpage(s)',
name: "extract_content",
description: "Extracts the content from the given webpage(s)",
parameters: schema,
},
},
],
tool_choice: 'auto',
tool_choice: "auto",
temperature,
})
});
const c = completion.choices[0].message.tool_calls[0].function.arguments
const c = completion.choices[0].message.tool_calls[0].function.arguments;
// Extract the LLM extraction content from the completion response
const llmExtraction = JSON.parse(c);
// console.log("llm extraction: ", llmExtraction);
// Return the document with the LLM extraction content added
return {
...document,
llm_extraction: llmExtraction
llm_extraction: llmExtraction,
};
}
// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
// model: LlamaModel,
// page: ScraperLoadResult,
// schema: JsonSchema7Type,
// prompt: string = defaultPrompt,
// temperature?: number
// ): Promise<ScraperCompletionResult<T>> {
// const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on
// const context = new LlamaContext({ model })
// const session = new LlamaChatSession({ context })
// const pagePrompt = `${prompt}\n${page.content}`
// const result = await session.prompt(pagePrompt, {
// grammar,
// temperature,
// })
// const parsed = grammar.parse(result)
// return {
// data: parsed,
// url: page.url,
// }
// }

View File

@ -195,8 +195,6 @@ export class WebScraperDataProvider {
documents = documents.concat(pdfDocuments);
if(this.extractorOptions.mode === "llm-extraction") {
const llm = new OpenAI()
documents = await generateCompletions(
documents,
this.extractorOptions