Nick: cleanup
This commit is contained in:
parent
d9d206aff6
commit
4f526cff92
@ -7,7 +7,6 @@ import { RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { Document } from "../lib/entities";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import Ajv from 'ajv';
|
||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||
|
||||
export async function scrapeHelper(
|
||||
|
@ -1,18 +1,16 @@
|
||||
|
||||
|
||||
import { encoding_for_model } from "@dqbd/tiktoken";
|
||||
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||
|
||||
// This function calculates the number of tokens in a text string using GPT-3.5-turbo model
|
||||
export function numTokensFromString(message: string, model: string): number {
|
||||
const encoder = encoding_for_model(model as TiktokenModel);
|
||||
const encoder = encoding_for_model(model as TiktokenModel);
|
||||
|
||||
// Encode the message into tokens
|
||||
const tokens = encoder.encode(message);
|
||||
// Encode the message into tokens
|
||||
const tokens = encoder.encode(message);
|
||||
|
||||
// Free the encoder resources after use
|
||||
encoder.free();
|
||||
// Free the encoder resources after use
|
||||
encoder.free();
|
||||
|
||||
// Return the number of tokens
|
||||
return tokens.length;
|
||||
}
|
||||
// Return the number of tokens
|
||||
return tokens.length;
|
||||
}
|
||||
|
@ -1,53 +1,51 @@
|
||||
import Turndown from 'turndown'
|
||||
import OpenAI from 'openai'
|
||||
// import { LlamaModel } from 'node-llama-cpp'
|
||||
import { z } from 'zod'
|
||||
import { zodToJsonSchema } from 'zod-to-json-schema'
|
||||
import Ajv from 'ajv';
|
||||
import Turndown from "turndown";
|
||||
import OpenAI from "openai";
|
||||
import Ajv from "ajv";
|
||||
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
||||
|
||||
import {
|
||||
ScraperCompletionResult,
|
||||
generateOpenAICompletions,
|
||||
} from './models'
|
||||
import { Document, ExtractorOptions } from '../entities'
|
||||
import { generateOpenAICompletions } from "./models";
|
||||
import { Document, ExtractorOptions } from "../entities";
|
||||
|
||||
// Generate completion using OpenAI
|
||||
// Generate completion using OpenAI
|
||||
export async function generateCompletions(
|
||||
documents: Document[],
|
||||
extractionOptions: ExtractorOptions
|
||||
documents: Document[],
|
||||
extractionOptions: ExtractorOptions
|
||||
): Promise<Document[]> {
|
||||
// const schema = zodToJsonSchema(options.schema)
|
||||
// const schema = zodToJsonSchema(options.schema)
|
||||
|
||||
const schema = extractionOptions.extractionSchema;
|
||||
const prompt = extractionOptions.extractionPrompt;
|
||||
const schema = extractionOptions.extractionSchema;
|
||||
const prompt = extractionOptions.extractionPrompt;
|
||||
|
||||
const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
|
||||
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
|
||||
|
||||
const completions = await Promise.all(
|
||||
documents.map(async (document: Document) => {
|
||||
switch (switchVariable) {
|
||||
case "openAI":
|
||||
const llm = new OpenAI();
|
||||
const completionResult = await generateOpenAICompletions({
|
||||
client: llm,
|
||||
document: document,
|
||||
schema: schema,
|
||||
prompt: prompt,
|
||||
});
|
||||
// Validate the JSON output against the schema using AJV
|
||||
const validate = ajv.compile(schema);
|
||||
if (!validate(completionResult.llm_extraction)) {
|
||||
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
||||
throw new Error(
|
||||
`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. JSON parsing error(s): ${validate.errors
|
||||
?.map((err) => err.message)
|
||||
.join(", ")}`
|
||||
);
|
||||
}
|
||||
|
||||
const completions = await Promise.all(documents.map(async (document: Document) => {
|
||||
switch (switchVariable) {
|
||||
case "openAI":
|
||||
const llm = new OpenAI();
|
||||
const completionResult = await generateOpenAICompletions({
|
||||
client: llm,
|
||||
document: document,
|
||||
schema: schema,
|
||||
prompt: prompt
|
||||
});
|
||||
// Validate the JSON output against the schema using AJV
|
||||
const validate = ajv.compile(schema);
|
||||
if (!validate(completionResult.llm_extraction)) {
|
||||
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
||||
throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
|
||||
}
|
||||
return completionResult;
|
||||
default:
|
||||
throw new Error("Invalid client");
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
return completionResult;
|
||||
default:
|
||||
throw new Error('Invalid client');
|
||||
}
|
||||
}));
|
||||
|
||||
|
||||
return completions;
|
||||
return completions;
|
||||
}
|
||||
|
@ -1,115 +1,74 @@
|
||||
import OpenAI from 'openai'
|
||||
import { z } from 'zod'
|
||||
import { Document, ExtractorOptions } from "../../lib/entities";
|
||||
import { numTokensFromString } from './helpers';
|
||||
import OpenAI from "openai";
|
||||
import { Document } from "../../lib/entities";
|
||||
|
||||
// import {
|
||||
// LlamaModel,
|
||||
// LlamaJsonSchemaGrammar,
|
||||
// LlamaContext,
|
||||
// LlamaChatSession,
|
||||
// GbnfJsonSchema,
|
||||
// } from 'node-llama-cpp'
|
||||
// import { JsonSchema7Type } from 'zod-to-json-schema'
|
||||
|
||||
export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
|
||||
data: any | null
|
||||
url: string
|
||||
}
|
||||
export type ScraperCompletionResult = {
|
||||
data: any | null;
|
||||
url: string;
|
||||
};
|
||||
|
||||
const defaultPrompt =
|
||||
'You are a professional web scraper. Extract the contents of the webpage'
|
||||
"You are a professional web scraper. Extract the contents of the webpage";
|
||||
|
||||
function prepareOpenAIDoc(
|
||||
document: Document
|
||||
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
|
||||
|
||||
// Check if the markdown content exists in the document
|
||||
if (!document.markdown) {
|
||||
throw new Error("Markdown content is missing in the document.");
|
||||
}
|
||||
|
||||
return [{ type: 'text', text: document.markdown}]
|
||||
return [{ type: "text", text: document.markdown }];
|
||||
}
|
||||
|
||||
export async function generateOpenAICompletions({
|
||||
client,
|
||||
model = 'gpt-4-turbo',
|
||||
model = "gpt-4-turbo",
|
||||
document,
|
||||
schema, //TODO - add zod dynamic type checking
|
||||
prompt = defaultPrompt,
|
||||
temperature
|
||||
temperature,
|
||||
}: {
|
||||
client: OpenAI,
|
||||
model?: string,
|
||||
document: Document,
|
||||
schema: any, // This should be replaced with a proper Zod schema type when available
|
||||
prompt?: string,
|
||||
temperature?: number
|
||||
client: OpenAI;
|
||||
model?: string;
|
||||
document: Document;
|
||||
schema: any; // This should be replaced with a proper Zod schema type when available
|
||||
prompt?: string;
|
||||
temperature?: number;
|
||||
}): Promise<Document> {
|
||||
const openai = client as OpenAI
|
||||
const content = prepareOpenAIDoc(document)
|
||||
|
||||
const openai = client as OpenAI;
|
||||
const content = prepareOpenAIDoc(document);
|
||||
|
||||
const completion = await openai.chat.completions.create({
|
||||
model,
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
role: "system",
|
||||
content: prompt,
|
||||
},
|
||||
{ role: 'user', content },
|
||||
{ role: "user", content },
|
||||
],
|
||||
tools: [
|
||||
{
|
||||
type: 'function',
|
||||
type: "function",
|
||||
function: {
|
||||
name: 'extract_content',
|
||||
description: 'Extracts the content from the given webpage(s)',
|
||||
name: "extract_content",
|
||||
description: "Extracts the content from the given webpage(s)",
|
||||
parameters: schema,
|
||||
},
|
||||
},
|
||||
],
|
||||
tool_choice: 'auto',
|
||||
tool_choice: "auto",
|
||||
temperature,
|
||||
})
|
||||
});
|
||||
|
||||
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
||||
|
||||
const c = completion.choices[0].message.tool_calls[0].function.arguments
|
||||
|
||||
// Extract the LLM extraction content from the completion response
|
||||
const llmExtraction = JSON.parse(c);
|
||||
|
||||
// console.log("llm extraction: ", llmExtraction);
|
||||
|
||||
|
||||
// Return the document with the LLM extraction content added
|
||||
return {
|
||||
...document,
|
||||
llm_extraction: llmExtraction
|
||||
llm_extraction: llmExtraction,
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
|
||||
// model: LlamaModel,
|
||||
// page: ScraperLoadResult,
|
||||
// schema: JsonSchema7Type,
|
||||
// prompt: string = defaultPrompt,
|
||||
// temperature?: number
|
||||
// ): Promise<ScraperCompletionResult<T>> {
|
||||
// const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on
|
||||
// const context = new LlamaContext({ model })
|
||||
// const session = new LlamaChatSession({ context })
|
||||
// const pagePrompt = `${prompt}\n${page.content}`
|
||||
|
||||
// const result = await session.prompt(pagePrompt, {
|
||||
// grammar,
|
||||
// temperature,
|
||||
// })
|
||||
|
||||
// const parsed = grammar.parse(result)
|
||||
// return {
|
||||
// data: parsed,
|
||||
// url: page.url,
|
||||
// }
|
||||
// }
|
||||
|
@ -195,8 +195,6 @@ export class WebScraperDataProvider {
|
||||
documents = documents.concat(pdfDocuments);
|
||||
|
||||
if(this.extractorOptions.mode === "llm-extraction") {
|
||||
|
||||
const llm = new OpenAI()
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions
|
||||
|
Loading…
x
Reference in New Issue
Block a user