Nick: cleanup
This commit is contained in:
parent
d9d206aff6
commit
4f526cff92
@ -7,7 +7,6 @@ import { RateLimiterMode } from "../types";
|
|||||||
import { logJob } from "../services/logging/log_job";
|
import { logJob } from "../services/logging/log_job";
|
||||||
import { Document } from "../lib/entities";
|
import { Document } from "../lib/entities";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
import Ajv from 'ajv';
|
|
||||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||||
|
|
||||||
export async function scrapeHelper(
|
export async function scrapeHelper(
|
||||||
|
@ -1,18 +1,16 @@
|
|||||||
|
|
||||||
|
|
||||||
import { encoding_for_model } from "@dqbd/tiktoken";
|
import { encoding_for_model } from "@dqbd/tiktoken";
|
||||||
import { TiktokenModel } from "@dqbd/tiktoken";
|
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||||
|
|
||||||
// This function calculates the number of tokens in a text string using GPT-3.5-turbo model
|
// This function calculates the number of tokens in a text string using GPT-3.5-turbo model
|
||||||
export function numTokensFromString(message: string, model: string): number {
|
export function numTokensFromString(message: string, model: string): number {
|
||||||
const encoder = encoding_for_model(model as TiktokenModel);
|
const encoder = encoding_for_model(model as TiktokenModel);
|
||||||
|
|
||||||
// Encode the message into tokens
|
// Encode the message into tokens
|
||||||
const tokens = encoder.encode(message);
|
const tokens = encoder.encode(message);
|
||||||
|
|
||||||
// Free the encoder resources after use
|
// Free the encoder resources after use
|
||||||
encoder.free();
|
encoder.free();
|
||||||
|
|
||||||
// Return the number of tokens
|
// Return the number of tokens
|
||||||
return tokens.length;
|
return tokens.length;
|
||||||
}
|
}
|
@ -1,53 +1,51 @@
|
|||||||
import Turndown from 'turndown'
|
import Turndown from "turndown";
|
||||||
import OpenAI from 'openai'
|
import OpenAI from "openai";
|
||||||
// import { LlamaModel } from 'node-llama-cpp'
|
import Ajv from "ajv";
|
||||||
import { z } from 'zod'
|
|
||||||
import { zodToJsonSchema } from 'zod-to-json-schema'
|
|
||||||
import Ajv from 'ajv';
|
|
||||||
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
||||||
|
|
||||||
import {
|
import { generateOpenAICompletions } from "./models";
|
||||||
ScraperCompletionResult,
|
import { Document, ExtractorOptions } from "../entities";
|
||||||
generateOpenAICompletions,
|
|
||||||
} from './models'
|
|
||||||
import { Document, ExtractorOptions } from '../entities'
|
|
||||||
|
|
||||||
// Generate completion using OpenAI
|
// Generate completion using OpenAI
|
||||||
export async function generateCompletions(
|
export async function generateCompletions(
|
||||||
documents: Document[],
|
documents: Document[],
|
||||||
extractionOptions: ExtractorOptions
|
extractionOptions: ExtractorOptions
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
// const schema = zodToJsonSchema(options.schema)
|
// const schema = zodToJsonSchema(options.schema)
|
||||||
|
|
||||||
const schema = extractionOptions.extractionSchema;
|
const schema = extractionOptions.extractionSchema;
|
||||||
const prompt = extractionOptions.extractionPrompt;
|
const prompt = extractionOptions.extractionPrompt;
|
||||||
|
|
||||||
const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
|
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
|
||||||
|
|
||||||
|
const completions = await Promise.all(
|
||||||
|
documents.map(async (document: Document) => {
|
||||||
|
switch (switchVariable) {
|
||||||
|
case "openAI":
|
||||||
|
const llm = new OpenAI();
|
||||||
|
const completionResult = await generateOpenAICompletions({
|
||||||
|
client: llm,
|
||||||
|
document: document,
|
||||||
|
schema: schema,
|
||||||
|
prompt: prompt,
|
||||||
|
});
|
||||||
|
// Validate the JSON output against the schema using AJV
|
||||||
|
const validate = ajv.compile(schema);
|
||||||
|
if (!validate(completionResult.llm_extraction)) {
|
||||||
|
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
||||||
|
throw new Error(
|
||||||
|
`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. JSON parsing error(s): ${validate.errors
|
||||||
|
?.map((err) => err.message)
|
||||||
|
.join(", ")}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
const completions = await Promise.all(documents.map(async (document: Document) => {
|
return completionResult;
|
||||||
switch (switchVariable) {
|
default:
|
||||||
case "openAI":
|
throw new Error("Invalid client");
|
||||||
const llm = new OpenAI();
|
}
|
||||||
const completionResult = await generateOpenAICompletions({
|
})
|
||||||
client: llm,
|
);
|
||||||
document: document,
|
|
||||||
schema: schema,
|
|
||||||
prompt: prompt
|
|
||||||
});
|
|
||||||
// Validate the JSON output against the schema using AJV
|
|
||||||
const validate = ajv.compile(schema);
|
|
||||||
if (!validate(completionResult.llm_extraction)) {
|
|
||||||
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
|
||||||
throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
return completionResult;
|
return completions;
|
||||||
default:
|
|
||||||
throw new Error('Invalid client');
|
|
||||||
}
|
|
||||||
}));
|
|
||||||
|
|
||||||
|
|
||||||
return completions;
|
|
||||||
}
|
}
|
||||||
|
@ -1,115 +1,74 @@
|
|||||||
import OpenAI from 'openai'
|
import OpenAI from "openai";
|
||||||
import { z } from 'zod'
|
import { Document } from "../../lib/entities";
|
||||||
import { Document, ExtractorOptions } from "../../lib/entities";
|
|
||||||
import { numTokensFromString } from './helpers';
|
|
||||||
|
|
||||||
// import {
|
export type ScraperCompletionResult = {
|
||||||
// LlamaModel,
|
data: any | null;
|
||||||
// LlamaJsonSchemaGrammar,
|
url: string;
|
||||||
// LlamaContext,
|
};
|
||||||
// LlamaChatSession,
|
|
||||||
// GbnfJsonSchema,
|
|
||||||
// } from 'node-llama-cpp'
|
|
||||||
// import { JsonSchema7Type } from 'zod-to-json-schema'
|
|
||||||
|
|
||||||
export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
|
|
||||||
data: any | null
|
|
||||||
url: string
|
|
||||||
}
|
|
||||||
|
|
||||||
const defaultPrompt =
|
const defaultPrompt =
|
||||||
'You are a professional web scraper. Extract the contents of the webpage'
|
"You are a professional web scraper. Extract the contents of the webpage";
|
||||||
|
|
||||||
function prepareOpenAIDoc(
|
function prepareOpenAIDoc(
|
||||||
document: Document
|
document: Document
|
||||||
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
|
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
|
||||||
|
|
||||||
// Check if the markdown content exists in the document
|
// Check if the markdown content exists in the document
|
||||||
if (!document.markdown) {
|
if (!document.markdown) {
|
||||||
throw new Error("Markdown content is missing in the document.");
|
throw new Error("Markdown content is missing in the document.");
|
||||||
}
|
}
|
||||||
|
|
||||||
return [{ type: 'text', text: document.markdown}]
|
return [{ type: "text", text: document.markdown }];
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function generateOpenAICompletions({
|
export async function generateOpenAICompletions({
|
||||||
client,
|
client,
|
||||||
model = 'gpt-4-turbo',
|
model = "gpt-4-turbo",
|
||||||
document,
|
document,
|
||||||
schema, //TODO - add zod dynamic type checking
|
schema, //TODO - add zod dynamic type checking
|
||||||
prompt = defaultPrompt,
|
prompt = defaultPrompt,
|
||||||
temperature
|
temperature,
|
||||||
}: {
|
}: {
|
||||||
client: OpenAI,
|
client: OpenAI;
|
||||||
model?: string,
|
model?: string;
|
||||||
document: Document,
|
document: Document;
|
||||||
schema: any, // This should be replaced with a proper Zod schema type when available
|
schema: any; // This should be replaced with a proper Zod schema type when available
|
||||||
prompt?: string,
|
prompt?: string;
|
||||||
temperature?: number
|
temperature?: number;
|
||||||
}): Promise<Document> {
|
}): Promise<Document> {
|
||||||
const openai = client as OpenAI
|
const openai = client as OpenAI;
|
||||||
const content = prepareOpenAIDoc(document)
|
const content = prepareOpenAIDoc(document);
|
||||||
|
|
||||||
|
|
||||||
const completion = await openai.chat.completions.create({
|
const completion = await openai.chat.completions.create({
|
||||||
model,
|
model,
|
||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
role: 'system',
|
role: "system",
|
||||||
content: prompt,
|
content: prompt,
|
||||||
},
|
},
|
||||||
{ role: 'user', content },
|
{ role: "user", content },
|
||||||
],
|
],
|
||||||
tools: [
|
tools: [
|
||||||
{
|
{
|
||||||
type: 'function',
|
type: "function",
|
||||||
function: {
|
function: {
|
||||||
name: 'extract_content',
|
name: "extract_content",
|
||||||
description: 'Extracts the content from the given webpage(s)',
|
description: "Extracts the content from the given webpage(s)",
|
||||||
parameters: schema,
|
parameters: schema,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
tool_choice: 'auto',
|
tool_choice: "auto",
|
||||||
temperature,
|
temperature,
|
||||||
})
|
});
|
||||||
|
|
||||||
const c = completion.choices[0].message.tool_calls[0].function.arguments
|
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
||||||
|
|
||||||
// Extract the LLM extraction content from the completion response
|
// Extract the LLM extraction content from the completion response
|
||||||
const llmExtraction = JSON.parse(c);
|
const llmExtraction = JSON.parse(c);
|
||||||
|
|
||||||
// console.log("llm extraction: ", llmExtraction);
|
|
||||||
|
|
||||||
|
|
||||||
// Return the document with the LLM extraction content added
|
// Return the document with the LLM extraction content added
|
||||||
return {
|
return {
|
||||||
...document,
|
...document,
|
||||||
llm_extraction: llmExtraction
|
llm_extraction: llmExtraction,
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
|
|
||||||
// model: LlamaModel,
|
|
||||||
// page: ScraperLoadResult,
|
|
||||||
// schema: JsonSchema7Type,
|
|
||||||
// prompt: string = defaultPrompt,
|
|
||||||
// temperature?: number
|
|
||||||
// ): Promise<ScraperCompletionResult<T>> {
|
|
||||||
// const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on
|
|
||||||
// const context = new LlamaContext({ model })
|
|
||||||
// const session = new LlamaChatSession({ context })
|
|
||||||
// const pagePrompt = `${prompt}\n${page.content}`
|
|
||||||
|
|
||||||
// const result = await session.prompt(pagePrompt, {
|
|
||||||
// grammar,
|
|
||||||
// temperature,
|
|
||||||
// })
|
|
||||||
|
|
||||||
// const parsed = grammar.parse(result)
|
|
||||||
// return {
|
|
||||||
// data: parsed,
|
|
||||||
// url: page.url,
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
@ -195,8 +195,6 @@ export class WebScraperDataProvider {
|
|||||||
documents = documents.concat(pdfDocuments);
|
documents = documents.concat(pdfDocuments);
|
||||||
|
|
||||||
if(this.extractorOptions.mode === "llm-extraction") {
|
if(this.extractorOptions.mode === "llm-extraction") {
|
||||||
|
|
||||||
const llm = new OpenAI()
|
|
||||||
documents = await generateCompletions(
|
documents = await generateCompletions(
|
||||||
documents,
|
documents,
|
||||||
this.extractorOptions
|
this.extractorOptions
|
||||||
|
Loading…
Reference in New Issue
Block a user