0

Nick: cleanup

This commit is contained in:
Nicolas 2024-04-30 12:19:43 -07:00
parent d9d206aff6
commit 4f526cff92
5 changed files with 76 additions and 124 deletions

View File

@ -7,7 +7,6 @@ import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job"; import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities"; import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import Ajv from 'ajv';
import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { numTokensFromString } from '../lib/LLM-extraction/helpers';
export async function scrapeHelper( export async function scrapeHelper(

View File

@ -1,5 +1,3 @@
import { encoding_for_model } from "@dqbd/tiktoken"; import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken"; import { TiktokenModel } from "@dqbd/tiktoken";

View File

@ -1,16 +1,10 @@
import Turndown from 'turndown' import Turndown from "turndown";
import OpenAI from 'openai' import OpenAI from "openai";
// import { LlamaModel } from 'node-llama-cpp' import Ajv from "ajv";
import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema'
import Ajv from 'ajv';
const ajv = new Ajv(); // Initialize AJV for JSON schema validation const ajv = new Ajv(); // Initialize AJV for JSON schema validation
import { import { generateOpenAICompletions } from "./models";
ScraperCompletionResult, import { Document, ExtractorOptions } from "../entities";
generateOpenAICompletions,
} from './models'
import { Document, ExtractorOptions } from '../entities'
// Generate completion using OpenAI // Generate completion using OpenAI
export async function generateCompletions( export async function generateCompletions(
@ -22,10 +16,10 @@ export async function generateCompletions(
const schema = extractionOptions.extractionSchema; const schema = extractionOptions.extractionSchema;
const prompt = extractionOptions.extractionPrompt; const prompt = extractionOptions.extractionPrompt;
const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
const completions = await Promise.all(
const completions = await Promise.all(documents.map(async (document: Document) => { documents.map(async (document: Document) => {
switch (switchVariable) { switch (switchVariable) {
case "openAI": case "openAI":
const llm = new OpenAI(); const llm = new OpenAI();
@ -33,21 +27,25 @@ export async function generateCompletions(
client: llm, client: llm,
document: document, document: document,
schema: schema, schema: schema,
prompt: prompt prompt: prompt,
}); });
// Validate the JSON output against the schema using AJV // Validate the JSON output against the schema using AJV
const validate = ajv.compile(schema); const validate = ajv.compile(schema);
if (!validate(completionResult.llm_extraction)) { if (!validate(completionResult.llm_extraction)) {
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`); throw new Error(
`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. JSON parsing error(s): ${validate.errors
?.map((err) => err.message)
.join(", ")}`
);
} }
return completionResult; return completionResult;
default: default:
throw new Error('Invalid client'); throw new Error("Invalid client");
} }
})); })
);
return completions; return completions;
} }

View File

@ -1,115 +1,74 @@
import OpenAI from 'openai' import OpenAI from "openai";
import { z } from 'zod' import { Document } from "../../lib/entities";
import { Document, ExtractorOptions } from "../../lib/entities";
import { numTokensFromString } from './helpers';
// import { export type ScraperCompletionResult = {
// LlamaModel, data: any | null;
// LlamaJsonSchemaGrammar, url: string;
// LlamaContext, };
// LlamaChatSession,
// GbnfJsonSchema,
// } from 'node-llama-cpp'
// import { JsonSchema7Type } from 'zod-to-json-schema'
export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
data: any | null
url: string
}
const defaultPrompt = const defaultPrompt =
'You are a professional web scraper. Extract the contents of the webpage' "You are a professional web scraper. Extract the contents of the webpage";
function prepareOpenAIDoc( function prepareOpenAIDoc(
document: Document document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] { ): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
// Check if the markdown content exists in the document // Check if the markdown content exists in the document
if (!document.markdown) { if (!document.markdown) {
throw new Error("Markdown content is missing in the document."); throw new Error("Markdown content is missing in the document.");
} }
return [{ type: 'text', text: document.markdown}] return [{ type: "text", text: document.markdown }];
} }
export async function generateOpenAICompletions({ export async function generateOpenAICompletions({
client, client,
model = 'gpt-4-turbo', model = "gpt-4-turbo",
document, document,
schema, //TODO - add zod dynamic type checking schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt, prompt = defaultPrompt,
temperature temperature,
}: { }: {
client: OpenAI, client: OpenAI;
model?: string, model?: string;
document: Document, document: Document;
schema: any, // This should be replaced with a proper Zod schema type when available schema: any; // This should be replaced with a proper Zod schema type when available
prompt?: string, prompt?: string;
temperature?: number temperature?: number;
}): Promise<Document> { }): Promise<Document> {
const openai = client as OpenAI const openai = client as OpenAI;
const content = prepareOpenAIDoc(document) const content = prepareOpenAIDoc(document);
const completion = await openai.chat.completions.create({ const completion = await openai.chat.completions.create({
model, model,
messages: [ messages: [
{ {
role: 'system', role: "system",
content: prompt, content: prompt,
}, },
{ role: 'user', content }, { role: "user", content },
], ],
tools: [ tools: [
{ {
type: 'function', type: "function",
function: { function: {
name: 'extract_content', name: "extract_content",
description: 'Extracts the content from the given webpage(s)', description: "Extracts the content from the given webpage(s)",
parameters: schema, parameters: schema,
}, },
}, },
], ],
tool_choice: 'auto', tool_choice: "auto",
temperature, temperature,
}) });
const c = completion.choices[0].message.tool_calls[0].function.arguments const c = completion.choices[0].message.tool_calls[0].function.arguments;
// Extract the LLM extraction content from the completion response // Extract the LLM extraction content from the completion response
const llmExtraction = JSON.parse(c); const llmExtraction = JSON.parse(c);
// console.log("llm extraction: ", llmExtraction);
// Return the document with the LLM extraction content added // Return the document with the LLM extraction content added
return { return {
...document, ...document,
llm_extraction: llmExtraction llm_extraction: llmExtraction,
}; };
} }
// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
// model: LlamaModel,
// page: ScraperLoadResult,
// schema: JsonSchema7Type,
// prompt: string = defaultPrompt,
// temperature?: number
// ): Promise<ScraperCompletionResult<T>> {
// const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on
// const context = new LlamaContext({ model })
// const session = new LlamaChatSession({ context })
// const pagePrompt = `${prompt}\n${page.content}`
// const result = await session.prompt(pagePrompt, {
// grammar,
// temperature,
// })
// const parsed = grammar.parse(result)
// return {
// data: parsed,
// url: page.url,
// }
// }

View File

@ -195,8 +195,6 @@ export class WebScraperDataProvider {
documents = documents.concat(pdfDocuments); documents = documents.concat(pdfDocuments);
if(this.extractorOptions.mode === "llm-extraction") { if(this.extractorOptions.mode === "llm-extraction") {
const llm = new OpenAI()
documents = await generateCompletions( documents = await generateCompletions(
documents, documents,
this.extractorOptions this.extractorOptions