0

Nick: max num tokens for llm extract (for now) + slice the max

This commit is contained in:
Nicolas 2024-05-20 17:07:38 -07:00
parent d5d0d48848
commit 77a79b5a79
4 changed files with 25 additions and 13 deletions

View File

@ -106,6 +106,9 @@ export async function scrapeController(req: Request, res: Response) {
const extractorOptions = req.body.extractorOptions ?? { const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown" mode: "markdown"
} }
if (extractorOptions.mode === "llm-extraction") {
pageOptions.onlyMainContent = true;
}
const origin = req.body.origin ?? "api"; const origin = req.body.origin ?? "api";
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds

View File

@ -1,25 +1,38 @@
import OpenAI from "openai"; import OpenAI from "openai";
import { Document } from "../../lib/entities"; import { Document } from "../../lib/entities";
import { numTokensFromString } from "./helpers";
export type ScraperCompletionResult = { export type ScraperCompletionResult = {
data: any | null; data: any | null;
url: string; url: string;
}; };
const maxTokens = 32000;
const modifier = 4;
const defaultPrompt = const defaultPrompt =
"You are a professional web scraper. Extract the contents of the webpage"; "You are a professional web scraper. Extract the contents of the webpage";
function prepareOpenAIDoc( function prepareOpenAIDoc(
document: Document document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] { ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
// Check if the markdown content exists in the document let markdown = document.markdown;
if (!document.markdown) {
// Check if the markdown content exists in the document
if (!markdown) {
throw new Error( throw new Error(
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
); );
} }
return [{ type: "text", text: document.markdown }]; // count number of tokens
const numTokens = numTokensFromString(document.markdown, "gpt-4");
if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, (maxTokens * modifier));
}
return [[{ type: "text", text: markdown }], numTokens];
} }
export async function generateOpenAICompletions({ export async function generateOpenAICompletions({
@ -38,7 +51,7 @@ export async function generateOpenAICompletions({
temperature?: number; temperature?: number;
}): Promise<Document> { }): Promise<Document> {
const openai = client as OpenAI; const openai = client as OpenAI;
const content = prepareOpenAIDoc(document); const [content, numTokens] = prepareOpenAIDoc(document);
const completion = await openai.chat.completions.create({ const completion = await openai.chat.completions.create({
model, model,
@ -72,6 +85,7 @@ export async function generateOpenAICompletions({
return { return {
...document, ...document,
llm_extraction: llmExtraction, llm_extraction: llmExtraction,
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
}; };
} }

View File

@ -72,6 +72,7 @@ export class Document {
}; };
childrenLinks?: string[]; childrenLinks?: string[];
provider?: string; provider?: string;
warning?: string;
constructor(data: Partial<Document>) { constructor(data: Partial<Document>) {
if (!data.content) { if (!data.content) {

View File

@ -34,8 +34,6 @@ export const excludeNonMainTags = [
"#nav", "#nav",
".breadcrumbs", ".breadcrumbs",
"#breadcrumbs", "#breadcrumbs",
".form",
"form",
"#search-form", "#search-form",
".search", ".search",
"#search", "#search",
@ -51,10 +49,6 @@ export const excludeNonMainTags = [
"#tag", "#tag",
".category", ".category",
"#category", "#category",
".comment", ".cookie",
"#comment", "#cookie"
".reply",
"#reply",
".author",
"#author",
]; ];