From 77a79b5a79ec51f39b69f25a465d2b1dc6ed1af5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 20 May 2024 17:07:38 -0700 Subject: [PATCH] Nick: max num tokens for llm extract (for now) + slice the max --- apps/api/src/controllers/scrape.ts | 3 +++ apps/api/src/lib/LLM-extraction/models.ts | 24 +++++++++++++++---- apps/api/src/lib/entities.ts | 1 + .../scraper/WebScraper/utils/excludeTags.ts | 10 ++------ 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 449a50f..0b3f146 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -106,6 +106,9 @@ export async function scrapeController(req: Request, res: Response) { const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } + if (extractorOptions.mode === "llm-extraction") { + pageOptions.onlyMainContent = true; + } const origin = req.body.origin ?? "api"; const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 4a25b43..1434e35 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -1,25 +1,38 @@ import OpenAI from "openai"; import { Document } from "../../lib/entities"; +import { numTokensFromString } from "./helpers"; export type ScraperCompletionResult = { data: any | null; url: string; }; +const maxTokens = 32000; +const modifier = 4; const defaultPrompt = "You are a professional web scraper. Extract the contents of the webpage"; function prepareOpenAIDoc( document: Document -): OpenAI.Chat.Completions.ChatCompletionContentPart[] { - // Check if the markdown content exists in the document - if (!document.markdown) { +): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] { + let markdown = document.markdown; + +// Check if the markdown content exists in the document + if (!markdown) { throw new Error( "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" ); } - return [{ type: "text", text: document.markdown }]; + // count number of tokens + const numTokens = numTokensFromString(document.markdown, "gpt-4"); + + if (numTokens > maxTokens) { + // trim the document to the maximum number of tokens, tokens != characters + markdown = markdown.slice(0, (maxTokens * modifier)); + } + + return [[{ type: "text", text: markdown }], numTokens]; } export async function generateOpenAICompletions({ @@ -38,7 +51,7 @@ export async function generateOpenAICompletions({ temperature?: number; }): Promise { const openai = client as OpenAI; - const content = prepareOpenAIDoc(document); + const [content, numTokens] = prepareOpenAIDoc(document); const completion = await openai.chat.completions.create({ model, @@ -72,6 +85,7 @@ export async function generateOpenAICompletions({ return { ...document, llm_extraction: llmExtraction, + warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined, }; } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 15550be..ab0a0ef 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -72,6 +72,7 @@ export class Document { }; childrenLinks?: string[]; provider?: string; + warning?: string; constructor(data: Partial) { if (!data.content) { diff --git a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts index 142bcef..bb9c519 100644 --- a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts +++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts @@ -34,8 +34,6 @@ export const excludeNonMainTags = [ "#nav", ".breadcrumbs", "#breadcrumbs", - ".form", - "form", "#search-form", ".search", "#search", @@ -51,10 +49,6 @@ export const excludeNonMainTags = [ "#tag", ".category", "#category", - ".comment", - "#comment", - ".reply", - "#reply", - ".author", - "#author", + ".cookie", + "#cookie" ];