Nick: max num tokens for llm extract (for now) + slice the max
This commit is contained in:
parent
d5d0d48848
commit
77a79b5a79
@ -106,6 +106,9 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
const extractorOptions = req.body.extractorOptions ?? {
|
const extractorOptions = req.body.extractorOptions ?? {
|
||||||
mode: "markdown"
|
mode: "markdown"
|
||||||
}
|
}
|
||||||
|
if (extractorOptions.mode === "llm-extraction") {
|
||||||
|
pageOptions.onlyMainContent = true;
|
||||||
|
}
|
||||||
const origin = req.body.origin ?? "api";
|
const origin = req.body.origin ?? "api";
|
||||||
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds
|
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds
|
||||||
|
|
||||||
|
@ -1,25 +1,38 @@
|
|||||||
import OpenAI from "openai";
|
import OpenAI from "openai";
|
||||||
import { Document } from "../../lib/entities";
|
import { Document } from "../../lib/entities";
|
||||||
|
import { numTokensFromString } from "./helpers";
|
||||||
|
|
||||||
export type ScraperCompletionResult = {
|
export type ScraperCompletionResult = {
|
||||||
data: any | null;
|
data: any | null;
|
||||||
url: string;
|
url: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const maxTokens = 32000;
|
||||||
|
const modifier = 4;
|
||||||
const defaultPrompt =
|
const defaultPrompt =
|
||||||
"You are a professional web scraper. Extract the contents of the webpage";
|
"You are a professional web scraper. Extract the contents of the webpage";
|
||||||
|
|
||||||
function prepareOpenAIDoc(
|
function prepareOpenAIDoc(
|
||||||
document: Document
|
document: Document
|
||||||
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
|
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
|
||||||
|
let markdown = document.markdown;
|
||||||
|
|
||||||
// Check if the markdown content exists in the document
|
// Check if the markdown content exists in the document
|
||||||
if (!document.markdown) {
|
if (!markdown) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
|
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return [{ type: "text", text: document.markdown }];
|
// count number of tokens
|
||||||
|
const numTokens = numTokensFromString(document.markdown, "gpt-4");
|
||||||
|
|
||||||
|
if (numTokens > maxTokens) {
|
||||||
|
// trim the document to the maximum number of tokens, tokens != characters
|
||||||
|
markdown = markdown.slice(0, (maxTokens * modifier));
|
||||||
|
}
|
||||||
|
|
||||||
|
return [[{ type: "text", text: markdown }], numTokens];
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function generateOpenAICompletions({
|
export async function generateOpenAICompletions({
|
||||||
@ -38,7 +51,7 @@ export async function generateOpenAICompletions({
|
|||||||
temperature?: number;
|
temperature?: number;
|
||||||
}): Promise<Document> {
|
}): Promise<Document> {
|
||||||
const openai = client as OpenAI;
|
const openai = client as OpenAI;
|
||||||
const content = prepareOpenAIDoc(document);
|
const [content, numTokens] = prepareOpenAIDoc(document);
|
||||||
|
|
||||||
const completion = await openai.chat.completions.create({
|
const completion = await openai.chat.completions.create({
|
||||||
model,
|
model,
|
||||||
@ -72,6 +85,7 @@ export async function generateOpenAICompletions({
|
|||||||
return {
|
return {
|
||||||
...document,
|
...document,
|
||||||
llm_extraction: llmExtraction,
|
llm_extraction: llmExtraction,
|
||||||
|
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,6 +72,7 @@ export class Document {
|
|||||||
};
|
};
|
||||||
childrenLinks?: string[];
|
childrenLinks?: string[];
|
||||||
provider?: string;
|
provider?: string;
|
||||||
|
warning?: string;
|
||||||
|
|
||||||
constructor(data: Partial<Document>) {
|
constructor(data: Partial<Document>) {
|
||||||
if (!data.content) {
|
if (!data.content) {
|
||||||
|
@ -34,8 +34,6 @@ export const excludeNonMainTags = [
|
|||||||
"#nav",
|
"#nav",
|
||||||
".breadcrumbs",
|
".breadcrumbs",
|
||||||
"#breadcrumbs",
|
"#breadcrumbs",
|
||||||
".form",
|
|
||||||
"form",
|
|
||||||
"#search-form",
|
"#search-form",
|
||||||
".search",
|
".search",
|
||||||
"#search",
|
"#search",
|
||||||
@ -51,10 +49,6 @@ export const excludeNonMainTags = [
|
|||||||
"#tag",
|
"#tag",
|
||||||
".category",
|
".category",
|
||||||
"#category",
|
"#category",
|
||||||
".comment",
|
".cookie",
|
||||||
"#comment",
|
"#cookie"
|
||||||
".reply",
|
|
||||||
"#reply",
|
|
||||||
".author",
|
|
||||||
"#author",
|
|
||||||
];
|
];
|
||||||
|
Loading…
Reference in New Issue
Block a user