From 6ee1f2d3bc954189f83040f89e070d8b69ff9fb7 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sun, 28 Apr 2024 13:59:35 -0700 Subject: [PATCH] Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper --- apps/api/package.json | 5 +- apps/api/pnpm-lock.yaml | 42 +++++--- apps/api/src/lib/LLM-extraction/models.ts | 99 +++++++++++++++++++ apps/api/src/lib/LLM-extraction/types.ts | 10 ++ apps/api/src/scraper/WebScraper/single_url.ts | 2 + 5 files changed, 145 insertions(+), 13 deletions(-) create mode 100644 apps/api/src/lib/LLM-extraction/models.ts create mode 100644 apps/api/src/lib/LLM-extraction/types.ts diff --git a/apps/api/package.json b/apps/api/package.json index 078c6b6..0f826da 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -68,6 +68,7 @@ "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.3.2", "joplin-turndown-plugin-gfm": "^1.0.12", + "json-schema-to-zod": "^2.1.0", "keyword-extractor": "^0.0.25", "langchain": "^0.1.25", "languagedetect": "^2.0.0", @@ -93,7 +94,9 @@ "unstructured-client": "^0.9.4", "uuid": "^9.0.1", "wordpos": "^2.1.0", - "xml2js": "^0.6.2" + "xml2js": "^0.6.2", + "zod": "^3.23.4", + "zod-to-json-schema": "^3.23.0" }, "nodemonConfig": { "ignore": [ diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 2b61222..d72dad0 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -86,6 +86,9 @@ dependencies: joplin-turndown-plugin-gfm: specifier: ^1.0.12 version: 1.0.12 + json-schema-to-zod: + specifier: ^2.1.0 + version: 2.1.0 keyword-extractor: specifier: ^0.0.25 version: 0.0.25 @@ -164,6 +167,12 @@ dependencies: xml2js: specifier: ^0.6.2 version: 0.6.2 + zod: + specifier: ^3.23.4 + version: 3.23.4 + zod-to-json-schema: + specifier: ^3.23.0 + version: 3.23.0(zod@3.23.4) devDependencies: '@flydotio/dockerfile': @@ -1200,7 +1209,7 @@ packages: redis: 4.6.13 typesense: 1.7.2(@babel/runtime@7.24.0) uuid: 9.0.1 - zod: 3.22.4 + zod: 3.23.4 transitivePeerDependencies: - encoding dev: false @@ -1218,8 +1227,8 @@ packages: p-queue: 6.6.2 p-retry: 4.6.2 uuid: 9.0.1 - zod: 3.22.4 - zod-to-json-schema: 3.22.4(zod@3.22.4) + zod: 3.23.4 + zod-to-json-schema: 3.23.0(zod@3.23.4) dev: false /@langchain/openai@0.0.18: @@ -1229,8 +1238,8 @@ packages: '@langchain/core': 0.1.43 js-tiktoken: 1.0.10 openai: 4.28.4 - zod: 3.22.4 - zod-to-json-schema: 3.22.4(zod@3.22.4) + zod: 3.23.4 + zod-to-json-schema: 3.23.0(zod@3.23.4) transitivePeerDependencies: - encoding dev: false @@ -3985,6 +3994,11 @@ packages: /json-parse-even-better-errors@2.3.1: resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==} + /json-schema-to-zod@2.1.0: + resolution: {integrity: sha512-7ishNgYY+AbIKeeHcp5xCOdJbdVwSfDx/4V2ktc16LUusCJJbz2fEKdWUmAxhKIiYzhZ9Fp4E8OsAoM/h9cOLA==} + hasBin: true + dev: false + /json5@2.2.3: resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==} engines: {node: '>=6'} @@ -4209,8 +4223,8 @@ packages: redis: 4.6.13 uuid: 9.0.1 yaml: 2.4.1 - zod: 3.22.4 - zod-to-json-schema: 3.22.4(zod@3.22.4) + zod: 3.23.4 + zod-to-json-schema: 3.23.0(zod@3.23.4) transitivePeerDependencies: - '@aws-crypto/sha256-js' - '@aws-sdk/client-bedrock-agent-runtime' @@ -5069,7 +5083,7 @@ packages: sbd: 1.0.19 typescript: 5.4.5 uuid: 9.0.1 - zod: 3.22.4 + zod: 3.23.4 transitivePeerDependencies: - debug dev: false @@ -6185,14 +6199,18 @@ packages: engines: {node: '>=10'} dev: true - /zod-to-json-schema@3.22.4(zod@3.22.4): - resolution: {integrity: sha512-2Ed5dJ+n/O3cU383xSY28cuVi0BCQhF8nYqWU5paEpl7fVdqdAmiLdqLyfblbNdfOFwFfi/mqU4O1pwc60iBhQ==} + /zod-to-json-schema@3.23.0(zod@3.23.4): + resolution: {integrity: sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==} peerDependencies: - zod: ^3.22.4 + zod: ^3.23.3 dependencies: - zod: 3.22.4 + zod: 3.23.4 dev: false /zod@3.22.4: resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==} dev: false + + /zod@3.23.4: + resolution: {integrity: sha512-/AtWOKbBgjzEYYQRNfoGKHObgfAZag6qUJX1VbHo2PRBgS+wfWagEY2mizjfyAPcGesrJOcx/wcl0L9WnVrHFw==} + dev: false diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts new file mode 100644 index 0000000..6e57024 --- /dev/null +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -0,0 +1,99 @@ +import OpenAI from 'openai' +import { z } from 'zod' +import { ScraperLoadResult } from './types' +// import { +// LlamaModel, +// LlamaJsonSchemaGrammar, +// LlamaContext, +// LlamaChatSession, +// GbnfJsonSchema, +// } from 'node-llama-cpp' +import { JsonSchema7Type } from 'zod-to-json-schema' + +export type ScraperCompletionResult> = { + data: z.infer | null + url: string +} + +const defaultPrompt = + 'You are a satistified web scraper. Extract the contents of the webpage' + +function prepareOpenAIPage( + page: ScraperLoadResult +): OpenAI.Chat.Completions.ChatCompletionContentPart[] { + if (page.mode === 'image') { + return [ + { + type: 'image_url', + image_url: { url: `data:image/jpeg;base64,${page.content}` }, + }, + ] + } + + return [{ type: 'text', text: page.content }] +} + +export async function generateOpenAICompletions>( + client: OpenAI, + model: string = 'gpt-3.5-turbo', + page: ScraperLoadResult, + schema: JsonSchema7Type, + prompt: string = defaultPrompt, + temperature?: number +): Promise> { + const openai = client as OpenAI + const content = prepareOpenAIPage(page) + + const completion = await openai.chat.completions.create({ + model, + messages: [ + { + role: 'system', + content: prompt, + }, + { role: 'user', content }, + ], + tools: [ + { + type: 'function', + function: { + name: 'extract_content', + description: 'Extracts the content from the given webpage(s)', + parameters: schema, + }, + }, + ], + tool_choice: 'auto', + temperature, + }) + + const c = completion.choices[0].message.tool_calls[0].function.arguments + return { + data: JSON.parse(c), + url: page.url, + } +} + +// export async function generateLlamaCompletions>( +// model: LlamaModel, +// page: ScraperLoadResult, +// schema: JsonSchema7Type, +// prompt: string = defaultPrompt, +// temperature?: number +// ): Promise> { +// const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on +// const context = new LlamaContext({ model }) +// const session = new LlamaChatSession({ context }) +// const pagePrompt = `${prompt}\n${page.content}` + +// const result = await session.prompt(pagePrompt, { +// grammar, +// temperature, +// }) + +// const parsed = grammar.parse(result) +// return { +// data: parsed, +// url: page.url, +// } +// } diff --git a/apps/api/src/lib/LLM-extraction/types.ts b/apps/api/src/lib/LLM-extraction/types.ts new file mode 100644 index 0000000..6f3a543 --- /dev/null +++ b/apps/api/src/lib/LLM-extraction/types.ts @@ -0,0 +1,10 @@ +export type ScraperLoadOptions = { + mode?: 'html' | 'text' | 'markdown' | 'image' + closeOnFinish?: boolean +} + +export type ScraperLoadResult = { + url: string + content: string + mode: ScraperLoadOptions['mode'] +} \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 6ab3003..80d7fa2 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -140,6 +140,8 @@ export async function scrapSingleUrl( } break; } + + //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); return [await parseMarkdown(cleanedHtml), text];