Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper
This commit is contained in:
parent
e6d7a4761d
commit
6ee1f2d3bc
@ -68,6 +68,7 @@
|
|||||||
"gpt3-tokenizer": "^1.1.5",
|
"gpt3-tokenizer": "^1.1.5",
|
||||||
"ioredis": "^5.3.2",
|
"ioredis": "^5.3.2",
|
||||||
"joplin-turndown-plugin-gfm": "^1.0.12",
|
"joplin-turndown-plugin-gfm": "^1.0.12",
|
||||||
|
"json-schema-to-zod": "^2.1.0",
|
||||||
"keyword-extractor": "^0.0.25",
|
"keyword-extractor": "^0.0.25",
|
||||||
"langchain": "^0.1.25",
|
"langchain": "^0.1.25",
|
||||||
"languagedetect": "^2.0.0",
|
"languagedetect": "^2.0.0",
|
||||||
@ -93,7 +94,9 @@
|
|||||||
"unstructured-client": "^0.9.4",
|
"unstructured-client": "^0.9.4",
|
||||||
"uuid": "^9.0.1",
|
"uuid": "^9.0.1",
|
||||||
"wordpos": "^2.1.0",
|
"wordpos": "^2.1.0",
|
||||||
"xml2js": "^0.6.2"
|
"xml2js": "^0.6.2",
|
||||||
|
"zod": "^3.23.4",
|
||||||
|
"zod-to-json-schema": "^3.23.0"
|
||||||
},
|
},
|
||||||
"nodemonConfig": {
|
"nodemonConfig": {
|
||||||
"ignore": [
|
"ignore": [
|
||||||
|
@ -86,6 +86,9 @@ dependencies:
|
|||||||
joplin-turndown-plugin-gfm:
|
joplin-turndown-plugin-gfm:
|
||||||
specifier: ^1.0.12
|
specifier: ^1.0.12
|
||||||
version: 1.0.12
|
version: 1.0.12
|
||||||
|
json-schema-to-zod:
|
||||||
|
specifier: ^2.1.0
|
||||||
|
version: 2.1.0
|
||||||
keyword-extractor:
|
keyword-extractor:
|
||||||
specifier: ^0.0.25
|
specifier: ^0.0.25
|
||||||
version: 0.0.25
|
version: 0.0.25
|
||||||
@ -164,6 +167,12 @@ dependencies:
|
|||||||
xml2js:
|
xml2js:
|
||||||
specifier: ^0.6.2
|
specifier: ^0.6.2
|
||||||
version: 0.6.2
|
version: 0.6.2
|
||||||
|
zod:
|
||||||
|
specifier: ^3.23.4
|
||||||
|
version: 3.23.4
|
||||||
|
zod-to-json-schema:
|
||||||
|
specifier: ^3.23.0
|
||||||
|
version: 3.23.0(zod@3.23.4)
|
||||||
|
|
||||||
devDependencies:
|
devDependencies:
|
||||||
'@flydotio/dockerfile':
|
'@flydotio/dockerfile':
|
||||||
@ -1200,7 +1209,7 @@ packages:
|
|||||||
redis: 4.6.13
|
redis: 4.6.13
|
||||||
typesense: 1.7.2(@babel/runtime@7.24.0)
|
typesense: 1.7.2(@babel/runtime@7.24.0)
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- encoding
|
- encoding
|
||||||
dev: false
|
dev: false
|
||||||
@ -1218,8 +1227,8 @@ packages:
|
|||||||
p-queue: 6.6.2
|
p-queue: 6.6.2
|
||||||
p-retry: 4.6.2
|
p-retry: 4.6.2
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
zod-to-json-schema: 3.22.4(zod@3.22.4)
|
zod-to-json-schema: 3.23.0(zod@3.23.4)
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
/@langchain/openai@0.0.18:
|
/@langchain/openai@0.0.18:
|
||||||
@ -1229,8 +1238,8 @@ packages:
|
|||||||
'@langchain/core': 0.1.43
|
'@langchain/core': 0.1.43
|
||||||
js-tiktoken: 1.0.10
|
js-tiktoken: 1.0.10
|
||||||
openai: 4.28.4
|
openai: 4.28.4
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
zod-to-json-schema: 3.22.4(zod@3.22.4)
|
zod-to-json-schema: 3.23.0(zod@3.23.4)
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- encoding
|
- encoding
|
||||||
dev: false
|
dev: false
|
||||||
@ -3985,6 +3994,11 @@ packages:
|
|||||||
/json-parse-even-better-errors@2.3.1:
|
/json-parse-even-better-errors@2.3.1:
|
||||||
resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==}
|
resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==}
|
||||||
|
|
||||||
|
/json-schema-to-zod@2.1.0:
|
||||||
|
resolution: {integrity: sha512-7ishNgYY+AbIKeeHcp5xCOdJbdVwSfDx/4V2ktc16LUusCJJbz2fEKdWUmAxhKIiYzhZ9Fp4E8OsAoM/h9cOLA==}
|
||||||
|
hasBin: true
|
||||||
|
dev: false
|
||||||
|
|
||||||
/json5@2.2.3:
|
/json5@2.2.3:
|
||||||
resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==}
|
resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==}
|
||||||
engines: {node: '>=6'}
|
engines: {node: '>=6'}
|
||||||
@ -4209,8 +4223,8 @@ packages:
|
|||||||
redis: 4.6.13
|
redis: 4.6.13
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
yaml: 2.4.1
|
yaml: 2.4.1
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
zod-to-json-schema: 3.22.4(zod@3.22.4)
|
zod-to-json-schema: 3.23.0(zod@3.23.4)
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- '@aws-crypto/sha256-js'
|
- '@aws-crypto/sha256-js'
|
||||||
- '@aws-sdk/client-bedrock-agent-runtime'
|
- '@aws-sdk/client-bedrock-agent-runtime'
|
||||||
@ -5069,7 +5083,7 @@ packages:
|
|||||||
sbd: 1.0.19
|
sbd: 1.0.19
|
||||||
typescript: 5.4.5
|
typescript: 5.4.5
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- debug
|
- debug
|
||||||
dev: false
|
dev: false
|
||||||
@ -6185,14 +6199,18 @@ packages:
|
|||||||
engines: {node: '>=10'}
|
engines: {node: '>=10'}
|
||||||
dev: true
|
dev: true
|
||||||
|
|
||||||
/zod-to-json-schema@3.22.4(zod@3.22.4):
|
/zod-to-json-schema@3.23.0(zod@3.23.4):
|
||||||
resolution: {integrity: sha512-2Ed5dJ+n/O3cU383xSY28cuVi0BCQhF8nYqWU5paEpl7fVdqdAmiLdqLyfblbNdfOFwFfi/mqU4O1pwc60iBhQ==}
|
resolution: {integrity: sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
zod: ^3.22.4
|
zod: ^3.23.3
|
||||||
dependencies:
|
dependencies:
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
/zod@3.22.4:
|
/zod@3.22.4:
|
||||||
resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
|
resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
/zod@3.23.4:
|
||||||
|
resolution: {integrity: sha512-/AtWOKbBgjzEYYQRNfoGKHObgfAZag6qUJX1VbHo2PRBgS+wfWagEY2mizjfyAPcGesrJOcx/wcl0L9WnVrHFw==}
|
||||||
|
dev: false
|
||||||
|
99
apps/api/src/lib/LLM-extraction/models.ts
Normal file
99
apps/api/src/lib/LLM-extraction/models.ts
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
import OpenAI from 'openai'
|
||||||
|
import { z } from 'zod'
|
||||||
|
import { ScraperLoadResult } from './types'
|
||||||
|
// import {
|
||||||
|
// LlamaModel,
|
||||||
|
// LlamaJsonSchemaGrammar,
|
||||||
|
// LlamaContext,
|
||||||
|
// LlamaChatSession,
|
||||||
|
// GbnfJsonSchema,
|
||||||
|
// } from 'node-llama-cpp'
|
||||||
|
import { JsonSchema7Type } from 'zod-to-json-schema'
|
||||||
|
|
||||||
|
export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
|
||||||
|
data: z.infer<T> | null
|
||||||
|
url: string
|
||||||
|
}
|
||||||
|
|
||||||
|
const defaultPrompt =
|
||||||
|
'You are a satistified web scraper. Extract the contents of the webpage'
|
||||||
|
|
||||||
|
function prepareOpenAIPage(
|
||||||
|
page: ScraperLoadResult
|
||||||
|
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
|
||||||
|
if (page.mode === 'image') {
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
type: 'image_url',
|
||||||
|
image_url: { url: `data:image/jpeg;base64,${page.content}` },
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
return [{ type: 'text', text: page.content }]
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
|
||||||
|
client: OpenAI,
|
||||||
|
model: string = 'gpt-3.5-turbo',
|
||||||
|
page: ScraperLoadResult,
|
||||||
|
schema: JsonSchema7Type,
|
||||||
|
prompt: string = defaultPrompt,
|
||||||
|
temperature?: number
|
||||||
|
): Promise<ScraperCompletionResult<T>> {
|
||||||
|
const openai = client as OpenAI
|
||||||
|
const content = prepareOpenAIPage(page)
|
||||||
|
|
||||||
|
const completion = await openai.chat.completions.create({
|
||||||
|
model,
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: 'system',
|
||||||
|
content: prompt,
|
||||||
|
},
|
||||||
|
{ role: 'user', content },
|
||||||
|
],
|
||||||
|
tools: [
|
||||||
|
{
|
||||||
|
type: 'function',
|
||||||
|
function: {
|
||||||
|
name: 'extract_content',
|
||||||
|
description: 'Extracts the content from the given webpage(s)',
|
||||||
|
parameters: schema,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
tool_choice: 'auto',
|
||||||
|
temperature,
|
||||||
|
})
|
||||||
|
|
||||||
|
const c = completion.choices[0].message.tool_calls[0].function.arguments
|
||||||
|
return {
|
||||||
|
data: JSON.parse(c),
|
||||||
|
url: page.url,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
|
||||||
|
// model: LlamaModel,
|
||||||
|
// page: ScraperLoadResult,
|
||||||
|
// schema: JsonSchema7Type,
|
||||||
|
// prompt: string = defaultPrompt,
|
||||||
|
// temperature?: number
|
||||||
|
// ): Promise<ScraperCompletionResult<T>> {
|
||||||
|
// const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on
|
||||||
|
// const context = new LlamaContext({ model })
|
||||||
|
// const session = new LlamaChatSession({ context })
|
||||||
|
// const pagePrompt = `${prompt}\n${page.content}`
|
||||||
|
|
||||||
|
// const result = await session.prompt(pagePrompt, {
|
||||||
|
// grammar,
|
||||||
|
// temperature,
|
||||||
|
// })
|
||||||
|
|
||||||
|
// const parsed = grammar.parse(result)
|
||||||
|
// return {
|
||||||
|
// data: parsed,
|
||||||
|
// url: page.url,
|
||||||
|
// }
|
||||||
|
// }
|
10
apps/api/src/lib/LLM-extraction/types.ts
Normal file
10
apps/api/src/lib/LLM-extraction/types.ts
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
export type ScraperLoadOptions = {
|
||||||
|
mode?: 'html' | 'text' | 'markdown' | 'image'
|
||||||
|
closeOnFinish?: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ScraperLoadResult = {
|
||||||
|
url: string
|
||||||
|
content: string
|
||||||
|
mode: ScraperLoadOptions['mode']
|
||||||
|
}
|
@ -140,6 +140,8 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||||
|
|
||||||
return [await parseMarkdown(cleanedHtml), text];
|
return [await parseMarkdown(cleanedHtml), text];
|
||||||
|
Loading…
Reference in New Issue
Block a user