From 06497729e2f97ba1449bf218eabbe96b3ad8f877 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sun, 28 Apr 2024 15:52:09 -0700 Subject: [PATCH] Caleb: got it to a testable state I believe --- .../src/__tests__/e2e_withAuth/index.test.ts | 44 ++++++++++++++- apps/api/src/controllers/scrape.ts | 11 +++- apps/api/src/lib/LLM-extraction/index.ts | 48 ++++++++++++++++ apps/api/src/lib/LLM-extraction/models.ts | 56 +++++++++++-------- apps/api/src/lib/LLM-extraction/types.ts | 5 -- apps/api/src/lib/entities.ts | 8 +++ apps/api/src/scraper/WebScraper/index.ts | 22 +++++++- 7 files changed, 163 insertions(+), 31 deletions(-) create mode 100644 apps/api/src/lib/LLM-extraction/index.ts diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 2b4c7e9..fcc7062 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -8,7 +8,7 @@ dotenv.config(); const TEST_URL = "http://127.0.0.1:3002"; - describe("E2E Tests for API Routes", () => { + describe.only("E2E Tests for API Routes", () => { beforeAll(() => { process.env.USE_DB_AUTHENTICATION = "true"; }); @@ -252,6 +252,48 @@ const TEST_URL = "http://127.0.0.1:3002"; }, 60000); // 60 seconds }); + describe("POST /v0/scrape with LLM Extraction", () => { + it("should extract data using LLM extraction mode", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + pageOptions: { + onlyMainContent: true + }, + extractorOptions: { + extractorMode: "llm-extract", + extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractorSchema: { + type: "object", + properties: { + company_mission: { + type: "string" + }, + supports_sso: { + type: "boolean" + }, + is_open_source: { + type: "boolean" + } + }, + required: ["company_mission", "supports_sso", "is_open_source"] + } + } + }); + + console.log("Response:", response.body); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("company_mission"); + expect(response.body.data).toHaveProperty("supports_sso"); + expect(response.body.data).toHaveProperty("is_open_source"); + }); + }); + describe("GET /is-production", () => { it("should return the production status", async () => { const response = await request(TEST_URL).get("/is-production"); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index eebdcb4..13c4dd2 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -1,3 +1,4 @@ +import { ExtractorOptions } from './../lib/entities'; import { Request, Response } from "express"; import { WebScraperDataProvider } from "../scraper/WebScraper"; import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; @@ -11,7 +12,8 @@ export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, - pageOptions: any + pageOptions: any, + extractorOptions: any ): Promise<{ success: boolean; error?: string; @@ -35,6 +37,7 @@ export async function scrapeHelper( ...crawlerOptions, }, pageOptions: pageOptions, + extractorOptions: extractorOptions }); const docs = await a.getDocuments(false); @@ -79,6 +82,9 @@ export async function scrapeController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const extractorOptions = req.body.extractorOptions ?? { + mode: "markdown" + } const origin = req.body.origin ?? "api"; try { @@ -96,7 +102,8 @@ export async function scrapeController(req: Request, res: Response) { req, team_id, crawlerOptions, - pageOptions + pageOptions, + extractorOptions ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts new file mode 100644 index 0000000..b52c931 --- /dev/null +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -0,0 +1,48 @@ +import Turndown from 'turndown' +import OpenAI from 'openai' +// import { LlamaModel } from 'node-llama-cpp' +import { z } from 'zod' +import { zodToJsonSchema } from 'zod-to-json-schema' +import { + ScraperCompletionResult, + generateOpenAICompletions, +} from './models.js' +import { ExtractorOptions } from '../entities.js' + + // Generate completion using OpenAI +export function generateCompletions( + documents: Document[], + extractionOptions: ExtractorOptions +): Promise < ScraperCompletionResult < T >> [] { + // const schema = zodToJsonSchema(options.schema) + + const schema = extractionOptions.extractionSchema; + const prompt = extractionOptions.extractionPrompt; + + const loader = documents.map(async (document, i) => { + switch (this.client.constructor) { + case true: + return generateOpenAICompletions( + this.client as OpenAI, + + schema, + options?.prompt, + options?.temperature + ) + + //TODO add other models + // case LlamaModel: + // return generateLlamaCompletions( + // this.client, + // await page, + // schema, + // options?.prompt, + // options?.temperature + // ) + default: + throw new Error('Invalid client') + } + }) + + return loader +} diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 6e57024..7f50f72 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -1,6 +1,8 @@ import OpenAI from 'openai' import { z } from 'zod' import { ScraperLoadResult } from './types' +import { Document, ExtractorOptions } from "../../lib/entities"; + // import { // LlamaModel, // LlamaJsonSchemaGrammar, @@ -8,41 +10,45 @@ import { ScraperLoadResult } from './types' // LlamaChatSession, // GbnfJsonSchema, // } from 'node-llama-cpp' -import { JsonSchema7Type } from 'zod-to-json-schema' +// import { JsonSchema7Type } from 'zod-to-json-schema' export type ScraperCompletionResult> = { - data: z.infer | null + data: any | null url: string } const defaultPrompt = 'You are a satistified web scraper. Extract the contents of the webpage' -function prepareOpenAIPage( - page: ScraperLoadResult +function prepareOpenAIDoc( + document: Document ): OpenAI.Chat.Completions.ChatCompletionContentPart[] { - if (page.mode === 'image') { - return [ - { - type: 'image_url', - image_url: { url: `data:image/jpeg;base64,${page.content}` }, - }, - ] + + // Check if the markdown content exists in the document + if (!document.markdown) { + throw new Error("Markdown content is missing in the document."); } - return [{ type: 'text', text: page.content }] + return [{ type: 'text', text: document.markdown }] } -export async function generateOpenAICompletions>( +export async function generateOpenAICompletions({ + client, + model = 'gpt-3.5-turbo', + document, + schema, //TODO - add zod dynamic type checking + prompt = defaultPrompt, + temperature +}: { client: OpenAI, - model: string = 'gpt-3.5-turbo', - page: ScraperLoadResult, - schema: JsonSchema7Type, - prompt: string = defaultPrompt, + model?: string, + document: Document, + schema: any, // This should be replaced with a proper Zod schema type when available + prompt?: string, temperature?: number -): Promise> { +}): Promise { const openai = client as OpenAI - const content = prepareOpenAIPage(page) + const content = prepareOpenAIDoc(document) const completion = await openai.chat.completions.create({ model, @@ -68,10 +74,16 @@ export async function generateOpenAICompletions>( }) const c = completion.choices[0].message.tool_calls[0].function.arguments + + // Extract the LLM extraction content from the completion response + const llmExtraction = c; + + // Return the document with the LLM extraction content added return { - data: JSON.parse(c), - url: page.url, - } + ...document, + llm_extraction: llmExtraction + }; + } // export async function generateLlamaCompletions>( diff --git a/apps/api/src/lib/LLM-extraction/types.ts b/apps/api/src/lib/LLM-extraction/types.ts index 6f3a543..2112189 100644 --- a/apps/api/src/lib/LLM-extraction/types.ts +++ b/apps/api/src/lib/LLM-extraction/types.ts @@ -3,8 +3,3 @@ export type ScraperLoadOptions = { closeOnFinish?: boolean } -export type ScraperLoadResult = { - url: string - content: string - mode: ScraperLoadOptions['mode'] -} \ No newline at end of file diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 7b46305..c492c4d 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -16,6 +16,12 @@ export type PageOptions = { }; +export type ExtractorOptions = { + mode: "markdown" | "llm-extraction"; + extractionPrompt?: string; + extractionSchema?: Record; +} + export type SearchOptions = { limit?: number; tbs?: string; @@ -38,6 +44,7 @@ export type WebScraperOptions = { replaceAllPathsWithAbsolutePaths?: boolean; }; pageOptions?: PageOptions; + extractorOptions?: ExtractorOptions; concurrentRequests?: number; }; @@ -50,6 +57,7 @@ export class Document { url?: string; // Used only in /search for now content: string; markdown?: string; + llm_extraction?: string; createdAt?: Date; updatedAt?: Date; type?: string; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1904ef9..fd22ef8 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -1,4 +1,4 @@ -import { Document, PageOptions, WebScraperOptions } from "../../lib/entities"; +import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities"; import { Progress } from "../../lib/entities"; import { scrapSingleUrl } from "./single_url"; import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; @@ -7,6 +7,8 @@ import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/imageDescription"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; +import OpenAI from 'openai' + export class WebScraperDataProvider { private urls: string[] = [""]; @@ -19,6 +21,7 @@ export class WebScraperDataProvider { private concurrentRequests: number = 20; private generateImgAltText: boolean = false; private pageOptions?: PageOptions; + private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; @@ -191,6 +194,22 @@ export class WebScraperDataProvider { documents = await this.getSitemapData(baseUrl, documents); documents = documents.concat(pdfDocuments); + + + + if(this.extractorOptions.mode === "llm-extraction") { + + // const llm = new OpenAI() + // generateCompletions( + // client=llm, + // page =, + // schema= + + // ) + + + } + await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); @@ -376,6 +395,7 @@ export class WebScraperDataProvider { this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; + this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check