Caleb: got it to a testable state I believe
This commit is contained in:
parent
6ee1f2d3bc
commit
06497729e2
@ -8,7 +8,7 @@ dotenv.config();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
|
||||
describe("E2E Tests for API Routes", () => {
|
||||
describe.only("E2E Tests for API Routes", () => {
|
||||
beforeAll(() => {
|
||||
process.env.USE_DB_AUTHENTICATION = "true";
|
||||
});
|
||||
@ -252,6 +252,48 @@ const TEST_URL = "http://127.0.0.1:3002";
|
||||
}, 60000); // 60 seconds
|
||||
});
|
||||
|
||||
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||
it("should extract data using LLM extraction mode", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://mendable.ai",
|
||||
pageOptions: {
|
||||
onlyMainContent: true
|
||||
},
|
||||
extractorOptions: {
|
||||
extractorMode: "llm-extract",
|
||||
extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
extractorSchema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
company_mission: {
|
||||
type: "string"
|
||||
},
|
||||
supports_sso: {
|
||||
type: "boolean"
|
||||
},
|
||||
is_open_source: {
|
||||
type: "boolean"
|
||||
}
|
||||
},
|
||||
required: ["company_mission", "supports_sso", "is_open_source"]
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
console.log("Response:", response.body);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("company_mission");
|
||||
expect(response.body.data).toHaveProperty("supports_sso");
|
||||
expect(response.body.data).toHaveProperty("is_open_source");
|
||||
});
|
||||
});
|
||||
|
||||
describe("GET /is-production", () => {
|
||||
it("should return the production status", async () => {
|
||||
const response = await request(TEST_URL).get("/is-production");
|
||||
|
@ -1,3 +1,4 @@
|
||||
import { ExtractorOptions } from './../lib/entities';
|
||||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||
@ -11,7 +12,8 @@ export async function scrapeHelper(
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
pageOptions: any
|
||||
pageOptions: any,
|
||||
extractorOptions: any
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
@ -35,6 +37,7 @@ export async function scrapeHelper(
|
||||
...crawlerOptions,
|
||||
},
|
||||
pageOptions: pageOptions,
|
||||
extractorOptions: extractorOptions
|
||||
});
|
||||
|
||||
const docs = await a.getDocuments(false);
|
||||
@ -79,6 +82,9 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||
const extractorOptions = req.body.extractorOptions ?? {
|
||||
mode: "markdown"
|
||||
}
|
||||
const origin = req.body.origin ?? "api";
|
||||
|
||||
try {
|
||||
@ -96,7 +102,8 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
pageOptions
|
||||
pageOptions,
|
||||
extractorOptions
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
48
apps/api/src/lib/LLM-extraction/index.ts
Normal file
48
apps/api/src/lib/LLM-extraction/index.ts
Normal file
@ -0,0 +1,48 @@
|
||||
import Turndown from 'turndown'
|
||||
import OpenAI from 'openai'
|
||||
// import { LlamaModel } from 'node-llama-cpp'
|
||||
import { z } from 'zod'
|
||||
import { zodToJsonSchema } from 'zod-to-json-schema'
|
||||
import {
|
||||
ScraperCompletionResult,
|
||||
generateOpenAICompletions,
|
||||
} from './models.js'
|
||||
import { ExtractorOptions } from '../entities.js'
|
||||
|
||||
// Generate completion using OpenAI
|
||||
export function generateCompletions(
|
||||
documents: Document[],
|
||||
extractionOptions: ExtractorOptions
|
||||
): Promise < ScraperCompletionResult < T >> [] {
|
||||
// const schema = zodToJsonSchema(options.schema)
|
||||
|
||||
const schema = extractionOptions.extractionSchema;
|
||||
const prompt = extractionOptions.extractionPrompt;
|
||||
|
||||
const loader = documents.map(async (document, i) => {
|
||||
switch (this.client.constructor) {
|
||||
case true:
|
||||
return generateOpenAICompletions<T>(
|
||||
this.client as OpenAI,
|
||||
|
||||
schema,
|
||||
options?.prompt,
|
||||
options?.temperature
|
||||
)
|
||||
|
||||
//TODO add other models
|
||||
// case LlamaModel:
|
||||
// return generateLlamaCompletions<T>(
|
||||
// this.client,
|
||||
// await page,
|
||||
// schema,
|
||||
// options?.prompt,
|
||||
// options?.temperature
|
||||
// )
|
||||
default:
|
||||
throw new Error('Invalid client')
|
||||
}
|
||||
})
|
||||
|
||||
return loader
|
||||
}
|
@ -1,6 +1,8 @@
|
||||
import OpenAI from 'openai'
|
||||
import { z } from 'zod'
|
||||
import { ScraperLoadResult } from './types'
|
||||
import { Document, ExtractorOptions } from "../../lib/entities";
|
||||
|
||||
// import {
|
||||
// LlamaModel,
|
||||
// LlamaJsonSchemaGrammar,
|
||||
@ -8,41 +10,45 @@ import { ScraperLoadResult } from './types'
|
||||
// LlamaChatSession,
|
||||
// GbnfJsonSchema,
|
||||
// } from 'node-llama-cpp'
|
||||
import { JsonSchema7Type } from 'zod-to-json-schema'
|
||||
// import { JsonSchema7Type } from 'zod-to-json-schema'
|
||||
|
||||
export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
|
||||
data: z.infer<T> | null
|
||||
data: any | null
|
||||
url: string
|
||||
}
|
||||
|
||||
const defaultPrompt =
|
||||
'You are a satistified web scraper. Extract the contents of the webpage'
|
||||
|
||||
function prepareOpenAIPage(
|
||||
page: ScraperLoadResult
|
||||
function prepareOpenAIDoc(
|
||||
document: Document
|
||||
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
|
||||
if (page.mode === 'image') {
|
||||
return [
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: { url: `data:image/jpeg;base64,${page.content}` },
|
||||
},
|
||||
]
|
||||
|
||||
// Check if the markdown content exists in the document
|
||||
if (!document.markdown) {
|
||||
throw new Error("Markdown content is missing in the document.");
|
||||
}
|
||||
|
||||
return [{ type: 'text', text: page.content }]
|
||||
return [{ type: 'text', text: document.markdown }]
|
||||
}
|
||||
|
||||
export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
|
||||
export async function generateOpenAICompletions<T>({
|
||||
client,
|
||||
model = 'gpt-3.5-turbo',
|
||||
document,
|
||||
schema, //TODO - add zod dynamic type checking
|
||||
prompt = defaultPrompt,
|
||||
temperature
|
||||
}: {
|
||||
client: OpenAI,
|
||||
model: string = 'gpt-3.5-turbo',
|
||||
page: ScraperLoadResult,
|
||||
schema: JsonSchema7Type,
|
||||
prompt: string = defaultPrompt,
|
||||
model?: string,
|
||||
document: Document,
|
||||
schema: any, // This should be replaced with a proper Zod schema type when available
|
||||
prompt?: string,
|
||||
temperature?: number
|
||||
): Promise<ScraperCompletionResult<T>> {
|
||||
}): Promise<Document> {
|
||||
const openai = client as OpenAI
|
||||
const content = prepareOpenAIPage(page)
|
||||
const content = prepareOpenAIDoc(document)
|
||||
|
||||
const completion = await openai.chat.completions.create({
|
||||
model,
|
||||
@ -68,10 +74,16 @@ export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
|
||||
})
|
||||
|
||||
const c = completion.choices[0].message.tool_calls[0].function.arguments
|
||||
|
||||
// Extract the LLM extraction content from the completion response
|
||||
const llmExtraction = c;
|
||||
|
||||
// Return the document with the LLM extraction content added
|
||||
return {
|
||||
data: JSON.parse(c),
|
||||
url: page.url,
|
||||
}
|
||||
...document,
|
||||
llm_extraction: llmExtraction
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
|
||||
|
@ -3,8 +3,3 @@ export type ScraperLoadOptions = {
|
||||
closeOnFinish?: boolean
|
||||
}
|
||||
|
||||
export type ScraperLoadResult = {
|
||||
url: string
|
||||
content: string
|
||||
mode: ScraperLoadOptions['mode']
|
||||
}
|
@ -16,6 +16,12 @@ export type PageOptions = {
|
||||
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
mode: "markdown" | "llm-extraction";
|
||||
extractionPrompt?: string;
|
||||
extractionSchema?: Record<string, any>;
|
||||
}
|
||||
|
||||
export type SearchOptions = {
|
||||
limit?: number;
|
||||
tbs?: string;
|
||||
@ -38,6 +44,7 @@ export type WebScraperOptions = {
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
};
|
||||
pageOptions?: PageOptions;
|
||||
extractorOptions?: ExtractorOptions;
|
||||
concurrentRequests?: number;
|
||||
};
|
||||
|
||||
@ -50,6 +57,7 @@ export class Document {
|
||||
url?: string; // Used only in /search for now
|
||||
content: string;
|
||||
markdown?: string;
|
||||
llm_extraction?: string;
|
||||
createdAt?: Date;
|
||||
updatedAt?: Date;
|
||||
type?: string;
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
|
||||
import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";
|
||||
import { Progress } from "../../lib/entities";
|
||||
import { scrapSingleUrl } from "./single_url";
|
||||
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
||||
@ -7,6 +7,8 @@ import { getValue, setValue } from "../../services/redis";
|
||||
import { getImageDescription } from "./utils/imageDescription";
|
||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
||||
import OpenAI from 'openai'
|
||||
|
||||
|
||||
export class WebScraperDataProvider {
|
||||
private urls: string[] = [""];
|
||||
@ -19,6 +21,7 @@ export class WebScraperDataProvider {
|
||||
private concurrentRequests: number = 20;
|
||||
private generateImgAltText: boolean = false;
|
||||
private pageOptions?: PageOptions;
|
||||
private extractorOptions?: ExtractorOptions;
|
||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";
|
||||
|
||||
@ -191,6 +194,22 @@ export class WebScraperDataProvider {
|
||||
documents = await this.getSitemapData(baseUrl, documents);
|
||||
documents = documents.concat(pdfDocuments);
|
||||
|
||||
|
||||
|
||||
|
||||
if(this.extractorOptions.mode === "llm-extraction") {
|
||||
|
||||
// const llm = new OpenAI()
|
||||
// generateCompletions(
|
||||
// client=llm,
|
||||
// page =,
|
||||
// schema=
|
||||
|
||||
// )
|
||||
|
||||
|
||||
}
|
||||
|
||||
await this.setCachedDocuments(documents);
|
||||
documents = this.removeChildLinks(documents);
|
||||
documents = documents.splice(0, this.limit);
|
||||
@ -376,6 +395,7 @@ export class WebScraperDataProvider {
|
||||
this.generateImgAltText =
|
||||
options.crawlerOptions?.generateImgAltText ?? false;
|
||||
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||
|
||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||
|
Loading…
Reference in New Issue
Block a user