0

Caleb: got it to a testable state I believe

This commit is contained in:
Caleb Peffer 2024-04-28 15:52:09 -07:00
parent 6ee1f2d3bc
commit 06497729e2
7 changed files with 163 additions and 31 deletions

View File

@ -8,7 +8,7 @@ dotenv.config();
const TEST_URL = "http://127.0.0.1:3002"; const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for API Routes", () => { describe.only("E2E Tests for API Routes", () => {
beforeAll(() => { beforeAll(() => {
process.env.USE_DB_AUTHENTICATION = "true"; process.env.USE_DB_AUTHENTICATION = "true";
}); });
@ -252,6 +252,48 @@ const TEST_URL = "http://127.0.0.1:3002";
}, 60000); // 60 seconds }, 60000); // 60 seconds
}); });
describe("POST /v0/scrape with LLM Extraction", () => {
it("should extract data using LLM extraction mode", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
pageOptions: {
onlyMainContent: true
},
extractorOptions: {
extractorMode: "llm-extract",
extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
extractorSchema: {
type: "object",
properties: {
company_mission: {
type: "string"
},
supports_sso: {
type: "boolean"
},
is_open_source: {
type: "boolean"
}
},
required: ["company_mission", "supports_sso", "is_open_source"]
}
}
});
console.log("Response:", response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("company_mission");
expect(response.body.data).toHaveProperty("supports_sso");
expect(response.body.data).toHaveProperty("is_open_source");
});
});
describe("GET /is-production", () => { describe("GET /is-production", () => {
it("should return the production status", async () => { it("should return the production status", async () => {
const response = await request(TEST_URL).get("/is-production"); const response = await request(TEST_URL).get("/is-production");

View File

@ -1,3 +1,4 @@
import { ExtractorOptions } from './../lib/entities';
import { Request, Response } from "express"; import { Request, Response } from "express";
import { WebScraperDataProvider } from "../scraper/WebScraper"; import { WebScraperDataProvider } from "../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
@ -11,7 +12,8 @@ export async function scrapeHelper(
req: Request, req: Request,
team_id: string, team_id: string,
crawlerOptions: any, crawlerOptions: any,
pageOptions: any pageOptions: any,
extractorOptions: any
): Promise<{ ): Promise<{
success: boolean; success: boolean;
error?: string; error?: string;
@ -35,6 +37,7 @@ export async function scrapeHelper(
...crawlerOptions, ...crawlerOptions,
}, },
pageOptions: pageOptions, pageOptions: pageOptions,
extractorOptions: extractorOptions
}); });
const docs = await a.getDocuments(false); const docs = await a.getDocuments(false);
@ -79,6 +82,9 @@ export async function scrapeController(req: Request, res: Response) {
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown"
}
const origin = req.body.origin ?? "api"; const origin = req.body.origin ?? "api";
try { try {
@ -96,7 +102,8 @@ export async function scrapeController(req: Request, res: Response) {
req, req,
team_id, team_id,
crawlerOptions, crawlerOptions,
pageOptions pageOptions,
extractorOptions
); );
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;

View File

@ -0,0 +1,48 @@
import Turndown from 'turndown'
import OpenAI from 'openai'
// import { LlamaModel } from 'node-llama-cpp'
import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema'
import {
ScraperCompletionResult,
generateOpenAICompletions,
} from './models.js'
import { ExtractorOptions } from '../entities.js'
// Generate completion using OpenAI
export function generateCompletions(
documents: Document[],
extractionOptions: ExtractorOptions
): Promise < ScraperCompletionResult < T >> [] {
// const schema = zodToJsonSchema(options.schema)
const schema = extractionOptions.extractionSchema;
const prompt = extractionOptions.extractionPrompt;
const loader = documents.map(async (document, i) => {
switch (this.client.constructor) {
case true:
return generateOpenAICompletions<T>(
this.client as OpenAI,
schema,
options?.prompt,
options?.temperature
)
//TODO add other models
// case LlamaModel:
// return generateLlamaCompletions<T>(
// this.client,
// await page,
// schema,
// options?.prompt,
// options?.temperature
// )
default:
throw new Error('Invalid client')
}
})
return loader
}

View File

@ -1,6 +1,8 @@
import OpenAI from 'openai' import OpenAI from 'openai'
import { z } from 'zod' import { z } from 'zod'
import { ScraperLoadResult } from './types' import { ScraperLoadResult } from './types'
import { Document, ExtractorOptions } from "../../lib/entities";
// import { // import {
// LlamaModel, // LlamaModel,
// LlamaJsonSchemaGrammar, // LlamaJsonSchemaGrammar,
@ -8,41 +10,45 @@ import { ScraperLoadResult } from './types'
// LlamaChatSession, // LlamaChatSession,
// GbnfJsonSchema, // GbnfJsonSchema,
// } from 'node-llama-cpp' // } from 'node-llama-cpp'
import { JsonSchema7Type } from 'zod-to-json-schema' // import { JsonSchema7Type } from 'zod-to-json-schema'
export type ScraperCompletionResult<T extends z.ZodSchema<any>> = { export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
data: z.infer<T> | null data: any | null
url: string url: string
} }
const defaultPrompt = const defaultPrompt =
'You are a satistified web scraper. Extract the contents of the webpage' 'You are a satistified web scraper. Extract the contents of the webpage'
function prepareOpenAIPage( function prepareOpenAIDoc(
page: ScraperLoadResult document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] { ): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
if (page.mode === 'image') {
return [ // Check if the markdown content exists in the document
{ if (!document.markdown) {
type: 'image_url', throw new Error("Markdown content is missing in the document.");
image_url: { url: `data:image/jpeg;base64,${page.content}` },
},
]
} }
return [{ type: 'text', text: page.content }] return [{ type: 'text', text: document.markdown }]
} }
export async function generateOpenAICompletions<T extends z.ZodSchema<any>>( export async function generateOpenAICompletions<T>({
client,
model = 'gpt-3.5-turbo',
document,
schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt,
temperature
}: {
client: OpenAI, client: OpenAI,
model: string = 'gpt-3.5-turbo', model?: string,
page: ScraperLoadResult, document: Document,
schema: JsonSchema7Type, schema: any, // This should be replaced with a proper Zod schema type when available
prompt: string = defaultPrompt, prompt?: string,
temperature?: number temperature?: number
): Promise<ScraperCompletionResult<T>> { }): Promise<Document> {
const openai = client as OpenAI const openai = client as OpenAI
const content = prepareOpenAIPage(page) const content = prepareOpenAIDoc(document)
const completion = await openai.chat.completions.create({ const completion = await openai.chat.completions.create({
model, model,
@ -68,10 +74,16 @@ export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
}) })
const c = completion.choices[0].message.tool_calls[0].function.arguments const c = completion.choices[0].message.tool_calls[0].function.arguments
// Extract the LLM extraction content from the completion response
const llmExtraction = c;
// Return the document with the LLM extraction content added
return { return {
data: JSON.parse(c), ...document,
url: page.url, llm_extraction: llmExtraction
} };
} }
// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>( // export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(

View File

@ -3,8 +3,3 @@ export type ScraperLoadOptions = {
closeOnFinish?: boolean closeOnFinish?: boolean
} }
export type ScraperLoadResult = {
url: string
content: string
mode: ScraperLoadOptions['mode']
}

View File

@ -16,6 +16,12 @@ export type PageOptions = {
}; };
export type ExtractorOptions = {
mode: "markdown" | "llm-extraction";
extractionPrompt?: string;
extractionSchema?: Record<string, any>;
}
export type SearchOptions = { export type SearchOptions = {
limit?: number; limit?: number;
tbs?: string; tbs?: string;
@ -38,6 +44,7 @@ export type WebScraperOptions = {
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
}; };
pageOptions?: PageOptions; pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions;
concurrentRequests?: number; concurrentRequests?: number;
}; };
@ -50,6 +57,7 @@ export class Document {
url?: string; // Used only in /search for now url?: string; // Used only in /search for now
content: string; content: string;
markdown?: string; markdown?: string;
llm_extraction?: string;
createdAt?: Date; createdAt?: Date;
updatedAt?: Date; updatedAt?: Date;
type?: string; type?: string;

View File

@ -1,4 +1,4 @@
import { Document, PageOptions, WebScraperOptions } from "../../lib/entities"; import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";
import { Progress } from "../../lib/entities"; import { Progress } from "../../lib/entities";
import { scrapSingleUrl } from "./single_url"; import { scrapSingleUrl } from "./single_url";
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
@ -7,6 +7,8 @@ import { getValue, setValue } from "../../services/redis";
import { getImageDescription } from "./utils/imageDescription"; import { getImageDescription } from "./utils/imageDescription";
import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
import OpenAI from 'openai'
export class WebScraperDataProvider { export class WebScraperDataProvider {
private urls: string[] = [""]; private urls: string[] = [""];
@ -19,6 +21,7 @@ export class WebScraperDataProvider {
private concurrentRequests: number = 20; private concurrentRequests: number = 20;
private generateImgAltText: boolean = false; private generateImgAltText: boolean = false;
private pageOptions?: PageOptions; private pageOptions?: PageOptions;
private extractorOptions?: ExtractorOptions;
private replaceAllPathsWithAbsolutePaths?: boolean = false; private replaceAllPathsWithAbsolutePaths?: boolean = false;
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";
@ -191,6 +194,22 @@ export class WebScraperDataProvider {
documents = await this.getSitemapData(baseUrl, documents); documents = await this.getSitemapData(baseUrl, documents);
documents = documents.concat(pdfDocuments); documents = documents.concat(pdfDocuments);
if(this.extractorOptions.mode === "llm-extraction") {
// const llm = new OpenAI()
// generateCompletions(
// client=llm,
// page =,
// schema=
// )
}
await this.setCachedDocuments(documents); await this.setCachedDocuments(documents);
documents = this.removeChildLinks(documents); documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit); documents = documents.splice(0, this.limit);
@ -376,6 +395,7 @@ export class WebScraperDataProvider {
this.generateImgAltText = this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false; options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check