0

Caleb: first test passing

This commit is contained in:
Caleb Peffer 2024-04-28 17:38:20 -07:00
parent 06497729e2
commit 2ad7a58eb7
3 changed files with 338 additions and 331 deletions

View File

@ -8,7 +8,7 @@ dotenv.config();
const TEST_URL = "http://127.0.0.1:3002"; const TEST_URL = "http://127.0.0.1:3002";
describe.only("E2E Tests for API Routes", () => { describe("E2E Tests for API Routes", () => {
beforeAll(() => { beforeAll(() => {
process.env.USE_DB_AUTHENTICATION = "true"; process.env.USE_DB_AUTHENTICATION = "true";
}); });
@ -252,7 +252,7 @@ const TEST_URL = "http://127.0.0.1:3002";
}, 60000); // 60 seconds }, 60000); // 60 seconds
}); });
describe("POST /v0/scrape with LLM Extraction", () => { describe.only("POST /v0/scrape with LLM Extraction", () => {
it("should extract data using LLM extraction mode", async () => { it("should extract data using LLM extraction mode", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")
@ -264,9 +264,9 @@ const TEST_URL = "http://127.0.0.1:3002";
onlyMainContent: true onlyMainContent: true
}, },
extractorOptions: { extractorOptions: {
extractorMode: "llm-extract", mode: "llm-extraction",
extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
extractorSchema: { extractionSchema: {
type: "object", type: "object",
properties: { properties: {
company_mission: { company_mission: {
@ -284,14 +284,33 @@ const TEST_URL = "http://127.0.0.1:3002";
} }
}); });
console.log("Response:", response.body);
expect(response.statusCode).toBe(200); // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
expect(response.body).toHaveProperty("data"); let llmExtraction = response.body.data.llm_extraction;
expect(response.body.data).toHaveProperty("company_mission");
expect(response.body.data).toHaveProperty("supports_sso");
expect(response.body.data).toHaveProperty("is_open_source"); // Check if llm_extraction is a string and parse it if necessary
}); if (typeof llmExtraction === 'string') {
llmExtraction = JSON.parse(llmExtraction);
}
console.log('llm extraction', llmExtraction);
// Print the keys of the response.body for debugging purposes
// Check if the llm_extraction object has the required properties with correct types and values
expect(llmExtraction).toHaveProperty("company_mission");
expect(typeof llmExtraction.company_mission).toBe("string");
expect(llmExtraction).toHaveProperty("supports_sso");
expect(llmExtraction.supports_sso).toBe(true);
expect(typeof llmExtraction.supports_sso).toBe("boolean");
expect(llmExtraction).toHaveProperty("is_open_source");
expect(llmExtraction.is_open_source).toBe(false);
expect(typeof llmExtraction.is_open_source).toBe("boolean");
}, 60000); // 60 secs
}); });
describe("GET /is-production", () => { describe("GET /is-production", () => {
@ -301,4 +320,4 @@ const TEST_URL = "http://127.0.0.1:3002";
expect(response.body).toHaveProperty("isProduction"); expect(response.body).toHaveProperty("isProduction");
}); });
}); });
}); });

View File

@ -3,46 +3,39 @@ import OpenAI from 'openai'
// import { LlamaModel } from 'node-llama-cpp' // import { LlamaModel } from 'node-llama-cpp'
import { z } from 'zod' import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema' import { zodToJsonSchema } from 'zod-to-json-schema'
import { import {
ScraperCompletionResult, ScraperCompletionResult,
generateOpenAICompletions, generateOpenAICompletions,
} from './models.js' } from './models'
import { ExtractorOptions } from '../entities.js' import { Document, ExtractorOptions } from '../entities'
// Generate completion using OpenAI // Generate completion using OpenAI
export function generateCompletions( export async function generateCompletions(
documents: Document[], documents: Document[],
extractionOptions: ExtractorOptions extractionOptions: ExtractorOptions
): Promise < ScraperCompletionResult < T >> [] { ): Promise<Document[]> {
// const schema = zodToJsonSchema(options.schema) // const schema = zodToJsonSchema(options.schema)
const schema = extractionOptions.extractionSchema; const schema = extractionOptions.extractionSchema;
const prompt = extractionOptions.extractionPrompt; const prompt = extractionOptions.extractionPrompt;
const loader = documents.map(async (document, i) => { const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
switch (this.client.constructor) {
case true:
return generateOpenAICompletions<T>(
this.client as OpenAI,
schema, const completions = await Promise.all(documents.map(async (document: Document) => {
options?.prompt, switch (switchVariable) {
options?.temperature case "openAI":
) const llm = new OpenAI();
return await generateOpenAICompletions({
//TODO add other models client: llm,
// case LlamaModel: document: document,
// return generateLlamaCompletions<T>( schema: schema,
// this.client, prompt: prompt
// await page, });
// schema,
// options?.prompt,
// options?.temperature
// )
default: default:
throw new Error('Invalid client') throw new Error('Invalid client');
} }
}) }));
return loader return completions;
} }

View File

@ -8,6 +8,7 @@ import { getImageDescription } from "./utils/imageDescription";
import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
import OpenAI from 'openai' import OpenAI from 'openai'
import { generateCompletions } from "../../lib/LLM-extraction";
export class WebScraperDataProvider { export class WebScraperDataProvider {
@ -194,20 +195,14 @@ export class WebScraperDataProvider {
documents = await this.getSitemapData(baseUrl, documents); documents = await this.getSitemapData(baseUrl, documents);
documents = documents.concat(pdfDocuments); documents = documents.concat(pdfDocuments);
console.log("extraction mode ", this.extractorOptions.mode)
if(this.extractorOptions.mode === "llm-extraction") { if(this.extractorOptions.mode === "llm-extraction") {
// const llm = new OpenAI() const llm = new OpenAI()
// generateCompletions( documents = await generateCompletions(
// client=llm, documents,
// page =, this.extractorOptions
// schema= )
// )
} }
await this.setCachedDocuments(documents); await this.setCachedDocuments(documents);