Caleb: first test passing

2024-04-28 17:38:20 -07:00 · 2024-04-28 17:38:20 -07:00 · 2ad7a58eb7
commit 2ad7a58eb7
parent 06497729e2
3 changed files with 338 additions and 331 deletions
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -8,7 +8,7 @@ dotenv.config();
 const TEST_URL = "http://127.0.0.1:3002";
-  describe.only("E2E Tests for API Routes", () => {
+describe("E2E Tests for API Routes", () => {
  beforeAll(() => {
    process.env.USE_DB_AUTHENTICATION = "true";
  });
@ -252,7 +252,7 @@ const TEST_URL = "http://127.0.0.1:3002";
    }, 60000); // 60 seconds
  });
-    describe("POST /v0/scrape with LLM Extraction", () => {
+  describe.only("POST /v0/scrape with LLM Extraction", () => {
    it("should extract data using LLM extraction mode", async () => {
      const response = await request(TEST_URL)
        .post("/v0/scrape")
@ -264,9 +264,9 @@ const TEST_URL = "http://127.0.0.1:3002";
            onlyMainContent: true
          },
          extractorOptions: {
-              extractorMode: "llm-extract",
+            mode: "llm-extraction",
-              extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
+            extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
-              extractorSchema: {
+            extractionSchema: {
              type: "object",
              properties: {
                company_mission: {
@ -284,14 +284,33 @@ const TEST_URL = "http://127.0.0.1:3002";
          }
        });
        console.log("Response:", response.body);
-        expect(response.statusCode).toBe(200);
+      // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
-        expect(response.body).toHaveProperty("data");
+      let llmExtraction = response.body.data.llm_extraction;
-        expect(response.body.data).toHaveProperty("company_mission");
+
-        expect(response.body.data).toHaveProperty("supports_sso");
+      
-        expect(response.body.data).toHaveProperty("is_open_source");
+      // Check if llm_extraction is a string and parse it if necessary
-      });
+      if (typeof llmExtraction === 'string') {
        llmExtraction = JSON.parse(llmExtraction);
      }
      console.log('llm extraction', llmExtraction);
      // Print the keys of the response.body for debugging purposes
      // Check if the llm_extraction object has the required properties with correct types and values
      expect(llmExtraction).toHaveProperty("company_mission");
      expect(typeof llmExtraction.company_mission).toBe("string");
      expect(llmExtraction).toHaveProperty("supports_sso");
      expect(llmExtraction.supports_sso).toBe(true);
      expect(typeof llmExtraction.supports_sso).toBe("boolean");
      expect(llmExtraction).toHaveProperty("is_open_source");
      expect(llmExtraction.is_open_source).toBe(false);
      expect(typeof llmExtraction.is_open_source).toBe("boolean");
    }, 60000); // 60 secs
  });
  describe("GET /is-production", () => {
@ -301,4 +320,4 @@ const TEST_URL = "http://127.0.0.1:3002";
      expect(response.body).toHaveProperty("isProduction");
    });
  });
-  });
+});
--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@ -3,46 +3,39 @@ import OpenAI from 'openai'
 // import { LlamaModel } from 'node-llama-cpp'
 import { z } from 'zod'
 import { zodToJsonSchema } from 'zod-to-json-schema'
 import {
    ScraperCompletionResult,
    generateOpenAICompletions,
-} from './models.js'
+} from './models'
-import { ExtractorOptions } from '../entities.js'
+import { Document, ExtractorOptions } from '../entities'
  // Generate completion using OpenAI
-export function generateCompletions(
+export async function generateCompletions(
    documents: Document[],
    extractionOptions: ExtractorOptions
-): Promise < ScraperCompletionResult < T >> [] {
+): Promise<Document[]> {
    // const schema = zodToJsonSchema(options.schema)
    const schema = extractionOptions.extractionSchema;
    const prompt = extractionOptions.extractionPrompt;
-    const loader = documents.map(async (document, i) => {
+    const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
        switch (this.client.constructor) {
            case true:
                return generateOpenAICompletions<T>(
                    this.client as OpenAI,
-                    schema,
+    const completions = await Promise.all(documents.map(async (document: Document) => {
-                    options?.prompt,
+        switch (switchVariable) {
-                    options?.temperature
+            case "openAI":
-                )
+                const llm = new OpenAI();
-            
+                return await generateOpenAICompletions({
-            //TODO add other models
+                    client: llm,
-            // case LlamaModel:
+                    document: document,
-            //     return generateLlamaCompletions<T>(
+                    schema: schema,
-            //         this.client,
+                    prompt: prompt
-            //         await page,
+                });
            //         schema,
            //         options?.prompt,
            //         options?.temperature
            //     )
            default:
-                throw new Error('Invalid client')
+                throw new Error('Invalid client');
        }
-    })
+    }));
-    return loader
+    return completions;
 }
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -8,6 +8,7 @@ import { getImageDescription } from "./utils/imageDescription";
 import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
 import OpenAI from 'openai'
 import { generateCompletions } from "../../lib/LLM-extraction";
 export class WebScraperDataProvider {
@ -194,20 +195,14 @@ export class WebScraperDataProvider {
        documents = await this.getSitemapData(baseUrl, documents);
        documents = documents.concat(pdfDocuments);
-
+        console.log("extraction mode ", this.extractorOptions.mode)
        if(this.extractorOptions.mode === "llm-extraction") {
-          // const llm = new OpenAI()
+          const llm = new OpenAI()
-          // generateCompletions(
+          documents = await generateCompletions(
-          //   client=llm,
+            documents,
-          //   page =, 
+            this.extractorOptions
-          //   schema= 
+          )
          // )
        }
        await this.setCachedDocuments(documents);