Caleb: got it to a testable state I believe

2024-04-28 15:52:09 -07:00 · 2024-04-28 15:52:09 -07:00 · 06497729e2
commit 06497729e2
parent 6ee1f2d3bc
7 changed files with 163 additions and 31 deletions
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -8,7 +8,7 @@ dotenv.config();
 const TEST_URL = "http://127.0.0.1:3002";
-  describe("E2E Tests for API Routes", () => {
+  describe.only("E2E Tests for API Routes", () => {
    beforeAll(() => {
      process.env.USE_DB_AUTHENTICATION = "true";
    });
@ -252,6 +252,48 @@ const TEST_URL = "http://127.0.0.1:3002";
      }, 60000); // 60 seconds
    });
    describe("POST /v0/scrape with LLM Extraction", () => {
      it("should extract data using LLM extraction mode", async () => {
        const response = await request(TEST_URL)
          .post("/v0/scrape")
          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
          .set("Content-Type", "application/json")
          .send({
            url: "https://mendable.ai",
            pageOptions: {
              onlyMainContent: true
            },
            extractorOptions: {
              extractorMode: "llm-extract",
              extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
              extractorSchema: {
                type: "object",
                properties: {
                  company_mission: {
                    type: "string"
                  },
                  supports_sso: {
                    type: "boolean"
                  },
                  is_open_source: {
                    type: "boolean"
                  }
                },
                required: ["company_mission", "supports_sso", "is_open_source"]
              }
            }
          });
        console.log("Response:", response.body);
        expect(response.statusCode).toBe(200);
        expect(response.body).toHaveProperty("data");
        expect(response.body.data).toHaveProperty("company_mission");
        expect(response.body.data).toHaveProperty("supports_sso");
        expect(response.body.data).toHaveProperty("is_open_source");
      });
    });
    describe("GET /is-production", () => {
      it("should return the production status", async () => {
        const response = await request(TEST_URL).get("/is-production");
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@ -1,3 +1,4 @@
 import { ExtractorOptions } from './../lib/entities';
 import { Request, Response } from "express";
 import { WebScraperDataProvider } from "../scraper/WebScraper";
 import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
@ -11,7 +12,8 @@ export async function scrapeHelper(
  req: Request,
  team_id: string,
  crawlerOptions: any,
-  pageOptions: any
+  pageOptions: any,
  extractorOptions: any
 ): Promise<{
  success: boolean;
  error?: string;
@ -35,6 +37,7 @@ export async function scrapeHelper(
      ...crawlerOptions,
    },
    pageOptions: pageOptions,
    extractorOptions: extractorOptions
  });
  const docs = await a.getDocuments(false);
@ -79,6 +82,9 @@ export async function scrapeController(req: Request, res: Response) {
    }
    const crawlerOptions = req.body.crawlerOptions ?? {};
    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
    const extractorOptions = req.body.extractorOptions ?? {
      mode: "markdown"
    }
    const origin = req.body.origin ?? "api";
    try {
@ -96,7 +102,8 @@ export async function scrapeController(req: Request, res: Response) {
      req,
      team_id,
      crawlerOptions,
-      pageOptions
+      pageOptions,
      extractorOptions
    );
    const endTime = new Date().getTime();
    const timeTakenInSeconds = (endTime - startTime) / 1000;
--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@ -0,0 +1,48 @@
 import Turndown from 'turndown'
 import OpenAI from 'openai'
 // import { LlamaModel } from 'node-llama-cpp'
 import { z } from 'zod'
 import { zodToJsonSchema } from 'zod-to-json-schema'
 import {
    ScraperCompletionResult,
    generateOpenAICompletions,
 } from './models.js'
 import { ExtractorOptions } from '../entities.js'
  // Generate completion using OpenAI
 export function generateCompletions(
    documents: Document[],
    extractionOptions: ExtractorOptions
 ): Promise < ScraperCompletionResult < T >> [] {
    // const schema = zodToJsonSchema(options.schema)
    const schema = extractionOptions.extractionSchema;
    const prompt = extractionOptions.extractionPrompt;
    const loader = documents.map(async (document, i) => {
        switch (this.client.constructor) {
            case true:
                return generateOpenAICompletions<T>(
                    this.client as OpenAI,
                    schema,
                    options?.prompt,
                    options?.temperature
                )
            //TODO add other models
            // case LlamaModel:
            //     return generateLlamaCompletions<T>(
            //         this.client,
            //         await page,
            //         schema,
            //         options?.prompt,
            //         options?.temperature
            //     )
            default:
                throw new Error('Invalid client')
        }
    })
    return loader
 }
--- a/apps/api/src/lib/LLM-extraction/models.ts
+++ b/apps/api/src/lib/LLM-extraction/models.ts
@ -1,6 +1,8 @@
 import OpenAI from 'openai'
 import { z } from 'zod'
 import { ScraperLoadResult } from './types'
 import { Document, ExtractorOptions } from "../../lib/entities";
 // import {
 //   LlamaModel,
 //   LlamaJsonSchemaGrammar,
@ -8,41 +10,45 @@ import { ScraperLoadResult } from './types'
 //   LlamaChatSession,
 //   GbnfJsonSchema,
 // } from 'node-llama-cpp'
-import { JsonSchema7Type } from 'zod-to-json-schema'
+// import { JsonSchema7Type } from 'zod-to-json-schema'
 export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
-  data: z.infer<T> | null
+  data: any | null
  url: string
 }
 const defaultPrompt =
  'You are a satistified web scraper. Extract the contents of the webpage'
-function prepareOpenAIPage(
+function prepareOpenAIDoc(
-  page: ScraperLoadResult
+  document: Document
 ): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
-  if (page.mode === 'image') {
+
-    return [
+  // Check if the markdown content exists in the document
-      {
+  if (!document.markdown) {
-        type: 'image_url',
+    throw new Error("Markdown content is missing in the document.");
        image_url: { url: `data:image/jpeg;base64,${page.content}` },
      },
    ]
  }
-  return [{ type: 'text', text: page.content }]
+  return [{ type: 'text', text: document.markdown }]
 }
-export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
+export async function generateOpenAICompletions<T>({
  client,
  model = 'gpt-3.5-turbo',
  document,
  schema, //TODO - add zod dynamic type checking
  prompt = defaultPrompt,
  temperature
 }: {
  client: OpenAI,
-  model: string = 'gpt-3.5-turbo',
+  model?: string,
-  page: ScraperLoadResult,
+  document: Document,
-  schema: JsonSchema7Type,
+  schema: any, // This should be replaced with a proper Zod schema type when available
-  prompt: string = defaultPrompt,
+  prompt?: string,
  temperature?: number
-): Promise<ScraperCompletionResult<T>> {
+}): Promise<Document> {
  const openai = client as OpenAI
-  const content = prepareOpenAIPage(page)
+  const content = prepareOpenAIDoc(document)
  const completion = await openai.chat.completions.create({
    model,
@ -68,10 +74,16 @@ export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
  })
  const c = completion.choices[0].message.tool_calls[0].function.arguments
  // Extract the LLM extraction content from the completion response
  const llmExtraction = c;
  // Return the document with the LLM extraction content added
  return {
-    data: JSON.parse(c),
+    ...document,
-    url: page.url,
+    llm_extraction: llmExtraction
-  }
+  };
 }
 // export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
--- a/apps/api/src/lib/LLM-extraction/types.ts
+++ b/apps/api/src/lib/LLM-extraction/types.ts
@ -3,8 +3,3 @@ export type ScraperLoadOptions = {
    closeOnFinish?: boolean
 }
 export type ScraperLoadResult = {
    url: string
    content: string
    mode: ScraperLoadOptions['mode']
 }
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -16,6 +16,12 @@ export type PageOptions = {
 };
 export type ExtractorOptions = {
  mode: "markdown" | "llm-extraction";
  extractionPrompt?: string;
  extractionSchema?: Record<string, any>;
 }
 export type SearchOptions = {
  limit?: number;
  tbs?: string;
@ -38,6 +44,7 @@ export type WebScraperOptions = {
    replaceAllPathsWithAbsolutePaths?: boolean;
  };
  pageOptions?: PageOptions;
  extractorOptions?: ExtractorOptions;
  concurrentRequests?: number;
 };
@ -50,6 +57,7 @@ export class Document {
  url?: string; // Used only in /search for now
  content: string;
  markdown?: string;
  llm_extraction?: string;
  createdAt?: Date;
  updatedAt?: Date;
  type?: string;
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -1,4 +1,4 @@
-import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
+import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";
 import { Progress } from "../../lib/entities";
 import { scrapSingleUrl } from "./single_url";
 import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
@ -7,6 +7,8 @@ import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/imageDescription";
 import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
 import OpenAI from 'openai'
 export class WebScraperDataProvider {
  private urls: string[] = [""];
@ -19,6 +21,7 @@ export class WebScraperDataProvider {
  private concurrentRequests: number = 20;
  private generateImgAltText: boolean = false;
  private pageOptions?: PageOptions;
  private extractorOptions?: ExtractorOptions;
  private replaceAllPathsWithAbsolutePaths?: boolean = false;
  private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";
@ -191,6 +194,22 @@ export class WebScraperDataProvider {
        documents = await this.getSitemapData(baseUrl, documents);
        documents = documents.concat(pdfDocuments);
        if(this.extractorOptions.mode === "llm-extraction") {
          // const llm = new OpenAI()
          // generateCompletions(
          //   client=llm,
          //   page =, 
          //   schema= 
          // )
        }
        await this.setCachedDocuments(documents);
        documents = this.removeChildLinks(documents);
        documents = documents.splice(0, this.limit);
@ -376,6 +395,7 @@ export class WebScraperDataProvider {
    this.generateImgAltText =
      options.crawlerOptions?.generateImgAltText ?? false;
    this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
    this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
    //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check