Caleb: trying to get loggin workng

2024-04-30 09:20:15 -07:00 · 2024-04-30 09:20:15 -07:00 · 3ca9e5153f
commit 3ca9e5153f
parent 79cd7d2ebc
12 changed files with 118 additions and 22 deletions
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -46,7 +46,7 @@
    "@bull-board/api": "^5.14.2",
    "@bull-board/express": "^5.8.0",
    "@devil7softwares/pos": "^1.0.2",
-    "@dqbd/tiktoken": "^1.0.7",
+    "@dqbd/tiktoken": "^1.0.13",
    "@logtail/node": "^0.4.12",
    "@nangohq/node": "^0.36.33",
    "@sentry/node": "^7.48.0",
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
@ -21,7 +21,7 @@ dependencies:
    specifier: ^1.0.2
    version: 1.0.2
  '@dqbd/tiktoken':
-    specifier: ^1.0.7
+    specifier: ^1.0.13
    version: 1.0.13
  '@logtail/node':
    specifier: ^0.4.12
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -252,7 +252,7 @@ describe("E2E Tests for API Routes", () => {
    }, 60000); // 60 seconds
  });

-  describe.only("POST /v0/scrape with LLM Extraction", () => {
+  describe("POST /v0/scrape with LLM Extraction", () => {
    it("should extract data using LLM extraction mode", async () => {
      const response = await request(TEST_URL)
        .post("/v0/scrape")
@ -293,16 +293,6 @@ describe("E2E Tests for API Routes", () => {
      // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
      let llmExtraction = response.body.data.llm_extraction;

-      
-      // // Check if llm_extraction is a string and parse it if necessary
-      // if (typeof llmExtraction === 'string') {
-      //   llmExtraction = JSON.parse(llmExtraction);
-      // }
-
-      // Print the keys of the response.body for debugging purposes
-
-
-  
      // Check if the llm_extraction object has the required properties with correct types and values
      expect(llmExtraction).toHaveProperty("company_mission");
      expect(typeof llmExtraction.company_mission).toBe("string");
@ -315,6 +305,68 @@ describe("E2E Tests for API Routes", () => {
    }, 60000); // 60 secs
  });

+  describe.only("POST /v0/scrape for Top 100 Companies", () => {
+    it("should extract data for the top 100 companies", async () => {
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://companiesmarketcap.com/",
+          pageOptions: {
+            onlyMainContent: true
+          },
+          extractorOptions: {
+            mode: "llm-extraction",
+            extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.",
+            extractionSchema: {
+              type: "object",
+              properties: {
+                companies: {
+                  type: "array",
+                  items: {
+                    type: "object",
+                    properties: {
+                      rank: { type: "number" },
+                      name: { type: "string" },
+                      marketCap: { type: "string" },
+                      price: { type: "string" },
+                      todayChange: { type: "string" }
+                    },
+                    required: ["rank", "name", "marketCap", "price", "todayChange"]
+                  }
+                }
+              },
+              required: ["companies"]
+            }
+          }
+        });
+
+
+      // Print the response body to the console for debugging purposes
+      console.log("Response companies:", response.body.data.llm_extraction.companies);
+
+      // Check if the response has the correct structure and data types
+      expect(response.status).toBe(200);
+      expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true);
+      expect(response.body.data.llm_extraction.companies.length).toBe(40);
+
+      // Sample check for the first company
+      const firstCompany = response.body.data.llm_extraction.companies[0];
+      expect(firstCompany).toHaveProperty("name");
+      expect(typeof firstCompany.name).toBe("string");
+      expect(firstCompany).toHaveProperty("marketCap");
+      expect(typeof firstCompany.marketCap).toBe("string");
+      expect(firstCompany).toHaveProperty("price");
+      expect(typeof firstCompany.price).toBe("string");
+      expect(firstCompany).toHaveProperty("todayChange");
+      expect(typeof firstCompany.todayChange).toBe("string");
+    }, 120000); // 120 secs
+  });
+
+  
+
+
  describe("GET /is-production", () => {
    it("should return the production status", async () => {
      const response = await request(TEST_URL).get("/is-production");
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@ -8,6 +8,7 @@ import { logJob } from "../services/logging/log_job";
 import { Document } from "../lib/entities";
 import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
 import Ajv from 'ajv';
+import { numTokensFromString } from '../lib/LLM-extraction/helpers';

 export async function scrapeHelper(
  req: Request,
@ -51,9 +52,18 @@ export async function scrapeHelper(
    return { success: true, error: "No page found", returnCode: 200 };
  }

+
+  let creditsToBeBilled =  filteredDocs.length;
+  const creditsPerLLMExtract = 4;
+
+  if (extractorOptions.mode === "llm-extraction"){
+    creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length)
+  }
+  // console.log("credits to be billed, ", creditsToBeBilled);
+
  const billingResult = await billTeam(
    team_id,
-    filteredDocs.length
+    creditsToBeBilled
  );
  if (!billingResult.success) {
    return {
@ -109,6 +119,8 @@ export async function scrapeController(req: Request, res: Response) {
    );
    const endTime = new Date().getTime();
    const timeTakenInSeconds = (endTime - startTime) / 1000;
+    const numTokens = numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
+
    logJob({
      success: result.success,
      message: result.error,
@ -120,7 +132,9 @@ export async function scrapeController(req: Request, res: Response) {
      url: req.body.url,
      crawlerOptions: crawlerOptions,
      pageOptions: pageOptions,
-      origin: origin,
+      origin: origin, 
+      extractor_options: extractorOptions,
+      num_tokens: numTokens
    });
    return res.status(result.returnCode).json(result);
  } catch (error) {
--- a/apps/api/src/lib/LLM-extraction/helpers.ts
+++ b/apps/api/src/lib/LLM-extraction/helpers.ts
@ -0,0 +1,18 @@
+
+
+import { encoding_for_model } from "@dqbd/tiktoken";
+import { TiktokenModel } from "@dqbd/tiktoken";
+
+// This function calculates the number of tokens in a text string using GPT-3.5-turbo model
+export function numTokensFromString(message: string, model: string): number {
+    const encoder = encoding_for_model(model as TiktokenModel);
+
+    // Encode the message into tokens
+    const tokens = encoder.encode(message);
+
+    // Free the encoder resources after use
+    encoder.free();
+
+    // Return the number of tokens
+    return tokens.length;
+}
--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@ -38,6 +38,7 @@ export async function generateCompletions(
                // Validate the JSON output against the schema using AJV
                const validate = ajv.compile(schema);
                if (!validate(completionResult.llm_extraction)) {
+                    //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
                    throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
                }

--- a/apps/api/src/lib/LLM-extraction/models.ts
+++ b/apps/api/src/lib/LLM-extraction/models.ts
@ -1,6 +1,7 @@
 import OpenAI from 'openai'
 import { z } from 'zod'
 import { Document, ExtractorOptions } from "../../lib/entities";
+import { numTokensFromString } from './helpers';

 // import {
 //   LlamaModel,
@ -17,7 +18,7 @@ export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
 }

 const defaultPrompt =
-  'You are a satistified web scraper. Extract the contents of the webpage'
+  'You are a professional web scraper. Extract the contents of the webpage'

 function prepareOpenAIDoc(
  document: Document
@ -28,12 +29,12 @@ function prepareOpenAIDoc(
    throw new Error("Markdown content is missing in the document.");
  }

-  return [{ type: 'text', text: document.markdown }]
+  return [{ type: 'text', text: document.html}]
 }

 export async function generateOpenAICompletions({
  client,
-  model = 'gpt-3.5-turbo',
+  model = 'gpt-4-turbo',
  document,
  schema, //TODO - add zod dynamic type checking
  prompt = defaultPrompt,
@ -49,6 +50,7 @@ export async function generateOpenAICompletions({
  const openai = client as OpenAI
  const content = prepareOpenAIDoc(document)

+
  const completion = await openai.chat.completions.create({
    model,
    messages: [
@ -77,6 +79,8 @@ export async function generateOpenAICompletions({
  // Extract the LLM extraction content from the completion response
  const llmExtraction = JSON.parse(c);

+//   console.log("llm extraction: ", llmExtraction);
+

  // Return the document with the LLM extraction content added
  return {
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -57,6 +57,7 @@ export class Document {
  url?: string; // Used only in /search for now
  content: string;
  markdown?: string;
+  html?: string;
  llm_extraction?: Record<string, any>;
  createdAt?: Date;
  updatedAt?: Date;
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -40,8 +40,7 @@ export class WebScraperDataProvider {
  ): Promise<Document[]> {
    const totalUrls = urls.length;
    let processedUrls = 0;
-    console.log("Converting urls to documents");
-    console.log("Total urls", urls);
+  
    const results: (Document | null)[] = new Array(urls.length).fill(null);
    for (let i = 0; i < urls.length; i += this.concurrentRequests) {
      const batchUrls = urls.slice(i, i + this.concurrentRequests);
@ -195,7 +194,6 @@ export class WebScraperDataProvider {
        documents = await this.getSitemapData(baseUrl, documents);
        documents = documents.concat(pdfDocuments);

-        console.log("extraction mode ", this.extractorOptions.mode)
        if(this.extractorOptions.mode === "llm-extraction") {

          const llm = new OpenAI()
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -106,7 +106,6 @@ export async function scrapSingleUrl(
  toMarkdown: boolean = true,
  pageOptions: PageOptions = { onlyMainContent: true }
 ): Promise<Document> {
-  console.log(`Scraping URL: ${urlToScrap}`);
  urlToScrap = urlToScrap.trim();

  const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
@ -217,6 +216,7 @@ export async function scrapSingleUrl(
    return {
      content: text,
      markdown: text,
+      html: html,
      metadata: { ...metadata, sourceURL: urlToScrap },
    } as Document;
  } catch (error) {
--- a/apps/api/src/services/logging/log_job.ts
+++ b/apps/api/src/services/logging/log_job.ts
@ -8,6 +8,8 @@ export async function logJob(job: FirecrawlJob) {
    if (process.env.ENV !== "production") {
      return;
    }
+
+    // console.log("logg")
    const { data, error } = await supabase_service
      .from("firecrawl_jobs")
      .insert([
@ -23,6 +25,8 @@ export async function logJob(job: FirecrawlJob) {
          crawler_options: job.crawlerOptions,
          page_options: job.pageOptions,
          origin: job.origin,
+          extractor_options: job.extractor_options,
+          num_tokens: job.num_tokens
        },
      ]);
    if (error) {
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -1,3 +1,5 @@
+import { ExtractorOptions } from "./lib/entities";
+
 export interface CrawlResult {
  source: string;
  content: string;
@ -37,6 +39,8 @@ export interface FirecrawlJob {
  crawlerOptions?: any;
  pageOptions?: any;
  origin: string;
+  extractor_options?: ExtractorOptions,
+  num_tokens?: number
 }

 export enum RateLimiterMode {