From 3ca9e5153f99da20b13478811ec0587eb003d434 Mon Sep 17 00:00:00 2001
From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com>
Date: Tue, 30 Apr 2024 09:20:15 -0700
Subject: [PATCH] Caleb: trying to get loggin workng

---
 apps/api/package.json                         |  2 +-
 apps/api/pnpm-lock.yaml                       |  2 +-
 .../src/__tests__/e2e_withAuth/index.test.ts  | 74 ++++++++++++++++---
 apps/api/src/controllers/scrape.ts            | 18 ++++-
 apps/api/src/lib/LLM-extraction/helpers.ts    | 18 +++++
 apps/api/src/lib/LLM-extraction/index.ts      |  1 +
 apps/api/src/lib/LLM-extraction/models.ts     | 10 ++-
 apps/api/src/lib/entities.ts                  |  1 +
 apps/api/src/scraper/WebScraper/index.ts      |  4 +-
 apps/api/src/scraper/WebScraper/single_url.ts |  2 +-
 apps/api/src/services/logging/log_job.ts      |  4 +
 apps/api/src/types.ts                         |  4 +
 12 files changed, 118 insertions(+), 22 deletions(-)
 create mode 100644 apps/api/src/lib/LLM-extraction/helpers.ts

diff --git a/apps/api/package.json b/apps/api/package.json
index 00ce1bb..047feaf 100644
--- a/apps/api/package.json
+++ b/apps/api/package.json
@@ -46,7 +46,7 @@
     "@bull-board/api": "^5.14.2",
     "@bull-board/express": "^5.8.0",
     "@devil7softwares/pos": "^1.0.2",
-    "@dqbd/tiktoken": "^1.0.7",
+    "@dqbd/tiktoken": "^1.0.13",
     "@logtail/node": "^0.4.12",
     "@nangohq/node": "^0.36.33",
     "@sentry/node": "^7.48.0",
diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml
index 8062354..bd5e37b 100644
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
@@ -21,7 +21,7 @@ dependencies:
     specifier: ^1.0.2
     version: 1.0.2
   '@dqbd/tiktoken':
-    specifier: ^1.0.7
+    specifier: ^1.0.13
     version: 1.0.13
   '@logtail/node':
     specifier: ^0.4.12
diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 4a47638..fb9d8af 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -252,7 +252,7 @@ describe("E2E Tests for API Routes", () => {
     }, 60000); // 60 seconds
   });
 
-  describe.only("POST /v0/scrape with LLM Extraction", () => {
+  describe("POST /v0/scrape with LLM Extraction", () => {
     it("should extract data using LLM extraction mode", async () => {
       const response = await request(TEST_URL)
         .post("/v0/scrape")
@@ -293,16 +293,6 @@ describe("E2E Tests for API Routes", () => {
       // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
       let llmExtraction = response.body.data.llm_extraction;
 
-      
-      // // Check if llm_extraction is a string and parse it if necessary
-      // if (typeof llmExtraction === 'string') {
-      //   llmExtraction = JSON.parse(llmExtraction);
-      // }
-
-      // Print the keys of the response.body for debugging purposes
-
-
-  
       // Check if the llm_extraction object has the required properties with correct types and values
       expect(llmExtraction).toHaveProperty("company_mission");
       expect(typeof llmExtraction.company_mission).toBe("string");
@@ -315,6 +305,68 @@ describe("E2E Tests for API Routes", () => {
     }, 60000); // 60 secs
   });
 
+  describe.only("POST /v0/scrape for Top 100 Companies", () => {
+    it("should extract data for the top 100 companies", async () => {
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://companiesmarketcap.com/",
+          pageOptions: {
+            onlyMainContent: true
+          },
+          extractorOptions: {
+            mode: "llm-extraction",
+            extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.",
+            extractionSchema: {
+              type: "object",
+              properties: {
+                companies: {
+                  type: "array",
+                  items: {
+                    type: "object",
+                    properties: {
+                      rank: { type: "number" },
+                      name: { type: "string" },
+                      marketCap: { type: "string" },
+                      price: { type: "string" },
+                      todayChange: { type: "string" }
+                    },
+                    required: ["rank", "name", "marketCap", "price", "todayChange"]
+                  }
+                }
+              },
+              required: ["companies"]
+            }
+          }
+        });
+
+
+      // Print the response body to the console for debugging purposes
+      console.log("Response companies:", response.body.data.llm_extraction.companies);
+
+      // Check if the response has the correct structure and data types
+      expect(response.status).toBe(200);
+      expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true);
+      expect(response.body.data.llm_extraction.companies.length).toBe(40);
+
+      // Sample check for the first company
+      const firstCompany = response.body.data.llm_extraction.companies[0];
+      expect(firstCompany).toHaveProperty("name");
+      expect(typeof firstCompany.name).toBe("string");
+      expect(firstCompany).toHaveProperty("marketCap");
+      expect(typeof firstCompany.marketCap).toBe("string");
+      expect(firstCompany).toHaveProperty("price");
+      expect(typeof firstCompany.price).toBe("string");
+      expect(firstCompany).toHaveProperty("todayChange");
+      expect(typeof firstCompany.todayChange).toBe("string");
+    }, 120000); // 120 secs
+  });
+
+  
+
+
   describe("GET /is-production", () => {
     it("should return the production status", async () => {
       const response = await request(TEST_URL).get("/is-production");
diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts
index 43b8ca4..d2340e8 100644
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@@ -8,6 +8,7 @@ import { logJob } from "../services/logging/log_job";
 import { Document } from "../lib/entities";
 import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
 import Ajv from 'ajv';
+import { numTokensFromString } from '../lib/LLM-extraction/helpers';
 
 export async function scrapeHelper(
   req: Request,
@@ -51,9 +52,18 @@ export async function scrapeHelper(
     return { success: true, error: "No page found", returnCode: 200 };
   }
 
+
+  let creditsToBeBilled =  filteredDocs.length;
+  const creditsPerLLMExtract = 4;
+
+  if (extractorOptions.mode === "llm-extraction"){
+    creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length)
+  }
+  // console.log("credits to be billed, ", creditsToBeBilled);
+
   const billingResult = await billTeam(
     team_id,
-    filteredDocs.length
+    creditsToBeBilled
   );
   if (!billingResult.success) {
     return {
@@ -109,6 +119,8 @@ export async function scrapeController(req: Request, res: Response) {
     );
     const endTime = new Date().getTime();
     const timeTakenInSeconds = (endTime - startTime) / 1000;
+    const numTokens = numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
+
     logJob({
       success: result.success,
       message: result.error,
@@ -120,7 +132,9 @@ export async function scrapeController(req: Request, res: Response) {
       url: req.body.url,
       crawlerOptions: crawlerOptions,
       pageOptions: pageOptions,
-      origin: origin,
+      origin: origin, 
+      extractor_options: extractorOptions,
+      num_tokens: numTokens
     });
     return res.status(result.returnCode).json(result);
   } catch (error) {
diff --git a/apps/api/src/lib/LLM-extraction/helpers.ts b/apps/api/src/lib/LLM-extraction/helpers.ts
new file mode 100644
index 0000000..2535964
--- /dev/null
+++ b/apps/api/src/lib/LLM-extraction/helpers.ts
@@ -0,0 +1,18 @@
+
+
+import { encoding_for_model } from "@dqbd/tiktoken";
+import { TiktokenModel } from "@dqbd/tiktoken";
+
+// This function calculates the number of tokens in a text string using GPT-3.5-turbo model
+export function numTokensFromString(message: string, model: string): number {
+    const encoder = encoding_for_model(model as TiktokenModel);
+
+    // Encode the message into tokens
+    const tokens = encoder.encode(message);
+
+    // Free the encoder resources after use
+    encoder.free();
+
+    // Return the number of tokens
+    return tokens.length;
+}
\ No newline at end of file
diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts
index 237fdbe..9fae79d 100644
--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@@ -38,6 +38,7 @@ export async function generateCompletions(
                 // Validate the JSON output against the schema using AJV
                 const validate = ajv.compile(schema);
                 if (!validate(completionResult.llm_extraction)) {
+                    //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
                     throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
                 }
 
diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts
index 9114511..177fe64 100644
--- a/apps/api/src/lib/LLM-extraction/models.ts
+++ b/apps/api/src/lib/LLM-extraction/models.ts
@@ -1,6 +1,7 @@
 import OpenAI from 'openai'
 import { z } from 'zod'
 import { Document, ExtractorOptions } from "../../lib/entities";
+import { numTokensFromString } from './helpers';
 
 // import {
 //   LlamaModel,
@@ -17,7 +18,7 @@ export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
 }
 
 const defaultPrompt =
-  'You are a satistified web scraper. Extract the contents of the webpage'
+  'You are a professional web scraper. Extract the contents of the webpage'
 
 function prepareOpenAIDoc(
   document: Document
@@ -28,12 +29,12 @@ function prepareOpenAIDoc(
     throw new Error("Markdown content is missing in the document.");
   }
 
-  return [{ type: 'text', text: document.markdown }]
+  return [{ type: 'text', text: document.html}]
 }
 
 export async function generateOpenAICompletions({
   client,
-  model = 'gpt-3.5-turbo',
+  model = 'gpt-4-turbo',
   document,
   schema, //TODO - add zod dynamic type checking
   prompt = defaultPrompt,
@@ -49,6 +50,7 @@ export async function generateOpenAICompletions({
   const openai = client as OpenAI
   const content = prepareOpenAIDoc(document)
 
+
   const completion = await openai.chat.completions.create({
     model,
     messages: [
@@ -77,6 +79,8 @@ export async function generateOpenAICompletions({
   // Extract the LLM extraction content from the completion response
   const llmExtraction = JSON.parse(c);
 
+//   console.log("llm extraction: ", llmExtraction);
+
 
   // Return the document with the LLM extraction content added
   return {
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
index 4ceab63..4008785 100644
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@@ -57,6 +57,7 @@ export class Document {
   url?: string; // Used only in /search for now
   content: string;
   markdown?: string;
+  html?: string;
   llm_extraction?: Record<string, any>;
   createdAt?: Date;
   updatedAt?: Date;
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index b278e38..0bd1a82 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -40,8 +40,7 @@ export class WebScraperDataProvider {
   ): Promise<Document[]> {
     const totalUrls = urls.length;
     let processedUrls = 0;
-    console.log("Converting urls to documents");
-    console.log("Total urls", urls);
+  
     const results: (Document | null)[] = new Array(urls.length).fill(null);
     for (let i = 0; i < urls.length; i += this.concurrentRequests) {
       const batchUrls = urls.slice(i, i + this.concurrentRequests);
@@ -195,7 +194,6 @@ export class WebScraperDataProvider {
         documents = await this.getSitemapData(baseUrl, documents);
         documents = documents.concat(pdfDocuments);
 
-        console.log("extraction mode ", this.extractorOptions.mode)
         if(this.extractorOptions.mode === "llm-extraction") {
 
           const llm = new OpenAI()
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index af215ce..12ff9c5 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -106,7 +106,6 @@ export async function scrapSingleUrl(
   toMarkdown: boolean = true,
   pageOptions: PageOptions = { onlyMainContent: true }
 ): Promise<Document> {
-  console.log(`Scraping URL: ${urlToScrap}`);
   urlToScrap = urlToScrap.trim();
 
   const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
@@ -217,6 +216,7 @@ export async function scrapSingleUrl(
     return {
       content: text,
       markdown: text,
+      html: html,
       metadata: { ...metadata, sourceURL: urlToScrap },
     } as Document;
   } catch (error) {
diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts
index 639b3a8..965ac29 100644
--- a/apps/api/src/services/logging/log_job.ts
+++ b/apps/api/src/services/logging/log_job.ts
@@ -8,6 +8,8 @@ export async function logJob(job: FirecrawlJob) {
     if (process.env.ENV !== "production") {
       return;
     }
+
+    // console.log("logg")
     const { data, error } = await supabase_service
       .from("firecrawl_jobs")
       .insert([
@@ -23,6 +25,8 @@ export async function logJob(job: FirecrawlJob) {
           crawler_options: job.crawlerOptions,
           page_options: job.pageOptions,
           origin: job.origin,
+          extractor_options: job.extractor_options,
+          num_tokens: job.num_tokens
         },
       ]);
     if (error) {
diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts
index c65140c..c1858f1 100644
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@@ -1,3 +1,5 @@
+import { ExtractorOptions } from "./lib/entities";
+
 export interface CrawlResult {
   source: string;
   content: string;
@@ -37,6 +39,8 @@ export interface FirecrawlJob {
   crawlerOptions?: any;
   pageOptions?: any;
   origin: string;
+  extractor_options?: ExtractorOptions,
+  num_tokens?: number
 }
 
 export enum RateLimiterMode {