diff --git a/apps/api/package.json b/apps/api/package.json index 00ce1bb..047feaf 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -46,7 +46,7 @@ "@bull-board/api": "^5.14.2", "@bull-board/express": "^5.8.0", "@devil7softwares/pos": "^1.0.2", - "@dqbd/tiktoken": "^1.0.7", + "@dqbd/tiktoken": "^1.0.13", "@logtail/node": "^0.4.12", "@nangohq/node": "^0.36.33", "@sentry/node": "^7.48.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 8062354..bd5e37b 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -21,7 +21,7 @@ dependencies: specifier: ^1.0.2 version: 1.0.2 '@dqbd/tiktoken': - specifier: ^1.0.7 + specifier: ^1.0.13 version: 1.0.13 '@logtail/node': specifier: ^0.4.12 diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 4a47638..fb9d8af 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -252,7 +252,7 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds }); - describe.only("POST /v0/scrape with LLM Extraction", () => { + describe("POST /v0/scrape with LLM Extraction", () => { it("should extract data using LLM extraction mode", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -293,16 +293,6 @@ describe("E2E Tests for API Routes", () => { // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` let llmExtraction = response.body.data.llm_extraction; - - // // Check if llm_extraction is a string and parse it if necessary - // if (typeof llmExtraction === 'string') { - // llmExtraction = JSON.parse(llmExtraction); - // } - - // Print the keys of the response.body for debugging purposes - - - // Check if the llm_extraction object has the required properties with correct types and values expect(llmExtraction).toHaveProperty("company_mission"); expect(typeof llmExtraction.company_mission).toBe("string"); @@ -315,6 +305,68 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 secs }); + describe.only("POST /v0/scrape for Top 100 Companies", () => { + it("should extract data for the top 100 companies", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://companiesmarketcap.com/", + pageOptions: { + onlyMainContent: true + }, + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.", + extractionSchema: { + type: "object", + properties: { + companies: { + type: "array", + items: { + type: "object", + properties: { + rank: { type: "number" }, + name: { type: "string" }, + marketCap: { type: "string" }, + price: { type: "string" }, + todayChange: { type: "string" } + }, + required: ["rank", "name", "marketCap", "price", "todayChange"] + } + } + }, + required: ["companies"] + } + } + }); + + + // Print the response body to the console for debugging purposes + console.log("Response companies:", response.body.data.llm_extraction.companies); + + // Check if the response has the correct structure and data types + expect(response.status).toBe(200); + expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true); + expect(response.body.data.llm_extraction.companies.length).toBe(40); + + // Sample check for the first company + const firstCompany = response.body.data.llm_extraction.companies[0]; + expect(firstCompany).toHaveProperty("name"); + expect(typeof firstCompany.name).toBe("string"); + expect(firstCompany).toHaveProperty("marketCap"); + expect(typeof firstCompany.marketCap).toBe("string"); + expect(firstCompany).toHaveProperty("price"); + expect(typeof firstCompany.price).toBe("string"); + expect(firstCompany).toHaveProperty("todayChange"); + expect(typeof firstCompany.todayChange).toBe("string"); + }, 120000); // 120 secs + }); + + + + describe("GET /is-production", () => { it("should return the production status", async () => { const response = await request(TEST_URL).get("/is-production"); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 43b8ca4..d2340e8 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -8,6 +8,7 @@ import { logJob } from "../services/logging/log_job"; import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import Ajv from 'ajv'; +import { numTokensFromString } from '../lib/LLM-extraction/helpers'; export async function scrapeHelper( req: Request, @@ -51,9 +52,18 @@ export async function scrapeHelper( return { success: true, error: "No page found", returnCode: 200 }; } + + let creditsToBeBilled = filteredDocs.length; + const creditsPerLLMExtract = 4; + + if (extractorOptions.mode === "llm-extraction"){ + creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length) + } + // console.log("credits to be billed, ", creditsToBeBilled); + const billingResult = await billTeam( team_id, - filteredDocs.length + creditsToBeBilled ); if (!billingResult.success) { return { @@ -109,6 +119,8 @@ export async function scrapeController(req: Request, res: Response) { ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; + const numTokens = numTokensFromString(result.data.markdown, "gpt-3.5-turbo") + logJob({ success: result.success, message: result.error, @@ -120,7 +132,9 @@ export async function scrapeController(req: Request, res: Response) { url: req.body.url, crawlerOptions: crawlerOptions, pageOptions: pageOptions, - origin: origin, + origin: origin, + extractor_options: extractorOptions, + num_tokens: numTokens }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/lib/LLM-extraction/helpers.ts b/apps/api/src/lib/LLM-extraction/helpers.ts new file mode 100644 index 0000000..2535964 --- /dev/null +++ b/apps/api/src/lib/LLM-extraction/helpers.ts @@ -0,0 +1,18 @@ + + +import { encoding_for_model } from "@dqbd/tiktoken"; +import { TiktokenModel } from "@dqbd/tiktoken"; + +// This function calculates the number of tokens in a text string using GPT-3.5-turbo model +export function numTokensFromString(message: string, model: string): number { + const encoder = encoding_for_model(model as TiktokenModel); + + // Encode the message into tokens + const tokens = encoder.encode(message); + + // Free the encoder resources after use + encoder.free(); + + // Return the number of tokens + return tokens.length; +} \ No newline at end of file diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 237fdbe..9fae79d 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -38,6 +38,7 @@ export async function generateCompletions( // Validate the JSON output against the schema using AJV const validate = ajv.compile(schema); if (!validate(completionResult.llm_extraction)) { + //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`); } diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 9114511..177fe64 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -1,6 +1,7 @@ import OpenAI from 'openai' import { z } from 'zod' import { Document, ExtractorOptions } from "../../lib/entities"; +import { numTokensFromString } from './helpers'; // import { // LlamaModel, @@ -17,7 +18,7 @@ export type ScraperCompletionResult> = { } const defaultPrompt = - 'You are a satistified web scraper. Extract the contents of the webpage' + 'You are a professional web scraper. Extract the contents of the webpage' function prepareOpenAIDoc( document: Document @@ -28,12 +29,12 @@ function prepareOpenAIDoc( throw new Error("Markdown content is missing in the document."); } - return [{ type: 'text', text: document.markdown }] + return [{ type: 'text', text: document.html}] } export async function generateOpenAICompletions({ client, - model = 'gpt-3.5-turbo', + model = 'gpt-4-turbo', document, schema, //TODO - add zod dynamic type checking prompt = defaultPrompt, @@ -49,6 +50,7 @@ export async function generateOpenAICompletions({ const openai = client as OpenAI const content = prepareOpenAIDoc(document) + const completion = await openai.chat.completions.create({ model, messages: [ @@ -77,6 +79,8 @@ export async function generateOpenAICompletions({ // Extract the LLM extraction content from the completion response const llmExtraction = JSON.parse(c); +// console.log("llm extraction: ", llmExtraction); + // Return the document with the LLM extraction content added return { diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 4ceab63..4008785 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -57,6 +57,7 @@ export class Document { url?: string; // Used only in /search for now content: string; markdown?: string; + html?: string; llm_extraction?: Record; createdAt?: Date; updatedAt?: Date; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index b278e38..0bd1a82 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -40,8 +40,7 @@ export class WebScraperDataProvider { ): Promise { const totalUrls = urls.length; let processedUrls = 0; - console.log("Converting urls to documents"); - console.log("Total urls", urls); + const results: (Document | null)[] = new Array(urls.length).fill(null); for (let i = 0; i < urls.length; i += this.concurrentRequests) { const batchUrls = urls.slice(i, i + this.concurrentRequests); @@ -195,7 +194,6 @@ export class WebScraperDataProvider { documents = await this.getSitemapData(baseUrl, documents); documents = documents.concat(pdfDocuments); - console.log("extraction mode ", this.extractorOptions.mode) if(this.extractorOptions.mode === "llm-extraction") { const llm = new OpenAI() diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index af215ce..12ff9c5 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -106,7 +106,6 @@ export async function scrapSingleUrl( toMarkdown: boolean = true, pageOptions: PageOptions = { onlyMainContent: true } ): Promise { - console.log(`Scraping URL: ${urlToScrap}`); urlToScrap = urlToScrap.trim(); const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { @@ -217,6 +216,7 @@ export async function scrapSingleUrl( return { content: text, markdown: text, + html: html, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 639b3a8..965ac29 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -8,6 +8,8 @@ export async function logJob(job: FirecrawlJob) { if (process.env.ENV !== "production") { return; } + + // console.log("logg") const { data, error } = await supabase_service .from("firecrawl_jobs") .insert([ @@ -23,6 +25,8 @@ export async function logJob(job: FirecrawlJob) { crawler_options: job.crawlerOptions, page_options: job.pageOptions, origin: job.origin, + extractor_options: job.extractor_options, + num_tokens: job.num_tokens }, ]); if (error) { diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index c65140c..c1858f1 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -1,3 +1,5 @@ +import { ExtractorOptions } from "./lib/entities"; + export interface CrawlResult { source: string; content: string; @@ -37,6 +39,8 @@ export interface FirecrawlJob { crawlerOptions?: any; pageOptions?: any; origin: string; + extractor_options?: ExtractorOptions, + num_tokens?: number } export enum RateLimiterMode {