0

Caleb: trying to get loggin workng

This commit is contained in:
Caleb Peffer 2024-04-30 09:20:15 -07:00
parent 79cd7d2ebc
commit 3ca9e5153f
12 changed files with 118 additions and 22 deletions

View File

@ -46,7 +46,7 @@
"@bull-board/api": "^5.14.2",
"@bull-board/express": "^5.8.0",
"@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.7",
"@dqbd/tiktoken": "^1.0.13",
"@logtail/node": "^0.4.12",
"@nangohq/node": "^0.36.33",
"@sentry/node": "^7.48.0",

View File

@ -21,7 +21,7 @@ dependencies:
specifier: ^1.0.2
version: 1.0.2
'@dqbd/tiktoken':
specifier: ^1.0.7
specifier: ^1.0.13
version: 1.0.13
'@logtail/node':
specifier: ^0.4.12

View File

@ -252,7 +252,7 @@ describe("E2E Tests for API Routes", () => {
}, 60000); // 60 seconds
});
describe.only("POST /v0/scrape with LLM Extraction", () => {
describe("POST /v0/scrape with LLM Extraction", () => {
it("should extract data using LLM extraction mode", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
@ -293,16 +293,6 @@ describe("E2E Tests for API Routes", () => {
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
let llmExtraction = response.body.data.llm_extraction;
// // Check if llm_extraction is a string and parse it if necessary
// if (typeof llmExtraction === 'string') {
// llmExtraction = JSON.parse(llmExtraction);
// }
// Print the keys of the response.body for debugging purposes
// Check if the llm_extraction object has the required properties with correct types and values
expect(llmExtraction).toHaveProperty("company_mission");
expect(typeof llmExtraction.company_mission).toBe("string");
@ -315,6 +305,68 @@ describe("E2E Tests for API Routes", () => {
}, 60000); // 60 secs
});
describe.only("POST /v0/scrape for Top 100 Companies", () => {
it("should extract data for the top 100 companies", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://companiesmarketcap.com/",
pageOptions: {
onlyMainContent: true
},
extractorOptions: {
mode: "llm-extraction",
extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.",
extractionSchema: {
type: "object",
properties: {
companies: {
type: "array",
items: {
type: "object",
properties: {
rank: { type: "number" },
name: { type: "string" },
marketCap: { type: "string" },
price: { type: "string" },
todayChange: { type: "string" }
},
required: ["rank", "name", "marketCap", "price", "todayChange"]
}
}
},
required: ["companies"]
}
}
});
// Print the response body to the console for debugging purposes
console.log("Response companies:", response.body.data.llm_extraction.companies);
// Check if the response has the correct structure and data types
expect(response.status).toBe(200);
expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true);
expect(response.body.data.llm_extraction.companies.length).toBe(40);
// Sample check for the first company
const firstCompany = response.body.data.llm_extraction.companies[0];
expect(firstCompany).toHaveProperty("name");
expect(typeof firstCompany.name).toBe("string");
expect(firstCompany).toHaveProperty("marketCap");
expect(typeof firstCompany.marketCap).toBe("string");
expect(firstCompany).toHaveProperty("price");
expect(typeof firstCompany.price).toBe("string");
expect(firstCompany).toHaveProperty("todayChange");
expect(typeof firstCompany.todayChange).toBe("string");
}, 120000); // 120 secs
});
describe("GET /is-production", () => {
it("should return the production status", async () => {
const response = await request(TEST_URL).get("/is-production");

View File

@ -8,6 +8,7 @@ import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import Ajv from 'ajv';
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
export async function scrapeHelper(
req: Request,
@ -51,9 +52,18 @@ export async function scrapeHelper(
return { success: true, error: "No page found", returnCode: 200 };
}
let creditsToBeBilled = filteredDocs.length;
const creditsPerLLMExtract = 4;
if (extractorOptions.mode === "llm-extraction"){
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length)
}
// console.log("credits to be billed, ", creditsToBeBilled);
const billingResult = await billTeam(
team_id,
filteredDocs.length
creditsToBeBilled
);
if (!billingResult.success) {
return {
@ -109,6 +119,8 @@ export async function scrapeController(req: Request, res: Response) {
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens = numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
logJob({
success: result.success,
message: result.error,
@ -120,7 +132,9 @@ export async function scrapeController(req: Request, res: Response) {
url: req.body.url,
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
origin: origin,
origin: origin,
extractor_options: extractorOptions,
num_tokens: numTokens
});
return res.status(result.returnCode).json(result);
} catch (error) {

View File

@ -0,0 +1,18 @@
import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
// This function calculates the number of tokens in a text string using GPT-3.5-turbo model
export function numTokensFromString(message: string, model: string): number {
const encoder = encoding_for_model(model as TiktokenModel);
// Encode the message into tokens
const tokens = encoder.encode(message);
// Free the encoder resources after use
encoder.free();
// Return the number of tokens
return tokens.length;
}

View File

@ -38,6 +38,7 @@ export async function generateCompletions(
// Validate the JSON output against the schema using AJV
const validate = ajv.compile(schema);
if (!validate(completionResult.llm_extraction)) {
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
}

View File

@ -1,6 +1,7 @@
import OpenAI from 'openai'
import { z } from 'zod'
import { Document, ExtractorOptions } from "../../lib/entities";
import { numTokensFromString } from './helpers';
// import {
// LlamaModel,
@ -17,7 +18,7 @@ export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
}
const defaultPrompt =
'You are a satistified web scraper. Extract the contents of the webpage'
'You are a professional web scraper. Extract the contents of the webpage'
function prepareOpenAIDoc(
document: Document
@ -28,12 +29,12 @@ function prepareOpenAIDoc(
throw new Error("Markdown content is missing in the document.");
}
return [{ type: 'text', text: document.markdown }]
return [{ type: 'text', text: document.html}]
}
export async function generateOpenAICompletions({
client,
model = 'gpt-3.5-turbo',
model = 'gpt-4-turbo',
document,
schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt,
@ -49,6 +50,7 @@ export async function generateOpenAICompletions({
const openai = client as OpenAI
const content = prepareOpenAIDoc(document)
const completion = await openai.chat.completions.create({
model,
messages: [
@ -77,6 +79,8 @@ export async function generateOpenAICompletions({
// Extract the LLM extraction content from the completion response
const llmExtraction = JSON.parse(c);
// console.log("llm extraction: ", llmExtraction);
// Return the document with the LLM extraction content added
return {

View File

@ -57,6 +57,7 @@ export class Document {
url?: string; // Used only in /search for now
content: string;
markdown?: string;
html?: string;
llm_extraction?: Record<string, any>;
createdAt?: Date;
updatedAt?: Date;

View File

@ -40,8 +40,7 @@ export class WebScraperDataProvider {
): Promise<Document[]> {
const totalUrls = urls.length;
let processedUrls = 0;
console.log("Converting urls to documents");
console.log("Total urls", urls);
const results: (Document | null)[] = new Array(urls.length).fill(null);
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
const batchUrls = urls.slice(i, i + this.concurrentRequests);
@ -195,7 +194,6 @@ export class WebScraperDataProvider {
documents = await this.getSitemapData(baseUrl, documents);
documents = documents.concat(pdfDocuments);
console.log("extraction mode ", this.extractorOptions.mode)
if(this.extractorOptions.mode === "llm-extraction") {
const llm = new OpenAI()

View File

@ -106,7 +106,6 @@ export async function scrapSingleUrl(
toMarkdown: boolean = true,
pageOptions: PageOptions = { onlyMainContent: true }
): Promise<Document> {
console.log(`Scraping URL: ${urlToScrap}`);
urlToScrap = urlToScrap.trim();
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
@ -217,6 +216,7 @@ export async function scrapSingleUrl(
return {
content: text,
markdown: text,
html: html,
metadata: { ...metadata, sourceURL: urlToScrap },
} as Document;
} catch (error) {

View File

@ -8,6 +8,8 @@ export async function logJob(job: FirecrawlJob) {
if (process.env.ENV !== "production") {
return;
}
// console.log("logg")
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.insert([
@ -23,6 +25,8 @@ export async function logJob(job: FirecrawlJob) {
crawler_options: job.crawlerOptions,
page_options: job.pageOptions,
origin: job.origin,
extractor_options: job.extractor_options,
num_tokens: job.num_tokens
},
]);
if (error) {

View File

@ -1,3 +1,5 @@
import { ExtractorOptions } from "./lib/entities";
export interface CrawlResult {
source: string;
content: string;
@ -37,6 +39,8 @@ export interface FirecrawlJob {
crawlerOptions?: any;
pageOptions?: any;
origin: string;
extractor_options?: ExtractorOptions,
num_tokens?: number
}
export enum RateLimiterMode {