Caleb: trying to get loggin workng
This commit is contained in:
parent
79cd7d2ebc
commit
3ca9e5153f
@ -46,7 +46,7 @@
|
|||||||
"@bull-board/api": "^5.14.2",
|
"@bull-board/api": "^5.14.2",
|
||||||
"@bull-board/express": "^5.8.0",
|
"@bull-board/express": "^5.8.0",
|
||||||
"@devil7softwares/pos": "^1.0.2",
|
"@devil7softwares/pos": "^1.0.2",
|
||||||
"@dqbd/tiktoken": "^1.0.7",
|
"@dqbd/tiktoken": "^1.0.13",
|
||||||
"@logtail/node": "^0.4.12",
|
"@logtail/node": "^0.4.12",
|
||||||
"@nangohq/node": "^0.36.33",
|
"@nangohq/node": "^0.36.33",
|
||||||
"@sentry/node": "^7.48.0",
|
"@sentry/node": "^7.48.0",
|
||||||
|
@ -21,7 +21,7 @@ dependencies:
|
|||||||
specifier: ^1.0.2
|
specifier: ^1.0.2
|
||||||
version: 1.0.2
|
version: 1.0.2
|
||||||
'@dqbd/tiktoken':
|
'@dqbd/tiktoken':
|
||||||
specifier: ^1.0.7
|
specifier: ^1.0.13
|
||||||
version: 1.0.13
|
version: 1.0.13
|
||||||
'@logtail/node':
|
'@logtail/node':
|
||||||
specifier: ^0.4.12
|
specifier: ^0.4.12
|
||||||
|
@ -252,7 +252,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
describe.only("POST /v0/scrape with LLM Extraction", () => {
|
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||||
it("should extract data using LLM extraction mode", async () => {
|
it("should extract data using LLM extraction mode", async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v0/scrape")
|
.post("/v0/scrape")
|
||||||
@ -293,16 +293,6 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
|
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
|
||||||
let llmExtraction = response.body.data.llm_extraction;
|
let llmExtraction = response.body.data.llm_extraction;
|
||||||
|
|
||||||
|
|
||||||
// // Check if llm_extraction is a string and parse it if necessary
|
|
||||||
// if (typeof llmExtraction === 'string') {
|
|
||||||
// llmExtraction = JSON.parse(llmExtraction);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// Print the keys of the response.body for debugging purposes
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Check if the llm_extraction object has the required properties with correct types and values
|
// Check if the llm_extraction object has the required properties with correct types and values
|
||||||
expect(llmExtraction).toHaveProperty("company_mission");
|
expect(llmExtraction).toHaveProperty("company_mission");
|
||||||
expect(typeof llmExtraction.company_mission).toBe("string");
|
expect(typeof llmExtraction.company_mission).toBe("string");
|
||||||
@ -315,6 +305,68 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
}, 60000); // 60 secs
|
}, 60000); // 60 secs
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe.only("POST /v0/scrape for Top 100 Companies", () => {
|
||||||
|
it("should extract data for the top 100 companies", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://companiesmarketcap.com/",
|
||||||
|
pageOptions: {
|
||||||
|
onlyMainContent: true
|
||||||
|
},
|
||||||
|
extractorOptions: {
|
||||||
|
mode: "llm-extraction",
|
||||||
|
extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.",
|
||||||
|
extractionSchema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
companies: {
|
||||||
|
type: "array",
|
||||||
|
items: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
rank: { type: "number" },
|
||||||
|
name: { type: "string" },
|
||||||
|
marketCap: { type: "string" },
|
||||||
|
price: { type: "string" },
|
||||||
|
todayChange: { type: "string" }
|
||||||
|
},
|
||||||
|
required: ["rank", "name", "marketCap", "price", "todayChange"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
required: ["companies"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
// Print the response body to the console for debugging purposes
|
||||||
|
console.log("Response companies:", response.body.data.llm_extraction.companies);
|
||||||
|
|
||||||
|
// Check if the response has the correct structure and data types
|
||||||
|
expect(response.status).toBe(200);
|
||||||
|
expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true);
|
||||||
|
expect(response.body.data.llm_extraction.companies.length).toBe(40);
|
||||||
|
|
||||||
|
// Sample check for the first company
|
||||||
|
const firstCompany = response.body.data.llm_extraction.companies[0];
|
||||||
|
expect(firstCompany).toHaveProperty("name");
|
||||||
|
expect(typeof firstCompany.name).toBe("string");
|
||||||
|
expect(firstCompany).toHaveProperty("marketCap");
|
||||||
|
expect(typeof firstCompany.marketCap).toBe("string");
|
||||||
|
expect(firstCompany).toHaveProperty("price");
|
||||||
|
expect(typeof firstCompany.price).toBe("string");
|
||||||
|
expect(firstCompany).toHaveProperty("todayChange");
|
||||||
|
expect(typeof firstCompany.todayChange).toBe("string");
|
||||||
|
}, 120000); // 120 secs
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
describe("GET /is-production", () => {
|
describe("GET /is-production", () => {
|
||||||
it("should return the production status", async () => {
|
it("should return the production status", async () => {
|
||||||
const response = await request(TEST_URL).get("/is-production");
|
const response = await request(TEST_URL).get("/is-production");
|
||||||
|
@ -8,6 +8,7 @@ import { logJob } from "../services/logging/log_job";
|
|||||||
import { Document } from "../lib/entities";
|
import { Document } from "../lib/entities";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
import Ajv from 'ajv';
|
import Ajv from 'ajv';
|
||||||
|
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||||
|
|
||||||
export async function scrapeHelper(
|
export async function scrapeHelper(
|
||||||
req: Request,
|
req: Request,
|
||||||
@ -51,9 +52,18 @@ export async function scrapeHelper(
|
|||||||
return { success: true, error: "No page found", returnCode: 200 };
|
return { success: true, error: "No page found", returnCode: 200 };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
let creditsToBeBilled = filteredDocs.length;
|
||||||
|
const creditsPerLLMExtract = 4;
|
||||||
|
|
||||||
|
if (extractorOptions.mode === "llm-extraction"){
|
||||||
|
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length)
|
||||||
|
}
|
||||||
|
// console.log("credits to be billed, ", creditsToBeBilled);
|
||||||
|
|
||||||
const billingResult = await billTeam(
|
const billingResult = await billTeam(
|
||||||
team_id,
|
team_id,
|
||||||
filteredDocs.length
|
creditsToBeBilled
|
||||||
);
|
);
|
||||||
if (!billingResult.success) {
|
if (!billingResult.success) {
|
||||||
return {
|
return {
|
||||||
@ -109,6 +119,8 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
);
|
);
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
const numTokens = numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
success: result.success,
|
success: result.success,
|
||||||
message: result.error,
|
message: result.error,
|
||||||
@ -120,7 +132,9 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
origin: origin,
|
origin: origin,
|
||||||
|
extractor_options: extractorOptions,
|
||||||
|
num_tokens: numTokens
|
||||||
});
|
});
|
||||||
return res.status(result.returnCode).json(result);
|
return res.status(result.returnCode).json(result);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
18
apps/api/src/lib/LLM-extraction/helpers.ts
Normal file
18
apps/api/src/lib/LLM-extraction/helpers.ts
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
|
||||||
|
|
||||||
|
import { encoding_for_model } from "@dqbd/tiktoken";
|
||||||
|
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||||
|
|
||||||
|
// This function calculates the number of tokens in a text string using GPT-3.5-turbo model
|
||||||
|
export function numTokensFromString(message: string, model: string): number {
|
||||||
|
const encoder = encoding_for_model(model as TiktokenModel);
|
||||||
|
|
||||||
|
// Encode the message into tokens
|
||||||
|
const tokens = encoder.encode(message);
|
||||||
|
|
||||||
|
// Free the encoder resources after use
|
||||||
|
encoder.free();
|
||||||
|
|
||||||
|
// Return the number of tokens
|
||||||
|
return tokens.length;
|
||||||
|
}
|
@ -38,6 +38,7 @@ export async function generateCompletions(
|
|||||||
// Validate the JSON output against the schema using AJV
|
// Validate the JSON output against the schema using AJV
|
||||||
const validate = ajv.compile(schema);
|
const validate = ajv.compile(schema);
|
||||||
if (!validate(completionResult.llm_extraction)) {
|
if (!validate(completionResult.llm_extraction)) {
|
||||||
|
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
||||||
throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
|
throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import OpenAI from 'openai'
|
import OpenAI from 'openai'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { Document, ExtractorOptions } from "../../lib/entities";
|
import { Document, ExtractorOptions } from "../../lib/entities";
|
||||||
|
import { numTokensFromString } from './helpers';
|
||||||
|
|
||||||
// import {
|
// import {
|
||||||
// LlamaModel,
|
// LlamaModel,
|
||||||
@ -17,7 +18,7 @@ export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const defaultPrompt =
|
const defaultPrompt =
|
||||||
'You are a satistified web scraper. Extract the contents of the webpage'
|
'You are a professional web scraper. Extract the contents of the webpage'
|
||||||
|
|
||||||
function prepareOpenAIDoc(
|
function prepareOpenAIDoc(
|
||||||
document: Document
|
document: Document
|
||||||
@ -28,12 +29,12 @@ function prepareOpenAIDoc(
|
|||||||
throw new Error("Markdown content is missing in the document.");
|
throw new Error("Markdown content is missing in the document.");
|
||||||
}
|
}
|
||||||
|
|
||||||
return [{ type: 'text', text: document.markdown }]
|
return [{ type: 'text', text: document.html}]
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function generateOpenAICompletions({
|
export async function generateOpenAICompletions({
|
||||||
client,
|
client,
|
||||||
model = 'gpt-3.5-turbo',
|
model = 'gpt-4-turbo',
|
||||||
document,
|
document,
|
||||||
schema, //TODO - add zod dynamic type checking
|
schema, //TODO - add zod dynamic type checking
|
||||||
prompt = defaultPrompt,
|
prompt = defaultPrompt,
|
||||||
@ -49,6 +50,7 @@ export async function generateOpenAICompletions({
|
|||||||
const openai = client as OpenAI
|
const openai = client as OpenAI
|
||||||
const content = prepareOpenAIDoc(document)
|
const content = prepareOpenAIDoc(document)
|
||||||
|
|
||||||
|
|
||||||
const completion = await openai.chat.completions.create({
|
const completion = await openai.chat.completions.create({
|
||||||
model,
|
model,
|
||||||
messages: [
|
messages: [
|
||||||
@ -77,6 +79,8 @@ export async function generateOpenAICompletions({
|
|||||||
// Extract the LLM extraction content from the completion response
|
// Extract the LLM extraction content from the completion response
|
||||||
const llmExtraction = JSON.parse(c);
|
const llmExtraction = JSON.parse(c);
|
||||||
|
|
||||||
|
// console.log("llm extraction: ", llmExtraction);
|
||||||
|
|
||||||
|
|
||||||
// Return the document with the LLM extraction content added
|
// Return the document with the LLM extraction content added
|
||||||
return {
|
return {
|
||||||
|
@ -57,6 +57,7 @@ export class Document {
|
|||||||
url?: string; // Used only in /search for now
|
url?: string; // Used only in /search for now
|
||||||
content: string;
|
content: string;
|
||||||
markdown?: string;
|
markdown?: string;
|
||||||
|
html?: string;
|
||||||
llm_extraction?: Record<string, any>;
|
llm_extraction?: Record<string, any>;
|
||||||
createdAt?: Date;
|
createdAt?: Date;
|
||||||
updatedAt?: Date;
|
updatedAt?: Date;
|
||||||
|
@ -40,8 +40,7 @@ export class WebScraperDataProvider {
|
|||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
const totalUrls = urls.length;
|
const totalUrls = urls.length;
|
||||||
let processedUrls = 0;
|
let processedUrls = 0;
|
||||||
console.log("Converting urls to documents");
|
|
||||||
console.log("Total urls", urls);
|
|
||||||
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
||||||
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
||||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||||
@ -195,7 +194,6 @@ export class WebScraperDataProvider {
|
|||||||
documents = await this.getSitemapData(baseUrl, documents);
|
documents = await this.getSitemapData(baseUrl, documents);
|
||||||
documents = documents.concat(pdfDocuments);
|
documents = documents.concat(pdfDocuments);
|
||||||
|
|
||||||
console.log("extraction mode ", this.extractorOptions.mode)
|
|
||||||
if(this.extractorOptions.mode === "llm-extraction") {
|
if(this.extractorOptions.mode === "llm-extraction") {
|
||||||
|
|
||||||
const llm = new OpenAI()
|
const llm = new OpenAI()
|
||||||
|
@ -106,7 +106,6 @@ export async function scrapSingleUrl(
|
|||||||
toMarkdown: boolean = true,
|
toMarkdown: boolean = true,
|
||||||
pageOptions: PageOptions = { onlyMainContent: true }
|
pageOptions: PageOptions = { onlyMainContent: true }
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
console.log(`Scraping URL: ${urlToScrap}`);
|
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||||
@ -217,6 +216,7 @@ export async function scrapSingleUrl(
|
|||||||
return {
|
return {
|
||||||
content: text,
|
content: text,
|
||||||
markdown: text,
|
markdown: text,
|
||||||
|
html: html,
|
||||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
metadata: { ...metadata, sourceURL: urlToScrap },
|
||||||
} as Document;
|
} as Document;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -8,6 +8,8 @@ export async function logJob(job: FirecrawlJob) {
|
|||||||
if (process.env.ENV !== "production") {
|
if (process.env.ENV !== "production") {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// console.log("logg")
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
.from("firecrawl_jobs")
|
.from("firecrawl_jobs")
|
||||||
.insert([
|
.insert([
|
||||||
@ -23,6 +25,8 @@ export async function logJob(job: FirecrawlJob) {
|
|||||||
crawler_options: job.crawlerOptions,
|
crawler_options: job.crawlerOptions,
|
||||||
page_options: job.pageOptions,
|
page_options: job.pageOptions,
|
||||||
origin: job.origin,
|
origin: job.origin,
|
||||||
|
extractor_options: job.extractor_options,
|
||||||
|
num_tokens: job.num_tokens
|
||||||
},
|
},
|
||||||
]);
|
]);
|
||||||
if (error) {
|
if (error) {
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import { ExtractorOptions } from "./lib/entities";
|
||||||
|
|
||||||
export interface CrawlResult {
|
export interface CrawlResult {
|
||||||
source: string;
|
source: string;
|
||||||
content: string;
|
content: string;
|
||||||
@ -37,6 +39,8 @@ export interface FirecrawlJob {
|
|||||||
crawlerOptions?: any;
|
crawlerOptions?: any;
|
||||||
pageOptions?: any;
|
pageOptions?: any;
|
||||||
origin: string;
|
origin: string;
|
||||||
|
extractor_options?: ExtractorOptions,
|
||||||
|
num_tokens?: number
|
||||||
}
|
}
|
||||||
|
|
||||||
export enum RateLimiterMode {
|
export enum RateLimiterMode {
|
||||||
|
Loading…
Reference in New Issue
Block a user