Caleb: got it to a testable state I believe
This commit is contained in:
parent
6ee1f2d3bc
commit
06497729e2
@ -8,7 +8,7 @@ dotenv.config();
|
|||||||
const TEST_URL = "http://127.0.0.1:3002";
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
|
|
||||||
describe("E2E Tests for API Routes", () => {
|
describe.only("E2E Tests for API Routes", () => {
|
||||||
beforeAll(() => {
|
beforeAll(() => {
|
||||||
process.env.USE_DB_AUTHENTICATION = "true";
|
process.env.USE_DB_AUTHENTICATION = "true";
|
||||||
});
|
});
|
||||||
@ -252,6 +252,48 @@ const TEST_URL = "http://127.0.0.1:3002";
|
|||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||||
|
it("should extract data using LLM extraction mode", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://mendable.ai",
|
||||||
|
pageOptions: {
|
||||||
|
onlyMainContent: true
|
||||||
|
},
|
||||||
|
extractorOptions: {
|
||||||
|
extractorMode: "llm-extract",
|
||||||
|
extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||||
|
extractorSchema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
company_mission: {
|
||||||
|
type: "string"
|
||||||
|
},
|
||||||
|
supports_sso: {
|
||||||
|
type: "boolean"
|
||||||
|
},
|
||||||
|
is_open_source: {
|
||||||
|
type: "boolean"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
required: ["company_mission", "supports_sso", "is_open_source"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log("Response:", response.body);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data).toHaveProperty("company_mission");
|
||||||
|
expect(response.body.data).toHaveProperty("supports_sso");
|
||||||
|
expect(response.body.data).toHaveProperty("is_open_source");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe("GET /is-production", () => {
|
describe("GET /is-production", () => {
|
||||||
it("should return the production status", async () => {
|
it("should return the production status", async () => {
|
||||||
const response = await request(TEST_URL).get("/is-production");
|
const response = await request(TEST_URL).get("/is-production");
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import { ExtractorOptions } from './../lib/entities';
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||||
@ -11,7 +12,8 @@ export async function scrapeHelper(
|
|||||||
req: Request,
|
req: Request,
|
||||||
team_id: string,
|
team_id: string,
|
||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
pageOptions: any
|
pageOptions: any,
|
||||||
|
extractorOptions: any
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
@ -35,6 +37,7 @@ export async function scrapeHelper(
|
|||||||
...crawlerOptions,
|
...crawlerOptions,
|
||||||
},
|
},
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
|
extractorOptions: extractorOptions
|
||||||
});
|
});
|
||||||
|
|
||||||
const docs = await a.getDocuments(false);
|
const docs = await a.getDocuments(false);
|
||||||
@ -79,6 +82,9 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||||
|
const extractorOptions = req.body.extractorOptions ?? {
|
||||||
|
mode: "markdown"
|
||||||
|
}
|
||||||
const origin = req.body.origin ?? "api";
|
const origin = req.body.origin ?? "api";
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -96,7 +102,8 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
req,
|
req,
|
||||||
team_id,
|
team_id,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions
|
pageOptions,
|
||||||
|
extractorOptions
|
||||||
);
|
);
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
48
apps/api/src/lib/LLM-extraction/index.ts
Normal file
48
apps/api/src/lib/LLM-extraction/index.ts
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
import Turndown from 'turndown'
|
||||||
|
import OpenAI from 'openai'
|
||||||
|
// import { LlamaModel } from 'node-llama-cpp'
|
||||||
|
import { z } from 'zod'
|
||||||
|
import { zodToJsonSchema } from 'zod-to-json-schema'
|
||||||
|
import {
|
||||||
|
ScraperCompletionResult,
|
||||||
|
generateOpenAICompletions,
|
||||||
|
} from './models.js'
|
||||||
|
import { ExtractorOptions } from '../entities.js'
|
||||||
|
|
||||||
|
// Generate completion using OpenAI
|
||||||
|
export function generateCompletions(
|
||||||
|
documents: Document[],
|
||||||
|
extractionOptions: ExtractorOptions
|
||||||
|
): Promise < ScraperCompletionResult < T >> [] {
|
||||||
|
// const schema = zodToJsonSchema(options.schema)
|
||||||
|
|
||||||
|
const schema = extractionOptions.extractionSchema;
|
||||||
|
const prompt = extractionOptions.extractionPrompt;
|
||||||
|
|
||||||
|
const loader = documents.map(async (document, i) => {
|
||||||
|
switch (this.client.constructor) {
|
||||||
|
case true:
|
||||||
|
return generateOpenAICompletions<T>(
|
||||||
|
this.client as OpenAI,
|
||||||
|
|
||||||
|
schema,
|
||||||
|
options?.prompt,
|
||||||
|
options?.temperature
|
||||||
|
)
|
||||||
|
|
||||||
|
//TODO add other models
|
||||||
|
// case LlamaModel:
|
||||||
|
// return generateLlamaCompletions<T>(
|
||||||
|
// this.client,
|
||||||
|
// await page,
|
||||||
|
// schema,
|
||||||
|
// options?.prompt,
|
||||||
|
// options?.temperature
|
||||||
|
// )
|
||||||
|
default:
|
||||||
|
throw new Error('Invalid client')
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return loader
|
||||||
|
}
|
@ -1,6 +1,8 @@
|
|||||||
import OpenAI from 'openai'
|
import OpenAI from 'openai'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { ScraperLoadResult } from './types'
|
import { ScraperLoadResult } from './types'
|
||||||
|
import { Document, ExtractorOptions } from "../../lib/entities";
|
||||||
|
|
||||||
// import {
|
// import {
|
||||||
// LlamaModel,
|
// LlamaModel,
|
||||||
// LlamaJsonSchemaGrammar,
|
// LlamaJsonSchemaGrammar,
|
||||||
@ -8,41 +10,45 @@ import { ScraperLoadResult } from './types'
|
|||||||
// LlamaChatSession,
|
// LlamaChatSession,
|
||||||
// GbnfJsonSchema,
|
// GbnfJsonSchema,
|
||||||
// } from 'node-llama-cpp'
|
// } from 'node-llama-cpp'
|
||||||
import { JsonSchema7Type } from 'zod-to-json-schema'
|
// import { JsonSchema7Type } from 'zod-to-json-schema'
|
||||||
|
|
||||||
export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
|
export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
|
||||||
data: z.infer<T> | null
|
data: any | null
|
||||||
url: string
|
url: string
|
||||||
}
|
}
|
||||||
|
|
||||||
const defaultPrompt =
|
const defaultPrompt =
|
||||||
'You are a satistified web scraper. Extract the contents of the webpage'
|
'You are a satistified web scraper. Extract the contents of the webpage'
|
||||||
|
|
||||||
function prepareOpenAIPage(
|
function prepareOpenAIDoc(
|
||||||
page: ScraperLoadResult
|
document: Document
|
||||||
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
|
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
|
||||||
if (page.mode === 'image') {
|
|
||||||
return [
|
// Check if the markdown content exists in the document
|
||||||
{
|
if (!document.markdown) {
|
||||||
type: 'image_url',
|
throw new Error("Markdown content is missing in the document.");
|
||||||
image_url: { url: `data:image/jpeg;base64,${page.content}` },
|
|
||||||
},
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return [{ type: 'text', text: page.content }]
|
return [{ type: 'text', text: document.markdown }]
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
|
export async function generateOpenAICompletions<T>({
|
||||||
|
client,
|
||||||
|
model = 'gpt-3.5-turbo',
|
||||||
|
document,
|
||||||
|
schema, //TODO - add zod dynamic type checking
|
||||||
|
prompt = defaultPrompt,
|
||||||
|
temperature
|
||||||
|
}: {
|
||||||
client: OpenAI,
|
client: OpenAI,
|
||||||
model: string = 'gpt-3.5-turbo',
|
model?: string,
|
||||||
page: ScraperLoadResult,
|
document: Document,
|
||||||
schema: JsonSchema7Type,
|
schema: any, // This should be replaced with a proper Zod schema type when available
|
||||||
prompt: string = defaultPrompt,
|
prompt?: string,
|
||||||
temperature?: number
|
temperature?: number
|
||||||
): Promise<ScraperCompletionResult<T>> {
|
}): Promise<Document> {
|
||||||
const openai = client as OpenAI
|
const openai = client as OpenAI
|
||||||
const content = prepareOpenAIPage(page)
|
const content = prepareOpenAIDoc(document)
|
||||||
|
|
||||||
const completion = await openai.chat.completions.create({
|
const completion = await openai.chat.completions.create({
|
||||||
model,
|
model,
|
||||||
@ -68,10 +74,16 @@ export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
|
|||||||
})
|
})
|
||||||
|
|
||||||
const c = completion.choices[0].message.tool_calls[0].function.arguments
|
const c = completion.choices[0].message.tool_calls[0].function.arguments
|
||||||
|
|
||||||
|
// Extract the LLM extraction content from the completion response
|
||||||
|
const llmExtraction = c;
|
||||||
|
|
||||||
|
// Return the document with the LLM extraction content added
|
||||||
return {
|
return {
|
||||||
data: JSON.parse(c),
|
...document,
|
||||||
url: page.url,
|
llm_extraction: llmExtraction
|
||||||
}
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
|
// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
|
||||||
|
@ -3,8 +3,3 @@ export type ScraperLoadOptions = {
|
|||||||
closeOnFinish?: boolean
|
closeOnFinish?: boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
export type ScraperLoadResult = {
|
|
||||||
url: string
|
|
||||||
content: string
|
|
||||||
mode: ScraperLoadOptions['mode']
|
|
||||||
}
|
|
@ -16,6 +16,12 @@ export type PageOptions = {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export type ExtractorOptions = {
|
||||||
|
mode: "markdown" | "llm-extraction";
|
||||||
|
extractionPrompt?: string;
|
||||||
|
extractionSchema?: Record<string, any>;
|
||||||
|
}
|
||||||
|
|
||||||
export type SearchOptions = {
|
export type SearchOptions = {
|
||||||
limit?: number;
|
limit?: number;
|
||||||
tbs?: string;
|
tbs?: string;
|
||||||
@ -38,6 +44,7 @@ export type WebScraperOptions = {
|
|||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
};
|
};
|
||||||
pageOptions?: PageOptions;
|
pageOptions?: PageOptions;
|
||||||
|
extractorOptions?: ExtractorOptions;
|
||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -50,6 +57,7 @@ export class Document {
|
|||||||
url?: string; // Used only in /search for now
|
url?: string; // Used only in /search for now
|
||||||
content: string;
|
content: string;
|
||||||
markdown?: string;
|
markdown?: string;
|
||||||
|
llm_extraction?: string;
|
||||||
createdAt?: Date;
|
createdAt?: Date;
|
||||||
updatedAt?: Date;
|
updatedAt?: Date;
|
||||||
type?: string;
|
type?: string;
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
|
import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";
|
||||||
import { Progress } from "../../lib/entities";
|
import { Progress } from "../../lib/entities";
|
||||||
import { scrapSingleUrl } from "./single_url";
|
import { scrapSingleUrl } from "./single_url";
|
||||||
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
||||||
@ -7,6 +7,8 @@ import { getValue, setValue } from "../../services/redis";
|
|||||||
import { getImageDescription } from "./utils/imageDescription";
|
import { getImageDescription } from "./utils/imageDescription";
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
||||||
|
import OpenAI from 'openai'
|
||||||
|
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
private urls: string[] = [""];
|
private urls: string[] = [""];
|
||||||
@ -19,6 +21,7 @@ export class WebScraperDataProvider {
|
|||||||
private concurrentRequests: number = 20;
|
private concurrentRequests: number = 20;
|
||||||
private generateImgAltText: boolean = false;
|
private generateImgAltText: boolean = false;
|
||||||
private pageOptions?: PageOptions;
|
private pageOptions?: PageOptions;
|
||||||
|
private extractorOptions?: ExtractorOptions;
|
||||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";
|
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";
|
||||||
|
|
||||||
@ -191,6 +194,22 @@ export class WebScraperDataProvider {
|
|||||||
documents = await this.getSitemapData(baseUrl, documents);
|
documents = await this.getSitemapData(baseUrl, documents);
|
||||||
documents = documents.concat(pdfDocuments);
|
documents = documents.concat(pdfDocuments);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if(this.extractorOptions.mode === "llm-extraction") {
|
||||||
|
|
||||||
|
// const llm = new OpenAI()
|
||||||
|
// generateCompletions(
|
||||||
|
// client=llm,
|
||||||
|
// page =,
|
||||||
|
// schema=
|
||||||
|
|
||||||
|
// )
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
await this.setCachedDocuments(documents);
|
await this.setCachedDocuments(documents);
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
documents = documents.splice(0, this.limit);
|
documents = documents.splice(0, this.limit);
|
||||||
@ -376,6 +395,7 @@ export class WebScraperDataProvider {
|
|||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
||||||
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
|
|
||||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||||
|
Loading…
Reference in New Issue
Block a user