From 6ee1f2d3bc954189f83040f89e070d8b69ff9fb7 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sun, 28 Apr 2024 13:59:35 -0700 Subject: [PATCH 01/15] Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper --- apps/api/package.json | 5 +- apps/api/pnpm-lock.yaml | 42 +++++--- apps/api/src/lib/LLM-extraction/models.ts | 99 +++++++++++++++++++ apps/api/src/lib/LLM-extraction/types.ts | 10 ++ apps/api/src/scraper/WebScraper/single_url.ts | 2 + 5 files changed, 145 insertions(+), 13 deletions(-) create mode 100644 apps/api/src/lib/LLM-extraction/models.ts create mode 100644 apps/api/src/lib/LLM-extraction/types.ts diff --git a/apps/api/package.json b/apps/api/package.json index 078c6b6..0f826da 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -68,6 +68,7 @@ "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.3.2", "joplin-turndown-plugin-gfm": "^1.0.12", + "json-schema-to-zod": "^2.1.0", "keyword-extractor": "^0.0.25", "langchain": "^0.1.25", "languagedetect": "^2.0.0", @@ -93,7 +94,9 @@ "unstructured-client": "^0.9.4", "uuid": "^9.0.1", "wordpos": "^2.1.0", - "xml2js": "^0.6.2" + "xml2js": "^0.6.2", + "zod": "^3.23.4", + "zod-to-json-schema": "^3.23.0" }, "nodemonConfig": { "ignore": [ diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 2b61222..d72dad0 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -86,6 +86,9 @@ dependencies: joplin-turndown-plugin-gfm: specifier: ^1.0.12 version: 1.0.12 + json-schema-to-zod: + specifier: ^2.1.0 + version: 2.1.0 keyword-extractor: specifier: ^0.0.25 version: 0.0.25 @@ -164,6 +167,12 @@ dependencies: xml2js: specifier: ^0.6.2 version: 0.6.2 + zod: + specifier: ^3.23.4 + version: 3.23.4 + zod-to-json-schema: + specifier: ^3.23.0 + version: 3.23.0(zod@3.23.4) devDependencies: '@flydotio/dockerfile': @@ -1200,7 +1209,7 @@ packages: redis: 4.6.13 typesense: 1.7.2(@babel/runtime@7.24.0) uuid: 9.0.1 - zod: 3.22.4 + zod: 3.23.4 transitivePeerDependencies: - encoding dev: false @@ -1218,8 +1227,8 @@ packages: p-queue: 6.6.2 p-retry: 4.6.2 uuid: 9.0.1 - zod: 3.22.4 - zod-to-json-schema: 3.22.4(zod@3.22.4) + zod: 3.23.4 + zod-to-json-schema: 3.23.0(zod@3.23.4) dev: false /@langchain/openai@0.0.18: @@ -1229,8 +1238,8 @@ packages: '@langchain/core': 0.1.43 js-tiktoken: 1.0.10 openai: 4.28.4 - zod: 3.22.4 - zod-to-json-schema: 3.22.4(zod@3.22.4) + zod: 3.23.4 + zod-to-json-schema: 3.23.0(zod@3.23.4) transitivePeerDependencies: - encoding dev: false @@ -3985,6 +3994,11 @@ packages: /json-parse-even-better-errors@2.3.1: resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==} + /json-schema-to-zod@2.1.0: + resolution: {integrity: sha512-7ishNgYY+AbIKeeHcp5xCOdJbdVwSfDx/4V2ktc16LUusCJJbz2fEKdWUmAxhKIiYzhZ9Fp4E8OsAoM/h9cOLA==} + hasBin: true + dev: false + /json5@2.2.3: resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==} engines: {node: '>=6'} @@ -4209,8 +4223,8 @@ packages: redis: 4.6.13 uuid: 9.0.1 yaml: 2.4.1 - zod: 3.22.4 - zod-to-json-schema: 3.22.4(zod@3.22.4) + zod: 3.23.4 + zod-to-json-schema: 3.23.0(zod@3.23.4) transitivePeerDependencies: - '@aws-crypto/sha256-js' - '@aws-sdk/client-bedrock-agent-runtime' @@ -5069,7 +5083,7 @@ packages: sbd: 1.0.19 typescript: 5.4.5 uuid: 9.0.1 - zod: 3.22.4 + zod: 3.23.4 transitivePeerDependencies: - debug dev: false @@ -6185,14 +6199,18 @@ packages: engines: {node: '>=10'} dev: true - /zod-to-json-schema@3.22.4(zod@3.22.4): - resolution: {integrity: sha512-2Ed5dJ+n/O3cU383xSY28cuVi0BCQhF8nYqWU5paEpl7fVdqdAmiLdqLyfblbNdfOFwFfi/mqU4O1pwc60iBhQ==} + /zod-to-json-schema@3.23.0(zod@3.23.4): + resolution: {integrity: sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==} peerDependencies: - zod: ^3.22.4 + zod: ^3.23.3 dependencies: - zod: 3.22.4 + zod: 3.23.4 dev: false /zod@3.22.4: resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==} dev: false + + /zod@3.23.4: + resolution: {integrity: sha512-/AtWOKbBgjzEYYQRNfoGKHObgfAZag6qUJX1VbHo2PRBgS+wfWagEY2mizjfyAPcGesrJOcx/wcl0L9WnVrHFw==} + dev: false diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts new file mode 100644 index 0000000..6e57024 --- /dev/null +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -0,0 +1,99 @@ +import OpenAI from 'openai' +import { z } from 'zod' +import { ScraperLoadResult } from './types' +// import { +// LlamaModel, +// LlamaJsonSchemaGrammar, +// LlamaContext, +// LlamaChatSession, +// GbnfJsonSchema, +// } from 'node-llama-cpp' +import { JsonSchema7Type } from 'zod-to-json-schema' + +export type ScraperCompletionResult> = { + data: z.infer | null + url: string +} + +const defaultPrompt = + 'You are a satistified web scraper. Extract the contents of the webpage' + +function prepareOpenAIPage( + page: ScraperLoadResult +): OpenAI.Chat.Completions.ChatCompletionContentPart[] { + if (page.mode === 'image') { + return [ + { + type: 'image_url', + image_url: { url: `data:image/jpeg;base64,${page.content}` }, + }, + ] + } + + return [{ type: 'text', text: page.content }] +} + +export async function generateOpenAICompletions>( + client: OpenAI, + model: string = 'gpt-3.5-turbo', + page: ScraperLoadResult, + schema: JsonSchema7Type, + prompt: string = defaultPrompt, + temperature?: number +): Promise> { + const openai = client as OpenAI + const content = prepareOpenAIPage(page) + + const completion = await openai.chat.completions.create({ + model, + messages: [ + { + role: 'system', + content: prompt, + }, + { role: 'user', content }, + ], + tools: [ + { + type: 'function', + function: { + name: 'extract_content', + description: 'Extracts the content from the given webpage(s)', + parameters: schema, + }, + }, + ], + tool_choice: 'auto', + temperature, + }) + + const c = completion.choices[0].message.tool_calls[0].function.arguments + return { + data: JSON.parse(c), + url: page.url, + } +} + +// export async function generateLlamaCompletions>( +// model: LlamaModel, +// page: ScraperLoadResult, +// schema: JsonSchema7Type, +// prompt: string = defaultPrompt, +// temperature?: number +// ): Promise> { +// const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on +// const context = new LlamaContext({ model }) +// const session = new LlamaChatSession({ context }) +// const pagePrompt = `${prompt}\n${page.content}` + +// const result = await session.prompt(pagePrompt, { +// grammar, +// temperature, +// }) + +// const parsed = grammar.parse(result) +// return { +// data: parsed, +// url: page.url, +// } +// } diff --git a/apps/api/src/lib/LLM-extraction/types.ts b/apps/api/src/lib/LLM-extraction/types.ts new file mode 100644 index 0000000..6f3a543 --- /dev/null +++ b/apps/api/src/lib/LLM-extraction/types.ts @@ -0,0 +1,10 @@ +export type ScraperLoadOptions = { + mode?: 'html' | 'text' | 'markdown' | 'image' + closeOnFinish?: boolean +} + +export type ScraperLoadResult = { + url: string + content: string + mode: ScraperLoadOptions['mode'] +} \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 6ab3003..80d7fa2 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -140,6 +140,8 @@ export async function scrapSingleUrl( } break; } + + //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); return [await parseMarkdown(cleanedHtml), text]; From 06497729e2f97ba1449bf218eabbe96b3ad8f877 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sun, 28 Apr 2024 15:52:09 -0700 Subject: [PATCH 02/15] Caleb: got it to a testable state I believe --- .../src/__tests__/e2e_withAuth/index.test.ts | 44 ++++++++++++++- apps/api/src/controllers/scrape.ts | 11 +++- apps/api/src/lib/LLM-extraction/index.ts | 48 ++++++++++++++++ apps/api/src/lib/LLM-extraction/models.ts | 56 +++++++++++-------- apps/api/src/lib/LLM-extraction/types.ts | 5 -- apps/api/src/lib/entities.ts | 8 +++ apps/api/src/scraper/WebScraper/index.ts | 22 +++++++- 7 files changed, 163 insertions(+), 31 deletions(-) create mode 100644 apps/api/src/lib/LLM-extraction/index.ts diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 2b4c7e9..fcc7062 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -8,7 +8,7 @@ dotenv.config(); const TEST_URL = "http://127.0.0.1:3002"; - describe("E2E Tests for API Routes", () => { + describe.only("E2E Tests for API Routes", () => { beforeAll(() => { process.env.USE_DB_AUTHENTICATION = "true"; }); @@ -252,6 +252,48 @@ const TEST_URL = "http://127.0.0.1:3002"; }, 60000); // 60 seconds }); + describe("POST /v0/scrape with LLM Extraction", () => { + it("should extract data using LLM extraction mode", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + pageOptions: { + onlyMainContent: true + }, + extractorOptions: { + extractorMode: "llm-extract", + extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractorSchema: { + type: "object", + properties: { + company_mission: { + type: "string" + }, + supports_sso: { + type: "boolean" + }, + is_open_source: { + type: "boolean" + } + }, + required: ["company_mission", "supports_sso", "is_open_source"] + } + } + }); + + console.log("Response:", response.body); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("company_mission"); + expect(response.body.data).toHaveProperty("supports_sso"); + expect(response.body.data).toHaveProperty("is_open_source"); + }); + }); + describe("GET /is-production", () => { it("should return the production status", async () => { const response = await request(TEST_URL).get("/is-production"); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index eebdcb4..13c4dd2 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -1,3 +1,4 @@ +import { ExtractorOptions } from './../lib/entities'; import { Request, Response } from "express"; import { WebScraperDataProvider } from "../scraper/WebScraper"; import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; @@ -11,7 +12,8 @@ export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, - pageOptions: any + pageOptions: any, + extractorOptions: any ): Promise<{ success: boolean; error?: string; @@ -35,6 +37,7 @@ export async function scrapeHelper( ...crawlerOptions, }, pageOptions: pageOptions, + extractorOptions: extractorOptions }); const docs = await a.getDocuments(false); @@ -79,6 +82,9 @@ export async function scrapeController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const extractorOptions = req.body.extractorOptions ?? { + mode: "markdown" + } const origin = req.body.origin ?? "api"; try { @@ -96,7 +102,8 @@ export async function scrapeController(req: Request, res: Response) { req, team_id, crawlerOptions, - pageOptions + pageOptions, + extractorOptions ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts new file mode 100644 index 0000000..b52c931 --- /dev/null +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -0,0 +1,48 @@ +import Turndown from 'turndown' +import OpenAI from 'openai' +// import { LlamaModel } from 'node-llama-cpp' +import { z } from 'zod' +import { zodToJsonSchema } from 'zod-to-json-schema' +import { + ScraperCompletionResult, + generateOpenAICompletions, +} from './models.js' +import { ExtractorOptions } from '../entities.js' + + // Generate completion using OpenAI +export function generateCompletions( + documents: Document[], + extractionOptions: ExtractorOptions +): Promise < ScraperCompletionResult < T >> [] { + // const schema = zodToJsonSchema(options.schema) + + const schema = extractionOptions.extractionSchema; + const prompt = extractionOptions.extractionPrompt; + + const loader = documents.map(async (document, i) => { + switch (this.client.constructor) { + case true: + return generateOpenAICompletions( + this.client as OpenAI, + + schema, + options?.prompt, + options?.temperature + ) + + //TODO add other models + // case LlamaModel: + // return generateLlamaCompletions( + // this.client, + // await page, + // schema, + // options?.prompt, + // options?.temperature + // ) + default: + throw new Error('Invalid client') + } + }) + + return loader +} diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 6e57024..7f50f72 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -1,6 +1,8 @@ import OpenAI from 'openai' import { z } from 'zod' import { ScraperLoadResult } from './types' +import { Document, ExtractorOptions } from "../../lib/entities"; + // import { // LlamaModel, // LlamaJsonSchemaGrammar, @@ -8,41 +10,45 @@ import { ScraperLoadResult } from './types' // LlamaChatSession, // GbnfJsonSchema, // } from 'node-llama-cpp' -import { JsonSchema7Type } from 'zod-to-json-schema' +// import { JsonSchema7Type } from 'zod-to-json-schema' export type ScraperCompletionResult> = { - data: z.infer | null + data: any | null url: string } const defaultPrompt = 'You are a satistified web scraper. Extract the contents of the webpage' -function prepareOpenAIPage( - page: ScraperLoadResult +function prepareOpenAIDoc( + document: Document ): OpenAI.Chat.Completions.ChatCompletionContentPart[] { - if (page.mode === 'image') { - return [ - { - type: 'image_url', - image_url: { url: `data:image/jpeg;base64,${page.content}` }, - }, - ] + + // Check if the markdown content exists in the document + if (!document.markdown) { + throw new Error("Markdown content is missing in the document."); } - return [{ type: 'text', text: page.content }] + return [{ type: 'text', text: document.markdown }] } -export async function generateOpenAICompletions>( +export async function generateOpenAICompletions({ + client, + model = 'gpt-3.5-turbo', + document, + schema, //TODO - add zod dynamic type checking + prompt = defaultPrompt, + temperature +}: { client: OpenAI, - model: string = 'gpt-3.5-turbo', - page: ScraperLoadResult, - schema: JsonSchema7Type, - prompt: string = defaultPrompt, + model?: string, + document: Document, + schema: any, // This should be replaced with a proper Zod schema type when available + prompt?: string, temperature?: number -): Promise> { +}): Promise { const openai = client as OpenAI - const content = prepareOpenAIPage(page) + const content = prepareOpenAIDoc(document) const completion = await openai.chat.completions.create({ model, @@ -68,10 +74,16 @@ export async function generateOpenAICompletions>( }) const c = completion.choices[0].message.tool_calls[0].function.arguments + + // Extract the LLM extraction content from the completion response + const llmExtraction = c; + + // Return the document with the LLM extraction content added return { - data: JSON.parse(c), - url: page.url, - } + ...document, + llm_extraction: llmExtraction + }; + } // export async function generateLlamaCompletions>( diff --git a/apps/api/src/lib/LLM-extraction/types.ts b/apps/api/src/lib/LLM-extraction/types.ts index 6f3a543..2112189 100644 --- a/apps/api/src/lib/LLM-extraction/types.ts +++ b/apps/api/src/lib/LLM-extraction/types.ts @@ -3,8 +3,3 @@ export type ScraperLoadOptions = { closeOnFinish?: boolean } -export type ScraperLoadResult = { - url: string - content: string - mode: ScraperLoadOptions['mode'] -} \ No newline at end of file diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 7b46305..c492c4d 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -16,6 +16,12 @@ export type PageOptions = { }; +export type ExtractorOptions = { + mode: "markdown" | "llm-extraction"; + extractionPrompt?: string; + extractionSchema?: Record; +} + export type SearchOptions = { limit?: number; tbs?: string; @@ -38,6 +44,7 @@ export type WebScraperOptions = { replaceAllPathsWithAbsolutePaths?: boolean; }; pageOptions?: PageOptions; + extractorOptions?: ExtractorOptions; concurrentRequests?: number; }; @@ -50,6 +57,7 @@ export class Document { url?: string; // Used only in /search for now content: string; markdown?: string; + llm_extraction?: string; createdAt?: Date; updatedAt?: Date; type?: string; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1904ef9..fd22ef8 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -1,4 +1,4 @@ -import { Document, PageOptions, WebScraperOptions } from "../../lib/entities"; +import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities"; import { Progress } from "../../lib/entities"; import { scrapSingleUrl } from "./single_url"; import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; @@ -7,6 +7,8 @@ import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/imageDescription"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; +import OpenAI from 'openai' + export class WebScraperDataProvider { private urls: string[] = [""]; @@ -19,6 +21,7 @@ export class WebScraperDataProvider { private concurrentRequests: number = 20; private generateImgAltText: boolean = false; private pageOptions?: PageOptions; + private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; @@ -191,6 +194,22 @@ export class WebScraperDataProvider { documents = await this.getSitemapData(baseUrl, documents); documents = documents.concat(pdfDocuments); + + + + if(this.extractorOptions.mode === "llm-extraction") { + + // const llm = new OpenAI() + // generateCompletions( + // client=llm, + // page =, + // schema= + + // ) + + + } + await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); @@ -376,6 +395,7 @@ export class WebScraperDataProvider { this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; + this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check From 2ad7a58eb76083198e1326ed9e548a5ded672f01 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sun, 28 Apr 2024 17:38:20 -0700 Subject: [PATCH 03/15] Caleb: first test passing --- .../src/__tests__/e2e_withAuth/index.test.ts | 599 +++++++++--------- apps/api/src/lib/LLM-extraction/index.ts | 51 +- apps/api/src/scraper/WebScraper/index.ts | 19 +- 3 files changed, 338 insertions(+), 331 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index fcc7062..ad4910a 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -8,297 +8,316 @@ dotenv.config(); const TEST_URL = "http://127.0.0.1:3002"; - describe.only("E2E Tests for API Routes", () => { - beforeAll(() => { - process.env.USE_DB_AUTHENTICATION = "true"; - }); +describe("E2E Tests for API Routes", () => { + beforeAll(() => { + process.env.USE_DB_AUTHENTICATION = "true"; + }); - afterAll(() => { - delete process.env.USE_DB_AUTHENTICATION; - }); - describe("GET /", () => { - it("should return Hello, world! message", async () => { - const response = await request(TEST_URL).get("/"); + afterAll(() => { + delete process.env.USE_DB_AUTHENTICATION; + }); + describe("GET /", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/"); - expect(response.statusCode).toBe(200); - expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); - }); - }); - - describe("GET /test", () => { - it("should return Hello, world! message", async () => { - const response = await request(TEST_URL).get("/test"); - expect(response.statusCode).toBe(200); - expect(response.text).toContain("Hello, world!"); - }); - }); - - describe("POST /v0/scrape", () => { - it("should require authorization", async () => { - const response = await request(app).post("/v0/scrape"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://facebook.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); - - it("should return a successful response with a valid preview token", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer this_is_just_a_preview_token`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - }, 10000); // 10 seconds timeout - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.content).toContain("🔥 FireCrawl"); - }, 30000); // 30 seconds timeout - }); - - describe("POST /v0/crawl", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).post("/v0/crawl"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://twitter.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("jobId"); - expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ - ); - }); - - - // Additional tests for insufficient credits? - }); - - describe("POST /v0/crawlWebsitePreview", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).post( - "/v0/crawlWebsitePreview" - ); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://instagram.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer this_is_just_a_preview_token`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("jobId"); - expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ - ); - }); - }); - - describe("POST /v0/search", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).post("/v0/search"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(401); - }); - - - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success"); - expect(response.body.success).toBe(true); - expect(response.body).toHaveProperty("data"); - }, 30000); // 30 seconds timeout - }); - - describe("GET /v0/crawl/status/:jobId", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).get("/v0/crawl/status/123"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/123") - .set("Authorization", `Bearer invalid-api-key`); - expect(response.statusCode).toBe(401); - }); - - it("should return Job not found for invalid job ID", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/invalidJobId") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(404); - }); - - it("should return a successful response for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); - - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); - - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain( - "🔥 FireCrawl" - ); - }, 60000); // 60 seconds - }); - - describe("POST /v0/scrape with LLM Extraction", () => { - it("should extract data using LLM extraction mode", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - pageOptions: { - onlyMainContent: true - }, - extractorOptions: { - extractorMode: "llm-extract", - extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", - extractorSchema: { - type: "object", - properties: { - company_mission: { - type: "string" - }, - supports_sso: { - type: "boolean" - }, - is_open_source: { - type: "boolean" - } - }, - required: ["company_mission", "supports_sso", "is_open_source"] - } - } - }); - - console.log("Response:", response.body); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("company_mission"); - expect(response.body.data).toHaveProperty("supports_sso"); - expect(response.body.data).toHaveProperty("is_open_source"); - }); - }); - - describe("GET /is-production", () => { - it("should return the production status", async () => { - const response = await request(TEST_URL).get("/is-production"); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("isProduction"); - }); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); }); }); + + describe("GET /test", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/test"); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("Hello, world!"); + }); + }); + + describe("POST /v0/scrape", () => { + it("should require authorization", async () => { + const response = await request(app).post("/v0/scrape"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://facebook.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should return a successful response with a valid preview token", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + }, 10000); // 10 seconds timeout + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("🔥 FireCrawl"); + }, 30000); // 30 seconds timeout + }); + + describe("POST /v0/crawl", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawl"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://twitter.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + + + // Additional tests for insufficient credits? + }); + + describe("POST /v0/crawlWebsitePreview", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post( + "/v0/crawlWebsitePreview" + ); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://instagram.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + }); + + describe("POST /v0/search", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/search"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(401); + }); + + + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }, 30000); // 30 seconds timeout + }); + + describe("GET /v0/crawl/status/:jobId", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).get("/v0/crawl/status/123"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + }); + + it("should return Job not found for invalid job ID", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + }); + + it("should return a successful response for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain( + "🔥 FireCrawl" + ); + }, 60000); // 60 seconds + }); + + describe.only("POST /v0/scrape with LLM Extraction", () => { + it("should extract data using LLM extraction mode", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + pageOptions: { + onlyMainContent: true + }, + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionSchema: { + type: "object", + properties: { + company_mission: { + type: "string" + }, + supports_sso: { + type: "boolean" + }, + is_open_source: { + type: "boolean" + } + }, + required: ["company_mission", "supports_sso", "is_open_source"] + } + } + }); + + + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` + let llmExtraction = response.body.data.llm_extraction; + + + // Check if llm_extraction is a string and parse it if necessary + if (typeof llmExtraction === 'string') { + llmExtraction = JSON.parse(llmExtraction); + } + + + console.log('llm extraction', llmExtraction); + + // Print the keys of the response.body for debugging purposes + + + + // Check if the llm_extraction object has the required properties with correct types and values + expect(llmExtraction).toHaveProperty("company_mission"); + expect(typeof llmExtraction.company_mission).toBe("string"); + expect(llmExtraction).toHaveProperty("supports_sso"); + expect(llmExtraction.supports_sso).toBe(true); + expect(typeof llmExtraction.supports_sso).toBe("boolean"); + expect(llmExtraction).toHaveProperty("is_open_source"); + expect(llmExtraction.is_open_source).toBe(false); + expect(typeof llmExtraction.is_open_source).toBe("boolean"); + }, 60000); // 60 secs + }); + + describe("GET /is-production", () => { + it("should return the production status", async () => { + const response = await request(TEST_URL).get("/is-production"); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("isProduction"); + }); + }); +}); diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index b52c931..d221498 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -3,46 +3,39 @@ import OpenAI from 'openai' // import { LlamaModel } from 'node-llama-cpp' import { z } from 'zod' import { zodToJsonSchema } from 'zod-to-json-schema' + import { ScraperCompletionResult, generateOpenAICompletions, -} from './models.js' -import { ExtractorOptions } from '../entities.js' +} from './models' +import { Document, ExtractorOptions } from '../entities' // Generate completion using OpenAI -export function generateCompletions( +export async function generateCompletions( documents: Document[], extractionOptions: ExtractorOptions -): Promise < ScraperCompletionResult < T >> [] { +): Promise { // const schema = zodToJsonSchema(options.schema) const schema = extractionOptions.extractionSchema; const prompt = extractionOptions.extractionPrompt; - const loader = documents.map(async (document, i) => { - switch (this.client.constructor) { - case true: - return generateOpenAICompletions( - this.client as OpenAI, - - schema, - options?.prompt, - options?.temperature - ) - - //TODO add other models - // case LlamaModel: - // return generateLlamaCompletions( - // this.client, - // await page, - // schema, - // options?.prompt, - // options?.temperature - // ) - default: - throw new Error('Invalid client') - } - }) + const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider - return loader + const completions = await Promise.all(documents.map(async (document: Document) => { + switch (switchVariable) { + case "openAI": + const llm = new OpenAI(); + return await generateOpenAICompletions({ + client: llm, + document: document, + schema: schema, + prompt: prompt + }); + default: + throw new Error('Invalid client'); + } + })); + + return completions; } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index fd22ef8..b278e38 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -8,6 +8,7 @@ import { getImageDescription } from "./utils/imageDescription"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; import OpenAI from 'openai' +import { generateCompletions } from "../../lib/LLM-extraction"; export class WebScraperDataProvider { @@ -194,20 +195,14 @@ export class WebScraperDataProvider { documents = await this.getSitemapData(baseUrl, documents); documents = documents.concat(pdfDocuments); - - - + console.log("extraction mode ", this.extractorOptions.mode) if(this.extractorOptions.mode === "llm-extraction") { - // const llm = new OpenAI() - // generateCompletions( - // client=llm, - // page =, - // schema= - - // ) - - + const llm = new OpenAI() + documents = await generateCompletions( + documents, + this.extractorOptions + ) } await this.setCachedDocuments(documents); From 667f740315a076338cf4b30cd424cf6fb6797f3c Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sun, 28 Apr 2024 19:28:28 -0700 Subject: [PATCH 04/15] Caleb: converted llm response to json --- apps/api/src/__tests__/e2e_noAuth/index.test.ts | 3 ++- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 11 ++++------- apps/api/src/lib/LLM-extraction/models.ts | 4 ++-- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index 271e848..356fe76 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -199,7 +199,8 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); + + }, 60000); // 60 seconds }); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index ad4910a..9e3fed1 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -289,13 +289,10 @@ describe("E2E Tests for API Routes", () => { let llmExtraction = response.body.data.llm_extraction; - // Check if llm_extraction is a string and parse it if necessary - if (typeof llmExtraction === 'string') { - llmExtraction = JSON.parse(llmExtraction); - } - - - console.log('llm extraction', llmExtraction); + // // Check if llm_extraction is a string and parse it if necessary + // if (typeof llmExtraction === 'string') { + // llmExtraction = JSON.parse(llmExtraction); + // } // Print the keys of the response.body for debugging purposes diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 7f50f72..71daa27 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -1,6 +1,5 @@ import OpenAI from 'openai' import { z } from 'zod' -import { ScraperLoadResult } from './types' import { Document, ExtractorOptions } from "../../lib/entities"; // import { @@ -76,7 +75,8 @@ export async function generateOpenAICompletions({ const c = completion.choices[0].message.tool_calls[0].function.arguments // Extract the LLM extraction content from the completion response - const llmExtraction = c; + const llmExtraction = JSON.parse(c); + // Return the document with the LLM extraction content added return { From 4f7737c922d05e7a34c83468796ec5959c9528c5 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Mon, 29 Apr 2024 12:12:55 -0700 Subject: [PATCH 05/15] Caleb: added ajv json schema validation. --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 31 +++++++++++++++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 5 +++ apps/api/src/controllers/scrape.ts | 4 ++- apps/api/src/lib/LLM-extraction/index.ts | 13 +++++++- apps/api/src/lib/LLM-extraction/models.ts | 2 +- apps/api/src/lib/entities.ts | 2 +- 7 files changed, 54 insertions(+), 4 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 0f826da..00ce1bb 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -51,6 +51,7 @@ "@nangohq/node": "^0.36.33", "@sentry/node": "^7.48.0", "@supabase/supabase-js": "^2.7.1", + "ajv": "^8.12.0", "async": "^3.2.5", "async-mutex": "^0.4.0", "axios": "^1.3.4", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index d72dad0..8062354 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -35,6 +35,9 @@ dependencies: '@supabase/supabase-js': specifier: ^2.7.1 version: 2.39.7 + ajv: + specifier: ^8.12.0 + version: 8.12.0 async: specifier: ^3.2.5 version: 3.2.5 @@ -1820,6 +1823,15 @@ packages: humanize-ms: 1.2.1 dev: false + /ajv@8.12.0: + resolution: {integrity: sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==} + dependencies: + fast-deep-equal: 3.1.3 + json-schema-traverse: 1.0.0 + require-from-string: 2.0.2 + uri-js: 4.4.1 + dev: false + /ansi-escapes@4.3.2: resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==} engines: {node: '>=8'} @@ -2926,6 +2938,10 @@ packages: - supports-color dev: false + /fast-deep-equal@3.1.3: + resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==} + dev: false + /fast-fifo@1.3.2: resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==} dev: false @@ -3999,6 +4015,10 @@ packages: hasBin: true dev: false + /json-schema-traverse@1.0.0: + resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==} + dev: false + /json5@2.2.3: resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==} engines: {node: '>=6'} @@ -5264,6 +5284,11 @@ packages: resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} engines: {node: '>=0.10.0'} + /require-from-string@2.0.2: + resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==} + engines: {node: '>=0.10.0'} + dev: false + /resolve-cwd@3.0.0: resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==} engines: {node: '>=8'} @@ -5970,6 +5995,12 @@ packages: picocolors: 1.0.0 dev: true + /uri-js@4.4.1: + resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==} + dependencies: + punycode: 2.3.1 + dev: false + /urlpattern-polyfill@10.0.0: resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==} dev: false diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 9e3fed1..4a47638 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -285,6 +285,11 @@ describe("E2E Tests for API Routes", () => { }); + // Ensure that the job was successfully created before proceeding with LLM extraction + expect(response.statusCode).toBe(200); + + + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` let llmExtraction = response.body.data.llm_extraction; diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 13c4dd2..43b8ca4 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -7,13 +7,14 @@ import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function +import Ajv from 'ajv'; export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, pageOptions: any, - extractorOptions: any + extractorOptions: ExtractorOptions ): Promise<{ success: boolean; error?: string; @@ -29,6 +30,7 @@ export async function scrapeHelper( return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; } + const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index d221498..237fdbe 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -3,6 +3,8 @@ import OpenAI from 'openai' // import { LlamaModel } from 'node-llama-cpp' import { z } from 'zod' import { zodToJsonSchema } from 'zod-to-json-schema' +import Ajv from 'ajv'; +const ajv = new Ajv(); // Initialize AJV for JSON schema validation import { ScraperCompletionResult, @@ -22,20 +24,29 @@ export async function generateCompletions( const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider + const completions = await Promise.all(documents.map(async (document: Document) => { switch (switchVariable) { case "openAI": const llm = new OpenAI(); - return await generateOpenAICompletions({ + const completionResult = await generateOpenAICompletions({ client: llm, document: document, schema: schema, prompt: prompt }); + // Validate the JSON output against the schema using AJV + const validate = ajv.compile(schema); + if (!validate(completionResult.llm_extraction)) { + throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`); + } + + return completionResult; default: throw new Error('Invalid client'); } })); + return completions; } diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 71daa27..9114511 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -31,7 +31,7 @@ function prepareOpenAIDoc( return [{ type: 'text', text: document.markdown }] } -export async function generateOpenAICompletions({ +export async function generateOpenAICompletions({ client, model = 'gpt-3.5-turbo', document, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index c492c4d..4ceab63 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -57,7 +57,7 @@ export class Document { url?: string; // Used only in /search for now content: string; markdown?: string; - llm_extraction?: string; + llm_extraction?: Record; createdAt?: Date; updatedAt?: Date; type?: string; From 3ca9e5153f99da20b13478811ec0587eb003d434 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 30 Apr 2024 09:20:15 -0700 Subject: [PATCH 06/15] Caleb: trying to get loggin workng --- apps/api/package.json | 2 +- apps/api/pnpm-lock.yaml | 2 +- .../src/__tests__/e2e_withAuth/index.test.ts | 74 ++++++++++++++++--- apps/api/src/controllers/scrape.ts | 18 ++++- apps/api/src/lib/LLM-extraction/helpers.ts | 18 +++++ apps/api/src/lib/LLM-extraction/index.ts | 1 + apps/api/src/lib/LLM-extraction/models.ts | 10 ++- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/index.ts | 4 +- apps/api/src/scraper/WebScraper/single_url.ts | 2 +- apps/api/src/services/logging/log_job.ts | 4 + apps/api/src/types.ts | 4 + 12 files changed, 118 insertions(+), 22 deletions(-) create mode 100644 apps/api/src/lib/LLM-extraction/helpers.ts diff --git a/apps/api/package.json b/apps/api/package.json index 00ce1bb..047feaf 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -46,7 +46,7 @@ "@bull-board/api": "^5.14.2", "@bull-board/express": "^5.8.0", "@devil7softwares/pos": "^1.0.2", - "@dqbd/tiktoken": "^1.0.7", + "@dqbd/tiktoken": "^1.0.13", "@logtail/node": "^0.4.12", "@nangohq/node": "^0.36.33", "@sentry/node": "^7.48.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 8062354..bd5e37b 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -21,7 +21,7 @@ dependencies: specifier: ^1.0.2 version: 1.0.2 '@dqbd/tiktoken': - specifier: ^1.0.7 + specifier: ^1.0.13 version: 1.0.13 '@logtail/node': specifier: ^0.4.12 diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 4a47638..fb9d8af 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -252,7 +252,7 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds }); - describe.only("POST /v0/scrape with LLM Extraction", () => { + describe("POST /v0/scrape with LLM Extraction", () => { it("should extract data using LLM extraction mode", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -293,16 +293,6 @@ describe("E2E Tests for API Routes", () => { // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` let llmExtraction = response.body.data.llm_extraction; - - // // Check if llm_extraction is a string and parse it if necessary - // if (typeof llmExtraction === 'string') { - // llmExtraction = JSON.parse(llmExtraction); - // } - - // Print the keys of the response.body for debugging purposes - - - // Check if the llm_extraction object has the required properties with correct types and values expect(llmExtraction).toHaveProperty("company_mission"); expect(typeof llmExtraction.company_mission).toBe("string"); @@ -315,6 +305,68 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 secs }); + describe.only("POST /v0/scrape for Top 100 Companies", () => { + it("should extract data for the top 100 companies", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://companiesmarketcap.com/", + pageOptions: { + onlyMainContent: true + }, + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.", + extractionSchema: { + type: "object", + properties: { + companies: { + type: "array", + items: { + type: "object", + properties: { + rank: { type: "number" }, + name: { type: "string" }, + marketCap: { type: "string" }, + price: { type: "string" }, + todayChange: { type: "string" } + }, + required: ["rank", "name", "marketCap", "price", "todayChange"] + } + } + }, + required: ["companies"] + } + } + }); + + + // Print the response body to the console for debugging purposes + console.log("Response companies:", response.body.data.llm_extraction.companies); + + // Check if the response has the correct structure and data types + expect(response.status).toBe(200); + expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true); + expect(response.body.data.llm_extraction.companies.length).toBe(40); + + // Sample check for the first company + const firstCompany = response.body.data.llm_extraction.companies[0]; + expect(firstCompany).toHaveProperty("name"); + expect(typeof firstCompany.name).toBe("string"); + expect(firstCompany).toHaveProperty("marketCap"); + expect(typeof firstCompany.marketCap).toBe("string"); + expect(firstCompany).toHaveProperty("price"); + expect(typeof firstCompany.price).toBe("string"); + expect(firstCompany).toHaveProperty("todayChange"); + expect(typeof firstCompany.todayChange).toBe("string"); + }, 120000); // 120 secs + }); + + + + describe("GET /is-production", () => { it("should return the production status", async () => { const response = await request(TEST_URL).get("/is-production"); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 43b8ca4..d2340e8 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -8,6 +8,7 @@ import { logJob } from "../services/logging/log_job"; import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import Ajv from 'ajv'; +import { numTokensFromString } from '../lib/LLM-extraction/helpers'; export async function scrapeHelper( req: Request, @@ -51,9 +52,18 @@ export async function scrapeHelper( return { success: true, error: "No page found", returnCode: 200 }; } + + let creditsToBeBilled = filteredDocs.length; + const creditsPerLLMExtract = 4; + + if (extractorOptions.mode === "llm-extraction"){ + creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length) + } + // console.log("credits to be billed, ", creditsToBeBilled); + const billingResult = await billTeam( team_id, - filteredDocs.length + creditsToBeBilled ); if (!billingResult.success) { return { @@ -109,6 +119,8 @@ export async function scrapeController(req: Request, res: Response) { ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; + const numTokens = numTokensFromString(result.data.markdown, "gpt-3.5-turbo") + logJob({ success: result.success, message: result.error, @@ -120,7 +132,9 @@ export async function scrapeController(req: Request, res: Response) { url: req.body.url, crawlerOptions: crawlerOptions, pageOptions: pageOptions, - origin: origin, + origin: origin, + extractor_options: extractorOptions, + num_tokens: numTokens }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/lib/LLM-extraction/helpers.ts b/apps/api/src/lib/LLM-extraction/helpers.ts new file mode 100644 index 0000000..2535964 --- /dev/null +++ b/apps/api/src/lib/LLM-extraction/helpers.ts @@ -0,0 +1,18 @@ + + +import { encoding_for_model } from "@dqbd/tiktoken"; +import { TiktokenModel } from "@dqbd/tiktoken"; + +// This function calculates the number of tokens in a text string using GPT-3.5-turbo model +export function numTokensFromString(message: string, model: string): number { + const encoder = encoding_for_model(model as TiktokenModel); + + // Encode the message into tokens + const tokens = encoder.encode(message); + + // Free the encoder resources after use + encoder.free(); + + // Return the number of tokens + return tokens.length; +} \ No newline at end of file diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 237fdbe..9fae79d 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -38,6 +38,7 @@ export async function generateCompletions( // Validate the JSON output against the schema using AJV const validate = ajv.compile(schema); if (!validate(completionResult.llm_extraction)) { + //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`); } diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 9114511..177fe64 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -1,6 +1,7 @@ import OpenAI from 'openai' import { z } from 'zod' import { Document, ExtractorOptions } from "../../lib/entities"; +import { numTokensFromString } from './helpers'; // import { // LlamaModel, @@ -17,7 +18,7 @@ export type ScraperCompletionResult> = { } const defaultPrompt = - 'You are a satistified web scraper. Extract the contents of the webpage' + 'You are a professional web scraper. Extract the contents of the webpage' function prepareOpenAIDoc( document: Document @@ -28,12 +29,12 @@ function prepareOpenAIDoc( throw new Error("Markdown content is missing in the document."); } - return [{ type: 'text', text: document.markdown }] + return [{ type: 'text', text: document.html}] } export async function generateOpenAICompletions({ client, - model = 'gpt-3.5-turbo', + model = 'gpt-4-turbo', document, schema, //TODO - add zod dynamic type checking prompt = defaultPrompt, @@ -49,6 +50,7 @@ export async function generateOpenAICompletions({ const openai = client as OpenAI const content = prepareOpenAIDoc(document) + const completion = await openai.chat.completions.create({ model, messages: [ @@ -77,6 +79,8 @@ export async function generateOpenAICompletions({ // Extract the LLM extraction content from the completion response const llmExtraction = JSON.parse(c); +// console.log("llm extraction: ", llmExtraction); + // Return the document with the LLM extraction content added return { diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 4ceab63..4008785 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -57,6 +57,7 @@ export class Document { url?: string; // Used only in /search for now content: string; markdown?: string; + html?: string; llm_extraction?: Record; createdAt?: Date; updatedAt?: Date; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index b278e38..0bd1a82 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -40,8 +40,7 @@ export class WebScraperDataProvider { ): Promise { const totalUrls = urls.length; let processedUrls = 0; - console.log("Converting urls to documents"); - console.log("Total urls", urls); + const results: (Document | null)[] = new Array(urls.length).fill(null); for (let i = 0; i < urls.length; i += this.concurrentRequests) { const batchUrls = urls.slice(i, i + this.concurrentRequests); @@ -195,7 +194,6 @@ export class WebScraperDataProvider { documents = await this.getSitemapData(baseUrl, documents); documents = documents.concat(pdfDocuments); - console.log("extraction mode ", this.extractorOptions.mode) if(this.extractorOptions.mode === "llm-extraction") { const llm = new OpenAI() diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index af215ce..12ff9c5 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -106,7 +106,6 @@ export async function scrapSingleUrl( toMarkdown: boolean = true, pageOptions: PageOptions = { onlyMainContent: true } ): Promise { - console.log(`Scraping URL: ${urlToScrap}`); urlToScrap = urlToScrap.trim(); const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { @@ -217,6 +216,7 @@ export async function scrapSingleUrl( return { content: text, markdown: text, + html: html, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 639b3a8..965ac29 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -8,6 +8,8 @@ export async function logJob(job: FirecrawlJob) { if (process.env.ENV !== "production") { return; } + + // console.log("logg") const { data, error } = await supabase_service .from("firecrawl_jobs") .insert([ @@ -23,6 +25,8 @@ export async function logJob(job: FirecrawlJob) { crawler_options: job.crawlerOptions, page_options: job.pageOptions, origin: job.origin, + extractor_options: job.extractor_options, + num_tokens: job.num_tokens }, ]); if (error) { diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index c65140c..c1858f1 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -1,3 +1,5 @@ +import { ExtractorOptions } from "./lib/entities"; + export interface CrawlResult { source: string; content: string; @@ -37,6 +39,8 @@ export interface FirecrawlJob { crawlerOptions?: any; pageOptions?: any; origin: string; + extractor_options?: ExtractorOptions, + num_tokens?: number } export enum RateLimiterMode { From a32f2b37b637532d792c27b57c708c7fd1591766 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 30 Apr 2024 10:21:41 -0700 Subject: [PATCH 07/15] Caleb: logs work --- apps/api/src/services/logging/log_job.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 965ac29..92a1dc1 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -1,3 +1,4 @@ +import { ExtractorOptions } from './../../lib/entities'; import { supabase_service } from "../supabase"; import { FirecrawlJob } from "../../types"; import "dotenv/config"; @@ -9,7 +10,7 @@ export async function logJob(job: FirecrawlJob) { return; } - // console.log("logg") + const { data, error } = await supabase_service .from("firecrawl_jobs") .insert([ From ad9c8e77d10be20302344cb57cbdfc98b4e8f1cb Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 30 Apr 2024 10:22:09 -0700 Subject: [PATCH 08/15] Caleb: commented out massive test --- .../src/__tests__/e2e_withAuth/index.test.ts | 108 +++++++++--------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index fb9d8af..c6c59bc 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -305,64 +305,64 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 secs }); - describe.only("POST /v0/scrape for Top 100 Companies", () => { - it("should extract data for the top 100 companies", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://companiesmarketcap.com/", - pageOptions: { - onlyMainContent: true - }, - extractorOptions: { - mode: "llm-extraction", - extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.", - extractionSchema: { - type: "object", - properties: { - companies: { - type: "array", - items: { - type: "object", - properties: { - rank: { type: "number" }, - name: { type: "string" }, - marketCap: { type: "string" }, - price: { type: "string" }, - todayChange: { type: "string" } - }, - required: ["rank", "name", "marketCap", "price", "todayChange"] - } - } - }, - required: ["companies"] - } - } - }); + // describe("POST /v0/scrape for Top 100 Companies", () => { + // it("should extract data for the top 100 companies", async () => { + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://companiesmarketcap.com/", + // pageOptions: { + // onlyMainContent: true + // }, + // extractorOptions: { + // mode: "llm-extraction", + // extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.", + // extractionSchema: { + // type: "object", + // properties: { + // companies: { + // type: "array", + // items: { + // type: "object", + // properties: { + // rank: { type: "number" }, + // name: { type: "string" }, + // marketCap: { type: "string" }, + // price: { type: "string" }, + // todayChange: { type: "string" } + // }, + // required: ["rank", "name", "marketCap", "price", "todayChange"] + // } + // } + // }, + // required: ["companies"] + // } + // } + // }); - // Print the response body to the console for debugging purposes - console.log("Response companies:", response.body.data.llm_extraction.companies); + // // Print the response body to the console for debugging purposes + // console.log("Response companies:", response.body.data.llm_extraction.companies); - // Check if the response has the correct structure and data types - expect(response.status).toBe(200); - expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true); - expect(response.body.data.llm_extraction.companies.length).toBe(40); + // // Check if the response has the correct structure and data types + // expect(response.status).toBe(200); + // expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true); + // expect(response.body.data.llm_extraction.companies.length).toBe(40); - // Sample check for the first company - const firstCompany = response.body.data.llm_extraction.companies[0]; - expect(firstCompany).toHaveProperty("name"); - expect(typeof firstCompany.name).toBe("string"); - expect(firstCompany).toHaveProperty("marketCap"); - expect(typeof firstCompany.marketCap).toBe("string"); - expect(firstCompany).toHaveProperty("price"); - expect(typeof firstCompany.price).toBe("string"); - expect(firstCompany).toHaveProperty("todayChange"); - expect(typeof firstCompany.todayChange).toBe("string"); - }, 120000); // 120 secs - }); + // // Sample check for the first company + // const firstCompany = response.body.data.llm_extraction.companies[0]; + // expect(firstCompany).toHaveProperty("name"); + // expect(typeof firstCompany.name).toBe("string"); + // expect(firstCompany).toHaveProperty("marketCap"); + // expect(typeof firstCompany.marketCap).toBe("string"); + // expect(firstCompany).toHaveProperty("price"); + // expect(typeof firstCompany.price).toBe("string"); + // expect(firstCompany).toHaveProperty("todayChange"); + // expect(typeof firstCompany.todayChange).toBe("string"); + // }, 120000); // 120 secs + // }); From d1235a0029b685f8e7a0f6a3e9218516b00fca97 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 30 Apr 2024 10:23:12 -0700 Subject: [PATCH 09/15] Caleb: switched back to markdown for extraction --- apps/api/src/lib/LLM-extraction/models.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 177fe64..df1b6d1 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -29,7 +29,7 @@ function prepareOpenAIDoc( throw new Error("Markdown content is missing in the document."); } - return [{ type: 'text', text: document.html}] + return [{ type: 'text', text: document.markdown}] } export async function generateOpenAICompletions({ From d9d206aff61db16582bc7489d1857a7a80c52d8c Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 30 Apr 2024 10:27:39 -0700 Subject: [PATCH 10/15] Caleb: --- apps/api/src/controllers/scrape.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index d2340e8..c42f451 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -59,7 +59,6 @@ export async function scrapeHelper( if (extractorOptions.mode === "llm-extraction"){ creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length) } - // console.log("credits to be billed, ", creditsToBeBilled); const billingResult = await billTeam( team_id, From 4f526cff9212c6cc58917884a268c1d687957965 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 30 Apr 2024 12:19:43 -0700 Subject: [PATCH 11/15] Nick: cleanup --- apps/api/src/controllers/scrape.ts | 1 - apps/api/src/lib/LLM-extraction/helpers.ts | 18 ++-- apps/api/src/lib/LLM-extraction/index.ts | 82 +++++++++--------- apps/api/src/lib/LLM-extraction/models.ts | 97 +++++++--------------- apps/api/src/scraper/WebScraper/index.ts | 2 - 5 files changed, 76 insertions(+), 124 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index c42f451..852d9b0 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -7,7 +7,6 @@ import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function -import Ajv from 'ajv'; import { numTokensFromString } from '../lib/LLM-extraction/helpers'; export async function scrapeHelper( diff --git a/apps/api/src/lib/LLM-extraction/helpers.ts b/apps/api/src/lib/LLM-extraction/helpers.ts index 2535964..f47a6b3 100644 --- a/apps/api/src/lib/LLM-extraction/helpers.ts +++ b/apps/api/src/lib/LLM-extraction/helpers.ts @@ -1,18 +1,16 @@ - - import { encoding_for_model } from "@dqbd/tiktoken"; import { TiktokenModel } from "@dqbd/tiktoken"; // This function calculates the number of tokens in a text string using GPT-3.5-turbo model export function numTokensFromString(message: string, model: string): number { - const encoder = encoding_for_model(model as TiktokenModel); + const encoder = encoding_for_model(model as TiktokenModel); - // Encode the message into tokens - const tokens = encoder.encode(message); + // Encode the message into tokens + const tokens = encoder.encode(message); - // Free the encoder resources after use - encoder.free(); + // Free the encoder resources after use + encoder.free(); - // Return the number of tokens - return tokens.length; -} \ No newline at end of file + // Return the number of tokens + return tokens.length; +} diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 9fae79d..0f156d2 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -1,53 +1,51 @@ -import Turndown from 'turndown' -import OpenAI from 'openai' -// import { LlamaModel } from 'node-llama-cpp' -import { z } from 'zod' -import { zodToJsonSchema } from 'zod-to-json-schema' -import Ajv from 'ajv'; +import Turndown from "turndown"; +import OpenAI from "openai"; +import Ajv from "ajv"; const ajv = new Ajv(); // Initialize AJV for JSON schema validation -import { - ScraperCompletionResult, - generateOpenAICompletions, -} from './models' -import { Document, ExtractorOptions } from '../entities' +import { generateOpenAICompletions } from "./models"; +import { Document, ExtractorOptions } from "../entities"; - // Generate completion using OpenAI +// Generate completion using OpenAI export async function generateCompletions( - documents: Document[], - extractionOptions: ExtractorOptions + documents: Document[], + extractionOptions: ExtractorOptions ): Promise { - // const schema = zodToJsonSchema(options.schema) + // const schema = zodToJsonSchema(options.schema) - const schema = extractionOptions.extractionSchema; - const prompt = extractionOptions.extractionPrompt; + const schema = extractionOptions.extractionSchema; + const prompt = extractionOptions.extractionPrompt; - const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider + const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider + const completions = await Promise.all( + documents.map(async (document: Document) => { + switch (switchVariable) { + case "openAI": + const llm = new OpenAI(); + const completionResult = await generateOpenAICompletions({ + client: llm, + document: document, + schema: schema, + prompt: prompt, + }); + // Validate the JSON output against the schema using AJV + const validate = ajv.compile(schema); + if (!validate(completionResult.llm_extraction)) { + //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. + throw new Error( + `LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. JSON parsing error(s): ${validate.errors + ?.map((err) => err.message) + .join(", ")}` + ); + } - const completions = await Promise.all(documents.map(async (document: Document) => { - switch (switchVariable) { - case "openAI": - const llm = new OpenAI(); - const completionResult = await generateOpenAICompletions({ - client: llm, - document: document, - schema: schema, - prompt: prompt - }); - // Validate the JSON output against the schema using AJV - const validate = ajv.compile(schema); - if (!validate(completionResult.llm_extraction)) { - //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. - throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`); - } + return completionResult; + default: + throw new Error("Invalid client"); + } + }) + ); - return completionResult; - default: - throw new Error('Invalid client'); - } - })); - - - return completions; + return completions; } diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index df1b6d1..d60979e 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -1,115 +1,74 @@ -import OpenAI from 'openai' -import { z } from 'zod' -import { Document, ExtractorOptions } from "../../lib/entities"; -import { numTokensFromString } from './helpers'; +import OpenAI from "openai"; +import { Document } from "../../lib/entities"; -// import { -// LlamaModel, -// LlamaJsonSchemaGrammar, -// LlamaContext, -// LlamaChatSession, -// GbnfJsonSchema, -// } from 'node-llama-cpp' -// import { JsonSchema7Type } from 'zod-to-json-schema' - -export type ScraperCompletionResult> = { - data: any | null - url: string -} +export type ScraperCompletionResult = { + data: any | null; + url: string; +}; const defaultPrompt = - 'You are a professional web scraper. Extract the contents of the webpage' + "You are a professional web scraper. Extract the contents of the webpage"; function prepareOpenAIDoc( document: Document ): OpenAI.Chat.Completions.ChatCompletionContentPart[] { - // Check if the markdown content exists in the document if (!document.markdown) { throw new Error("Markdown content is missing in the document."); } - return [{ type: 'text', text: document.markdown}] + return [{ type: "text", text: document.markdown }]; } export async function generateOpenAICompletions({ client, - model = 'gpt-4-turbo', + model = "gpt-4-turbo", document, schema, //TODO - add zod dynamic type checking prompt = defaultPrompt, - temperature + temperature, }: { - client: OpenAI, - model?: string, - document: Document, - schema: any, // This should be replaced with a proper Zod schema type when available - prompt?: string, - temperature?: number + client: OpenAI; + model?: string; + document: Document; + schema: any; // This should be replaced with a proper Zod schema type when available + prompt?: string; + temperature?: number; }): Promise { - const openai = client as OpenAI - const content = prepareOpenAIDoc(document) - + const openai = client as OpenAI; + const content = prepareOpenAIDoc(document); const completion = await openai.chat.completions.create({ model, messages: [ { - role: 'system', + role: "system", content: prompt, }, - { role: 'user', content }, + { role: "user", content }, ], tools: [ { - type: 'function', + type: "function", function: { - name: 'extract_content', - description: 'Extracts the content from the given webpage(s)', + name: "extract_content", + description: "Extracts the content from the given webpage(s)", parameters: schema, }, }, ], - tool_choice: 'auto', + tool_choice: "auto", temperature, - }) + }); + + const c = completion.choices[0].message.tool_calls[0].function.arguments; - const c = completion.choices[0].message.tool_calls[0].function.arguments - // Extract the LLM extraction content from the completion response const llmExtraction = JSON.parse(c); -// console.log("llm extraction: ", llmExtraction); - - // Return the document with the LLM extraction content added return { ...document, - llm_extraction: llmExtraction + llm_extraction: llmExtraction, }; - } - -// export async function generateLlamaCompletions>( -// model: LlamaModel, -// page: ScraperLoadResult, -// schema: JsonSchema7Type, -// prompt: string = defaultPrompt, -// temperature?: number -// ): Promise> { -// const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on -// const context = new LlamaContext({ model }) -// const session = new LlamaChatSession({ context }) -// const pagePrompt = `${prompt}\n${page.content}` - -// const result = await session.prompt(pagePrompt, { -// grammar, -// temperature, -// }) - -// const parsed = grammar.parse(result) -// return { -// data: parsed, -// url: page.url, -// } -// } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 0bd1a82..a56f8ff 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -195,8 +195,6 @@ export class WebScraperDataProvider { documents = documents.concat(pdfDocuments); if(this.extractorOptions.mode === "llm-extraction") { - - const llm = new OpenAI() documents = await generateCompletions( documents, this.extractorOptions From 3c7030dbb14df294cd5bd3e1ba4149d34d1733cc Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 30 Apr 2024 16:19:32 -0700 Subject: [PATCH 12/15] Nick: improvements --- apps/api/src/lib/LLM-extraction/index.ts | 4 ++-- apps/api/src/lib/LLM-extraction/models.ts | 4 +++- apps/api/src/lib/LLM-extraction/types.ts | 5 ----- 3 files changed, 5 insertions(+), 8 deletions(-) delete mode 100644 apps/api/src/lib/LLM-extraction/types.ts diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 0f156d2..86e2f90 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -34,9 +34,9 @@ export async function generateCompletions( if (!validate(completionResult.llm_extraction)) { //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. throw new Error( - `LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. JSON parsing error(s): ${validate.errors + `JSON parsing error(s): ${validate.errors ?.map((err) => err.message) - .join(", ")}` + .join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.` ); } diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index d60979e..ec8a710 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -14,7 +14,9 @@ function prepareOpenAIDoc( ): OpenAI.Chat.Completions.ChatCompletionContentPart[] { // Check if the markdown content exists in the document if (!document.markdown) { - throw new Error("Markdown content is missing in the document."); + throw new Error( + "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" + ); } return [{ type: "text", text: document.markdown }]; diff --git a/apps/api/src/lib/LLM-extraction/types.ts b/apps/api/src/lib/LLM-extraction/types.ts deleted file mode 100644 index 2112189..0000000 --- a/apps/api/src/lib/LLM-extraction/types.ts +++ /dev/null @@ -1,5 +0,0 @@ -export type ScraperLoadOptions = { - mode?: 'html' | 'text' | 'markdown' | 'image' - closeOnFinish?: boolean -} - From dfcf39f4c0b411139a81663ea6f02782e3f261a4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 30 Apr 2024 16:19:59 -0700 Subject: [PATCH 13/15] Update scrape.ts --- apps/api/src/controllers/scrape.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 852d9b0..de02f4d 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -53,7 +53,7 @@ export async function scrapeHelper( let creditsToBeBilled = filteredDocs.length; - const creditsPerLLMExtract = 4; + const creditsPerLLMExtract = 5; if (extractorOptions.mode === "llm-extraction"){ creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length) From a38625951107a279fff7a843a394d8047a086e56 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 30 Apr 2024 16:35:44 -0700 Subject: [PATCH 14/15] Update scrape.ts --- apps/api/src/controllers/scrape.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index de02f4d..849500a 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -117,7 +117,7 @@ export async function scrapeController(req: Request, res: Response) { ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; - const numTokens = numTokensFromString(result.data.markdown, "gpt-3.5-turbo") + const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0; logJob({ success: result.success, From 768166b066fa14f27e44d84195facf70166767f8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 30 Apr 2024 16:57:44 -0700 Subject: [PATCH 15/15] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 12ff9c5..fab54bd 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -216,7 +216,6 @@ export async function scrapSingleUrl( return { content: text, markdown: text, - html: html, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) {