From 2ad7a58eb76083198e1326ed9e548a5ded672f01 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Sun, 28 Apr 2024 17:38:20 -0700 Subject: [PATCH] Caleb: first test passing --- .../src/__tests__/e2e_withAuth/index.test.ts | 599 +++++++++--------- apps/api/src/lib/LLM-extraction/index.ts | 51 +- apps/api/src/scraper/WebScraper/index.ts | 19 +- 3 files changed, 338 insertions(+), 331 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index fcc7062..ad4910a 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -8,297 +8,316 @@ dotenv.config(); const TEST_URL = "http://127.0.0.1:3002"; - describe.only("E2E Tests for API Routes", () => { - beforeAll(() => { - process.env.USE_DB_AUTHENTICATION = "true"; - }); +describe("E2E Tests for API Routes", () => { + beforeAll(() => { + process.env.USE_DB_AUTHENTICATION = "true"; + }); - afterAll(() => { - delete process.env.USE_DB_AUTHENTICATION; - }); - describe("GET /", () => { - it("should return Hello, world! message", async () => { - const response = await request(TEST_URL).get("/"); + afterAll(() => { + delete process.env.USE_DB_AUTHENTICATION; + }); + describe("GET /", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/"); - expect(response.statusCode).toBe(200); - expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); - }); - }); - - describe("GET /test", () => { - it("should return Hello, world! message", async () => { - const response = await request(TEST_URL).get("/test"); - expect(response.statusCode).toBe(200); - expect(response.text).toContain("Hello, world!"); - }); - }); - - describe("POST /v0/scrape", () => { - it("should require authorization", async () => { - const response = await request(app).post("/v0/scrape"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://facebook.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); - - it("should return a successful response with a valid preview token", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer this_is_just_a_preview_token`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - }, 10000); // 10 seconds timeout - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.content).toContain("🔥 FireCrawl"); - }, 30000); // 30 seconds timeout - }); - - describe("POST /v0/crawl", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).post("/v0/crawl"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://twitter.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("jobId"); - expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ - ); - }); - - - // Additional tests for insufficient credits? - }); - - describe("POST /v0/crawlWebsitePreview", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).post( - "/v0/crawlWebsitePreview" - ); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://instagram.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer this_is_just_a_preview_token`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("jobId"); - expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ - ); - }); - }); - - describe("POST /v0/search", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).post("/v0/search"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(401); - }); - - - - it("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success"); - expect(response.body.success).toBe(true); - expect(response.body).toHaveProperty("data"); - }, 30000); // 30 seconds timeout - }); - - describe("GET /v0/crawl/status/:jobId", () => { - it("should require authorization", async () => { - const response = await request(TEST_URL).get("/v0/crawl/status/123"); - expect(response.statusCode).toBe(401); - }); - - it("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/123") - .set("Authorization", `Bearer invalid-api-key`); - expect(response.statusCode).toBe(401); - }); - - it("should return Job not found for invalid job ID", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/invalidJobId") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(404); - }); - - it("should return a successful response for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); - - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); - - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain( - "🔥 FireCrawl" - ); - }, 60000); // 60 seconds - }); - - describe("POST /v0/scrape with LLM Extraction", () => { - it("should extract data using LLM extraction mode", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - pageOptions: { - onlyMainContent: true - }, - extractorOptions: { - extractorMode: "llm-extract", - extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", - extractorSchema: { - type: "object", - properties: { - company_mission: { - type: "string" - }, - supports_sso: { - type: "boolean" - }, - is_open_source: { - type: "boolean" - } - }, - required: ["company_mission", "supports_sso", "is_open_source"] - } - } - }); - - console.log("Response:", response.body); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("company_mission"); - expect(response.body.data).toHaveProperty("supports_sso"); - expect(response.body.data).toHaveProperty("is_open_source"); - }); - }); - - describe("GET /is-production", () => { - it("should return the production status", async () => { - const response = await request(TEST_URL).get("/is-production"); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("isProduction"); - }); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); }); }); + + describe("GET /test", () => { + it("should return Hello, world! message", async () => { + const response = await request(TEST_URL).get("/test"); + expect(response.statusCode).toBe(200); + expect(response.text).toContain("Hello, world!"); + }); + }); + + describe("POST /v0/scrape", () => { + it("should require authorization", async () => { + const response = await request(app).post("/v0/scrape"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://facebook.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should return a successful response with a valid preview token", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + }, 10000); // 10 seconds timeout + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("🔥 FireCrawl"); + }, 30000); // 30 seconds timeout + }); + + describe("POST /v0/crawl", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/crawl"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://twitter.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + + + // Additional tests for insufficient credits? + }); + + describe("POST /v0/crawlWebsitePreview", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post( + "/v0/crawlWebsitePreview" + ); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://instagram.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + }); + }); + + describe("POST /v0/search", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).post("/v0/search"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(401); + }); + + + + it("should return a successful response with a valid API key", async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }, 30000); // 30 seconds timeout + }); + + describe("GET /v0/crawl/status/:jobId", () => { + it("should require authorization", async () => { + const response = await request(TEST_URL).get("/v0/crawl/status/123"); + expect(response.statusCode).toBe(401); + }); + + it("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + }); + + it("should return Job not found for invalid job ID", async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + }); + + it("should return a successful response for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain( + "🔥 FireCrawl" + ); + }, 60000); // 60 seconds + }); + + describe.only("POST /v0/scrape with LLM Extraction", () => { + it("should extract data using LLM extraction mode", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + pageOptions: { + onlyMainContent: true + }, + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionSchema: { + type: "object", + properties: { + company_mission: { + type: "string" + }, + supports_sso: { + type: "boolean" + }, + is_open_source: { + type: "boolean" + } + }, + required: ["company_mission", "supports_sso", "is_open_source"] + } + } + }); + + + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` + let llmExtraction = response.body.data.llm_extraction; + + + // Check if llm_extraction is a string and parse it if necessary + if (typeof llmExtraction === 'string') { + llmExtraction = JSON.parse(llmExtraction); + } + + + console.log('llm extraction', llmExtraction); + + // Print the keys of the response.body for debugging purposes + + + + // Check if the llm_extraction object has the required properties with correct types and values + expect(llmExtraction).toHaveProperty("company_mission"); + expect(typeof llmExtraction.company_mission).toBe("string"); + expect(llmExtraction).toHaveProperty("supports_sso"); + expect(llmExtraction.supports_sso).toBe(true); + expect(typeof llmExtraction.supports_sso).toBe("boolean"); + expect(llmExtraction).toHaveProperty("is_open_source"); + expect(llmExtraction.is_open_source).toBe(false); + expect(typeof llmExtraction.is_open_source).toBe("boolean"); + }, 60000); // 60 secs + }); + + describe("GET /is-production", () => { + it("should return the production status", async () => { + const response = await request(TEST_URL).get("/is-production"); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("isProduction"); + }); + }); +}); diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index b52c931..d221498 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -3,46 +3,39 @@ import OpenAI from 'openai' // import { LlamaModel } from 'node-llama-cpp' import { z } from 'zod' import { zodToJsonSchema } from 'zod-to-json-schema' + import { ScraperCompletionResult, generateOpenAICompletions, -} from './models.js' -import { ExtractorOptions } from '../entities.js' +} from './models' +import { Document, ExtractorOptions } from '../entities' // Generate completion using OpenAI -export function generateCompletions( +export async function generateCompletions( documents: Document[], extractionOptions: ExtractorOptions -): Promise < ScraperCompletionResult < T >> [] { +): Promise { // const schema = zodToJsonSchema(options.schema) const schema = extractionOptions.extractionSchema; const prompt = extractionOptions.extractionPrompt; - const loader = documents.map(async (document, i) => { - switch (this.client.constructor) { - case true: - return generateOpenAICompletions( - this.client as OpenAI, - - schema, - options?.prompt, - options?.temperature - ) - - //TODO add other models - // case LlamaModel: - // return generateLlamaCompletions( - // this.client, - // await page, - // schema, - // options?.prompt, - // options?.temperature - // ) - default: - throw new Error('Invalid client') - } - }) + const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider - return loader + const completions = await Promise.all(documents.map(async (document: Document) => { + switch (switchVariable) { + case "openAI": + const llm = new OpenAI(); + return await generateOpenAICompletions({ + client: llm, + document: document, + schema: schema, + prompt: prompt + }); + default: + throw new Error('Invalid client'); + } + })); + + return completions; } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index fd22ef8..b278e38 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -8,6 +8,7 @@ import { getImageDescription } from "./utils/imageDescription"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; import OpenAI from 'openai' +import { generateCompletions } from "../../lib/LLM-extraction"; export class WebScraperDataProvider { @@ -194,20 +195,14 @@ export class WebScraperDataProvider { documents = await this.getSitemapData(baseUrl, documents); documents = documents.concat(pdfDocuments); - - - + console.log("extraction mode ", this.extractorOptions.mode) if(this.extractorOptions.mode === "llm-extraction") { - // const llm = new OpenAI() - // generateCompletions( - // client=llm, - // page =, - // schema= - - // ) - - + const llm = new OpenAI() + documents = await generateCompletions( + documents, + this.extractorOptions + ) } await this.setCachedDocuments(documents);