Merge pull request #90 from mendableai/llm-extraction
feat: LLM Extraction (mvp)
This commit is contained in:
commit
2f2b83b5ee
@ -46,11 +46,12 @@
|
|||||||
"@bull-board/api": "^5.14.2",
|
"@bull-board/api": "^5.14.2",
|
||||||
"@bull-board/express": "^5.8.0",
|
"@bull-board/express": "^5.8.0",
|
||||||
"@devil7softwares/pos": "^1.0.2",
|
"@devil7softwares/pos": "^1.0.2",
|
||||||
"@dqbd/tiktoken": "^1.0.7",
|
"@dqbd/tiktoken": "^1.0.13",
|
||||||
"@logtail/node": "^0.4.12",
|
"@logtail/node": "^0.4.12",
|
||||||
"@nangohq/node": "^0.36.33",
|
"@nangohq/node": "^0.36.33",
|
||||||
"@sentry/node": "^7.48.0",
|
"@sentry/node": "^7.48.0",
|
||||||
"@supabase/supabase-js": "^2.7.1",
|
"@supabase/supabase-js": "^2.7.1",
|
||||||
|
"ajv": "^8.12.0",
|
||||||
"async": "^3.2.5",
|
"async": "^3.2.5",
|
||||||
"async-mutex": "^0.4.0",
|
"async-mutex": "^0.4.0",
|
||||||
"axios": "^1.3.4",
|
"axios": "^1.3.4",
|
||||||
@ -68,6 +69,7 @@
|
|||||||
"gpt3-tokenizer": "^1.1.5",
|
"gpt3-tokenizer": "^1.1.5",
|
||||||
"ioredis": "^5.3.2",
|
"ioredis": "^5.3.2",
|
||||||
"joplin-turndown-plugin-gfm": "^1.0.12",
|
"joplin-turndown-plugin-gfm": "^1.0.12",
|
||||||
|
"json-schema-to-zod": "^2.1.0",
|
||||||
"keyword-extractor": "^0.0.25",
|
"keyword-extractor": "^0.0.25",
|
||||||
"langchain": "^0.1.25",
|
"langchain": "^0.1.25",
|
||||||
"languagedetect": "^2.0.0",
|
"languagedetect": "^2.0.0",
|
||||||
@ -93,7 +95,9 @@
|
|||||||
"unstructured-client": "^0.9.4",
|
"unstructured-client": "^0.9.4",
|
||||||
"uuid": "^9.0.1",
|
"uuid": "^9.0.1",
|
||||||
"wordpos": "^2.1.0",
|
"wordpos": "^2.1.0",
|
||||||
"xml2js": "^0.6.2"
|
"xml2js": "^0.6.2",
|
||||||
|
"zod": "^3.23.4",
|
||||||
|
"zod-to-json-schema": "^3.23.0"
|
||||||
},
|
},
|
||||||
"nodemonConfig": {
|
"nodemonConfig": {
|
||||||
"ignore": [
|
"ignore": [
|
||||||
|
@ -21,7 +21,7 @@ dependencies:
|
|||||||
specifier: ^1.0.2
|
specifier: ^1.0.2
|
||||||
version: 1.0.2
|
version: 1.0.2
|
||||||
'@dqbd/tiktoken':
|
'@dqbd/tiktoken':
|
||||||
specifier: ^1.0.7
|
specifier: ^1.0.13
|
||||||
version: 1.0.13
|
version: 1.0.13
|
||||||
'@logtail/node':
|
'@logtail/node':
|
||||||
specifier: ^0.4.12
|
specifier: ^0.4.12
|
||||||
@ -35,6 +35,9 @@ dependencies:
|
|||||||
'@supabase/supabase-js':
|
'@supabase/supabase-js':
|
||||||
specifier: ^2.7.1
|
specifier: ^2.7.1
|
||||||
version: 2.39.7
|
version: 2.39.7
|
||||||
|
ajv:
|
||||||
|
specifier: ^8.12.0
|
||||||
|
version: 8.12.0
|
||||||
async:
|
async:
|
||||||
specifier: ^3.2.5
|
specifier: ^3.2.5
|
||||||
version: 3.2.5
|
version: 3.2.5
|
||||||
@ -86,6 +89,9 @@ dependencies:
|
|||||||
joplin-turndown-plugin-gfm:
|
joplin-turndown-plugin-gfm:
|
||||||
specifier: ^1.0.12
|
specifier: ^1.0.12
|
||||||
version: 1.0.12
|
version: 1.0.12
|
||||||
|
json-schema-to-zod:
|
||||||
|
specifier: ^2.1.0
|
||||||
|
version: 2.1.0
|
||||||
keyword-extractor:
|
keyword-extractor:
|
||||||
specifier: ^0.0.25
|
specifier: ^0.0.25
|
||||||
version: 0.0.25
|
version: 0.0.25
|
||||||
@ -164,6 +170,12 @@ dependencies:
|
|||||||
xml2js:
|
xml2js:
|
||||||
specifier: ^0.6.2
|
specifier: ^0.6.2
|
||||||
version: 0.6.2
|
version: 0.6.2
|
||||||
|
zod:
|
||||||
|
specifier: ^3.23.4
|
||||||
|
version: 3.23.4
|
||||||
|
zod-to-json-schema:
|
||||||
|
specifier: ^3.23.0
|
||||||
|
version: 3.23.0(zod@3.23.4)
|
||||||
|
|
||||||
devDependencies:
|
devDependencies:
|
||||||
'@flydotio/dockerfile':
|
'@flydotio/dockerfile':
|
||||||
@ -1200,7 +1212,7 @@ packages:
|
|||||||
redis: 4.6.13
|
redis: 4.6.13
|
||||||
typesense: 1.7.2(@babel/runtime@7.24.0)
|
typesense: 1.7.2(@babel/runtime@7.24.0)
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- encoding
|
- encoding
|
||||||
dev: false
|
dev: false
|
||||||
@ -1218,8 +1230,8 @@ packages:
|
|||||||
p-queue: 6.6.2
|
p-queue: 6.6.2
|
||||||
p-retry: 4.6.2
|
p-retry: 4.6.2
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
zod-to-json-schema: 3.22.4(zod@3.22.4)
|
zod-to-json-schema: 3.23.0(zod@3.23.4)
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
/@langchain/openai@0.0.18:
|
/@langchain/openai@0.0.18:
|
||||||
@ -1229,8 +1241,8 @@ packages:
|
|||||||
'@langchain/core': 0.1.43
|
'@langchain/core': 0.1.43
|
||||||
js-tiktoken: 1.0.10
|
js-tiktoken: 1.0.10
|
||||||
openai: 4.28.4
|
openai: 4.28.4
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
zod-to-json-schema: 3.22.4(zod@3.22.4)
|
zod-to-json-schema: 3.23.0(zod@3.23.4)
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- encoding
|
- encoding
|
||||||
dev: false
|
dev: false
|
||||||
@ -1811,6 +1823,15 @@ packages:
|
|||||||
humanize-ms: 1.2.1
|
humanize-ms: 1.2.1
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
/ajv@8.12.0:
|
||||||
|
resolution: {integrity: sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==}
|
||||||
|
dependencies:
|
||||||
|
fast-deep-equal: 3.1.3
|
||||||
|
json-schema-traverse: 1.0.0
|
||||||
|
require-from-string: 2.0.2
|
||||||
|
uri-js: 4.4.1
|
||||||
|
dev: false
|
||||||
|
|
||||||
/ansi-escapes@4.3.2:
|
/ansi-escapes@4.3.2:
|
||||||
resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==}
|
resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
@ -2917,6 +2938,10 @@ packages:
|
|||||||
- supports-color
|
- supports-color
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
/fast-deep-equal@3.1.3:
|
||||||
|
resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==}
|
||||||
|
dev: false
|
||||||
|
|
||||||
/fast-fifo@1.3.2:
|
/fast-fifo@1.3.2:
|
||||||
resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==}
|
resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==}
|
||||||
dev: false
|
dev: false
|
||||||
@ -3985,6 +4010,15 @@ packages:
|
|||||||
/json-parse-even-better-errors@2.3.1:
|
/json-parse-even-better-errors@2.3.1:
|
||||||
resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==}
|
resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==}
|
||||||
|
|
||||||
|
/json-schema-to-zod@2.1.0:
|
||||||
|
resolution: {integrity: sha512-7ishNgYY+AbIKeeHcp5xCOdJbdVwSfDx/4V2ktc16LUusCJJbz2fEKdWUmAxhKIiYzhZ9Fp4E8OsAoM/h9cOLA==}
|
||||||
|
hasBin: true
|
||||||
|
dev: false
|
||||||
|
|
||||||
|
/json-schema-traverse@1.0.0:
|
||||||
|
resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==}
|
||||||
|
dev: false
|
||||||
|
|
||||||
/json5@2.2.3:
|
/json5@2.2.3:
|
||||||
resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==}
|
resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==}
|
||||||
engines: {node: '>=6'}
|
engines: {node: '>=6'}
|
||||||
@ -4209,8 +4243,8 @@ packages:
|
|||||||
redis: 4.6.13
|
redis: 4.6.13
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
yaml: 2.4.1
|
yaml: 2.4.1
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
zod-to-json-schema: 3.22.4(zod@3.22.4)
|
zod-to-json-schema: 3.23.0(zod@3.23.4)
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- '@aws-crypto/sha256-js'
|
- '@aws-crypto/sha256-js'
|
||||||
- '@aws-sdk/client-bedrock-agent-runtime'
|
- '@aws-sdk/client-bedrock-agent-runtime'
|
||||||
@ -5069,7 +5103,7 @@ packages:
|
|||||||
sbd: 1.0.19
|
sbd: 1.0.19
|
||||||
typescript: 5.4.5
|
typescript: 5.4.5
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- debug
|
- debug
|
||||||
dev: false
|
dev: false
|
||||||
@ -5250,6 +5284,11 @@ packages:
|
|||||||
resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
|
resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
|
||||||
engines: {node: '>=0.10.0'}
|
engines: {node: '>=0.10.0'}
|
||||||
|
|
||||||
|
/require-from-string@2.0.2:
|
||||||
|
resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==}
|
||||||
|
engines: {node: '>=0.10.0'}
|
||||||
|
dev: false
|
||||||
|
|
||||||
/resolve-cwd@3.0.0:
|
/resolve-cwd@3.0.0:
|
||||||
resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==}
|
resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
@ -5956,6 +5995,12 @@ packages:
|
|||||||
picocolors: 1.0.0
|
picocolors: 1.0.0
|
||||||
dev: true
|
dev: true
|
||||||
|
|
||||||
|
/uri-js@4.4.1:
|
||||||
|
resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==}
|
||||||
|
dependencies:
|
||||||
|
punycode: 2.3.1
|
||||||
|
dev: false
|
||||||
|
|
||||||
/urlpattern-polyfill@10.0.0:
|
/urlpattern-polyfill@10.0.0:
|
||||||
resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==}
|
resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==}
|
||||||
dev: false
|
dev: false
|
||||||
@ -6185,14 +6230,18 @@ packages:
|
|||||||
engines: {node: '>=10'}
|
engines: {node: '>=10'}
|
||||||
dev: true
|
dev: true
|
||||||
|
|
||||||
/zod-to-json-schema@3.22.4(zod@3.22.4):
|
/zod-to-json-schema@3.23.0(zod@3.23.4):
|
||||||
resolution: {integrity: sha512-2Ed5dJ+n/O3cU383xSY28cuVi0BCQhF8nYqWU5paEpl7fVdqdAmiLdqLyfblbNdfOFwFfi/mqU4O1pwc60iBhQ==}
|
resolution: {integrity: sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
zod: ^3.22.4
|
zod: ^3.23.3
|
||||||
dependencies:
|
dependencies:
|
||||||
zod: 3.22.4
|
zod: 3.23.4
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
/zod@3.22.4:
|
/zod@3.22.4:
|
||||||
resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
|
resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
/zod@3.23.4:
|
||||||
|
resolution: {integrity: sha512-/AtWOKbBgjzEYYQRNfoGKHObgfAZag6qUJX1VbHo2PRBgS+wfWagEY2mizjfyAPcGesrJOcx/wcl0L9WnVrHFw==}
|
||||||
|
dev: false
|
||||||
|
@ -199,7 +199,8 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl");
|
|
||||||
|
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ dotenv.config();
|
|||||||
const TEST_URL = "http://127.0.0.1:3002";
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
|
|
||||||
describe("E2E Tests for API Routes", () => {
|
describe("E2E Tests for API Routes", () => {
|
||||||
beforeAll(() => {
|
beforeAll(() => {
|
||||||
process.env.USE_DB_AUTHENTICATION = "true";
|
process.env.USE_DB_AUTHENTICATION = "true";
|
||||||
});
|
});
|
||||||
@ -252,6 +252,121 @@ const TEST_URL = "http://127.0.0.1:3002";
|
|||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||||
|
it("should extract data using LLM extraction mode", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://mendable.ai",
|
||||||
|
pageOptions: {
|
||||||
|
onlyMainContent: true
|
||||||
|
},
|
||||||
|
extractorOptions: {
|
||||||
|
mode: "llm-extraction",
|
||||||
|
extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||||
|
extractionSchema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
company_mission: {
|
||||||
|
type: "string"
|
||||||
|
},
|
||||||
|
supports_sso: {
|
||||||
|
type: "boolean"
|
||||||
|
},
|
||||||
|
is_open_source: {
|
||||||
|
type: "boolean"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
required: ["company_mission", "supports_sso", "is_open_source"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
// Ensure that the job was successfully created before proceeding with LLM extraction
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
|
||||||
|
let llmExtraction = response.body.data.llm_extraction;
|
||||||
|
|
||||||
|
// Check if the llm_extraction object has the required properties with correct types and values
|
||||||
|
expect(llmExtraction).toHaveProperty("company_mission");
|
||||||
|
expect(typeof llmExtraction.company_mission).toBe("string");
|
||||||
|
expect(llmExtraction).toHaveProperty("supports_sso");
|
||||||
|
expect(llmExtraction.supports_sso).toBe(true);
|
||||||
|
expect(typeof llmExtraction.supports_sso).toBe("boolean");
|
||||||
|
expect(llmExtraction).toHaveProperty("is_open_source");
|
||||||
|
expect(llmExtraction.is_open_source).toBe(false);
|
||||||
|
expect(typeof llmExtraction.is_open_source).toBe("boolean");
|
||||||
|
}, 60000); // 60 secs
|
||||||
|
});
|
||||||
|
|
||||||
|
// describe("POST /v0/scrape for Top 100 Companies", () => {
|
||||||
|
// it("should extract data for the top 100 companies", async () => {
|
||||||
|
// const response = await request(TEST_URL)
|
||||||
|
// .post("/v0/scrape")
|
||||||
|
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
// .set("Content-Type", "application/json")
|
||||||
|
// .send({
|
||||||
|
// url: "https://companiesmarketcap.com/",
|
||||||
|
// pageOptions: {
|
||||||
|
// onlyMainContent: true
|
||||||
|
// },
|
||||||
|
// extractorOptions: {
|
||||||
|
// mode: "llm-extraction",
|
||||||
|
// extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.",
|
||||||
|
// extractionSchema: {
|
||||||
|
// type: "object",
|
||||||
|
// properties: {
|
||||||
|
// companies: {
|
||||||
|
// type: "array",
|
||||||
|
// items: {
|
||||||
|
// type: "object",
|
||||||
|
// properties: {
|
||||||
|
// rank: { type: "number" },
|
||||||
|
// name: { type: "string" },
|
||||||
|
// marketCap: { type: "string" },
|
||||||
|
// price: { type: "string" },
|
||||||
|
// todayChange: { type: "string" }
|
||||||
|
// },
|
||||||
|
// required: ["rank", "name", "marketCap", "price", "todayChange"]
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// },
|
||||||
|
// required: ["companies"]
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// });
|
||||||
|
|
||||||
|
|
||||||
|
// // Print the response body to the console for debugging purposes
|
||||||
|
// console.log("Response companies:", response.body.data.llm_extraction.companies);
|
||||||
|
|
||||||
|
// // Check if the response has the correct structure and data types
|
||||||
|
// expect(response.status).toBe(200);
|
||||||
|
// expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true);
|
||||||
|
// expect(response.body.data.llm_extraction.companies.length).toBe(40);
|
||||||
|
|
||||||
|
// // Sample check for the first company
|
||||||
|
// const firstCompany = response.body.data.llm_extraction.companies[0];
|
||||||
|
// expect(firstCompany).toHaveProperty("name");
|
||||||
|
// expect(typeof firstCompany.name).toBe("string");
|
||||||
|
// expect(firstCompany).toHaveProperty("marketCap");
|
||||||
|
// expect(typeof firstCompany.marketCap).toBe("string");
|
||||||
|
// expect(firstCompany).toHaveProperty("price");
|
||||||
|
// expect(typeof firstCompany.price).toBe("string");
|
||||||
|
// expect(firstCompany).toHaveProperty("todayChange");
|
||||||
|
// expect(typeof firstCompany.todayChange).toBe("string");
|
||||||
|
// }, 120000); // 120 secs
|
||||||
|
// });
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
describe("GET /is-production", () => {
|
describe("GET /is-production", () => {
|
||||||
it("should return the production status", async () => {
|
it("should return the production status", async () => {
|
||||||
const response = await request(TEST_URL).get("/is-production");
|
const response = await request(TEST_URL).get("/is-production");
|
||||||
@ -259,4 +374,4 @@ const TEST_URL = "http://127.0.0.1:3002";
|
|||||||
expect(response.body).toHaveProperty("isProduction");
|
expect(response.body).toHaveProperty("isProduction");
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import { ExtractorOptions } from './../lib/entities';
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||||
@ -6,12 +7,14 @@ import { RateLimiterMode } from "../types";
|
|||||||
import { logJob } from "../services/logging/log_job";
|
import { logJob } from "../services/logging/log_job";
|
||||||
import { Document } from "../lib/entities";
|
import { Document } from "../lib/entities";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
|
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||||
|
|
||||||
export async function scrapeHelper(
|
export async function scrapeHelper(
|
||||||
req: Request,
|
req: Request,
|
||||||
team_id: string,
|
team_id: string,
|
||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
pageOptions: any
|
pageOptions: any,
|
||||||
|
extractorOptions: ExtractorOptions
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
@ -27,6 +30,7 @@ export async function scrapeHelper(
|
|||||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
@ -35,6 +39,7 @@ export async function scrapeHelper(
|
|||||||
...crawlerOptions,
|
...crawlerOptions,
|
||||||
},
|
},
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
|
extractorOptions: extractorOptions
|
||||||
});
|
});
|
||||||
|
|
||||||
const docs = await a.getDocuments(false);
|
const docs = await a.getDocuments(false);
|
||||||
@ -46,9 +51,17 @@ export async function scrapeHelper(
|
|||||||
return { success: true, error: "No page found", returnCode: 200 };
|
return { success: true, error: "No page found", returnCode: 200 };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
let creditsToBeBilled = filteredDocs.length;
|
||||||
|
const creditsPerLLMExtract = 5;
|
||||||
|
|
||||||
|
if (extractorOptions.mode === "llm-extraction"){
|
||||||
|
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length)
|
||||||
|
}
|
||||||
|
|
||||||
const billingResult = await billTeam(
|
const billingResult = await billTeam(
|
||||||
team_id,
|
team_id,
|
||||||
filteredDocs.length
|
creditsToBeBilled
|
||||||
);
|
);
|
||||||
if (!billingResult.success) {
|
if (!billingResult.success) {
|
||||||
return {
|
return {
|
||||||
@ -79,6 +92,9 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
|
||||||
|
const extractorOptions = req.body.extractorOptions ?? {
|
||||||
|
mode: "markdown"
|
||||||
|
}
|
||||||
const origin = req.body.origin ?? "api";
|
const origin = req.body.origin ?? "api";
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -96,10 +112,13 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
req,
|
req,
|
||||||
team_id,
|
team_id,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions
|
pageOptions,
|
||||||
|
extractorOptions
|
||||||
);
|
);
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
success: result.success,
|
success: result.success,
|
||||||
message: result.error,
|
message: result.error,
|
||||||
@ -112,6 +131,8 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
origin: origin,
|
origin: origin,
|
||||||
|
extractor_options: extractorOptions,
|
||||||
|
num_tokens: numTokens
|
||||||
});
|
});
|
||||||
return res.status(result.returnCode).json(result);
|
return res.status(result.returnCode).json(result);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
16
apps/api/src/lib/LLM-extraction/helpers.ts
Normal file
16
apps/api/src/lib/LLM-extraction/helpers.ts
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
import { encoding_for_model } from "@dqbd/tiktoken";
|
||||||
|
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||||
|
|
||||||
|
// This function calculates the number of tokens in a text string using GPT-3.5-turbo model
|
||||||
|
export function numTokensFromString(message: string, model: string): number {
|
||||||
|
const encoder = encoding_for_model(model as TiktokenModel);
|
||||||
|
|
||||||
|
// Encode the message into tokens
|
||||||
|
const tokens = encoder.encode(message);
|
||||||
|
|
||||||
|
// Free the encoder resources after use
|
||||||
|
encoder.free();
|
||||||
|
|
||||||
|
// Return the number of tokens
|
||||||
|
return tokens.length;
|
||||||
|
}
|
51
apps/api/src/lib/LLM-extraction/index.ts
Normal file
51
apps/api/src/lib/LLM-extraction/index.ts
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
import Turndown from "turndown";
|
||||||
|
import OpenAI from "openai";
|
||||||
|
import Ajv from "ajv";
|
||||||
|
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
||||||
|
|
||||||
|
import { generateOpenAICompletions } from "./models";
|
||||||
|
import { Document, ExtractorOptions } from "../entities";
|
||||||
|
|
||||||
|
// Generate completion using OpenAI
|
||||||
|
export async function generateCompletions(
|
||||||
|
documents: Document[],
|
||||||
|
extractionOptions: ExtractorOptions
|
||||||
|
): Promise<Document[]> {
|
||||||
|
// const schema = zodToJsonSchema(options.schema)
|
||||||
|
|
||||||
|
const schema = extractionOptions.extractionSchema;
|
||||||
|
const prompt = extractionOptions.extractionPrompt;
|
||||||
|
|
||||||
|
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
|
||||||
|
|
||||||
|
const completions = await Promise.all(
|
||||||
|
documents.map(async (document: Document) => {
|
||||||
|
switch (switchVariable) {
|
||||||
|
case "openAI":
|
||||||
|
const llm = new OpenAI();
|
||||||
|
const completionResult = await generateOpenAICompletions({
|
||||||
|
client: llm,
|
||||||
|
document: document,
|
||||||
|
schema: schema,
|
||||||
|
prompt: prompt,
|
||||||
|
});
|
||||||
|
// Validate the JSON output against the schema using AJV
|
||||||
|
const validate = ajv.compile(schema);
|
||||||
|
if (!validate(completionResult.llm_extraction)) {
|
||||||
|
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
||||||
|
throw new Error(
|
||||||
|
`JSON parsing error(s): ${validate.errors
|
||||||
|
?.map((err) => err.message)
|
||||||
|
.join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return completionResult;
|
||||||
|
default:
|
||||||
|
throw new Error("Invalid client");
|
||||||
|
}
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
return completions;
|
||||||
|
}
|
76
apps/api/src/lib/LLM-extraction/models.ts
Normal file
76
apps/api/src/lib/LLM-extraction/models.ts
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
import OpenAI from "openai";
|
||||||
|
import { Document } from "../../lib/entities";
|
||||||
|
|
||||||
|
export type ScraperCompletionResult = {
|
||||||
|
data: any | null;
|
||||||
|
url: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
const defaultPrompt =
|
||||||
|
"You are a professional web scraper. Extract the contents of the webpage";
|
||||||
|
|
||||||
|
function prepareOpenAIDoc(
|
||||||
|
document: Document
|
||||||
|
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
|
||||||
|
// Check if the markdown content exists in the document
|
||||||
|
if (!document.markdown) {
|
||||||
|
throw new Error(
|
||||||
|
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return [{ type: "text", text: document.markdown }];
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function generateOpenAICompletions({
|
||||||
|
client,
|
||||||
|
model = "gpt-4-turbo",
|
||||||
|
document,
|
||||||
|
schema, //TODO - add zod dynamic type checking
|
||||||
|
prompt = defaultPrompt,
|
||||||
|
temperature,
|
||||||
|
}: {
|
||||||
|
client: OpenAI;
|
||||||
|
model?: string;
|
||||||
|
document: Document;
|
||||||
|
schema: any; // This should be replaced with a proper Zod schema type when available
|
||||||
|
prompt?: string;
|
||||||
|
temperature?: number;
|
||||||
|
}): Promise<Document> {
|
||||||
|
const openai = client as OpenAI;
|
||||||
|
const content = prepareOpenAIDoc(document);
|
||||||
|
|
||||||
|
const completion = await openai.chat.completions.create({
|
||||||
|
model,
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "system",
|
||||||
|
content: prompt,
|
||||||
|
},
|
||||||
|
{ role: "user", content },
|
||||||
|
],
|
||||||
|
tools: [
|
||||||
|
{
|
||||||
|
type: "function",
|
||||||
|
function: {
|
||||||
|
name: "extract_content",
|
||||||
|
description: "Extracts the content from the given webpage(s)",
|
||||||
|
parameters: schema,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
tool_choice: "auto",
|
||||||
|
temperature,
|
||||||
|
});
|
||||||
|
|
||||||
|
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
||||||
|
|
||||||
|
// Extract the LLM extraction content from the completion response
|
||||||
|
const llmExtraction = JSON.parse(c);
|
||||||
|
|
||||||
|
// Return the document with the LLM extraction content added
|
||||||
|
return {
|
||||||
|
...document,
|
||||||
|
llm_extraction: llmExtraction,
|
||||||
|
};
|
||||||
|
}
|
@ -16,6 +16,12 @@ export type PageOptions = {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export type ExtractorOptions = {
|
||||||
|
mode: "markdown" | "llm-extraction";
|
||||||
|
extractionPrompt?: string;
|
||||||
|
extractionSchema?: Record<string, any>;
|
||||||
|
}
|
||||||
|
|
||||||
export type SearchOptions = {
|
export type SearchOptions = {
|
||||||
limit?: number;
|
limit?: number;
|
||||||
tbs?: string;
|
tbs?: string;
|
||||||
@ -38,6 +44,7 @@ export type WebScraperOptions = {
|
|||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
};
|
};
|
||||||
pageOptions?: PageOptions;
|
pageOptions?: PageOptions;
|
||||||
|
extractorOptions?: ExtractorOptions;
|
||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -50,6 +57,8 @@ export class Document {
|
|||||||
url?: string; // Used only in /search for now
|
url?: string; // Used only in /search for now
|
||||||
content: string;
|
content: string;
|
||||||
markdown?: string;
|
markdown?: string;
|
||||||
|
html?: string;
|
||||||
|
llm_extraction?: Record<string, any>;
|
||||||
createdAt?: Date;
|
createdAt?: Date;
|
||||||
updatedAt?: Date;
|
updatedAt?: Date;
|
||||||
type?: string;
|
type?: string;
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
|
import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";
|
||||||
import { Progress } from "../../lib/entities";
|
import { Progress } from "../../lib/entities";
|
||||||
import { scrapSingleUrl } from "./single_url";
|
import { scrapSingleUrl } from "./single_url";
|
||||||
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
||||||
@ -7,6 +7,9 @@ import { getValue, setValue } from "../../services/redis";
|
|||||||
import { getImageDescription } from "./utils/imageDescription";
|
import { getImageDescription } from "./utils/imageDescription";
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
||||||
|
import OpenAI from 'openai'
|
||||||
|
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||||
|
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
private urls: string[] = [""];
|
private urls: string[] = [""];
|
||||||
@ -19,6 +22,7 @@ export class WebScraperDataProvider {
|
|||||||
private concurrentRequests: number = 20;
|
private concurrentRequests: number = 20;
|
||||||
private generateImgAltText: boolean = false;
|
private generateImgAltText: boolean = false;
|
||||||
private pageOptions?: PageOptions;
|
private pageOptions?: PageOptions;
|
||||||
|
private extractorOptions?: ExtractorOptions;
|
||||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";
|
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";
|
||||||
|
|
||||||
@ -36,8 +40,7 @@ export class WebScraperDataProvider {
|
|||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
const totalUrls = urls.length;
|
const totalUrls = urls.length;
|
||||||
let processedUrls = 0;
|
let processedUrls = 0;
|
||||||
console.log("Converting urls to documents");
|
|
||||||
console.log("Total urls", urls);
|
|
||||||
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
||||||
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
||||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||||
@ -192,6 +195,13 @@ export class WebScraperDataProvider {
|
|||||||
documents = await this.getSitemapData(baseUrl, documents);
|
documents = await this.getSitemapData(baseUrl, documents);
|
||||||
documents = documents.concat(pdfDocuments);
|
documents = documents.concat(pdfDocuments);
|
||||||
|
|
||||||
|
if(this.extractorOptions.mode === "llm-extraction") {
|
||||||
|
documents = await generateCompletions(
|
||||||
|
documents,
|
||||||
|
this.extractorOptions
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
await this.setCachedDocuments(documents);
|
await this.setCachedDocuments(documents);
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
documents = documents.splice(0, this.limit);
|
documents = documents.splice(0, this.limit);
|
||||||
@ -377,6 +387,7 @@ export class WebScraperDataProvider {
|
|||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
||||||
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
|
|
||||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||||
|
@ -106,7 +106,6 @@ export async function scrapSingleUrl(
|
|||||||
toMarkdown: boolean = true,
|
toMarkdown: boolean = true,
|
||||||
pageOptions: PageOptions = { onlyMainContent: true }
|
pageOptions: PageOptions = { onlyMainContent: true }
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
console.log(`Scraping URL: ${urlToScrap}`);
|
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||||
@ -170,6 +169,8 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||||
|
|
||||||
return [await parseMarkdown(cleanedHtml), text];
|
return [await parseMarkdown(cleanedHtml), text];
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import { ExtractorOptions } from './../../lib/entities';
|
||||||
import { supabase_service } from "../supabase";
|
import { supabase_service } from "../supabase";
|
||||||
import { FirecrawlJob } from "../../types";
|
import { FirecrawlJob } from "../../types";
|
||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
@ -8,6 +9,8 @@ export async function logJob(job: FirecrawlJob) {
|
|||||||
if (process.env.ENV !== "production") {
|
if (process.env.ENV !== "production") {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
.from("firecrawl_jobs")
|
.from("firecrawl_jobs")
|
||||||
.insert([
|
.insert([
|
||||||
@ -23,6 +26,8 @@ export async function logJob(job: FirecrawlJob) {
|
|||||||
crawler_options: job.crawlerOptions,
|
crawler_options: job.crawlerOptions,
|
||||||
page_options: job.pageOptions,
|
page_options: job.pageOptions,
|
||||||
origin: job.origin,
|
origin: job.origin,
|
||||||
|
extractor_options: job.extractor_options,
|
||||||
|
num_tokens: job.num_tokens
|
||||||
},
|
},
|
||||||
]);
|
]);
|
||||||
if (error) {
|
if (error) {
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import { ExtractorOptions } from "./lib/entities";
|
||||||
|
|
||||||
export interface CrawlResult {
|
export interface CrawlResult {
|
||||||
source: string;
|
source: string;
|
||||||
content: string;
|
content: string;
|
||||||
@ -37,6 +39,8 @@ export interface FirecrawlJob {
|
|||||||
crawlerOptions?: any;
|
crawlerOptions?: any;
|
||||||
pageOptions?: any;
|
pageOptions?: any;
|
||||||
origin: string;
|
origin: string;
|
||||||
|
extractor_options?: ExtractorOptions,
|
||||||
|
num_tokens?: number
|
||||||
}
|
}
|
||||||
|
|
||||||
export enum RateLimiterMode {
|
export enum RateLimiterMode {
|
||||||
|
Loading…
Reference in New Issue
Block a user