Merge pull request #137 from mendableai/nsc/llm-extraction-zod-integration
[Docs] Updated examples
This commit is contained in:
commit
832a4f53e0
@ -1,7 +1,13 @@
|
|||||||
import FirecrawlApp from '@mendable/firecrawl-js';
|
import FirecrawlApp from '@mendable/firecrawl-js';
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
const app = new FirecrawlApp({apiKey: "YOUR_API_KEY"});
|
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||||
|
|
||||||
|
// Scrape a website:
|
||||||
|
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||||
|
console.log(scrapeResult.data.content)
|
||||||
|
|
||||||
|
// Crawl a website:
|
||||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||||
console.log(crawlResult)
|
console.log(crawlResult)
|
||||||
|
|
||||||
@ -17,4 +23,61 @@ while (true) {
|
|||||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(job.data[0].content);
|
console.log(job.data[0].content);
|
||||||
|
|
||||||
|
// Search for a query:
|
||||||
|
const query = 'what is mendable?'
|
||||||
|
const searchResult = await app.search(query)
|
||||||
|
console.log(searchResult)
|
||||||
|
|
||||||
|
// LLM Extraction:
|
||||||
|
// Define schema to extract contents into using zod schema
|
||||||
|
const zodSchema = z.object({
|
||||||
|
top: z
|
||||||
|
.array(
|
||||||
|
z.object({
|
||||||
|
title: z.string(),
|
||||||
|
points: z.number(),
|
||||||
|
by: z.string(),
|
||||||
|
commentsURL: z.string(),
|
||||||
|
})
|
||||||
|
)
|
||||||
|
.length(5)
|
||||||
|
.describe("Top 5 stories on Hacker News"),
|
||||||
|
});
|
||||||
|
|
||||||
|
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||||
|
extractorOptions: { extractionSchema: zodSchema },
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(llmExtractionResult.data.llm_extraction);
|
||||||
|
|
||||||
|
// Define schema to extract contents into using json schema
|
||||||
|
const jsonSchema = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"top": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {"type": "string"},
|
||||||
|
"points": {"type": "number"},
|
||||||
|
"by": {"type": "string"},
|
||||||
|
"commentsURL": {"type": "string"}
|
||||||
|
},
|
||||||
|
"required": ["title", "points", "by", "commentsURL"]
|
||||||
|
},
|
||||||
|
"minItems": 5,
|
||||||
|
"maxItems": 5,
|
||||||
|
"description": "Top 5 stories on Hacker News"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["top"]
|
||||||
|
}
|
||||||
|
|
||||||
|
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||||
|
extractorOptions: { extractionSchema: jsonSchema },
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(llmExtractionResult.data.llm_extraction);
|
83
apps/js-sdk/example.ts
Normal file
83
apps/js-sdk/example.ts
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
import FirecrawlApp, { JobStatusResponse } from '@mendable/firecrawl-js';
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||||
|
|
||||||
|
// Scrape a website:
|
||||||
|
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||||
|
console.log(scrapeResult.data.content)
|
||||||
|
|
||||||
|
// Crawl a website:
|
||||||
|
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||||
|
console.log(crawlResult)
|
||||||
|
|
||||||
|
const jobId: string = await crawlResult['jobId'];
|
||||||
|
console.log(jobId);
|
||||||
|
|
||||||
|
let job: JobStatusResponse;
|
||||||
|
while (true) {
|
||||||
|
job = await app.checkCrawlStatus(jobId);
|
||||||
|
if (job.status === 'completed') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(job.data[0].content);
|
||||||
|
|
||||||
|
// Search for a query:
|
||||||
|
const query = 'what is mendable?'
|
||||||
|
const searchResult = await app.search(query)
|
||||||
|
console.log(searchResult)
|
||||||
|
|
||||||
|
// LLM Extraction:
|
||||||
|
// Define schema to extract contents into using zod schema
|
||||||
|
const zodSchema = z.object({
|
||||||
|
top: z
|
||||||
|
.array(
|
||||||
|
z.object({
|
||||||
|
title: z.string(),
|
||||||
|
points: z.number(),
|
||||||
|
by: z.string(),
|
||||||
|
commentsURL: z.string(),
|
||||||
|
})
|
||||||
|
)
|
||||||
|
.length(5)
|
||||||
|
.describe("Top 5 stories on Hacker News"),
|
||||||
|
});
|
||||||
|
|
||||||
|
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||||
|
extractorOptions: { extractionSchema: zodSchema },
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(llmExtractionResult.data.llm_extraction);
|
||||||
|
|
||||||
|
// Define schema to extract contents into using json schema
|
||||||
|
const jsonSchema = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"top": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {"type": "string"},
|
||||||
|
"points": {"type": "number"},
|
||||||
|
"by": {"type": "string"},
|
||||||
|
"commentsURL": {"type": "string"}
|
||||||
|
},
|
||||||
|
"required": ["title", "points", "by", "commentsURL"]
|
||||||
|
},
|
||||||
|
"minItems": 5,
|
||||||
|
"maxItems": 5,
|
||||||
|
"description": "Top 5 stories on Hacker News"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["top"]
|
||||||
|
}
|
||||||
|
|
||||||
|
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||||
|
extractorOptions: { extractionSchema: jsonSchema },
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(llmExtractionResult.data.llm_extraction);
|
@ -77,6 +77,42 @@ To scrape a single URL with error handling, use the `scrapeUrl` method. It takes
|
|||||||
scrapeExample();
|
scrapeExample();
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Extracting structured data from a URL
|
||||||
|
|
||||||
|
With LLM extraction, you can easily extract structured data from any URL. We support zod schemas to make it easier for you too. Here is how you to use it:
|
||||||
|
|
||||||
|
```js
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
const zodSchema = z.object({
|
||||||
|
top: z
|
||||||
|
.array(
|
||||||
|
z.object({
|
||||||
|
title: z.string(),
|
||||||
|
points: z.number(),
|
||||||
|
by: z.string(),
|
||||||
|
commentsURL: z.string(),
|
||||||
|
})
|
||||||
|
)
|
||||||
|
.length(5)
|
||||||
|
.describe("Top 5 stories on Hacker News"),
|
||||||
|
});
|
||||||
|
|
||||||
|
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||||
|
extractorOptions: { extractionSchema: zodSchema },
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(llmExtractionResult.data.llm_extraction);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Search for a query
|
||||||
|
|
||||||
|
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
||||||
|
|
||||||
|
```js
|
||||||
|
query = 'what is mendable?'
|
||||||
|
searchResult = app.search(query)
|
||||||
|
```
|
||||||
|
|
||||||
### Crawling a Website
|
### Crawling a Website
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "0.0.19",
|
"version": "0.0.20",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "build/index.js",
|
"main": "build/index.js",
|
||||||
"types": "types/index.d.ts",
|
"types": "types/index.d.ts",
|
||||||
|
@ -1,20 +1,19 @@
|
|||||||
from firecrawl import FirecrawlApp
|
from firecrawl import FirecrawlApp
|
||||||
|
|
||||||
|
|
||||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||||
|
|
||||||
|
# Scrape a website:
|
||||||
|
scrape_result = app.scrape_url('firecrawl.dev')
|
||||||
|
print(scrape_result['markdown'])
|
||||||
|
|
||||||
|
# Crawl a website:
|
||||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
|
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
|
||||||
|
print(crawl_result)
|
||||||
|
|
||||||
print(crawl_result[0]['markdown'])
|
# LLM Extraction:
|
||||||
|
# Define schema to extract contents into using pydantic
|
||||||
job_id = crawl_result['jobId']
|
|
||||||
print(job_id)
|
|
||||||
|
|
||||||
status = app.check_crawl_status(job_id)
|
|
||||||
print(status)
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import List, Optional
|
from typing import List
|
||||||
|
|
||||||
class ArticleSchema(BaseModel):
|
class ArticleSchema(BaseModel):
|
||||||
title: str
|
title: str
|
||||||
@ -25,7 +24,7 @@ class ArticleSchema(BaseModel):
|
|||||||
class TopArticlesSchema(BaseModel):
|
class TopArticlesSchema(BaseModel):
|
||||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||||
|
|
||||||
a = app.scrape_url('https://news.ycombinator.com', {
|
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||||
'extractorOptions': {
|
'extractorOptions': {
|
||||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||||
'mode': 'llm-extraction'
|
'mode': 'llm-extraction'
|
||||||
@ -35,3 +34,40 @@ a = app.scrape_url('https://news.ycombinator.com', {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
print(llm_extraction_result['llm_extraction'])
|
||||||
|
|
||||||
|
# Define schema to extract contents into using json schema
|
||||||
|
json_schema = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"top": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {"type": "string"},
|
||||||
|
"points": {"type": "number"},
|
||||||
|
"by": {"type": "string"},
|
||||||
|
"commentsURL": {"type": "string"}
|
||||||
|
},
|
||||||
|
"required": ["title", "points", "by", "commentsURL"]
|
||||||
|
},
|
||||||
|
"minItems": 5,
|
||||||
|
"maxItems": 5,
|
||||||
|
"description": "Top 5 stories on Hacker News"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["top"]
|
||||||
|
}
|
||||||
|
|
||||||
|
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||||
|
'extractorOptions': {
|
||||||
|
'extractionSchema': json_schema,
|
||||||
|
'mode': 'llm-extraction'
|
||||||
|
},
|
||||||
|
'pageOptions':{
|
||||||
|
'onlyMainContent': True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
print(llm_extraction_result['llm_extraction'])
|
Loading…
Reference in New Issue
Block a user