0

73 lines
1.8 KiB
Python
Raw Normal View History

2024-04-15 17:01:47 -04:00
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
2024-04-15 17:01:47 -04:00
2024-05-09 10:36:56 -03:00
# Scrape a website:
scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown'])
2024-05-08 17:36:40 -07:00
2024-05-09 10:36:56 -03:00
# Crawl a website:
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
print(crawl_result)
2024-05-09 10:36:56 -03:00
# LLM Extraction:
# Define schema to extract contents into using pydantic
from pydantic import BaseModel, Field
2024-05-09 10:36:56 -03:00
from typing import List
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
2024-05-09 10:36:56 -03:00
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': TopArticlesSchema.model_json_schema(),
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})
2024-04-15 17:01:47 -04:00
2024-05-09 10:36:56 -03:00
print(llm_extraction_result['llm_extraction'])
# Define schema to extract contents into using json schema
json_schema = {
"type": "object",
"properties": {
"top": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"points": {"type": "number"},
"by": {"type": "string"},
"commentsURL": {"type": "string"}
},
"required": ["title", "points", "by", "commentsURL"]
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News"
}
},
"required": ["top"]
}
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': json_schema,
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})
print(llm_extraction_result['llm_extraction'])