From aa6b84c5fa591900c855a0419d5be9b3ca14f08b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 8 May 2024 17:41:15 -0700 Subject: [PATCH] Nick: readme --- README.md | 26 ++++++++++++++++++++++++++ apps/python-sdk/README.md | 25 +++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/README.md b/README.md index 9ac5636..17ba373 100644 --- a/README.md +++ b/README.md @@ -248,6 +248,32 @@ url = 'https://example.com' scraped_data = app.scrape_url(url) ``` +### Extracting structured data from a URL + +With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it: + +```python +class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str + +class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") + +data = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': TopArticlesSchema.model_json_schema(), + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) +print(data["llm_extraction"]) +``` + ### Search for a query Performs a web search, retrieve the top results, extract data from each page, and returns their markdown. diff --git a/apps/python-sdk/README.md b/apps/python-sdk/README.md index 02ad307..38ca843 100644 --- a/apps/python-sdk/README.md +++ b/apps/python-sdk/README.md @@ -46,6 +46,31 @@ To scrape a single URL, use the `scrape_url` method. It takes the URL as a param url = 'https://example.com' scraped_data = app.scrape_url(url) ``` +### Extracting structured data from a URL + +With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it: + +```python +class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str + +class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") + +data = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': TopArticlesSchema.model_json_schema(), + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) +print(data["llm_extraction"]) +``` ### Search for a query