From 9390816c1b7975b3349f402f562d1846e6845e2a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 10 Jun 2024 18:26:25 -0700 Subject: [PATCH] Update openapi.json --- apps/api/openapi.json | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index ab452ff..55bfe1c 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -51,10 +51,19 @@ "description": "Include the raw HTML content of the page. Will output a html key in the response.", "default": false }, + "screenshot": { + "type": "boolean", + "description": "Include a screenshot of the top of the page that you are scraping.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", "default": 0 + }, + "headers": { + "type": "object", + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." } } }, @@ -176,6 +185,11 @@ "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", "default": "default" }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore the website sitemap when crawling", + "default": false + }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl", @@ -195,6 +209,15 @@ "type": "boolean", "description": "Include the raw HTML content of the page. Will output a html key in the response.", "default": false + }, + "screenshot": { + "type": "boolean", + "description": "Include a screenshot of the top of the page that you are scraping.", + "default": false + }, + "headers": { + "type": "object", + "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." } } } @@ -368,7 +391,7 @@ "items": { "$ref": "#/components/schemas/CrawlStatusResponseObj" }, - "description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled." + "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array." } } } @@ -513,6 +536,10 @@ "nullable": true, "description": "Raw HTML content of the page if `includeHtml` is true" }, + "index": { + "type": "integer", + "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." + }, "metadata": { "type": "object", "properties": {