Nick:
This commit is contained in:
parent
e26008a833
commit
fd82982a31
@ -18,8 +18,8 @@
|
||||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL",
|
||||
"operationId": "scrapeSingleUrl",
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
@ -45,8 +45,43 @@
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"extractorOptions": {
|
||||
"type": "object",
|
||||
"description": "Options for LLM-based extraction of structured information from the page content",
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["llm-extraction"],
|
||||
"description": "The extraction mode to use, currently supports 'llm-extraction'"
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page"
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
@ -126,6 +161,16 @@
|
||||
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
|
||||
"default": false
|
||||
},
|
||||
"maxDepth": {
|
||||
"type": "integer",
|
||||
"description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on."
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["default", "fast"],
|
||||
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
|
||||
"default": "default"
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of pages to crawl",
|
||||
@ -140,6 +185,11 @@
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -206,6 +256,11 @@
|
||||
"type": "boolean",
|
||||
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
|
||||
"default": true
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -302,6 +357,63 @@
|
||||
"$ref": "#/components/schemas/ScrapeResponse"
|
||||
},
|
||||
"description": "Data returned from the job (null when it is in progress)"
|
||||
},
|
||||
"partial_data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/ScrapeResponse"
|
||||
},
|
||||
"description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required"
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests"
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/cancel/{jobId}": {
|
||||
"delete": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Cancel a crawl job",
|
||||
"operationId": "cancelCrawlJob",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Returns cancelled."
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -344,6 +456,11 @@
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -183,7 +183,7 @@ describe("Scraping/Crawling Checkup (E2E)", () => {
|
||||
}
|
||||
|
||||
|
||||
expect(score).toBeGreaterThanOrEqual(75);
|
||||
expect(score).toBeGreaterThanOrEqual(70);
|
||||
}, 350000); // 150 seconds timeout
|
||||
});
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user