0
This commit is contained in:
Nicolas 2024-05-15 12:11:16 -07:00
parent e26008a833
commit fd82982a31
2 changed files with 120 additions and 3 deletions

View File

@ -18,8 +18,8 @@
"paths": { "paths": {
"/scrape": { "/scrape": {
"post": { "post": {
"summary": "Scrape a single URL", "summary": "Scrape a single URL and optionally extract information using an LLM",
"operationId": "scrapeSingleUrl", "operationId": "scrapeAndExtractFromUrl",
"tags": ["Scraping"], "tags": ["Scraping"],
"security": [ "security": [
{ {
@ -45,8 +45,43 @@
"type": "boolean", "type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.", "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false "default": false
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
} }
} }
},
"extractorOptions": {
"type": "object",
"description": "Options for LLM-based extraction of structured information from the page content",
"properties": {
"mode": {
"type": "string",
"enum": ["llm-extraction"],
"description": "The extraction mode to use, currently supports 'llm-extraction'"
},
"extractionPrompt": {
"type": "string",
"description": "A prompt describing what information to extract from the page"
},
"extractionSchema": {
"type": "object",
"additionalProperties": true,
"description": "The schema for the data to be extracted",
"required": [
"company_mission",
"supports_sso",
"is_open_source"
]
}
}
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for the request",
"default": 30000
} }
}, },
"required": ["url"] "required": ["url"]
@ -126,6 +161,16 @@
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
"default": false "default": false
}, },
"maxDepth": {
"type": "integer",
"description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on."
},
"mode": {
"type": "string",
"enum": ["default", "fast"],
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
"default": "default"
},
"limit": { "limit": {
"type": "integer", "type": "integer",
"description": "Maximum number of pages to crawl", "description": "Maximum number of pages to crawl",
@ -140,6 +185,11 @@
"type": "boolean", "type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.", "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false "default": false
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
} }
} }
} }
@ -206,6 +256,11 @@
"type": "boolean", "type": "boolean",
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
"default": true "default": true
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
} }
} }
}, },
@ -302,6 +357,63 @@
"$ref": "#/components/schemas/ScrapeResponse" "$ref": "#/components/schemas/ScrapeResponse"
}, },
"description": "Data returned from the job (null when it is in progress)" "description": "Data returned from the job (null when it is in progress)"
},
"partial_data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ScrapeResponse"
},
"description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
}
}
}
}
}
},
"402": {
"description": "Payment required"
},
"429": {
"description": "Too many requests"
},
"500": {
"description": "Server error"
}
}
}
},
"/crawl/cancel/{jobId}": {
"delete": {
"tags": ["Crawl"],
"summary": "Cancel a crawl job",
"operationId": "cancelCrawlJob",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Returns cancelled."
} }
} }
} }
@ -344,6 +456,11 @@
"content": { "content": {
"type": "string" "type": "string"
}, },
"html": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeHtml` is true"
},
"metadata": { "metadata": {
"type": "object", "type": "object",
"properties": { "properties": {

View File

@ -183,7 +183,7 @@ describe("Scraping/Crawling Checkup (E2E)", () => {
} }
expect(score).toBeGreaterThanOrEqual(75); expect(score).toBeGreaterThanOrEqual(70);
}, 350000); // 150 seconds timeout }, 350000); // 150 seconds timeout
}); });
}); });