0
This commit is contained in:
Nicolas 2024-04-20 11:46:06 -07:00
parent 39dca60241
commit ddf9ff9c9a
3 changed files with 16 additions and 7 deletions

View File

@ -13,12 +13,15 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1
### Scrape Website ### Scrape Website
POST https://api.firecrawl.dev/v0/scrape HTTP/1.1 POST http://localhost:3002/v0/crawl HTTP/1.1
Authorization: Bearer Authorization: Bearer
content-type: application/json content-type: application/json
{ {
"url":"https://www.mendable.ai" "url":"https://www.mendable.ai",
"crawlerOptions": {
"returnOnlyUrls": true
}
} }
@ -34,7 +37,7 @@ content-type: application/json
### Check Job Status ### Check Job Status
GET http://localhost:3002/v0/crawl/status/333ab225-dc3e-418b-9d4b-8fb833cbaf89 HTTP/1.1 GET http://localhost:3002/v0/crawl/status/4dbf2b62-487d-45d7-a4f7-8f5e883dfecd HTTP/1.1
Authorization: Bearer Authorization: Bearer
### Get Job Result ### Get Job Result
@ -48,5 +51,5 @@ content-type: application/json
} }
### Check Job Status ### Check Job Status
GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66 GET https://api.firecrawl.dev/v0/crawl/status/abd12f69-06b2-4378-8753-118b811df59d
Authorization: Bearer Authorization: Bearer

View File

@ -66,6 +66,7 @@ export async function runWebScraper({
inProgress(progress); inProgress(progress);
})) as CrawlResult[]; })) as CrawlResult[];
if (docs.length === 0) { if (docs.length === 0) {
return { return {
success: true, success: true,
@ -75,7 +76,7 @@ export async function runWebScraper({
} }
// remove docs with empty content // remove docs with empty content
const filteredDocs = docs.filter((doc) => doc.content.trim().length > 0); const filteredDocs = crawlerOptions.returnOnlyUrls ? docs : docs.filter((doc) => doc.content.trim().length > 0);
onSuccess(filteredDocs); onSuccess(filteredDocs);
const { success, credit_usage } = await billTeam( const { success, credit_usage } = await billTeam(

View File

@ -80,11 +80,16 @@ export class WebScraperDataProvider {
}); });
let links = await crawler.start(inProgress, 5, this.limit); let links = await crawler.start(inProgress, 5, this.limit);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
inProgress({
current: links.length,
total: links.length,
status: "COMPLETED",
currentDocumentUrl: this.urls[0],
});
return links.map((url) => ({ return links.map((url) => ({
content: "", content: "",
markdown: "",
metadata: { sourceURL: url }, metadata: { sourceURL: url },
provider: "web",
type: "text",
})); }));
} }