From ddf9ff9c9acc9a6d9bc5003b95eafe3c54f25d2c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 11:46:06 -0700 Subject: [PATCH] Nick: --- apps/api/requests.http | 11 +++++++---- apps/api/src/main/runWebScraper.ts | 3 ++- apps/api/src/scraper/WebScraper/index.ts | 9 +++++++-- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 2350136..f8d87c2 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -13,12 +13,15 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1 ### Scrape Website -POST https://api.firecrawl.dev/v0/scrape HTTP/1.1 +POST http://localhost:3002/v0/crawl HTTP/1.1 Authorization: Bearer content-type: application/json { - "url":"https://www.mendable.ai" + "url":"https://www.mendable.ai", + "crawlerOptions": { + "returnOnlyUrls": true + } } @@ -34,7 +37,7 @@ content-type: application/json ### Check Job Status -GET http://localhost:3002/v0/crawl/status/333ab225-dc3e-418b-9d4b-8fb833cbaf89 HTTP/1.1 +GET http://localhost:3002/v0/crawl/status/4dbf2b62-487d-45d7-a4f7-8f5e883dfecd HTTP/1.1 Authorization: Bearer ### Get Job Result @@ -48,5 +51,5 @@ content-type: application/json } ### Check Job Status -GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66 +GET https://api.firecrawl.dev/v0/crawl/status/abd12f69-06b2-4378-8753-118b811df59d Authorization: Bearer \ No newline at end of file diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index c43b1b3..1cc5ab0 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -66,6 +66,7 @@ export async function runWebScraper({ inProgress(progress); })) as CrawlResult[]; + if (docs.length === 0) { return { success: true, @@ -75,7 +76,7 @@ export async function runWebScraper({ } // remove docs with empty content - const filteredDocs = docs.filter((doc) => doc.content.trim().length > 0); + const filteredDocs = crawlerOptions.returnOnlyUrls ? docs : docs.filter((doc) => doc.content.trim().length > 0); onSuccess(filteredDocs); const { success, credit_usage } = await billTeam( diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index c2146be..47d18e8 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -80,11 +80,16 @@ export class WebScraperDataProvider { }); let links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { + inProgress({ + current: links.length, + total: links.length, + status: "COMPLETED", + currentDocumentUrl: this.urls[0], + }); return links.map((url) => ({ content: "", + markdown: "", metadata: { sourceURL: url }, - provider: "web", - type: "text", })); }