From ddf9ff9c9acc9a6d9bc5003b95eafe3c54f25d2c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 11:46:06 -0700 Subject: [PATCH 1/4] Nick: --- apps/api/requests.http | 11 +++++++---- apps/api/src/main/runWebScraper.ts | 3 ++- apps/api/src/scraper/WebScraper/index.ts | 9 +++++++-- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 2350136..f8d87c2 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -13,12 +13,15 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1 ### Scrape Website -POST https://api.firecrawl.dev/v0/scrape HTTP/1.1 +POST http://localhost:3002/v0/crawl HTTP/1.1 Authorization: Bearer content-type: application/json { - "url":"https://www.mendable.ai" + "url":"https://www.mendable.ai", + "crawlerOptions": { + "returnOnlyUrls": true + } } @@ -34,7 +37,7 @@ content-type: application/json ### Check Job Status -GET http://localhost:3002/v0/crawl/status/333ab225-dc3e-418b-9d4b-8fb833cbaf89 HTTP/1.1 +GET http://localhost:3002/v0/crawl/status/4dbf2b62-487d-45d7-a4f7-8f5e883dfecd HTTP/1.1 Authorization: Bearer ### Get Job Result @@ -48,5 +51,5 @@ content-type: application/json } ### Check Job Status -GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66 +GET https://api.firecrawl.dev/v0/crawl/status/abd12f69-06b2-4378-8753-118b811df59d Authorization: Bearer \ No newline at end of file diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index c43b1b3..1cc5ab0 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -66,6 +66,7 @@ export async function runWebScraper({ inProgress(progress); })) as CrawlResult[]; + if (docs.length === 0) { return { success: true, @@ -75,7 +76,7 @@ export async function runWebScraper({ } // remove docs with empty content - const filteredDocs = docs.filter((doc) => doc.content.trim().length > 0); + const filteredDocs = crawlerOptions.returnOnlyUrls ? docs : docs.filter((doc) => doc.content.trim().length > 0); onSuccess(filteredDocs); const { success, credit_usage } = await billTeam( diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index c2146be..47d18e8 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -80,11 +80,16 @@ export class WebScraperDataProvider { }); let links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { + inProgress({ + current: links.length, + total: links.length, + status: "COMPLETED", + currentDocumentUrl: this.urls[0], + }); return links.map((url) => ({ content: "", + markdown: "", metadata: { sourceURL: url }, - provider: "web", - type: "text", })); } From 1a3aa2999d2d88ff8ff8034d22b3a5bcbc39c295 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 11:59:42 -0700 Subject: [PATCH 2/4] Nick: return the only list of urls --- apps/api/src/lib/entities.ts | 4 ++++ apps/api/src/main/runWebScraper.ts | 23 +++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index e261dd4..ac2d731 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -28,6 +28,10 @@ export type WebScraperOptions = { concurrentRequests?: number; }; +export interface DocumentUrl { + url: string; +} + export class Document { id?: string; content: string; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 1cc5ab0..23dd55b 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -1,8 +1,9 @@ import { Job } from "bull"; import { CrawlResult, WebScraperOptions } from "../types"; import { WebScraperDataProvider } from "../scraper/WebScraper"; -import { Progress } from "../lib/entities"; +import { DocumentUrl, Progress } from "../lib/entities"; import { billTeam } from "../services/billing/credit_billing"; +import { Document } from "../lib/entities"; export async function startWebScraperPipeline({ job, @@ -44,7 +45,11 @@ export async function runWebScraper({ onSuccess: (result: any) => void; onError: (error: any) => void; team_id: string; -}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> { +}): Promise<{ + success: boolean; + message: string; + docs: Document[] | DocumentUrl[]; +}> { try { const provider = new WebScraperDataProvider(); if (mode === "crawl") { @@ -64,8 +69,7 @@ export async function runWebScraper({ } const docs = (await provider.getDocuments(false, (progress: Progress) => { inProgress(progress); - })) as CrawlResult[]; - + })) as Document[]; if (docs.length === 0) { return { @@ -76,7 +80,14 @@ export async function runWebScraper({ } // remove docs with empty content - const filteredDocs = crawlerOptions.returnOnlyUrls ? docs : docs.filter((doc) => doc.content.trim().length > 0); + const filteredDocs = crawlerOptions.returnOnlyUrls + ? docs.map((doc) => { + if (doc.metadata.sourceURL) { + return { url: doc.metadata.sourceURL }; + } + }) + : docs.filter((doc) => doc.content.trim().length > 0); + onSuccess(filteredDocs); const { success, credit_usage } = await billTeam( @@ -92,7 +103,7 @@ export async function runWebScraper({ }; } - return { success: true, message: "", docs: filteredDocs as CrawlResult[] }; + return { success: true, message: "", docs: filteredDocs }; } catch (error) { console.error("Error running web scraper", error); onError(error); From 3b5b868d0da4a55afa9e50f3b34dc7d02d4f3a16 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 23 Apr 2024 18:13:58 -0700 Subject: [PATCH 3/4] Update requests.http --- apps/api/requests.http | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 9a972de..1dbeaeb 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -14,7 +14,7 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1 ### Scrape Website POST http://localhost:3002/v0/crawl HTTP/1.1 -Authorization: Bearer +Authorization: Bearer fc-879f515fdd5b418b8d55ec6ccb1acd46 content-type: application/json { @@ -25,6 +25,10 @@ content-type: application/json } + + + + ### Scrape Website POST http://localhost:3002/v0/scrape HTTP/1.1 Authorization: Bearer @@ -37,8 +41,8 @@ content-type: application/json ### Check Job Status -GET http://localhost:3002/v0/crawl/status/4dbf2b62-487d-45d7-a4f7-8f5e883dfecd HTTP/1.1 -Authorization: Bearer +GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1 +Authorization: Bearer fc-879f515fdd5b418b8d55ec6ccb1acd46 ### Get Job Result From 07e93ee5fd5bee4cb7d54f825bccd5cd1574a7ae Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 24 Apr 2024 10:32:35 -0300 Subject: [PATCH 4/4] Update requests.http --- apps/api/requests.http | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 1dbeaeb..495df97 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -14,7 +14,7 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1 ### Scrape Website POST http://localhost:3002/v0/crawl HTTP/1.1 -Authorization: Bearer fc-879f515fdd5b418b8d55ec6ccb1acd46 +Authorization: Bearer content-type: application/json { @@ -29,6 +29,8 @@ content-type: application/json + + ### Scrape Website POST http://localhost:3002/v0/scrape HTTP/1.1 Authorization: Bearer @@ -42,7 +44,7 @@ content-type: application/json ### Check Job Status GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1 -Authorization: Bearer fc-879f515fdd5b418b8d55ec6ccb1acd46 +Authorization: Bearer ### Get Job Result