From 890bde686f5bb7e94137a2a5b5aa51f1d999994d Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 19 Apr 2024 19:10:05 -0300 Subject: [PATCH 01/10] added type declarations --- apps/js-sdk/firecrawl/package.json | 3 +- apps/js-sdk/firecrawl/tsconfig.json | 8 +- apps/js-sdk/firecrawl/types/index.d.ts | 107 +++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 apps/js-sdk/firecrawl/types/index.d.ts diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 58aa5ac..811f87f 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,8 +1,9 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.10", + "version": "0.0.11", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", + "types": "types/index.d.ts", "type": "module", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" diff --git a/apps/js-sdk/firecrawl/tsconfig.json b/apps/js-sdk/firecrawl/tsconfig.json index 5bca86d..d7764a4 100644 --- a/apps/js-sdk/firecrawl/tsconfig.json +++ b/apps/js-sdk/firecrawl/tsconfig.json @@ -49,7 +49,7 @@ // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ /* Emit */ - // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ + "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ // "declarationMap": true, /* Create sourcemaps for d.ts files. */ // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ @@ -70,7 +70,7 @@ // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ - // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ + "declarationDir": "./types", /* Specify the output directory for generated declaration files. */ // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */ /* Interop Constraints */ @@ -105,5 +105,7 @@ /* Completeness */ // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ "skipLibCheck": true /* Skip type checking all .d.ts files. */ - } + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist", "**/__tests__/*"] } diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts new file mode 100644 index 0000000..a9d04ba --- /dev/null +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -0,0 +1,107 @@ +import { AxiosResponse, AxiosRequestHeaders } from 'axios'; +/** + * Configuration interface for FirecrawlApp. + */ +export interface FirecrawlAppConfig { + apiKey?: string | null; +} +/** + * Generic parameter interface. + */ +export interface Params { + [key: string]: any; +} +/** + * Response interface for scraping operations. + */ +export interface ScrapeResponse { + success: boolean; + data?: any; + error?: string; +} +/** + * Response interface for crawling operations. + */ +export interface CrawlResponse { + success: boolean; + jobId?: string; + data?: any; + error?: string; +} +/** + * Response interface for job status checks. + */ +export interface JobStatusResponse { + success: boolean; + status: string; + jobId?: string; + data?: any; + error?: string; +} +/** + * Main class for interacting with the Firecrawl API. + */ +export default class FirecrawlApp { + private apiKey; + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey }: FirecrawlAppConfig); + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url: string, params?: Params | null): Promise; + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, timeout?: number): Promise; + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId: string): Promise; + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(): AxiosRequestHeaders; + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise; + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url: string, headers: AxiosRequestHeaders): Promise; + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise; + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response: AxiosResponse, action: string): void; +} From 6aa3cc3ce85c0d71fe6e0ae0e6f92fb007f04431 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 13:53:11 -0700 Subject: [PATCH 02/10] Nick: --- apps/api/src/main/runWebScraper.ts | 12 ++++++--- apps/api/src/services/logging/log_job.ts | 33 ++++++++++++++++++++++++ apps/api/src/services/queue-worker.ts | 19 +++++++++++++- apps/api/src/types.ts | 14 ++++++++++ 4 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 apps/api/src/services/logging/log_job.ts diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index c43b1b3..0f562a0 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -3,7 +3,7 @@ import { CrawlResult, WebScraperOptions } from "../types"; import { WebScraperDataProvider } from "../scraper/WebScraper"; import { Progress } from "../lib/entities"; import { billTeam } from "../services/billing/credit_billing"; - +import { Document } from "../lib/entities"; export async function startWebScraperPipeline({ job, }: { @@ -24,7 +24,7 @@ export async function startWebScraperPipeline({ job.moveToFailed(error); }, team_id: job.data.team_id, - })) as { success: boolean; message: string; docs: CrawlResult[] }; + })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ url, @@ -76,12 +76,12 @@ export async function runWebScraper({ // remove docs with empty content const filteredDocs = docs.filter((doc) => doc.content.trim().length > 0); - onSuccess(filteredDocs); const { success, credit_usage } = await billTeam( team_id, filteredDocs.length ); + if (!success) { // throw new Error("Failed to bill team, no subscription was found"); return { @@ -91,7 +91,11 @@ export async function runWebScraper({ }; } - return { success: true, message: "", docs: filteredDocs as CrawlResult[] }; + // This is where the returnvalue from the job is set + onSuccess(filteredDocs); + + // this return doesn't matter too much for the job completion result + return { success: true, message: "", docs: filteredDocs }; } catch (error) { console.error("Error running web scraper", error); onError(error); diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts new file mode 100644 index 0000000..cb7e648 --- /dev/null +++ b/apps/api/src/services/logging/log_job.ts @@ -0,0 +1,33 @@ +import { supabase_service } from "../supabase"; +import { FirecrawlJob } from "../../types"; +import "dotenv/config"; + +export async function logJob(job: FirecrawlJob) { + try { + // Only log jobs in production + if (process.env.ENV !== "production") { + return; + } + const { data, error } = await supabase_service + .from("firecrawl_jobs") + .insert([ + { + success: job.success, + message: job.message, + num_docs: job.num_docs, + docs: job.docs, + time_taken: job.time_taken, + team_id: job.team_id, + mode: job.mode, + url: job.url, + crawler_options: job.crawlerOptions, + page_options: job.pageOptions, + }, + ]); + if (error) { + console.error("Error logging job:\n", error); + } + } catch (error) { + console.error("Error logging job:\n", error); + } +} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index c9c5f73..d436401 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -4,6 +4,7 @@ import "dotenv/config"; import { logtail } from "./logtail"; import { startWebScraperPipeline } from "../main/runWebScraper"; import { callWebhook } from "./webhook"; +import { logJob } from "./logging/log_job"; getWebScraperQueue().process( Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), @@ -15,8 +16,11 @@ getWebScraperQueue().process( current_step: "SCRAPING", current_url: "", }); + const start = Date.now(); const { success, message, docs } = await startWebScraperPipeline({ job }); - + const end = Date.now(); + const timeTakenInSeconds = (end - start) / 1000; + const data = { success: success, result: { @@ -29,6 +33,19 @@ getWebScraperQueue().process( }; await callWebhook(job.data.team_id, data); + + await logJob({ + success: success, + message: message, + num_docs: docs.length, + docs: docs, + time_taken: timeTakenInSeconds, + team_id: job.data.team_id, + mode: "crawl", + url: job.data.url, + crawlerOptions: job.data.crawlerOptions, + pageOptions: job.data.pageOptions, + }); done(null, data); } catch (error) { if (error instanceof CustomError) { diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 2123e0c..7803d93 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -25,4 +25,18 @@ export interface WebScraperOptions { } +export interface FirecrawlJob { + success: boolean; + message: string; + num_docs: number; + docs: any[]; + time_taken: number; + team_id: string; + mode: string; + url: string; + crawlerOptions?: any; + pageOptions?: any; +} + + From 408c7a479f62dd0a50c72481c524a6a18d95432f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 14:02:22 -0700 Subject: [PATCH 03/10] Nick: rate limit fixes --- apps/api/src/index.ts | 16 +++++++++------- apps/api/src/services/rate-limiter.ts | 19 +++++++++++++++++-- apps/api/src/types.ts | 8 ++++++++ 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 98be945..fcd26b7 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -9,6 +9,7 @@ import { WebScraperDataProvider } from "./scraper/WebScraper"; import { billTeam, checkTeamCredits } from "./services/billing/credit_billing"; import { getRateLimiter, redisClient } from "./services/rate-limiter"; import { parseApi } from "./lib/parseApi"; +import { RateLimiterMode } from "./types"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -46,7 +47,7 @@ app.get("/test", async (req, res) => { res.send("Hello, world!"); }); -async function authenticateUser(req, res, mode?: string): Promise<{ success: boolean, team_id?: string, error?: string, status?: number }> { +async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<{ success: boolean, team_id?: string, error?: string, status?: number }> { const authHeader = req.headers.authorization; if (!authHeader) { return { success: false, error: "Unauthorized", status: 401 }; @@ -56,12 +57,13 @@ async function authenticateUser(req, res, mode?: string): Promise<{ success: boo return { success: false, error: "Unauthorized: Token missing", status: 401 }; } + + try { const incomingIP = (req.headers["x-forwarded-for"] || req.socket.remoteAddress) as string; const iptoken = incomingIP + token; - await getRateLimiter( - token === "this_is_just_a_preview_token" ? true : false + await getRateLimiter((token === "this_is_just_a_preview_token") ? RateLimiterMode.Preview : mode ).consume(iptoken); } catch (rateLimiterRes) { console.error(rateLimiterRes); @@ -88,7 +90,7 @@ async function authenticateUser(req, res, mode?: string): Promise<{ success: boo app.post("/v0/scrape", async (req, res) => { try { // make sure to authenticate user first, Bearer - const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Scrape); if (!success) { return res.status(status).json({ error }); } @@ -164,7 +166,7 @@ app.post("/v0/scrape", async (req, res) => { app.post("/v0/crawl", async (req, res) => { try { - const { success, team_id, error, status } = await authenticateUser(req, res, "crawl"); + const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Crawl); if (!success) { return res.status(status).json({ error }); } @@ -230,7 +232,7 @@ app.post("/v0/crawl", async (req, res) => { }); app.post("/v0/crawlWebsitePreview", async (req, res) => { try { - const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Crawl); if (!success) { return res.status(status).json({ error }); } @@ -259,7 +261,7 @@ app.post("/v0/crawlWebsitePreview", async (req, res) => { app.get("/v0/crawl/status/:jobId", async (req, res) => { try { - const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); + const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.CrawlStatus); if (!success) { return res.status(status).json({ error }); } diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 5812f5d..dcd05da 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -1,5 +1,6 @@ import { RateLimiterRedis } from "rate-limiter-flexible"; import * as redis from "redis"; +import { RateLimiterMode } from "../../src/types"; const MAX_REQUESTS_PER_MINUTE_PREVIEW = 5; const MAX_CRAWLS_PER_MINUTE_STARTER = 2; @@ -8,6 +9,9 @@ const MAX_CRAWLS_PER_MINUTE_SCALE = 20; const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 20; +const MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS = 120; + + export const redisClient = redis.createClient({ @@ -29,6 +33,13 @@ export const serverRateLimiter = new RateLimiterRedis({ duration: 60, // Duration in seconds }); +export const crawlStatusRateLimiter = new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "middleware", + points: MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS, + duration: 60, // Duration in seconds +}); + export function crawlRateLimit(plan: string){ if(plan === "standard"){ @@ -56,9 +67,13 @@ export function crawlRateLimit(plan: string){ } -export function getRateLimiter(preview: boolean){ - if(preview){ + + +export function getRateLimiter(mode: RateLimiterMode){ + if(mode === RateLimiterMode.Preview){ return previewRateLimiter; + }else if(mode === RateLimiterMode.CrawlStatus){ + return crawlStatusRateLimiter; }else{ return serverRateLimiter; } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 2123e0c..9442176 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -26,3 +26,11 @@ export interface WebScraperOptions { +export enum RateLimiterMode { + Crawl = "crawl", + CrawlStatus = "crawl-status", + Scrape = "scrape", + Preview = "preview", +} + + From 43c2e877e7a40add2a20bf86603bd7e27b668249 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 14:05:01 -0700 Subject: [PATCH 04/10] Update index.ts --- apps/api/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index fcd26b7..271d96d 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -232,7 +232,7 @@ app.post("/v0/crawl", async (req, res) => { }); app.post("/v0/crawlWebsitePreview", async (req, res) => { try { - const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Crawl); + const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Preview); if (!success) { return res.status(status).json({ error }); } From 5b3c75b06e3756bfc09a469ee9f029582bbc16c7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 14:10:29 -0700 Subject: [PATCH 05/10] Nick: --- apps/api/src/index.ts | 2 +- apps/api/src/services/rate-limiter.ts | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 271d96d..0fbd91e 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -70,7 +70,7 @@ async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<{ suc return { success: false, error: "Rate limit exceeded. Too many requests, try again in 1 minute.", status: 429 }; } - if (token === "this_is_just_a_preview_token" && mode === "scrape") { + if (token === "this_is_just_a_preview_token" && (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview)) { return { success: true, team_id: "preview" }; } diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index dcd05da..b1ee562 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -70,11 +70,12 @@ export function crawlRateLimit(plan: string){ export function getRateLimiter(mode: RateLimiterMode){ - if(mode === RateLimiterMode.Preview){ - return previewRateLimiter; - }else if(mode === RateLimiterMode.CrawlStatus){ - return crawlStatusRateLimiter; - }else{ - return serverRateLimiter; + switch(mode) { + case RateLimiterMode.Preview: + return previewRateLimiter; + case RateLimiterMode.CrawlStatus: + return crawlStatusRateLimiter; + default: + return serverRateLimiter; } } From 23b2190e5df0b7559a634b412b97a6a23152eeaa Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 16:38:05 -0700 Subject: [PATCH 06/10] Nick: --- apps/api/jest.config.js | 3 + apps/api/src/controllers/auth.ts | 67 ++++++ apps/api/src/controllers/crawl-status.ts | 36 +++ apps/api/src/controllers/crawl.ts | 77 +++++++ apps/api/src/controllers/crawlPreview.ts | 37 ++++ apps/api/src/controllers/scrape.ts | 104 +++++++++ apps/api/src/controllers/status.ts | 25 +++ apps/api/src/index.ts | 270 +---------------------- apps/api/src/routes/v0.ts | 14 ++ 9 files changed, 369 insertions(+), 264 deletions(-) create mode 100644 apps/api/src/controllers/auth.ts create mode 100644 apps/api/src/controllers/crawl-status.ts create mode 100644 apps/api/src/controllers/crawl.ts create mode 100644 apps/api/src/controllers/crawlPreview.ts create mode 100644 apps/api/src/controllers/scrape.ts create mode 100644 apps/api/src/controllers/status.ts create mode 100644 apps/api/src/routes/v0.ts diff --git a/apps/api/jest.config.js b/apps/api/jest.config.js index c099257..2854452 100644 --- a/apps/api/jest.config.js +++ b/apps/api/jest.config.js @@ -2,4 +2,7 @@ module.exports = { preset: "ts-jest", testEnvironment: "node", setupFiles: ["./jest.setup.js"], + // ignore dist folder root dir + modulePathIgnorePatterns: ["/dist/"], + }; diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts new file mode 100644 index 0000000..76bacbe --- /dev/null +++ b/apps/api/src/controllers/auth.ts @@ -0,0 +1,67 @@ +import { parseApi } from "../../src/lib/parseApi"; +import { getRateLimiter } from "../../src/services/rate-limiter"; +import { RateLimiterMode } from "../../src/types"; +import { supabase_service } from "../../src/services/supabase"; + +export async function authenticateUser( + req, + res, + mode?: RateLimiterMode +): Promise<{ + success: boolean; + team_id?: string; + error?: string; + status?: number; +}> { + const authHeader = req.headers.authorization; + if (!authHeader) { + return { success: false, error: "Unauthorized", status: 401 }; + } + const token = authHeader.split(" ")[1]; // Extract the token from "Bearer " + if (!token) { + return { + success: false, + error: "Unauthorized: Token missing", + status: 401, + }; + } + + try { + const incomingIP = (req.headers["x-forwarded-for"] || + req.socket.remoteAddress) as string; + const iptoken = incomingIP + token; + await getRateLimiter( + token === "this_is_just_a_preview_token" ? RateLimiterMode.Preview : mode + ).consume(iptoken); + } catch (rateLimiterRes) { + console.error(rateLimiterRes); + return { + success: false, + error: "Rate limit exceeded. Too many requests, try again in 1 minute.", + status: 429, + }; + } + + if ( + token === "this_is_just_a_preview_token" && + (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview) + ) { + return { success: true, team_id: "preview" }; + } + + const normalizedApi = parseApi(token); + // make sure api key is valid, based on the api_keys table in supabase + const { data, error } = await supabase_service + .from("api_keys") + .select("*") + .eq("key", normalizedApi); + if (error || !data || data.length === 0) { + return { + success: false, + error: "Unauthorized: Invalid token", + status: 401, + }; + } + + return { success: true, team_id: data[0].team_id }; +} diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/crawl-status.ts new file mode 100644 index 0000000..3534cd1 --- /dev/null +++ b/apps/api/src/controllers/crawl-status.ts @@ -0,0 +1,36 @@ +import { Request, Response } from "express"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { addWebScraperJob } from "../../src/services/queue-jobs"; +import { getWebScraperQueue } from "../../src/services/queue-service"; + +export async function crawlStatusController(req: Request, res: Response) { + try { + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.CrawlStatus + ); + if (!success) { + return res.status(status).json({ error }); + } + const job = await getWebScraperQueue().getJob(req.params.jobId); + if (!job) { + return res.status(404).json({ error: "Job not found" }); + } + + const { current, current_url, total, current_step } = await job.progress(); + res.json({ + status: await job.getState(), + // progress: job.progress(), + current: current, + current_url: current_url, + current_step: current_step, + total: total, + data: job.returnvalue, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts new file mode 100644 index 0000000..2f7f842 --- /dev/null +++ b/apps/api/src/controllers/crawl.ts @@ -0,0 +1,77 @@ +import { Request, Response } from "express"; +import { WebScraperDataProvider } from "../../src/scraper/WebScraper"; +import { billTeam } from "../../src/services/billing/credit_billing"; +import { checkTeamCredits } from "../../src/services/billing/credit_billing"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { addWebScraperJob } from "../../src/services/queue-jobs"; + +export async function crawlController(req: Request, res: Response) { + try { + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.Crawl + ); + if (!success) { + return res.status(status).json({ error }); + } + + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return res.status(402).json({ error: "Insufficient credits" }); + } + + // authenticate on supabase + const url = req.body.url; + if (!url) { + return res.status(400).json({ error: "Url is required" }); + } + const mode = req.body.mode ?? "crawl"; + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + + if (mode === "single_urls" && !url.includes(",")) { + try { + const a = new WebScraperDataProvider(); + await a.setOptions({ + mode: "single_urls", + urls: [url], + crawlerOptions: { + returnOnlyUrls: true, + }, + pageOptions: pageOptions, + }); + + const docs = await a.getDocuments(false, (progress) => { + job.progress({ + current: progress.current, + total: progress.total, + current_step: "SCRAPING", + current_url: progress.currentDocumentUrl, + }); + }); + return res.json({ + success: true, + documents: docs, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } + } + const job = await addWebScraperJob({ + url: url, + mode: mode ?? "crawl", // fix for single urls not working + crawlerOptions: { ...crawlerOptions }, + team_id: team_id, + pageOptions: pageOptions, + }); + + res.json({ jobId: job.id }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts new file mode 100644 index 0000000..641468c --- /dev/null +++ b/apps/api/src/controllers/crawlPreview.ts @@ -0,0 +1,37 @@ +import { Request, Response } from "express"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { addWebScraperJob } from "../../src/services/queue-jobs"; + +export async function crawlPreviewController(req: Request, res: Response) { + try { + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.Preview + ); + if (!success) { + return res.status(status).json({ error }); + } + // authenticate on supabase + const url = req.body.url; + if (!url) { + return res.status(400).json({ error: "Url is required" }); + } + const mode = req.body.mode ?? "crawl"; + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const job = await addWebScraperJob({ + url: url, + mode: mode ?? "crawl", // fix for single urls not working + crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, + team_id: "preview", + pageOptions: pageOptions, + }); + + res.json({ jobId: job.id }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts new file mode 100644 index 0000000..9173533 --- /dev/null +++ b/apps/api/src/controllers/scrape.ts @@ -0,0 +1,104 @@ +import { Request, Response } from "express"; +import { WebScraperDataProvider } from "../../src/scraper/WebScraper"; +import { billTeam } from "../../src/services/billing/credit_billing"; +import { checkTeamCredits } from "../../src/services/billing/credit_billing"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { logJob } from "../../src/services/logging/log_job"; +import { Document } from "../../src/lib/entities"; + +export async function scrapeHelper( + req: Request, + team_id: string, + crawlerOptions: any, + pageOptions: any +) : Promise<{ success: boolean; error?: string; data?: Document }> { + const url = req.body.url; + if (!url) { + throw new Error("Url is required"); + } + + const a = new WebScraperDataProvider(); + await a.setOptions({ + mode: "single_urls", + urls: [url], + crawlerOptions: { + ...crawlerOptions, + }, + pageOptions: pageOptions, + }); + + const docs = await a.getDocuments(false); + // make sure doc.content is not empty + const filteredDocs = docs.filter( + (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 + ); + if (filteredDocs.length === 0) { + return { success: true, error: "No pages found" }; + } + const { success, credit_usage } = await billTeam( + team_id, + filteredDocs.length + ); + if (!success) { + return { + success: false, + error: "Failed to bill team. Insufficient credits or subscription not found.", + }; + } + return { + success: true, + data: filteredDocs[0], + }; +} + +export async function scrapeController(req: Request, res: Response) { + try { + // make sure to authenticate user first, Bearer + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.Scrape + ); + if (!success) { + return res.status(status).json({ error }); + } + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + + try { + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return res.status(402).json({ error: "Insufficient credits" }); + } + } catch (error) { + console.error(error); + return res.status(500).json({ error: "Internal server error" }); + } + + const result = await scrapeHelper( + req, + team_id, + crawlerOptions, + pageOptions + ); + logJob({ + success: result.success, + message: result.error, + num_docs: result.data.length, + docs: result.data, + time_taken: 0, + team_id: team_id, + mode: "scrape", + url: req.body.url, + crawlerOptions: crawlerOptions, + pageOptions: pageOptions, + }); + return res.json(result); + + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/status.ts new file mode 100644 index 0000000..bd1d2ea --- /dev/null +++ b/apps/api/src/controllers/status.ts @@ -0,0 +1,25 @@ +import { Request, Response } from "express"; +import { getWebScraperQueue } from "../../src/services/queue-service"; + +export async function crawlJobStatusPreviewController(req: Request, res: Response) { + try { + const job = await getWebScraperQueue().getJob(req.params.jobId); + if (!job) { + return res.status(404).json({ error: "Job not found" }); + } + + const { current, current_url, total, current_step } = await job.progress(); + res.json({ + status: await job.getState(), + // progress: job.progress(), + current: current, + current_url: current_url, + current_step: current_step, + total: total, + data: job.returnvalue, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 0fbd91e..57a05f2 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -10,6 +10,7 @@ import { billTeam, checkTeamCredits } from "./services/billing/credit_billing"; import { getRateLimiter, redisClient } from "./services/rate-limiter"; import { parseApi } from "./lib/parseApi"; import { RateLimiterMode } from "./types"; +import { v0Router } from "./routes/v0"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -17,7 +18,6 @@ const { ExpressAdapter } = require("@bull-board/express"); export const app = express(); - global.isProduction = process.env.IS_PRODUCTION === "true"; app.use(bodyParser.urlencoded({ extended: true })); @@ -47,267 +47,8 @@ app.get("/test", async (req, res) => { res.send("Hello, world!"); }); -async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<{ success: boolean, team_id?: string, error?: string, status?: number }> { - const authHeader = req.headers.authorization; - if (!authHeader) { - return { success: false, error: "Unauthorized", status: 401 }; - } - const token = authHeader.split(" ")[1]; // Extract the token from "Bearer " - if (!token) { - return { success: false, error: "Unauthorized: Token missing", status: 401 }; - } - - - - try { - const incomingIP = (req.headers["x-forwarded-for"] || - req.socket.remoteAddress) as string; - const iptoken = incomingIP + token; - await getRateLimiter((token === "this_is_just_a_preview_token") ? RateLimiterMode.Preview : mode - ).consume(iptoken); - } catch (rateLimiterRes) { - console.error(rateLimiterRes); - return { success: false, error: "Rate limit exceeded. Too many requests, try again in 1 minute.", status: 429 }; - } - - if (token === "this_is_just_a_preview_token" && (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview)) { - return { success: true, team_id: "preview" }; - } - - const normalizedApi = parseApi(token); - // make sure api key is valid, based on the api_keys table in supabase - const { data, error } = await supabase_service - .from("api_keys") - .select("*") - .eq("key", normalizedApi); - if (error || !data || data.length === 0) { - return { success: false, error: "Unauthorized: Invalid token", status: 401 }; - } - - return { success: true, team_id: data[0].team_id }; -} - -app.post("/v0/scrape", async (req, res) => { - try { - // make sure to authenticate user first, Bearer - const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Scrape); - if (!success) { - return res.status(status).json({ error }); - } - const crawlerOptions = req.body.crawlerOptions ?? {}; - - try { - const { success: creditsCheckSuccess, message: creditsCheckMessage } = - await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits" }); - } - } catch (error) { - console.error(error); - return res.status(500).json({ error: "Internal server error" }); - } - - // authenticate on supabase - const url = req.body.url; - if (!url) { - return res.status(400).json({ error: "Url is required" }); - } - - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - - try { - const a = new WebScraperDataProvider(); - await a.setOptions({ - mode: "single_urls", - urls: [url], - crawlerOptions: { - ...crawlerOptions, - }, - pageOptions: pageOptions, - }); - - const docs = await a.getDocuments(false); - // make sure doc.content is not empty - const filteredDocs = docs.filter( - (doc: { content?: string }) => - doc.content && doc.content.trim().length > 0 - ); - if (filteredDocs.length === 0) { - return res.status(200).json({ success: true, data: [] }); - } - const { success, credit_usage } = await billTeam( - team_id, - filteredDocs.length - ); - if (!success) { - // throw new Error("Failed to bill team, no subscription was found"); - // return { - // success: false, - // message: "Failed to bill team, no subscription was found", - // docs: [], - // }; - return res - .status(402) - .json({ error: "Failed to bill, no subscription was found" }); - } - return res.json({ - success: true, - data: filteredDocs[0], - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); - -app.post("/v0/crawl", async (req, res) => { - try { - const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Crawl); - if (!success) { - return res.status(status).json({ error }); - } - - const { success: creditsCheckSuccess, message: creditsCheckMessage } = - await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits" }); - } - - // authenticate on supabase - const url = req.body.url; - if (!url) { - return res.status(400).json({ error: "Url is required" }); - } - const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - - if (mode === "single_urls" && !url.includes(",")) { - try { - const a = new WebScraperDataProvider(); - await a.setOptions({ - mode: "single_urls", - urls: [url], - crawlerOptions: { - returnOnlyUrls: true, - }, - pageOptions: pageOptions, - }); - - const docs = await a.getDocuments(false, (progress) => { - job.progress({ - current: progress.current, - total: progress.total, - current_step: "SCRAPING", - current_url: progress.currentDocumentUrl, - }); - }); - return res.json({ - success: true, - documents: docs, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } - } - const job = await addWebScraperJob({ - url: url, - mode: mode ?? "crawl", // fix for single urls not working - crawlerOptions: { ...crawlerOptions }, - team_id: team_id, - pageOptions: pageOptions, - - }); - - res.json({ jobId: job.id }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); -app.post("/v0/crawlWebsitePreview", async (req, res) => { - try { - const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.Preview); - if (!success) { - return res.status(status).json({ error }); - } - // authenticate on supabase - const url = req.body.url; - if (!url) { - return res.status(400).json({ error: "Url is required" }); - } - const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - const job = await addWebScraperJob({ - url: url, - mode: mode ?? "crawl", // fix for single urls not working - crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, - team_id: "preview", - pageOptions: pageOptions, - }); - - res.json({ jobId: job.id }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); - -app.get("/v0/crawl/status/:jobId", async (req, res) => { - try { - const { success, team_id, error, status } = await authenticateUser(req, res, RateLimiterMode.CrawlStatus); - if (!success) { - return res.status(status).json({ error }); - } - const job = await getWebScraperQueue().getJob(req.params.jobId); - if (!job) { - return res.status(404).json({ error: "Job not found" }); - } - - const { current, current_url, total, current_step } = await job.progress(); - res.json({ - status: await job.getState(), - // progress: job.progress(), - current: current, - current_url: current_url, - current_step: current_step, - total: total, - data: job.returnvalue, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); - -app.get("/v0/checkJobStatus/:jobId", async (req, res) => { - try { - const job = await getWebScraperQueue().getJob(req.params.jobId); - if (!job) { - return res.status(404).json({ error: "Job not found" }); - } - - const { current, current_url, total, current_step } = await job.progress(); - res.json({ - status: await job.getState(), - // progress: job.progress(), - current: current, - current_url: current_url, - current_step: current_step, - total: total, - data: job.returnvalue, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); +// register router +app.use(v0Router); const DEFAULT_PORT = process.env.PORT ?? 3002; const HOST = process.env.HOST ?? "localhost"; @@ -316,7 +57,9 @@ redisClient.connect(); export function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { console.log(`Server listening on port ${port}`); - console.log(`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`); + console.log( + `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + ); console.log(""); console.log("1. Make sure Redis is running on port 6379 by default"); console.log( @@ -353,4 +96,3 @@ app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { app.get("/is-production", (req, res) => { res.send({ isProduction: global.isProduction }); }); - diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts new file mode 100644 index 0000000..023282a --- /dev/null +++ b/apps/api/src/routes/v0.ts @@ -0,0 +1,14 @@ +import express from "express"; +import { crawlController } from "../../src/controllers/crawl"; +import { crawlStatusController } from "../../src/controllers/crawl-status"; +import { scrapeController } from "../../src/controllers/scrape"; +import { crawlPreviewController } from "../../src/controllers/crawlPreview"; +import { crawlJobStatusPreviewController } from "../../src/controllers/status"; + +export const v0Router = express.Router(); + +v0Router.post("/v0/scrape", scrapeController); +v0Router.post("/v0/crawl", crawlController); +v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController); +v0Router.get("/v0/crawl/status/:jobId", crawlStatusController); +v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController); From 5b8aed26dd85a9d5f23e0fd865882dfd5b14a865 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 18:55:39 -0700 Subject: [PATCH 07/10] Update scrape.ts --- apps/api/src/controllers/scrape.ts | 57 +++++++++++++++++------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 9173533..04fe525 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -5,17 +5,22 @@ import { checkTeamCredits } from "../../src/services/billing/credit_billing"; import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { logJob } from "../../src/services/logging/log_job"; -import { Document } from "../../src/lib/entities"; +import { Document } from "../../src/lib/entities"; export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, pageOptions: any -) : Promise<{ success: boolean; error?: string; data?: Document }> { +): Promise<{ + success: boolean; + error?: string; + data?: Document; + returnCode?: number; +}> { const url = req.body.url; if (!url) { - throw new Error("Url is required"); + return { success: false, error: "Url is required", returnCode: 400 }; } const a = new WebScraperDataProvider(); @@ -34,7 +39,7 @@ export async function scrapeHelper( (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 ); if (filteredDocs.length === 0) { - return { success: true, error: "No pages found" }; + return { success: true, error: "No page found", returnCode: 200 }; } const { success, credit_usage } = await billTeam( team_id, @@ -43,12 +48,15 @@ export async function scrapeHelper( if (!success) { return { success: false, - error: "Failed to bill team. Insufficient credits or subscription not found.", + error: + "Failed to bill team. Insufficient credits or subscription not found.", + returnCode: 402, }; } return { success: true, data: filteredDocs[0], + returnCode: 200, }; } @@ -77,26 +85,25 @@ export async function scrapeController(req: Request, res: Response) { return res.status(500).json({ error: "Internal server error" }); } - const result = await scrapeHelper( - req, - team_id, - crawlerOptions, - pageOptions - ); - logJob({ - success: result.success, - message: result.error, - num_docs: result.data.length, - docs: result.data, - time_taken: 0, - team_id: team_id, - mode: "scrape", - url: req.body.url, - crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - }); - return res.json(result); - + const result = await scrapeHelper( + req, + team_id, + crawlerOptions, + pageOptions + ); + logJob({ + success: result.success, + message: result.error, + num_docs: 1, + docs: [result.data], + time_taken: 0, + team_id: team_id, + mode: "scrape", + url: req.body.url, + crawlerOptions: crawlerOptions, + pageOptions: pageOptions, + }); + return res.json(result); } catch (error) { console.error(error); return res.status(500).json({ error: error.message }); From 4543c57e4e70dfe072c86c01c77f90a4df535979 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 19:04:27 -0700 Subject: [PATCH 08/10] Nick: --- apps/api/.env.local | 1 + apps/api/src/controllers/scrape.ts | 15 +++++++-------- apps/api/src/index.ts | 8 +------- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/apps/api/.env.local b/apps/api/.env.local index f5c625f..6c58f19 100644 --- a/apps/api/.env.local +++ b/apps/api/.env.local @@ -1,3 +1,4 @@ +ENV= NUM_WORKERS_PER_QUEUE=8 PORT= HOST= diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 04fe525..51d14f2 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -1,11 +1,10 @@ import { Request, Response } from "express"; -import { WebScraperDataProvider } from "../../src/scraper/WebScraper"; -import { billTeam } from "../../src/services/billing/credit_billing"; -import { checkTeamCredits } from "../../src/services/billing/credit_billing"; +import { WebScraperDataProvider } from "../scraper/WebScraper"; +import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; import { authenticateUser } from "./auth"; -import { RateLimiterMode } from "../../src/types"; -import { logJob } from "../../src/services/logging/log_job"; -import { Document } from "../../src/lib/entities"; +import { RateLimiterMode } from "../types"; +import { logJob } from "../services/logging/log_job"; +import { Document } from "../lib/entities"; export async function scrapeHelper( req: Request, @@ -16,7 +15,7 @@ export async function scrapeHelper( success: boolean; error?: string; data?: Document; - returnCode?: number; + returnCode: number; }> { const url = req.body.url; if (!url) { @@ -103,7 +102,7 @@ export async function scrapeController(req: Request, res: Response) { crawlerOptions: crawlerOptions, pageOptions: pageOptions, }); - return res.json(result); + return res.status(result.returnCode).json(result); } catch (error) { console.error(error); return res.status(500).json({ error: error.message }); diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 57a05f2..1a42eb4 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -3,13 +3,7 @@ import bodyParser from "body-parser"; import cors from "cors"; import "dotenv/config"; import { getWebScraperQueue } from "./services/queue-service"; -import { addWebScraperJob } from "./services/queue-jobs"; -import { supabase_service } from "./services/supabase"; -import { WebScraperDataProvider } from "./scraper/WebScraper"; -import { billTeam, checkTeamCredits } from "./services/billing/credit_billing"; -import { getRateLimiter, redisClient } from "./services/rate-limiter"; -import { parseApi } from "./lib/parseApi"; -import { RateLimiterMode } from "./types"; +import { redisClient } from "./services/rate-limiter"; import { v0Router } from "./routes/v0"; const { createBullBoard } = require("@bull-board/api"); From 0db0874b00742e7e7a6439a975501a397da5d6b8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 19:37:45 -0700 Subject: [PATCH 09/10] Nick: --- apps/api/src/controllers/crawl.ts | 2 ++ apps/api/src/controllers/crawlPreview.ts | 2 ++ apps/api/src/controllers/scrape.ts | 8 ++++++-- apps/api/src/main/runWebScraper.ts | 10 +++++++--- apps/api/src/services/logging/log_job.ts | 3 ++- apps/api/src/services/queue-worker.ts | 6 ++++-- apps/api/src/services/webhook.ts | 9 +++++++-- apps/api/src/types.ts | 2 ++ 8 files changed, 32 insertions(+), 10 deletions(-) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 2f7f842..17cfa62 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -42,6 +42,7 @@ export async function crawlController(req: Request, res: Response) { returnOnlyUrls: true, }, pageOptions: pageOptions, + }); const docs = await a.getDocuments(false, (progress) => { @@ -67,6 +68,7 @@ export async function crawlController(req: Request, res: Response) { crawlerOptions: { ...crawlerOptions }, team_id: team_id, pageOptions: pageOptions, + origin: req.body.origin ?? "api", }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 641468c..3f28ef6 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -21,12 +21,14 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const job = await addWebScraperJob({ url: url, mode: mode ?? "crawl", // fix for single urls not working crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, team_id: "preview", pageOptions: pageOptions, + origin: "website-preview", }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 51d14f2..632fff5 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -72,6 +72,7 @@ export async function scrapeController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const origin = req.body.origin ?? "api"; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -83,24 +84,27 @@ export async function scrapeController(req: Request, res: Response) { console.error(error); return res.status(500).json({ error: "Internal server error" }); } - + const startTime = new Date().getTime(); const result = await scrapeHelper( req, team_id, crawlerOptions, pageOptions ); + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; logJob({ success: result.success, message: result.error, num_docs: 1, docs: [result.data], - time_taken: 0, + time_taken: timeTakenInSeconds, team_id: team_id, mode: "scrape", url: req.body.url, crawlerOptions: crawlerOptions, pageOptions: pageOptions, + origin: origin, }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 0f562a0..d943429 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -44,7 +44,11 @@ export async function runWebScraper({ onSuccess: (result: any) => void; onError: (error: any) => void; team_id: string; -}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> { +}): Promise<{ + success: boolean; + message: string; + docs: CrawlResult[]; +}> { try { const provider = new WebScraperDataProvider(); if (mode === "crawl") { @@ -70,7 +74,7 @@ export async function runWebScraper({ return { success: true, message: "No pages found", - docs: [], + docs: [] }; } @@ -87,7 +91,7 @@ export async function runWebScraper({ return { success: false, message: "Failed to bill team, no subscription was found", - docs: [], + docs: [] }; } diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index cb7e648..639b3a8 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -17,11 +17,12 @@ export async function logJob(job: FirecrawlJob) { num_docs: job.num_docs, docs: job.docs, time_taken: job.time_taken, - team_id: job.team_id, + team_id: job.team_id === "preview" ? null : job.team_id, mode: job.mode, url: job.url, crawler_options: job.crawlerOptions, page_options: job.pageOptions, + origin: job.origin, }, ]); if (error) { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index d436401..dda876a 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -17,10 +17,11 @@ getWebScraperQueue().process( current_url: "", }); const start = Date.now(); + console.log("Processing job", job.data); const { success, message, docs } = await startWebScraperPipeline({ job }); const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; - + const data = { success: success, result: { @@ -33,7 +34,7 @@ getWebScraperQueue().process( }; await callWebhook(job.data.team_id, data); - + await logJob({ success: success, message: message, @@ -45,6 +46,7 @@ getWebScraperQueue().process( url: job.data.url, crawlerOptions: job.data.crawlerOptions, pageOptions: job.data.pageOptions, + origin: job.data.origin, }); done(null, data); } catch (error) { diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index a086425..ab1f90e 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,6 +1,7 @@ import { supabase_service } from "./supabase"; export const callWebhook = async (teamId: string, data: any) => { + try { const { data: webhooksData, error } = await supabase_service .from('webhooks') .select('url') @@ -37,5 +38,9 @@ export const callWebhook = async (teamId: string, data: any) => { data: dataToSend, error: data.error || undefined, }), - }); -} \ No newline at end of file + }); + } catch (error) { + console.error(`Error sending webhook for team ID: ${teamId}`, error.message); + } +}; + diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index e3fc5dc..f9e5c73 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -22,6 +22,7 @@ export interface WebScraperOptions { crawlerOptions: any; pageOptions: any; team_id: string; + origin?: string; } @@ -36,6 +37,7 @@ export interface FirecrawlJob { url: string; crawlerOptions?: any; pageOptions?: any; + origin: string; } From 9b31e68a7ef64ededa0531bece1fb340e72a9e70 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 20 Apr 2024 19:38:44 -0700 Subject: [PATCH 10/10] Update queue-worker.ts --- apps/api/src/services/queue-worker.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index dda876a..8d7a7bd 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -17,7 +17,7 @@ getWebScraperQueue().process( current_url: "", }); const start = Date.now(); - console.log("Processing job", job.data); + const { success, message, docs } = await startWebScraperPipeline({ job }); const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000;