diff --git a/apps/api/.env.example b/apps/api/.env.example index 392db9a..9a4541c 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -1,6 +1,6 @@ -# Required -NUM_WORKERS_PER_QUEUE=8 -PORT= +ENV= +NUM_WORKERS_PER_QUEUE=8 +PORT= HOST= SUPABASE_ANON_TOKEN= SUPABASE_URL= diff --git a/apps/api/jest.config.js b/apps/api/jest.config.js index c099257..2854452 100644 --- a/apps/api/jest.config.js +++ b/apps/api/jest.config.js @@ -2,4 +2,7 @@ module.exports = { preset: "ts-jest", testEnvironment: "node", setupFiles: ["./jest.setup.js"], + // ignore dist folder root dir + modulePathIgnorePatterns: ["/dist/"], + }; diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts new file mode 100644 index 0000000..76bacbe --- /dev/null +++ b/apps/api/src/controllers/auth.ts @@ -0,0 +1,67 @@ +import { parseApi } from "../../src/lib/parseApi"; +import { getRateLimiter } from "../../src/services/rate-limiter"; +import { RateLimiterMode } from "../../src/types"; +import { supabase_service } from "../../src/services/supabase"; + +export async function authenticateUser( + req, + res, + mode?: RateLimiterMode +): Promise<{ + success: boolean; + team_id?: string; + error?: string; + status?: number; +}> { + const authHeader = req.headers.authorization; + if (!authHeader) { + return { success: false, error: "Unauthorized", status: 401 }; + } + const token = authHeader.split(" ")[1]; // Extract the token from "Bearer " + if (!token) { + return { + success: false, + error: "Unauthorized: Token missing", + status: 401, + }; + } + + try { + const incomingIP = (req.headers["x-forwarded-for"] || + req.socket.remoteAddress) as string; + const iptoken = incomingIP + token; + await getRateLimiter( + token === "this_is_just_a_preview_token" ? RateLimiterMode.Preview : mode + ).consume(iptoken); + } catch (rateLimiterRes) { + console.error(rateLimiterRes); + return { + success: false, + error: "Rate limit exceeded. Too many requests, try again in 1 minute.", + status: 429, + }; + } + + if ( + token === "this_is_just_a_preview_token" && + (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview) + ) { + return { success: true, team_id: "preview" }; + } + + const normalizedApi = parseApi(token); + // make sure api key is valid, based on the api_keys table in supabase + const { data, error } = await supabase_service + .from("api_keys") + .select("*") + .eq("key", normalizedApi); + if (error || !data || data.length === 0) { + return { + success: false, + error: "Unauthorized: Invalid token", + status: 401, + }; + } + + return { success: true, team_id: data[0].team_id }; +} diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/crawl-status.ts new file mode 100644 index 0000000..3534cd1 --- /dev/null +++ b/apps/api/src/controllers/crawl-status.ts @@ -0,0 +1,36 @@ +import { Request, Response } from "express"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { addWebScraperJob } from "../../src/services/queue-jobs"; +import { getWebScraperQueue } from "../../src/services/queue-service"; + +export async function crawlStatusController(req: Request, res: Response) { + try { + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.CrawlStatus + ); + if (!success) { + return res.status(status).json({ error }); + } + const job = await getWebScraperQueue().getJob(req.params.jobId); + if (!job) { + return res.status(404).json({ error: "Job not found" }); + } + + const { current, current_url, total, current_step } = await job.progress(); + res.json({ + status: await job.getState(), + // progress: job.progress(), + current: current, + current_url: current_url, + current_step: current_step, + total: total, + data: job.returnvalue, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts new file mode 100644 index 0000000..17cfa62 --- /dev/null +++ b/apps/api/src/controllers/crawl.ts @@ -0,0 +1,79 @@ +import { Request, Response } from "express"; +import { WebScraperDataProvider } from "../../src/scraper/WebScraper"; +import { billTeam } from "../../src/services/billing/credit_billing"; +import { checkTeamCredits } from "../../src/services/billing/credit_billing"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { addWebScraperJob } from "../../src/services/queue-jobs"; + +export async function crawlController(req: Request, res: Response) { + try { + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.Crawl + ); + if (!success) { + return res.status(status).json({ error }); + } + + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return res.status(402).json({ error: "Insufficient credits" }); + } + + // authenticate on supabase + const url = req.body.url; + if (!url) { + return res.status(400).json({ error: "Url is required" }); + } + const mode = req.body.mode ?? "crawl"; + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + + if (mode === "single_urls" && !url.includes(",")) { + try { + const a = new WebScraperDataProvider(); + await a.setOptions({ + mode: "single_urls", + urls: [url], + crawlerOptions: { + returnOnlyUrls: true, + }, + pageOptions: pageOptions, + + }); + + const docs = await a.getDocuments(false, (progress) => { + job.progress({ + current: progress.current, + total: progress.total, + current_step: "SCRAPING", + current_url: progress.currentDocumentUrl, + }); + }); + return res.json({ + success: true, + documents: docs, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } + } + const job = await addWebScraperJob({ + url: url, + mode: mode ?? "crawl", // fix for single urls not working + crawlerOptions: { ...crawlerOptions }, + team_id: team_id, + pageOptions: pageOptions, + origin: req.body.origin ?? "api", + }); + + res.json({ jobId: job.id }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts new file mode 100644 index 0000000..3f28ef6 --- /dev/null +++ b/apps/api/src/controllers/crawlPreview.ts @@ -0,0 +1,39 @@ +import { Request, Response } from "express"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { addWebScraperJob } from "../../src/services/queue-jobs"; + +export async function crawlPreviewController(req: Request, res: Response) { + try { + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.Preview + ); + if (!success) { + return res.status(status).json({ error }); + } + // authenticate on supabase + const url = req.body.url; + if (!url) { + return res.status(400).json({ error: "Url is required" }); + } + const mode = req.body.mode ?? "crawl"; + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + + const job = await addWebScraperJob({ + url: url, + mode: mode ?? "crawl", // fix for single urls not working + crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, + team_id: "preview", + pageOptions: pageOptions, + origin: "website-preview", + }); + + res.json({ jobId: job.id }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts new file mode 100644 index 0000000..632fff5 --- /dev/null +++ b/apps/api/src/controllers/scrape.ts @@ -0,0 +1,114 @@ +import { Request, Response } from "express"; +import { WebScraperDataProvider } from "../scraper/WebScraper"; +import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../types"; +import { logJob } from "../services/logging/log_job"; +import { Document } from "../lib/entities"; + +export async function scrapeHelper( + req: Request, + team_id: string, + crawlerOptions: any, + pageOptions: any +): Promise<{ + success: boolean; + error?: string; + data?: Document; + returnCode: number; +}> { + const url = req.body.url; + if (!url) { + return { success: false, error: "Url is required", returnCode: 400 }; + } + + const a = new WebScraperDataProvider(); + await a.setOptions({ + mode: "single_urls", + urls: [url], + crawlerOptions: { + ...crawlerOptions, + }, + pageOptions: pageOptions, + }); + + const docs = await a.getDocuments(false); + // make sure doc.content is not empty + const filteredDocs = docs.filter( + (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 + ); + if (filteredDocs.length === 0) { + return { success: true, error: "No page found", returnCode: 200 }; + } + const { success, credit_usage } = await billTeam( + team_id, + filteredDocs.length + ); + if (!success) { + return { + success: false, + error: + "Failed to bill team. Insufficient credits or subscription not found.", + returnCode: 402, + }; + } + return { + success: true, + data: filteredDocs[0], + returnCode: 200, + }; +} + +export async function scrapeController(req: Request, res: Response) { + try { + // make sure to authenticate user first, Bearer + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.Scrape + ); + if (!success) { + return res.status(status).json({ error }); + } + const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const origin = req.body.origin ?? "api"; + + try { + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + return res.status(402).json({ error: "Insufficient credits" }); + } + } catch (error) { + console.error(error); + return res.status(500).json({ error: "Internal server error" }); + } + const startTime = new Date().getTime(); + const result = await scrapeHelper( + req, + team_id, + crawlerOptions, + pageOptions + ); + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + logJob({ + success: result.success, + message: result.error, + num_docs: 1, + docs: [result.data], + time_taken: timeTakenInSeconds, + team_id: team_id, + mode: "scrape", + url: req.body.url, + crawlerOptions: crawlerOptions, + pageOptions: pageOptions, + origin: origin, + }); + return res.status(result.returnCode).json(result); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/status.ts new file mode 100644 index 0000000..bd1d2ea --- /dev/null +++ b/apps/api/src/controllers/status.ts @@ -0,0 +1,25 @@ +import { Request, Response } from "express"; +import { getWebScraperQueue } from "../../src/services/queue-service"; + +export async function crawlJobStatusPreviewController(req: Request, res: Response) { + try { + const job = await getWebScraperQueue().getJob(req.params.jobId); + if (!job) { + return res.status(404).json({ error: "Job not found" }); + } + + const { current, current_url, total, current_step } = await job.progress(); + res.json({ + status: await job.getState(), + // progress: job.progress(), + current: current, + current_url: current_url, + current_step: current_step, + total: total, + data: job.returnvalue, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 98be945..1a42eb4 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -3,12 +3,8 @@ import bodyParser from "body-parser"; import cors from "cors"; import "dotenv/config"; import { getWebScraperQueue } from "./services/queue-service"; -import { addWebScraperJob } from "./services/queue-jobs"; -import { supabase_service } from "./services/supabase"; -import { WebScraperDataProvider } from "./scraper/WebScraper"; -import { billTeam, checkTeamCredits } from "./services/billing/credit_billing"; -import { getRateLimiter, redisClient } from "./services/rate-limiter"; -import { parseApi } from "./lib/parseApi"; +import { redisClient } from "./services/rate-limiter"; +import { v0Router } from "./routes/v0"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -16,7 +12,6 @@ const { ExpressAdapter } = require("@bull-board/express"); export const app = express(); - global.isProduction = process.env.IS_PRODUCTION === "true"; app.use(bodyParser.urlencoded({ extended: true })); @@ -46,266 +41,8 @@ app.get("/test", async (req, res) => { res.send("Hello, world!"); }); -async function authenticateUser(req, res, mode?: string): Promise<{ success: boolean, team_id?: string, error?: string, status?: number }> { - const authHeader = req.headers.authorization; - if (!authHeader) { - return { success: false, error: "Unauthorized", status: 401 }; - } - const token = authHeader.split(" ")[1]; // Extract the token from "Bearer " - if (!token) { - return { success: false, error: "Unauthorized: Token missing", status: 401 }; - } - - try { - const incomingIP = (req.headers["x-forwarded-for"] || - req.socket.remoteAddress) as string; - const iptoken = incomingIP + token; - await getRateLimiter( - token === "this_is_just_a_preview_token" ? true : false - ).consume(iptoken); - } catch (rateLimiterRes) { - console.error(rateLimiterRes); - return { success: false, error: "Rate limit exceeded. Too many requests, try again in 1 minute.", status: 429 }; - } - - if (token === "this_is_just_a_preview_token" && mode === "scrape") { - return { success: true, team_id: "preview" }; - } - - const normalizedApi = parseApi(token); - // make sure api key is valid, based on the api_keys table in supabase - const { data, error } = await supabase_service - .from("api_keys") - .select("*") - .eq("key", normalizedApi); - if (error || !data || data.length === 0) { - return { success: false, error: "Unauthorized: Invalid token", status: 401 }; - } - - return { success: true, team_id: data[0].team_id }; -} - -app.post("/v0/scrape", async (req, res) => { - try { - // make sure to authenticate user first, Bearer - const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); - if (!success) { - return res.status(status).json({ error }); - } - const crawlerOptions = req.body.crawlerOptions ?? {}; - - try { - const { success: creditsCheckSuccess, message: creditsCheckMessage } = - await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits" }); - } - } catch (error) { - console.error(error); - return res.status(500).json({ error: "Internal server error" }); - } - - // authenticate on supabase - const url = req.body.url; - if (!url) { - return res.status(400).json({ error: "Url is required" }); - } - - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - - try { - const a = new WebScraperDataProvider(); - await a.setOptions({ - mode: "single_urls", - urls: [url], - crawlerOptions: { - ...crawlerOptions, - }, - pageOptions: pageOptions, - }); - - const docs = await a.getDocuments(false); - // make sure doc.content is not empty - const filteredDocs = docs.filter( - (doc: { content?: string }) => - doc.content && doc.content.trim().length > 0 - ); - if (filteredDocs.length === 0) { - return res.status(200).json({ success: true, data: [] }); - } - const { success, credit_usage } = await billTeam( - team_id, - filteredDocs.length - ); - if (!success) { - // throw new Error("Failed to bill team, no subscription was found"); - // return { - // success: false, - // message: "Failed to bill team, no subscription was found", - // docs: [], - // }; - return res - .status(402) - .json({ error: "Failed to bill, no subscription was found" }); - } - return res.json({ - success: true, - data: filteredDocs[0], - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); - -app.post("/v0/crawl", async (req, res) => { - try { - const { success, team_id, error, status } = await authenticateUser(req, res, "crawl"); - if (!success) { - return res.status(status).json({ error }); - } - - const { success: creditsCheckSuccess, message: creditsCheckMessage } = - await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits" }); - } - - // authenticate on supabase - const url = req.body.url; - if (!url) { - return res.status(400).json({ error: "Url is required" }); - } - const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - - if (mode === "single_urls" && !url.includes(",")) { - try { - const a = new WebScraperDataProvider(); - await a.setOptions({ - mode: "single_urls", - urls: [url], - crawlerOptions: { - returnOnlyUrls: true, - }, - pageOptions: pageOptions, - }); - - const docs = await a.getDocuments(false, (progress) => { - job.progress({ - current: progress.current, - total: progress.total, - current_step: "SCRAPING", - current_url: progress.currentDocumentUrl, - }); - }); - return res.json({ - success: true, - documents: docs, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } - } - const job = await addWebScraperJob({ - url: url, - mode: mode ?? "crawl", // fix for single urls not working - crawlerOptions: { ...crawlerOptions }, - team_id: team_id, - pageOptions: pageOptions, - - }); - - res.json({ jobId: job.id }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); -app.post("/v0/crawlWebsitePreview", async (req, res) => { - try { - const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); - if (!success) { - return res.status(status).json({ error }); - } - // authenticate on supabase - const url = req.body.url; - if (!url) { - return res.status(400).json({ error: "Url is required" }); - } - const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - const job = await addWebScraperJob({ - url: url, - mode: mode ?? "crawl", // fix for single urls not working - crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, - team_id: "preview", - pageOptions: pageOptions, - }); - - res.json({ jobId: job.id }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); - -app.get("/v0/crawl/status/:jobId", async (req, res) => { - try { - const { success, team_id, error, status } = await authenticateUser(req, res, "scrape"); - if (!success) { - return res.status(status).json({ error }); - } - const job = await getWebScraperQueue().getJob(req.params.jobId); - if (!job) { - return res.status(404).json({ error: "Job not found" }); - } - - const { current, current_url, total, current_step } = await job.progress(); - res.json({ - status: await job.getState(), - // progress: job.progress(), - current: current, - current_url: current_url, - current_step: current_step, - total: total, - data: job.returnvalue, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); - -app.get("/v0/checkJobStatus/:jobId", async (req, res) => { - try { - const job = await getWebScraperQueue().getJob(req.params.jobId); - if (!job) { - return res.status(404).json({ error: "Job not found" }); - } - - const { current, current_url, total, current_step } = await job.progress(); - res.json({ - status: await job.getState(), - // progress: job.progress(), - current: current, - current_url: current_url, - current_step: current_step, - total: total, - data: job.returnvalue, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } -}); +// register router +app.use(v0Router); const DEFAULT_PORT = process.env.PORT ?? 3002; const HOST = process.env.HOST ?? "localhost"; @@ -314,7 +51,9 @@ redisClient.connect(); export function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { console.log(`Server listening on port ${port}`); - console.log(`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`); + console.log( + `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + ); console.log(""); console.log("1. Make sure Redis is running on port 6379 by default"); console.log( @@ -351,4 +90,3 @@ app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { app.get("/is-production", (req, res) => { res.send({ isProduction: global.isProduction }); }); - diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index c43b1b3..d943429 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -3,7 +3,7 @@ import { CrawlResult, WebScraperOptions } from "../types"; import { WebScraperDataProvider } from "../scraper/WebScraper"; import { Progress } from "../lib/entities"; import { billTeam } from "../services/billing/credit_billing"; - +import { Document } from "../lib/entities"; export async function startWebScraperPipeline({ job, }: { @@ -24,7 +24,7 @@ export async function startWebScraperPipeline({ job.moveToFailed(error); }, team_id: job.data.team_id, - })) as { success: boolean; message: string; docs: CrawlResult[] }; + })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ url, @@ -44,7 +44,11 @@ export async function runWebScraper({ onSuccess: (result: any) => void; onError: (error: any) => void; team_id: string; -}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> { +}): Promise<{ + success: boolean; + message: string; + docs: CrawlResult[]; +}> { try { const provider = new WebScraperDataProvider(); if (mode === "crawl") { @@ -70,28 +74,32 @@ export async function runWebScraper({ return { success: true, message: "No pages found", - docs: [], + docs: [] }; } // remove docs with empty content const filteredDocs = docs.filter((doc) => doc.content.trim().length > 0); - onSuccess(filteredDocs); const { success, credit_usage } = await billTeam( team_id, filteredDocs.length ); + if (!success) { // throw new Error("Failed to bill team, no subscription was found"); return { success: false, message: "Failed to bill team, no subscription was found", - docs: [], + docs: [] }; } - return { success: true, message: "", docs: filteredDocs as CrawlResult[] }; + // This is where the returnvalue from the job is set + onSuccess(filteredDocs); + + // this return doesn't matter too much for the job completion result + return { success: true, message: "", docs: filteredDocs }; } catch (error) { console.error("Error running web scraper", error); onError(error); diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts new file mode 100644 index 0000000..023282a --- /dev/null +++ b/apps/api/src/routes/v0.ts @@ -0,0 +1,14 @@ +import express from "express"; +import { crawlController } from "../../src/controllers/crawl"; +import { crawlStatusController } from "../../src/controllers/crawl-status"; +import { scrapeController } from "../../src/controllers/scrape"; +import { crawlPreviewController } from "../../src/controllers/crawlPreview"; +import { crawlJobStatusPreviewController } from "../../src/controllers/status"; + +export const v0Router = express.Router(); + +v0Router.post("/v0/scrape", scrapeController); +v0Router.post("/v0/crawl", crawlController); +v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController); +v0Router.get("/v0/crawl/status/:jobId", crawlStatusController); +v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController); diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts new file mode 100644 index 0000000..639b3a8 --- /dev/null +++ b/apps/api/src/services/logging/log_job.ts @@ -0,0 +1,34 @@ +import { supabase_service } from "../supabase"; +import { FirecrawlJob } from "../../types"; +import "dotenv/config"; + +export async function logJob(job: FirecrawlJob) { + try { + // Only log jobs in production + if (process.env.ENV !== "production") { + return; + } + const { data, error } = await supabase_service + .from("firecrawl_jobs") + .insert([ + { + success: job.success, + message: job.message, + num_docs: job.num_docs, + docs: job.docs, + time_taken: job.time_taken, + team_id: job.team_id === "preview" ? null : job.team_id, + mode: job.mode, + url: job.url, + crawler_options: job.crawlerOptions, + page_options: job.pageOptions, + origin: job.origin, + }, + ]); + if (error) { + console.error("Error logging job:\n", error); + } + } catch (error) { + console.error("Error logging job:\n", error); + } +} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index c9c5f73..8d7a7bd 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -4,6 +4,7 @@ import "dotenv/config"; import { logtail } from "./logtail"; import { startWebScraperPipeline } from "../main/runWebScraper"; import { callWebhook } from "./webhook"; +import { logJob } from "./logging/log_job"; getWebScraperQueue().process( Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), @@ -15,7 +16,11 @@ getWebScraperQueue().process( current_step: "SCRAPING", current_url: "", }); + const start = Date.now(); + const { success, message, docs } = await startWebScraperPipeline({ job }); + const end = Date.now(); + const timeTakenInSeconds = (end - start) / 1000; const data = { success: success, @@ -29,6 +34,20 @@ getWebScraperQueue().process( }; await callWebhook(job.data.team_id, data); + + await logJob({ + success: success, + message: message, + num_docs: docs.length, + docs: docs, + time_taken: timeTakenInSeconds, + team_id: job.data.team_id, + mode: "crawl", + url: job.data.url, + crawlerOptions: job.data.crawlerOptions, + pageOptions: job.data.pageOptions, + origin: job.data.origin, + }); done(null, data); } catch (error) { if (error instanceof CustomError) { diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 5812f5d..b1ee562 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -1,5 +1,6 @@ import { RateLimiterRedis } from "rate-limiter-flexible"; import * as redis from "redis"; +import { RateLimiterMode } from "../../src/types"; const MAX_REQUESTS_PER_MINUTE_PREVIEW = 5; const MAX_CRAWLS_PER_MINUTE_STARTER = 2; @@ -8,6 +9,9 @@ const MAX_CRAWLS_PER_MINUTE_SCALE = 20; const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 20; +const MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS = 120; + + export const redisClient = redis.createClient({ @@ -29,6 +33,13 @@ export const serverRateLimiter = new RateLimiterRedis({ duration: 60, // Duration in seconds }); +export const crawlStatusRateLimiter = new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "middleware", + points: MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS, + duration: 60, // Duration in seconds +}); + export function crawlRateLimit(plan: string){ if(plan === "standard"){ @@ -56,10 +67,15 @@ export function crawlRateLimit(plan: string){ } -export function getRateLimiter(preview: boolean){ - if(preview){ - return previewRateLimiter; - }else{ - return serverRateLimiter; + + +export function getRateLimiter(mode: RateLimiterMode){ + switch(mode) { + case RateLimiterMode.Preview: + return previewRateLimiter; + case RateLimiterMode.CrawlStatus: + return crawlStatusRateLimiter; + default: + return serverRateLimiter; } } diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index a086425..ab1f90e 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,6 +1,7 @@ import { supabase_service } from "./supabase"; export const callWebhook = async (teamId: string, data: any) => { + try { const { data: webhooksData, error } = await supabase_service .from('webhooks') .select('url') @@ -37,5 +38,9 @@ export const callWebhook = async (teamId: string, data: any) => { data: dataToSend, error: data.error || undefined, }), - }); -} \ No newline at end of file + }); + } catch (error) { + console.error(`Error sending webhook for team ID: ${teamId}`, error.message); + } +}; + diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 2123e0c..f9e5c73 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -22,7 +22,31 @@ export interface WebScraperOptions { crawlerOptions: any; pageOptions: any; team_id: string; + origin?: string; +} + + +export interface FirecrawlJob { + success: boolean; + message: string; + num_docs: number; + docs: any[]; + time_taken: number; + team_id: string; + mode: string; + url: string; + crawlerOptions?: any; + pageOptions?: any; + origin: string; } +export enum RateLimiterMode { + Crawl = "crawl", + CrawlStatus = "crawl-status", + Scrape = "scrape", + Preview = "preview", +} + + diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 58aa5ac..811f87f 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,8 +1,9 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.10", + "version": "0.0.11", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", + "types": "types/index.d.ts", "type": "module", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" diff --git a/apps/js-sdk/firecrawl/tsconfig.json b/apps/js-sdk/firecrawl/tsconfig.json index 5bca86d..d7764a4 100644 --- a/apps/js-sdk/firecrawl/tsconfig.json +++ b/apps/js-sdk/firecrawl/tsconfig.json @@ -49,7 +49,7 @@ // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ /* Emit */ - // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ + "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ // "declarationMap": true, /* Create sourcemaps for d.ts files. */ // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ @@ -70,7 +70,7 @@ // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ - // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ + "declarationDir": "./types", /* Specify the output directory for generated declaration files. */ // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */ /* Interop Constraints */ @@ -105,5 +105,7 @@ /* Completeness */ // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ "skipLibCheck": true /* Skip type checking all .d.ts files. */ - } + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist", "**/__tests__/*"] } diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts new file mode 100644 index 0000000..a9d04ba --- /dev/null +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -0,0 +1,107 @@ +import { AxiosResponse, AxiosRequestHeaders } from 'axios'; +/** + * Configuration interface for FirecrawlApp. + */ +export interface FirecrawlAppConfig { + apiKey?: string | null; +} +/** + * Generic parameter interface. + */ +export interface Params { + [key: string]: any; +} +/** + * Response interface for scraping operations. + */ +export interface ScrapeResponse { + success: boolean; + data?: any; + error?: string; +} +/** + * Response interface for crawling operations. + */ +export interface CrawlResponse { + success: boolean; + jobId?: string; + data?: any; + error?: string; +} +/** + * Response interface for job status checks. + */ +export interface JobStatusResponse { + success: boolean; + status: string; + jobId?: string; + data?: any; + error?: string; +} +/** + * Main class for interacting with the Firecrawl API. + */ +export default class FirecrawlApp { + private apiKey; + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey }: FirecrawlAppConfig); + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url: string, params?: Params | null): Promise; + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, timeout?: number): Promise; + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId: string): Promise; + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(): AxiosRequestHeaders; + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise; + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url: string, headers: AxiosRequestHeaders): Promise; + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise; + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response: AxiosResponse, action: string): void; +}