diff --git a/apps/api/fly.toml b/apps/api/fly.toml index 6bc8266..468695d 100644 --- a/apps/api/fly.toml +++ b/apps/api/fly.toml @@ -54,7 +54,7 @@ kill_timeout = '5s' soft_limit = 12 [[vm]] - size = 'performance-8x' + size = 'performance-4x' processes = ['app'] diff --git a/apps/api/openapi.json b/apps/api/openapi.json index a755e37..b07e43f 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -61,6 +61,13 @@ "description": "Wait x amount of milliseconds for the page to load to fetch content", "default": 0 }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, "headers": { "type": "object", "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." @@ -194,6 +201,11 @@ "type": "integer", "description": "Maximum number of pages to crawl", "default": 10000 + }, + "allowBackwardCrawling": { + "type": "boolean", + "description": "Allow backward crawling (crawl from the base URL to the previous URLs)", + "default": false } } }, @@ -219,6 +231,13 @@ "type": "object", "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." }, + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, "replaceAllPathsWithAbsolutePaths": { "type": "boolean", "description": "Replace all relative paths with absolute paths for images and links", diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index c443e71..acb2278 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -1,5 +1,4 @@ import request from "supertest"; -import { app } from "../../index"; import dotenv from "dotenv"; const fs = require("fs"); const path = require("path"); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 4a1609b..780ad39 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -1,5 +1,4 @@ import request from "supertest"; -import { app } from "../../index"; import dotenv from "dotenv"; import { v4 as uuidv4 } from "uuid"; @@ -35,7 +34,7 @@ describe("E2E Tests for API Routes", () => { describe("POST /v0/scrape", () => { it.concurrent("should require authorization", async () => { - const response = await request(app).post("/v0/scrape"); + const response = await request(TEST_URL).post("/v0/scrape"); expect(response.statusCode).toBe(401); }); @@ -151,6 +150,40 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); }, 60000); // 60 seconds + it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { + const responseWithoutRemoveTags = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/" }); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site"); + expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer + expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav + expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong + + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("Scrape This Site"); + expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer + expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav + expect(response.body.data.content).not.toContain("web scraping"); // strong + }, 30000); // 30 seconds timeout + // TODO: add this test back once we nail the waitFor option to be more deterministic // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { // const startTime = Date.now(); diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index fc3fe28..8fd876d 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -55,10 +55,14 @@ export async function crawlController(req: Request, res: Response) { } const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false }; + + const crawlerOptions = req.body.crawlerOptions ?? { + allowBackwardCrawling: false + }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, + removeTags: [], parsePDF: true }; diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index d3e9afe..2c3dc4e 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] }; const job = await addWebScraperJob({ url: url, diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 7474aae..abbc357 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -85,6 +85,7 @@ export async function searchHelper( onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, includeHtml: pageOptions?.includeHtml ?? false, + removeTags: pageOptions?.removeTags ?? [], fallback: false, }, }); @@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) { includeHtml: false, onlyMainContent: true, fetchPageContent: true, + removeTags: [], fallback: false, }; const origin = req.body.origin ?? "api"; diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index cc8376b..494b4d5 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -5,190 +5,215 @@ import "dotenv/config"; import { getWebScraperQueue } from "./services/queue-service"; import { redisClient } from "./services/rate-limiter"; import { v0Router } from "./routes/v0"; -import { initSDK } from '@hyperdx/node-opentelemetry'; +import { initSDK } from "@hyperdx/node-opentelemetry"; +import cluster from "cluster"; +import os from "os"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { ExpressAdapter } = require("@bull-board/express"); -export const app = express(); +const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; +console.log(`Number of CPUs: ${numCPUs} available`); -global.isProduction = process.env.IS_PRODUCTION === "true"; +if (cluster.isMaster) { + console.log(`Master ${process.pid} is running`); -app.use(bodyParser.urlencoded({ extended: true })); -app.use(bodyParser.json({ limit: "10mb" })); + // Fork workers. + for (let i = 0; i < numCPUs; i++) { + cluster.fork(); + } -app.use(cors()); // Add this line to enable CORS - -const serverAdapter = new ExpressAdapter(); -serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); - -const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ - queues: [new BullAdapter(getWebScraperQueue())], - serverAdapter: serverAdapter, -}); - -app.use( - `/admin/${process.env.BULL_AUTH_KEY}/queues`, - serverAdapter.getRouter() -); - -app.get("/", (req, res) => { - res.send("SCRAPERS-JS: Hello, world! Fly.io"); -}); - -//write a simple test function -app.get("/test", async (req, res) => { - res.send("Hello, world!"); -}); - -// register router -app.use(v0Router); - -const DEFAULT_PORT = process.env.PORT ?? 3002; -const HOST = process.env.HOST ?? "localhost"; -redisClient.connect(); - -// HyperDX OpenTelemetry -if(process.env.ENV === 'production') { - initSDK({ consoleCapture: true, additionalInstrumentations: []}); -} - - -export function startServer(port = DEFAULT_PORT) { - const server = app.listen(Number(port), HOST, () => { - console.log(`Server listening on port ${port}`); - console.log( - `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` - ); - console.log(""); - console.log("1. Make sure Redis is running on port 6379 by default"); - console.log( - "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 " - ); + cluster.on("exit", (worker, code, signal) => { + console.log(`Worker ${worker.process.pid} exited`); + console.log("Starting a new worker"); + cluster.fork(); }); - return server; -} +} else { + const app = express(); -if (require.main === module) { - startServer(); -} + global.isProduction = process.env.IS_PRODUCTION === "true"; -// Use this as a "health check" that way we dont destroy the server -app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { - try { - const webScraperQueue = getWebScraperQueue(); - const [webScraperActive] = await Promise.all([ - webScraperQueue.getActiveCount(), - ]); + app.use(bodyParser.urlencoded({ extended: true })); + app.use(bodyParser.json({ limit: "10mb" })); - const noActiveJobs = webScraperActive === 0; - // 200 if no active jobs, 503 if there are active jobs - return res.status(noActiveJobs ? 200 : 500).json({ - webScraperActive, - noActiveJobs, - }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); + app.use(cors()); // Add this line to enable CORS + + const serverAdapter = new ExpressAdapter(); + serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); + + const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ + queues: [new BullAdapter(getWebScraperQueue())], + serverAdapter: serverAdapter, + }); + + app.use( + `/admin/${process.env.BULL_AUTH_KEY}/queues`, + serverAdapter.getRouter() + ); + + app.get("/", (req, res) => { + res.send("SCRAPERS-JS: Hello, world! Fly.io"); + }); + + //write a simple test function + app.get("/test", async (req, res) => { + res.send("Hello, world!"); + }); + + // register router + app.use(v0Router); + + const DEFAULT_PORT = process.env.PORT ?? 3002; + const HOST = process.env.HOST ?? "localhost"; + redisClient.connect(); + + // HyperDX OpenTelemetry + if (process.env.ENV === "production") { + initSDK({ consoleCapture: true, additionalInstrumentations: [] }); } -}); -app.get(`/serverHealthCheck`, async (req, res) => { - try { - const webScraperQueue = getWebScraperQueue(); - const [waitingJobs] = await Promise.all([ - webScraperQueue.getWaitingCount(), - ]); - - const noWaitingJobs = waitingJobs === 0; - // 200 if no active jobs, 503 if there are active jobs - return res.status(noWaitingJobs ? 200 : 500).json({ - waitingJobs, + function startServer(port = DEFAULT_PORT) { + const server = app.listen(Number(port), HOST, () => { + console.log(`Worker ${process.pid} listening on port ${port}`); + console.log( + `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + ); + console.log(""); + console.log("1. Make sure Redis is running on port 6379 by default"); + console.log( + "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 " + ); }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); + return server; } -}); -app.get('/serverHealthCheck/notify', async (req, res) => { - if (process.env.SLACK_WEBHOOK_URL) { - const treshold = 1; // The treshold value for the active jobs - const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds + if (require.main === module) { + startServer(); + } - const getWaitingJobsCount = async () => { + // Use this as a "health check" that way we dont destroy the server + app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { + try { const webScraperQueue = getWebScraperQueue(); - const [waitingJobsCount] = await Promise.all([ + const [webScraperActive] = await Promise.all([ + webScraperQueue.getActiveCount(), + ]); + + const noActiveJobs = webScraperActive === 0; + // 200 if no active jobs, 503 if there are active jobs + return res.status(noActiveJobs ? 200 : 500).json({ + webScraperActive, + noActiveJobs, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } + }); + + app.get(`/serverHealthCheck`, async (req, res) => { + try { + const webScraperQueue = getWebScraperQueue(); + const [waitingJobs] = await Promise.all([ webScraperQueue.getWaitingCount(), ]); - return waitingJobsCount; - }; + const noWaitingJobs = waitingJobs === 0; + // 200 if no active jobs, 503 if there are active jobs + return res.status(noWaitingJobs ? 200 : 500).json({ + waitingJobs, + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } + }); - res.status(200).json({ message: "Check initiated" }); + app.get("/serverHealthCheck/notify", async (req, res) => { + if (process.env.SLACK_WEBHOOK_URL) { + const treshold = 1; // The treshold value for the active jobs + const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds - const checkWaitingJobs = async () => { - try { - let waitingJobsCount = await getWaitingJobsCount(); - if (waitingJobsCount >= treshold) { - setTimeout(async () => { - // Re-check the waiting jobs count after the timeout - waitingJobsCount = await getWaitingJobsCount(); - if (waitingJobsCount >= treshold) { - const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; - const message = { - text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`, - }; + const getWaitingJobsCount = async () => { + const webScraperQueue = getWebScraperQueue(); + const [waitingJobsCount] = await Promise.all([ + webScraperQueue.getWaitingCount(), + ]); - const response = await fetch(slackWebhookUrl, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify(message), - }) - - if (!response.ok) { - console.error('Failed to send Slack notification') + return waitingJobsCount; + }; + + res.status(200).json({ message: "Check initiated" }); + + const checkWaitingJobs = async () => { + try { + let waitingJobsCount = await getWaitingJobsCount(); + if (waitingJobsCount >= treshold) { + setTimeout(async () => { + // Re-check the waiting jobs count after the timeout + waitingJobsCount = await getWaitingJobsCount(); + if (waitingJobsCount >= treshold) { + const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; + const message = { + text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${ + timeout / 60000 + } minute(s).`, + }; + + const response = await fetch(slackWebhookUrl, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify(message), + }); + + if (!response.ok) { + console.error("Failed to send Slack notification"); + } } - } - }, timeout); + }, timeout); + } + } catch (error) { + console.error(error); } - } catch (error) { - console.error(error); - } - }; + }; - checkWaitingJobs(); - } -}); + checkWaitingJobs(); + } + }); -app.get(`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => { - try { - const webScraperQueue = getWebScraperQueue(); - const completedJobs = await webScraperQueue.getJobs(['completed']); - const before24hJobs = completedJobs.filter(job => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000); - const jobIds = before24hJobs.map(job => job.id) as string[]; - let count = 0; - for (const jobId of jobIds) { + app.get( + `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, + async (req, res) => { try { - await webScraperQueue.removeJobs(jobId); - count++; - } catch (jobError) { - console.error(`Failed to remove job with ID ${jobId}:`, jobError); + const webScraperQueue = getWebScraperQueue(); + const completedJobs = await webScraperQueue.getJobs(["completed"]); + const before24hJobs = completedJobs.filter( + (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 + ); + const jobIds = before24hJobs.map((job) => job.id) as string[]; + let count = 0; + for (const jobId of jobIds) { + try { + await webScraperQueue.removeJobs(jobId); + count++; + } catch (jobError) { + console.error(`Failed to remove job with ID ${jobId}:`, jobError); + } + } + res.status(200).send(`Removed ${count} completed jobs.`); + } catch (error) { + console.error("Failed to clean last 24h complete jobs:", error); + res.status(500).send("Failed to clean jobs"); } } - res.status(200).send(`Removed ${count} completed jobs.`); - } catch (error) { - console.error('Failed to clean last 24h complete jobs:', error); - res.status(500).send('Failed to clean jobs'); - } -}); + ); -app.get("/is-production", (req, res) => { - res.send({ isProduction: global.isProduction }); -}); + app.get("/is-production", (req, res) => { + res.send({ isProduction: global.isProduction }); + }); - -// /workers health check, cant act as load balancer, just has to be a pre deploy thing \ No newline at end of file + console.log(`Worker ${process.pid} started`); +} diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index d676584..0dae9ba 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -19,7 +19,8 @@ export type PageOptions = { screenshot?: boolean; headers?: Record; replaceAllPathsWithAbsolutePaths?: boolean; - parsePDF?: boolean + parsePDF?: boolean; + removeTags?: string | string[]; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index f0f423a..36af58a 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -479,7 +479,8 @@ export class WebScraperDataProvider { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false, - parsePDF: true + parsePDF: true, + removeTags: [] }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 8fa268f..4723a56 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -309,6 +309,19 @@ export async function scrapSingleUrl( const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { const soup = cheerio.load(html); soup("script, style, iframe, noscript, meta, head").remove(); + + if (pageOptions.removeTags) { + if (typeof pageOptions.removeTags === 'string') { + pageOptions.removeTags.split(',').forEach((tag) => { + soup(tag.trim()).remove(); + }); + } else if (Array.isArray(pageOptions.removeTags)) { + pageOptions.removeTags.forEach((tag) => { + soup(tag).remove(); + }); + } + } + if (pageOptions.onlyMainContent) { // remove any other tags that are not in the main content excludeNonMainTags.forEach((tag) => { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 6772c57..a42b3e8 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -38,7 +38,7 @@ getWebScraperQueue().process( error: message /* etc... */, }; - await callWebhook(job.data.team_id, data); + await callWebhook(job.data.team_id, job.id as string, data); await logJob({ success: success, @@ -78,7 +78,7 @@ getWebScraperQueue().process( error: "Something went wrong... Contact help@mendable.ai or try again." /* etc... */, }; - await callWebhook(job.data.team_id, data); + await callWebhook(job.data.team_id, job.id as string, data); await logJob({ success: false, message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"), diff --git a/apps/api/src/services/redis.ts b/apps/api/src/services/redis.ts index f2cedd1..491eeb1 100644 --- a/apps/api/src/services/redis.ts +++ b/apps/api/src/services/redis.ts @@ -1,8 +1,35 @@ -import Redis from 'ioredis'; +import Redis from "ioredis"; // Initialize Redis client const redis = new Redis(process.env.REDIS_URL); +// Listen to 'error' events to the Redis connection +redis.on("error", (error) => { + try { + if (error.message === "ECONNRESET") { + console.log("Connection to Redis Session Store timed out."); + } else if (error.message === "ECONNREFUSED") { + console.log("Connection to Redis Session Store refused!"); + } else console.log(error); + } catch (error) {} +}); + +// Listen to 'reconnecting' event to Redis +redis.on("reconnecting", (err) => { + try { + if (redis.status === "reconnecting") + console.log("Reconnecting to Redis Session Store..."); + else console.log("Error reconnecting to Redis Session Store."); + } catch (error) {} +}); + +// Listen to the 'connect' event to Redis +redis.on("connect", (err) => { + try { + if (!err) console.log("Connected to Redis Session Store!"); + } catch (error) {} +}); + /** * Set a value in Redis with an optional expiration time. * @param {string} key The key under which to store the value. @@ -11,7 +38,7 @@ const redis = new Redis(process.env.REDIS_URL); */ const setValue = async (key: string, value: string, expire?: number) => { if (expire) { - await redis.set(key, value, 'EX', expire); + await redis.set(key, value, "EX", expire); } else { await redis.set(key, value); } diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 1f8d647..fc5962b 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,6 +1,6 @@ import { supabase_service } from "./supabase"; -export const callWebhook = async (teamId: string, data: any) => { +export const callWebhook = async (teamId: string, jobId: string,data: any) => { try { const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; @@ -47,6 +47,7 @@ export const callWebhook = async (teamId: string, data: any) => { }, body: JSON.stringify({ success: data.success, + jobId: jobId, data: dataToSend, error: data.error || undefined, }), diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index ecb017f..fbb2bdb 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -1,3 +1,57 @@ +""" +This is the Firecrawl package. + +This package provides a Python SDK for interacting with the Firecrawl API. +It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, +and check the status of these jobs. + +For more information visit https://github.com/firecrawl/ +""" + +import logging +import os + from .firecrawl import FirecrawlApp -__version__ = "0.0.14" +__version__ = "0.0.16" + +# Define the logger for the Firecrawl project +logger: logging.Logger = logging.getLogger("firecrawl") + + +def _basic_config() -> None: + """Set up basic configuration for logging with a specific format and date format.""" + try: + logging.basicConfig( + format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + except Exception as e: + logger.error("Failed to configure logging: %s", e) + + +def setup_logging() -> None: + """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable.""" + env = os.environ.get( + "FIRECRAWL_LOGGING_LEVEL", "INFO" + ).upper() # Default to 'INFO' level + _basic_config() + + if env == "DEBUG": + logger.setLevel(logging.DEBUG) + elif env == "INFO": + logger.setLevel(logging.INFO) + elif env == "WARNING": + logger.setLevel(logging.WARNING) + elif env == "ERROR": + logger.setLevel(logging.ERROR) + elif env == "CRITICAL": + logger.setLevel(logging.CRITICAL) + else: + logger.setLevel(logging.INFO) + logger.warning("Unknown logging level: %s, defaulting to INFO", env) + + +# Initialize logging configuration when the module is imported +setup_logging() +logger.debug("Debugging logger setup") diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc new file mode 100644 index 0000000..5ba1f13 Binary files /dev/null and b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc differ diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 90a6498..452d498 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -27,14 +27,14 @@ def test_scrape_url_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.scrape_url('https://firecrawl.dev') - assert "Failed to scrape URL. Status code: 401" in str(excinfo.value) + assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_blocklisted_url(): blocklisted_url = "https://facebook.com/fake-test" app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) with pytest.raises(Exception) as excinfo: app.scrape_url(blocklisted_url) - assert "Failed to scrape URL. Status code: 403" in str(excinfo.value) + assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") @@ -86,14 +86,14 @@ def test_crawl_url_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.crawl_url('https://firecrawl.dev') - assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value) + assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_should_return_error_for_blocklisted_url(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) blocklisted_url = "https://twitter.com/fake-test" with pytest.raises(Exception) as excinfo: app.crawl_url(blocklisted_url) - assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value) + assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_crawl_url_wait_for_completion_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -114,7 +114,7 @@ def test_crawl_url_with_idempotency_key_e2e(): with pytest.raises(Exception) as excinfo: app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) - assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value) + assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) def test_check_crawl_status_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -141,7 +141,7 @@ def test_search_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.search("test query") - assert "Failed to search. Status code: 401" in str(excinfo.value) + assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_llm_extraction(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index b9a823f..7ec0d33 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes. Classes: - FirecrawlApp: Main class for interacting with the Firecrawl API. """ - +import logging import os import time from typing import Any, Dict, Optional import requests +logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: """ @@ -28,8 +29,15 @@ class FirecrawlApp: def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') if self.api_key is None: + logger.warning("No API key provided") raise ValueError('No API key provided') + else: + logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + if self.api_url != 'https://api.firecrawl.dev': + logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url) + def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ Scrape the specified URL using the Firecrawl API. @@ -45,10 +53,8 @@ class FirecrawlApp: Exception: If the scrape request fails. """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } + headers = self._prepare_headers() + # Prepare the base scrape parameters with the URL scrape_params = {'url': url} @@ -81,13 +87,10 @@ class FirecrawlApp: return response['data'] else: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - elif response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') else: - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') + self._handle_error(response, 'scrape URL') - def search(self, query, params=None): + def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any: """ Perform a search using the Firecrawl API. @@ -101,10 +104,7 @@ class FirecrawlApp: Raises: Exception: If the search request fails. """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } + headers = self._prepare_headers() json_data = {'query': query} if params: json_data.update(params) @@ -121,13 +121,14 @@ class FirecrawlApp: else: raise Exception(f'Failed to search. Error: {response["error"]}') - elif response.status_code in [402, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') else: - raise Exception(f'Failed to search. Status code: {response.status_code}') + self._handle_error(response, 'search') - def crawl_url(self, url, params=None, wait_until_done=True, poll_interval=2, idempotency_key=None): + def crawl_url(self, url: str, + params: Optional[Dict[str, Any]] = None, + wait_until_done: bool = True, + poll_interval: int = 2, + idempotency_key: Optional[str] = None) -> Any: """ Initiate a crawl job for the specified URL using the Firecrawl API. @@ -158,7 +159,7 @@ class FirecrawlApp: else: self._handle_error(response, 'start crawl job') - def check_crawl_status(self, job_id): + def check_crawl_status(self, job_id: str) -> Any: """ Check the status of a crawl job using the Firecrawl API. @@ -178,7 +179,7 @@ class FirecrawlApp: else: self._handle_error(response, 'check crawl status') - def _prepare_headers(self, idempotency_key=None): + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. @@ -200,7 +201,11 @@ class FirecrawlApp: 'Authorization': f'Bearer {self.api_key}', } - def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): + def _post_request(self, url: str, + data: Dict[str, Any], + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a POST request with retries. @@ -225,7 +230,10 @@ class FirecrawlApp: return response return response - def _get_request(self, url, headers, retries=3, backoff_factor=0.5): + def _get_request(self, url: str, + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a GET request with retries. @@ -249,7 +257,7 @@ class FirecrawlApp: return response return response - def _monitor_job_status(self, job_id, headers, poll_interval): + def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any: """ Monitor the status of a crawl job until completion. @@ -281,7 +289,7 @@ class FirecrawlApp: else: self._handle_error(status_response, 'check crawl status') - def _handle_error(self, response, action): + def _handle_error(self, response: requests.Response, action: str) -> None: """ Handle errors from API responses. @@ -292,8 +300,19 @@ class FirecrawlApp: Raises: Exception: An exception with a message containing the status code and error details from the response. """ - if response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') + error_message = response.json().get('error', 'No additional error details provided.') + + if response.status_code == 402: + message = f"Payment Required: Failed to {action}. {error_message}" + elif response.status_code == 408: + message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}" + elif response.status_code == 409: + message = f"Conflict: Failed to {action} due to a conflict. {error_message}" + elif response.status_code == 500: + message = f"Internal Server Error: Failed to {action}. {error_message}" else: - raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') + message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}" + + # Raise an HTTPError with the custom message and attach the response + raise requests.exceptions.HTTPError(message, response=response) + \ No newline at end of file