Merge branch 'main' into feat/issue-205

2024-06-14 11:25:20 -03:00 · 2024-06-14 11:25:20 -03:00 · 3e2e76311c
commit 3e2e76311c
parent e37d151404 5fd228f9ec
18 changed files with 400 additions and 202 deletions
--- a/apps/api/fly.toml
+++ b/apps/api/fly.toml
@ -54,7 +54,7 @@ kill_timeout = '5s'
    soft_limit = 12

 [[vm]]
-  size = 'performance-8x'
+  size = 'performance-4x'
  processes = ['app']


--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@ -61,6 +61,13 @@
                        "description": "Wait x amount of milliseconds for the page to load to fetch content",
                        "default": 0
                      },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
                      "headers": {
                        "type": "object",
                        "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
@ -194,6 +201,11 @@
                        "type": "integer",
                        "description": "Maximum number of pages to crawl",
                        "default": 10000
+                      },
+                      "allowBackwardCrawling": {
+                        "type": "boolean",
+                        "description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
+                        "default": false
                      }
                    }
                  },
@ -219,6 +231,13 @@
                        "type": "object",
                        "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
                      },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
                      "replaceAllPathsWithAbsolutePaths": {
                        "type": "boolean",
                        "description": "Replace all relative paths with absolute paths for images and links",
--- a/apps/api/src/tests/e2e_noAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_noAuth/index.test.ts
@ -1,5 +1,4 @@
 import request from "supertest";
-import { app } from "../../index";
 import dotenv from "dotenv";
 const fs = require("fs");
 const path = require("path");
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -1,5 +1,4 @@
 import request from "supertest";
-import { app } from "../../index";
 import dotenv from "dotenv";
 import { v4 as uuidv4 } from "uuid";

@ -35,7 +34,7 @@ describe("E2E Tests for API Routes", () => {

  describe("POST /v0/scrape", () => {
    it.concurrent("should require authorization", async () => {
-      const response = await request(app).post("/v0/scrape");
+      const response = await request(TEST_URL).post("/v0/scrape");
      expect(response.statusCode).toBe(401);
    });

@ -151,6 +150,40 @@ describe("E2E Tests for API Routes", () => {
      expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1  7 Jan 1993)>>endobj');
    }, 60000); // 60 seconds

+    it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
+      const responseWithoutRemoveTags = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://www.scrapethissite.com/" });
+      expect(responseWithoutRemoveTags.statusCode).toBe(200);
+      expect(responseWithoutRemoveTags.body).toHaveProperty("data");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
+      expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
+      expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
+      expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
+      expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
+      expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
+
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toHaveProperty("content");
+      expect(response.body.data).toHaveProperty("markdown");
+      expect(response.body.data).toHaveProperty("metadata");
+      expect(response.body.data).not.toHaveProperty("html");
+      expect(response.body.data.content).toContain("Scrape This Site");
+      expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
+      expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
+      expect(response.body.data.content).not.toContain("web scraping"); // strong
+    }, 30000); // 30 seconds timeout
+
    // TODO: add this test back once we nail the waitFor option to be more deterministic
    // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
    //   const startTime = Date.now();
--- a/apps/api/src/controllers/crawl.ts
+++ b/apps/api/src/controllers/crawl.ts
@ -55,10 +55,14 @@ export async function crawlController(req: Request, res: Response) {
    }

    const mode = req.body.mode ?? "crawl";
-    const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
+
+    const crawlerOptions = req.body.crawlerOptions ?? {
+      allowBackwardCrawling: false
+    };
    const pageOptions = req.body.pageOptions ?? {
      onlyMainContent: false,
      includeHtml: false,
+      removeTags: [],
      parsePDF: true
    };

--- a/apps/api/src/controllers/crawlPreview.ts
+++ b/apps/api/src/controllers/crawlPreview.ts
@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {

    const mode = req.body.mode ?? "crawl";
    const crawlerOptions = req.body.crawlerOptions ?? {};
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };

    const job = await addWebScraperJob({
      url: url,
--- a/apps/api/src/controllers/search.ts
+++ b/apps/api/src/controllers/search.ts
@ -85,6 +85,7 @@ export async function searchHelper(
      onlyMainContent: pageOptions?.onlyMainContent ?? true,
      fetchPageContent: pageOptions?.fetchPageContent ?? true,
      includeHtml: pageOptions?.includeHtml ?? false,
+      removeTags: pageOptions?.removeTags ?? [],
      fallback: false,
    },
  });
@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
      includeHtml: false,
      onlyMainContent: true,
      fetchPageContent: true,
+      removeTags: [],
      fallback: false,
    };
    const origin = req.body.origin ?? "api";
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -5,59 +5,77 @@ import "dotenv/config";
 import { getWebScraperQueue } from "./services/queue-service";
 import { redisClient } from "./services/rate-limiter";
 import { v0Router } from "./routes/v0";
-import { initSDK } from '@hyperdx/node-opentelemetry';
+import { initSDK } from "@hyperdx/node-opentelemetry";
+import cluster from "cluster";
+import os from "os";

 const { createBullBoard } = require("@bull-board/api");
 const { BullAdapter } = require("@bull-board/api/bullAdapter");
 const { ExpressAdapter } = require("@bull-board/express");

-export const app = express();
+const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
+console.log(`Number of CPUs: ${numCPUs} available`);

-global.isProduction = process.env.IS_PRODUCTION === "true";
+if (cluster.isMaster) {
+  console.log(`Master ${process.pid} is running`);

-app.use(bodyParser.urlencoded({ extended: true }));
-app.use(bodyParser.json({ limit: "10mb" }));
+  // Fork workers.
+  for (let i = 0; i < numCPUs; i++) {
+    cluster.fork();
+  }

-app.use(cors()); // Add this line to enable CORS
+  cluster.on("exit", (worker, code, signal) => {
+    console.log(`Worker ${worker.process.pid} exited`);
+    console.log("Starting a new worker");
+    cluster.fork();
+  });
+} else {
+  const app = express();

-const serverAdapter = new ExpressAdapter();
-serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
+  global.isProduction = process.env.IS_PRODUCTION === "true";

-const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
+  app.use(bodyParser.urlencoded({ extended: true }));
+  app.use(bodyParser.json({ limit: "10mb" }));
+
+  app.use(cors()); // Add this line to enable CORS
+
+  const serverAdapter = new ExpressAdapter();
+  serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
+
+  const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
    queues: [new BullAdapter(getWebScraperQueue())],
    serverAdapter: serverAdapter,
-});
+  });

-app.use(
+  app.use(
    `/admin/${process.env.BULL_AUTH_KEY}/queues`,
    serverAdapter.getRouter()
-);
+  );

-app.get("/", (req, res) => {
+  app.get("/", (req, res) => {
    res.send("SCRAPERS-JS: Hello, world! Fly.io");
-});
+  });

-//write a simple test function
-app.get("/test", async (req, res) => {
+  //write a simple test function
+  app.get("/test", async (req, res) => {
    res.send("Hello, world!");
-});
+  });

-// register router
-app.use(v0Router);
+  // register router
+  app.use(v0Router);

-const DEFAULT_PORT = process.env.PORT ?? 3002;
-const HOST = process.env.HOST ?? "localhost";
-redisClient.connect();
+  const DEFAULT_PORT = process.env.PORT ?? 3002;
+  const HOST = process.env.HOST ?? "localhost";
+  redisClient.connect();

-// HyperDX OpenTelemetry
-if(process.env.ENV === 'production') {
-  initSDK({ consoleCapture: true, additionalInstrumentations: []});
-}
+  // HyperDX OpenTelemetry
+  if (process.env.ENV === "production") {
+    initSDK({ consoleCapture: true, additionalInstrumentations: [] });
+  }

-
-export function startServer(port = DEFAULT_PORT) {
+  function startServer(port = DEFAULT_PORT) {
    const server = app.listen(Number(port), HOST, () => {
-    console.log(`Server listening on port ${port}`);
+      console.log(`Worker ${process.pid} listening on port ${port}`);
      console.log(
        `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
      );
@ -68,14 +86,14 @@ export function startServer(port = DEFAULT_PORT) {
      );
    });
    return server;
-}
+  }

-if (require.main === module) {
+  if (require.main === module) {
    startServer();
-}
+  }

-// Use this as a "health check" that way we dont destroy the server
-app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => {
+  // Use this as a "health check" that way we dont destroy the server
+  app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => {
    try {
      const webScraperQueue = getWebScraperQueue();
      const [webScraperActive] = await Promise.all([
@ -92,9 +110,9 @@ app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => {
      console.error(error);
      return res.status(500).json({ error: error.message });
    }
-});
+  });

-app.get(`/serverHealthCheck`, async (req, res) => {
+  app.get(`/serverHealthCheck`, async (req, res) => {
    try {
      const webScraperQueue = getWebScraperQueue();
      const [waitingJobs] = await Promise.all([
@ -110,9 +128,9 @@ app.get(`/serverHealthCheck`, async (req, res) => {
      console.error(error);
      return res.status(500).json({ error: error.message });
    }
-});
+  });

-app.get('/serverHealthCheck/notify', async (req, res) => {
+  app.get("/serverHealthCheck/notify", async (req, res) => {
    if (process.env.SLACK_WEBHOOK_URL) {
      const treshold = 1; // The treshold value for the active jobs
      const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
@ -138,19 +156,21 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
              if (waitingJobsCount >= treshold) {
                const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
                const message = {
-                text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`,
+                  text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
+                    timeout / 60000
+                  } minute(s).`,
                };

                const response = await fetch(slackWebhookUrl, {
-                method: 'POST',
+                  method: "POST",
                  headers: {
-                  'Content-Type': 'application/json',
+                    "Content-Type": "application/json",
                  },
                  body: JSON.stringify(message),
-              })
+                });

                if (!response.ok) {
-                console.error('Failed to send Slack notification')
+                  console.error("Failed to send Slack notification");
                }
              }
            }, timeout);
@ -162,14 +182,18 @@ app.get('/serverHealthCheck/notify', async (req, res) => {

      checkWaitingJobs();
    }
-});
+  });

-app.get(`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => {
+  app.get(
+    `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
+    async (req, res) => {
      try {
        const webScraperQueue = getWebScraperQueue();
-    const completedJobs = await webScraperQueue.getJobs(['completed']);
-    const before24hJobs = completedJobs.filter(job => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000);
-    const jobIds = before24hJobs.map(job => job.id) as string[];
+        const completedJobs = await webScraperQueue.getJobs(["completed"]);
+        const before24hJobs = completedJobs.filter(
+          (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
+        );
+        const jobIds = before24hJobs.map((job) => job.id) as string[];
        let count = 0;
        for (const jobId of jobIds) {
          try {
@ -181,14 +205,15 @@ app.get(`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, as
        }
        res.status(200).send(`Removed ${count} completed jobs.`);
      } catch (error) {
-    console.error('Failed to clean last 24h complete jobs:', error);
-    res.status(500).send('Failed to clean jobs');
+        console.error("Failed to clean last 24h complete jobs:", error);
+        res.status(500).send("Failed to clean jobs");
      }
-});
+    }
+  );

-app.get("/is-production", (req, res) => {
+  app.get("/is-production", (req, res) => {
    res.send({ isProduction: global.isProduction });
-});
+  });

-
-// /workers health check, cant act as load balancer, just has to be a pre deploy thing
+  console.log(`Worker ${process.pid} started`);
+}
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -19,7 +19,8 @@ export type PageOptions = {
  screenshot?: boolean;
  headers?: Record<string, string>;
  replaceAllPathsWithAbsolutePaths?: boolean;
-  parsePDF?: boolean
+  parsePDF?: boolean;
+  removeTags?: string | string[];
 };

 export type ExtractorOptions = {
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -479,7 +479,8 @@ export class WebScraperDataProvider {
      onlyMainContent: false,
      includeHtml: false,
      replaceAllPathsWithAbsolutePaths: false,
-      parsePDF: true
+      parsePDF: true,
+      removeTags: []
    };
    this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -309,6 +309,19 @@ export async function scrapSingleUrl(
  const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
    const soup = cheerio.load(html);
    soup("script, style, iframe, noscript, meta, head").remove();
+
+    if (pageOptions.removeTags) {
+      if (typeof pageOptions.removeTags === 'string') {
+        pageOptions.removeTags.split(',').forEach((tag) => {
+          soup(tag.trim()).remove();
+        });
+      } else if (Array.isArray(pageOptions.removeTags)) {
+        pageOptions.removeTags.forEach((tag) => {
+          soup(tag).remove();
+        });
+      }
+    }
+    
    if (pageOptions.onlyMainContent) {
      // remove any other tags that are not in the main content
      excludeNonMainTags.forEach((tag) => {
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -38,7 +38,7 @@ getWebScraperQueue().process(
        error: message /* etc... */,
      };

-      await callWebhook(job.data.team_id, data);
+      await callWebhook(job.data.team_id, job.id as string, data);

      await logJob({
        success: success,
@ -78,7 +78,7 @@ getWebScraperQueue().process(
        error:
          "Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
      };
-      await callWebhook(job.data.team_id, data);
+      await callWebhook(job.data.team_id, job.id as string, data);
      await logJob({
        success: false,
        message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
--- a/apps/api/src/services/redis.ts
+++ b/apps/api/src/services/redis.ts
@ -1,8 +1,35 @@
-import Redis from 'ioredis';
+import Redis from "ioredis";

 // Initialize Redis client
 const redis = new Redis(process.env.REDIS_URL);

+// Listen to 'error' events to the Redis connection
+redis.on("error", (error) => {
+  try {
+    if (error.message === "ECONNRESET") {
+      console.log("Connection to Redis Session Store timed out.");
+    } else if (error.message === "ECONNREFUSED") {
+      console.log("Connection to Redis Session Store refused!");
+    } else console.log(error);
+  } catch (error) {}
+});
+
+// Listen to 'reconnecting' event to Redis
+redis.on("reconnecting", (err) => {
+  try {
+    if (redis.status === "reconnecting")
+      console.log("Reconnecting to Redis Session Store...");
+    else console.log("Error reconnecting to Redis Session Store.");
+  } catch (error) {}
+});
+
+// Listen to the 'connect' event to Redis
+redis.on("connect", (err) => {
+  try {
+    if (!err) console.log("Connected to Redis Session Store!");
+  } catch (error) {}
+});
+
 /**
 * Set a value in Redis with an optional expiration time.
 * @param {string} key The key under which to store the value.
@ -11,7 +38,7 @@ const redis = new Redis(process.env.REDIS_URL);
 */
 const setValue = async (key: string, value: string, expire?: number) => {
  if (expire) {
-    await redis.set(key, value, 'EX', expire);
+    await redis.set(key, value, "EX", expire);
  } else {
    await redis.set(key, value);
  }
--- a/apps/api/src/services/webhook.ts
+++ b/apps/api/src/services/webhook.ts
@ -1,6 +1,6 @@
 import { supabase_service } from "./supabase";

-export const callWebhook = async (teamId: string, data: any) => {
+export const callWebhook = async (teamId: string, jobId: string,data: any) => {
  try {
    const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
    const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
@ -47,6 +47,7 @@ export const callWebhook = async (teamId: string, data: any) => {
      },
      body: JSON.stringify({
        success: data.success,
+        jobId: jobId,
        data: dataToSend,
        error: data.error || undefined,
      }),
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -1,3 +1,57 @@
+"""
+This is the Firecrawl package.
+
+This package provides a Python SDK for interacting with the Firecrawl API.
+It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
+and check the status of these jobs.
+
+For more information visit https://github.com/firecrawl/
+"""
+
+import logging
+import os
+
 from .firecrawl import FirecrawlApp

-__version__ = "0.0.14"
+__version__ = "0.0.16"
+
+# Define the logger for the Firecrawl project
+logger: logging.Logger = logging.getLogger("firecrawl")
+
+
+def _basic_config() -> None:
+    """Set up basic configuration for logging with a specific format and date format."""
+    try:
+        logging.basicConfig(
+            format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+    except Exception as e:
+        logger.error("Failed to configure logging: %s", e)
+
+
+def setup_logging() -> None:
+    """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
+    env = os.environ.get(
+        "FIRECRAWL_LOGGING_LEVEL", "INFO"
+    ).upper()  # Default to 'INFO' level
+    _basic_config()
+
+    if env == "DEBUG":
+        logger.setLevel(logging.DEBUG)
+    elif env == "INFO":
+        logger.setLevel(logging.INFO)
+    elif env == "WARNING":
+        logger.setLevel(logging.WARNING)
+    elif env == "ERROR":
+        logger.setLevel(logging.ERROR)
+    elif env == "CRITICAL":
+        logger.setLevel(logging.CRITICAL)
+    else:
+        logger.setLevel(logging.INFO)
+        logger.warning("Unknown logging level: %s, defaulting to INFO", env)
+
+
+# Initialize logging configuration when the module is imported
+setup_logging()
+logger.debug("Debugging logger setup")
--- a/apps/python-sdk/firecrawl/tests/e2e_withAuth/pycache/test.cpython-311-pytest-8.2.1.pyc
+++ b/apps/python-sdk/firecrawl/tests/e2e_withAuth/pycache/test.cpython-311-pytest-8.2.1.pyc
--- a/apps/python-sdk/firecrawl/tests/e2e_withAuth/test.py
+++ b/apps/python-sdk/firecrawl/tests/e2e_withAuth/test.py
@ -27,14 +27,14 @@ def test_scrape_url_invalid_api_key():
    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
    with pytest.raises(Exception) as excinfo:
        invalid_app.scrape_url('https://firecrawl.dev')
-    assert "Failed to scrape URL. Status code: 401" in str(excinfo.value)
+    assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)

 def test_blocklisted_url():
    blocklisted_url = "https://facebook.com/fake-test"
    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
    with pytest.raises(Exception) as excinfo:
        app.scrape_url(blocklisted_url)
-    assert "Failed to scrape URL. Status code: 403" in str(excinfo.value)
+    assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)

 def test_successful_response_with_valid_preview_token():
    app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
@ -86,14 +86,14 @@ def test_crawl_url_invalid_api_key():
    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
    with pytest.raises(Exception) as excinfo:
        invalid_app.crawl_url('https://firecrawl.dev')
-    assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value)
+    assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)

 def test_should_return_error_for_blocklisted_url():
    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
    blocklisted_url = "https://twitter.com/fake-test"
    with pytest.raises(Exception) as excinfo:
        app.crawl_url(blocklisted_url)
-    assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value)
+    assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)

 def test_crawl_url_wait_for_completion_e2e():
    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@ -114,7 +114,7 @@ def test_crawl_url_with_idempotency_key_e2e():

    with pytest.raises(Exception) as excinfo:
        app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
-    assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value) 
+    assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) 

 def test_check_crawl_status_e2e():
    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@ -141,7 +141,7 @@ def test_search_invalid_api_key():
    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
    with pytest.raises(Exception) as excinfo:
        invalid_app.search("test query")
-    assert "Failed to search. Status code: 401" in str(excinfo.value)
+    assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)

 def test_llm_extraction():
    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes.
 Classes:
    - FirecrawlApp: Main class for interacting with the Firecrawl API.
 """
-
+import logging
 import os
 import time
 from typing import Any, Dict, Optional

 import requests

+logger : logging.Logger = logging.getLogger("firecrawl")

 class FirecrawlApp:
    """
@ -28,8 +29,15 @@ class FirecrawlApp:
    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
        self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
        if self.api_key is None:
+            logger.warning("No API key provided")
            raise ValueError('No API key provided')
+        else:
+            logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
+
        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
+        if self.api_url != 'https://api.firecrawl.dev':
+            logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
+
    def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
        """
        Scrape the specified URL using the Firecrawl API.
@ -45,10 +53,8 @@ class FirecrawlApp:
            Exception: If the scrape request fails.
        """

-        headers = {
-            'Content-Type': 'application/json',
-            'Authorization': f'Bearer {self.api_key}'
-        }
+        headers = self._prepare_headers()
+
        # Prepare the base scrape parameters with the URL
        scrape_params = {'url': url}

@ -81,13 +87,10 @@ class FirecrawlApp:
                return response['data']
            else:
                raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
-        elif response.status_code in [402, 408, 409, 500]:
-            error_message = response.json().get('error', 'Unknown error occurred')
-            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
        else:
-            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
+            self._handle_error(response, 'scrape URL')

-    def search(self, query, params=None):
+    def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
        """
        Perform a search using the Firecrawl API.

@ -101,10 +104,7 @@ class FirecrawlApp:
        Raises:
            Exception: If the search request fails.
        """
-        headers = {
-            'Content-Type': 'application/json',
-            'Authorization': f'Bearer {self.api_key}'
-        }
+        headers = self._prepare_headers()
        json_data = {'query': query}
        if params:
            json_data.update(params)
@ -121,13 +121,14 @@ class FirecrawlApp:
            else:
                raise Exception(f'Failed to search. Error: {response["error"]}')

-        elif response.status_code in [402, 409, 500]:
-            error_message = response.json().get('error', 'Unknown error occurred')
-            raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
        else:
-            raise Exception(f'Failed to search. Status code: {response.status_code}')
+            self._handle_error(response, 'search')

-    def crawl_url(self, url, params=None, wait_until_done=True, poll_interval=2, idempotency_key=None):
+    def crawl_url(self, url: str,
+                  params: Optional[Dict[str, Any]] = None,
+                  wait_until_done: bool = True,
+                  poll_interval: int = 2,
+                  idempotency_key: Optional[str] = None) -> Any:
        """
        Initiate a crawl job for the specified URL using the Firecrawl API.

@ -158,7 +159,7 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'start crawl job')

-    def check_crawl_status(self, job_id):
+    def check_crawl_status(self, job_id: str) -> Any:
        """
        Check the status of a crawl job using the Firecrawl API.

@ -178,7 +179,7 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'check crawl status')

-    def _prepare_headers(self, idempotency_key=None):
+    def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
        """
        Prepare the headers for API requests.

@ -200,7 +201,11 @@ class FirecrawlApp:
            'Authorization': f'Bearer {self.api_key}',
        }

-    def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
+    def _post_request(self, url: str,
+                      data: Dict[str, Any],
+                      headers: Dict[str, str],
+                      retries: int = 3,
+                      backoff_factor: float = 0.5) -> requests.Response:
        """
        Make a POST request with retries.

@ -225,7 +230,10 @@ class FirecrawlApp:
                return response
        return response

-    def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
+    def _get_request(self, url: str,
+                     headers: Dict[str, str],
+                     retries: int = 3,
+                     backoff_factor: float = 0.5) -> requests.Response:
        """
        Make a GET request with retries.

@ -249,7 +257,7 @@ class FirecrawlApp:
                return response
        return response

-    def _monitor_job_status(self, job_id, headers, poll_interval):
+    def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any:
        """
        Monitor the status of a crawl job until completion.

@ -281,7 +289,7 @@ class FirecrawlApp:
            else:
                self._handle_error(status_response, 'check crawl status')

-    def _handle_error(self, response, action):
+    def _handle_error(self, response: requests.Response, action: str) -> None:
        """
        Handle errors from API responses.

@ -292,8 +300,19 @@ class FirecrawlApp:
        Raises:
            Exception: An exception with a message containing the status code and error details from the response.
        """
-        if response.status_code in [402, 408, 409, 500]:
-            error_message = response.json().get('error', 'Unknown error occurred')
-            raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
+        error_message = response.json().get('error', 'No additional error details provided.')
+
+        if response.status_code == 402:
+            message = f"Payment Required: Failed to {action}. {error_message}"
+        elif response.status_code == 408:
+            message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}"
+        elif response.status_code == 409:
+            message = f"Conflict: Failed to {action} due to a conflict. {error_message}"
+        elif response.status_code == 500:
+            message = f"Internal Server Error: Failed to {action}. {error_message}"
        else:
-            raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
+            message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}"
+
+        # Raise an HTTPError with the custom message and attach the response
+        raise requests.exceptions.HTTPError(message, response=response)
+