Merge branch 'main' into feat/maxDepthRelative

2024-06-14 11:49:12 -04:00 · 2024-06-14 11:49:12 -04:00 · 2c5f5c0ea2
commit 2c5f5c0ea2
parent 80c10393b4 52d6201c42
22 changed files with 824 additions and 128 deletions
--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@ -61,6 +61,13 @@
                        "description": "Wait x amount of milliseconds for the page to load to fetch content",
                        "default": 0
                      },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
                      "headers": {
                        "type": "object",
                        "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
@ -194,6 +201,11 @@
                        "type": "integer",
                        "description": "Maximum number of pages to crawl",
                        "default": 10000
+                      },
+                      "allowBackwardCrawling": {
+                        "type": "boolean",
+                        "description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
+                        "default": false
                      }
                    }
                  },
@ -219,6 +231,13 @@
                        "type": "object",
                        "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
                      },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
                      "replaceAllPathsWithAbsolutePaths": {
                        "type": "boolean",
                        "description": "Replace all relative paths with absolute paths for images and links",
@ -492,7 +511,7 @@
              "html": {
                "type": "string",
                "nullable": true,
-                "description": "Raw HTML content of the page if `includeHtml`  is true"
+                "description": "Raw HTML content of the page if `includeHtml` is true"
              },
              "metadata": {
                "type": "object",
@ -507,9 +526,126 @@
                    "type": "string",
                    "nullable": true
                  },
+                  "keywords": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "robots": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "ogTitle": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "ogDescription": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "ogUrl": {
+                    "type": "string",
+                    "format": "uri",
+                    "nullable": true
+                  },
+                  "ogImage": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "ogAudio": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "ogDeterminer": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "ogLocale": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "ogLocaleAlternate": {
+                    "type": "array",
+                    "items": {
+                      "type": "string"
+                    },
+                    "nullable": true
+                  },
+                  "ogSiteName": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "ogVideo": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "dctermsCreated": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "dcDateCreated": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "dcDate": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "dctermsType": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "dcType": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "dctermsAudience": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "dctermsSubject": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "dcSubject": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "dcDescription": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "dctermsKeywords": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "modifiedTime": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "publishedTime": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "articleTag": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "articleSection": {
+                    "type": "string",
+                    "nullable": true
+                  },
                  "sourceURL": {
                    "type": "string",
                    "format": "uri"
+                  },
+                  "pageStatusCode": {
+                    "type": "integer",
+                    "description": "The status code of the page"
+                  },
+                  "pageError": {
+                    "type": "string",
+                    "nullable": true,
+                    "description": "The error message of the page"
                  }
                }
              },
@ -558,9 +694,126 @@
                "type": "string",
                "nullable": true
              },
+              "keywords": {
+                "type": "string",
+                "nullable": true
+              },
+              "robots": {
+                "type": "string",
+                "nullable": true
+              },
+              "ogTitle": {
+                "type": "string",
+                "nullable": true
+              },
+              "ogDescription": {
+                "type": "string",
+                "nullable": true
+              },
+              "ogUrl": {
+                "type": "string",
+                "format": "uri",
+                "nullable": true
+              },
+              "ogImage": {
+                "type": "string",
+                "nullable": true
+              },
+              "ogAudio": {
+                "type": "string",
+                "nullable": true
+              },
+              "ogDeterminer": {
+                "type": "string",
+                "nullable": true
+              },
+              "ogLocale": {
+                "type": "string",
+                "nullable": true
+              },
+              "ogLocaleAlternate": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "nullable": true
+              },
+              "ogSiteName": {
+                "type": "string",
+                "nullable": true
+              },
+              "ogVideo": {
+                "type": "string",
+                "nullable": true
+              },
+              "dctermsCreated": {
+                "type": "string",
+                "nullable": true
+              },
+              "dcDateCreated": {
+                "type": "string",
+                "nullable": true
+              },
+              "dcDate": {
+                "type": "string",
+                "nullable": true
+              },
+              "dctermsType": {
+                "type": "string",
+                "nullable": true
+              },
+              "dcType": {
+                "type": "string",
+                "nullable": true
+              },
+              "dctermsAudience": {
+                "type": "string",
+                "nullable": true
+              },
+              "dctermsSubject": {
+                "type": "string",
+                "nullable": true
+              },
+              "dcSubject": {
+                "type": "string",
+                "nullable": true
+              },
+              "dcDescription": {
+                "type": "string",
+                "nullable": true
+              },
+              "dctermsKeywords": {
+                "type": "string",
+                "nullable": true
+              },
+              "modifiedTime": {
+                "type": "string",
+                "nullable": true
+              },
+              "publishedTime": {
+                "type": "string",
+                "nullable": true
+              },
+              "articleTag": {
+                "type": "string",
+                "nullable": true
+              },
+              "articleSection": {
+                "type": "string",
+                "nullable": true
+              },
              "sourceURL": {
                "type": "string",
                "format": "uri"
+              },
+              "pageStatusCode": {
+                "type": "integer",
+                "description": "The status code of the page"
+              },
+              "pageError": {
+                "type": "string",
+                "nullable": true,
+                "description": "The error message of the page"
              }
            }
          }
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -83,6 +83,8 @@ describe("E2E Tests for API Routes", () => {
      expect(response.body.data).toHaveProperty("metadata");
      expect(response.body.data).not.toHaveProperty("html");
      expect(response.body.data.content).toContain("_Roast_");
+      expect(response.body.data.metadata.pageStatusCode).toBe(200);
+      expect(response.body.data.metadata.pageError).toBeUndefined();
    }, 30000); // 30 seconds timeout

    it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => {
@ -103,6 +105,8 @@ describe("E2E Tests for API Routes", () => {
      expect(response.body.data.content).toContain("_Roast_");
      expect(response.body.data.markdown).toContain("_Roast_");
      expect(response.body.data.html).toContain("<h1");
+      expect(response.body.data.metadata.pageStatusCode).toBe(200);
+      expect(response.body.data.metadata.pageError).toBeUndefined();
    }, 30000); // 30 seconds timeout
    
   it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
@ -118,6 +122,8 @@ describe("E2E Tests for API Routes", () => {
      expect(response.body.data).toHaveProperty('content');
      expect(response.body.data).toHaveProperty('metadata');
      expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
+      expect(response.body.data.metadata.pageStatusCode).toBe(200);
+      expect(response.body.data.metadata.pageError).toBeUndefined();
    }, 60000); // 60 seconds
  
    it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
@ -133,8 +139,59 @@ describe("E2E Tests for API Routes", () => {
      expect(response.body.data).toHaveProperty('content');
      expect(response.body.data).toHaveProperty('metadata');
      expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
+      expect(response.body.data.metadata.pageStatusCode).toBe(200);
+      expect(response.body.data.metadata.pageError).toBeUndefined();
    }, 60000); // 60 seconds

+    it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
+      await new Promise((r) => setTimeout(r, 6000));
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1  7 Jan 1993)>>endobj');
+    }, 60000); // 60 seconds
+
+    it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
+      const responseWithoutRemoveTags = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://www.scrapethissite.com/" });
+      expect(responseWithoutRemoveTags.statusCode).toBe(200);
+      expect(responseWithoutRemoveTags.body).toHaveProperty("data");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
+      expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
+      expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
+      expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
+      expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
+      expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
+
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toHaveProperty("content");
+      expect(response.body.data).toHaveProperty("markdown");
+      expect(response.body.data).toHaveProperty("metadata");
+      expect(response.body.data).not.toHaveProperty("html");
+      expect(response.body.data.content).toContain("Scrape This Site");
+      expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
+      expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
+      expect(response.body.data.content).not.toContain("web scraping"); // strong
+    }, 30000); // 30 seconds timeout
+
    // TODO: add this test back once we nail the waitFor option to be more deterministic
    // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
    //   const startTime = Date.now();
@ -155,6 +212,102 @@ describe("E2E Tests for API Routes", () => {
    //   expect(response.body.data.content).toContain("🔥 Firecrawl");
    //   expect(duration).toBeGreaterThanOrEqual(7000);
    // }, 12000); // 12 seconds timeout
+
+    it.concurrent('should return a successful response for a scrape with 400 page', async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://httpstat.us/400' });
+      await new Promise((r) => setTimeout(r, 5000));
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.metadata.pageStatusCode).toBe(400);
+      expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request");
+    }, 60000); // 60 seconds
+
+    it.concurrent('should return a successful response for a scrape with 401 page', async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://httpstat.us/401' });
+      await new Promise((r) => setTimeout(r, 5000));
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.metadata.pageStatusCode).toBe(401);
+      expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized");
+    }, 60000); // 60 seconds
+
+    it.concurrent("should return a successful response for a scrape with 403 page", async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://httpstat.us/403' });
+
+      await new Promise((r) => setTimeout(r, 5000));
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.metadata.pageStatusCode).toBe(403);
+      expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden");
+    }, 60000); // 60 seconds
+
+    it.concurrent('should return a successful response for a scrape with 404 page', async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://httpstat.us/404' });
+      await new Promise((r) => setTimeout(r, 5000));
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.metadata.pageStatusCode).toBe(404);
+      expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found");
+    }, 60000); // 60 seconds
+
+    it.concurrent('should return a successful response for a scrape with 405 page', async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://httpstat.us/405' });
+      await new Promise((r) => setTimeout(r, 5000));
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.metadata.pageStatusCode).toBe(405);
+      expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed");
+    }, 60000); // 60 seconds
+
+    it.concurrent('should return a successful response for a scrape with 500 page', async () => {
+      const response = await request(TEST_URL)
+        .post('/v0/scrape')
+        .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+        .set('Content-Type', 'application/json')
+        .send({ url: 'https://httpstat.us/500' });
+      await new Promise((r) => setTimeout(r, 5000));
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      expect(response.body.data).toHaveProperty('content');
+      expect(response.body.data).toHaveProperty('metadata');
+      expect(response.body.data.metadata.pageStatusCode).toBe(500);
+      expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error");
+    }, 60000); // 60 seconds
  });

  describe("POST /v0/crawl", () => {
@ -270,6 +423,8 @@ describe("E2E Tests for API Routes", () => {
      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
      expect(completedResponse.body.data[0].content).toContain("Mendable");
+      expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+      expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
    }, 60000); // 60 seconds

    it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => {
@ -351,6 +506,8 @@ describe("E2E Tests for API Routes", () => {
      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
      expect(completedResponse.body.data[0].content).toContain("Mendable");
+      expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+      expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
    }, 60000); // 60 seconds
  
    it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
@ -393,6 +550,8 @@ describe("E2E Tests for API Routes", () => {
      expect(completedResponse.body.data[0]).toHaveProperty("content");
      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+      expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+      expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
      const urls = completedResponse.body.data.map(
        (item: any) => item.metadata?.sourceURL
      );
@ -651,6 +810,8 @@ describe("E2E Tests for API Routes", () => {
      expect(completedResponse.body.data[0]).toHaveProperty("content");
      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+      expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+      expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();

      // 120 seconds
      expect(completedResponse.body.data[0]).toHaveProperty("html");
@ -658,7 +819,11 @@ describe("E2E Tests for API Routes", () => {
      expect(completedResponse.body.data[0].content).toContain("_Roast_");
      expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
      expect(completedResponse.body.data[0].html).toContain("<h1");
+
+      expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+      expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
    }, 180000);
+
  });

  describe("POST /v0/crawlWebsitePreview", () => {
@ -792,6 +957,8 @@ describe("E2E Tests for API Routes", () => {
      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
      expect(completedResponse.body.data[0].content).toContain("Mendable");
+      expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+      expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();

      const childrenLinks = completedResponse.body.data.filter(doc => 
        doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
@ -835,8 +1002,13 @@ describe("E2E Tests for API Routes", () => {
            })
          ])
        );
+
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+        expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
    }, 180000); // 120 seconds

+
    it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
      const crawlResponse = await request(TEST_URL)
        .post("/v0/crawl")
@ -870,6 +1042,9 @@ describe("E2E Tests for API Routes", () => {
      expect(completedResponse.body.data[0]).toHaveProperty("content");
      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+      expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+      expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
+
      const urls = completedResponse.body.data.map(
        (item: any) => item.metadata?.sourceURL
      );
@ -930,6 +1105,8 @@ describe("E2E Tests for API Routes", () => {
      expect(completedResponse.body.data[0].content).toContain("_Roast_");
      expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
      expect(completedResponse.body.data[0].html).toContain("<h1");
+      expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+      expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
    }, 60000);
  }); // 60 seconds

@ -973,6 +1150,8 @@ describe("E2E Tests for API Routes", () => {
    expect(completedResponse.body.data[0]).toHaveProperty("html");
    expect(completedResponse.body.data[0].content).toContain("Mendable");
    expect(completedResponse.body.data[0].markdown).toContain("Mendable");
+    expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+    expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();

    const onlyChildrenLinks = completedResponse.body.data.filter(doc => {
      return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
@ -1013,7 +1192,8 @@ describe("E2E Tests for API Routes", () => {
    expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
    expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
    expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
-    
+    expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200);
+    expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined();
  }, 60000); // 60 seconds

  describe("POST /v0/scrape with LLM Extraction", () => {
@ -1168,6 +1348,10 @@ describe("E2E Tests for API Routes", () => {
      expect(statusResponse.body).toHaveProperty("data");
      expect(statusResponse.body.data[0]).toHaveProperty("content");
      expect(statusResponse.body.data[0]).toHaveProperty("markdown");
+      expect(statusResponse.body.data[0]).toHaveProperty("metadata");
+      expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+      expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined();
+
      const results = statusResponse.body.data;
      // results.forEach((result, i) => {
      //   console.log(result.metadata.sourceURL);
--- a/apps/api/src/controllers/crawl.ts
+++ b/apps/api/src/controllers/crawl.ts
@ -55,8 +55,16 @@ export async function crawlController(req: Request, res: Response) {
    }

    const mode = req.body.mode ?? "crawl";
-    const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+
+    const crawlerOptions = req.body.crawlerOptions ?? {
+      allowBackwardCrawling: false
+    };
+    const pageOptions = req.body.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      removeTags: [],
+      parsePDF: true
+    };

    if (mode === "single_urls" && !url.includes(",")) {
      try {
--- a/apps/api/src/controllers/crawlPreview.ts
+++ b/apps/api/src/controllers/crawlPreview.ts
@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {

    const mode = req.body.mode ?? "crawl";
    const crawlerOptions = req.body.crawlerOptions ?? {};
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };

    const job = await addWebScraperJob({
      url: url,
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@ -61,7 +61,7 @@ export async function scrapeHelper(
    (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
  );
  if (filteredDocs.length === 0) {
-    return { success: true, error: "No page found", returnCode: 200 };
+    return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
  }

  let creditsToBeBilled = filteredDocs.length;
@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
      return res.status(status).json({ error });
    }
    const crawlerOptions = req.body.crawlerOptions ?? {};
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
+    const pageOptions = req.body.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      waitFor: 0,
+      screenshot: false,
+      parsePDF: true
+    };
    const extractorOptions = req.body.extractorOptions ?? {
      mode: "markdown"
    }
--- a/apps/api/src/controllers/search.ts
+++ b/apps/api/src/controllers/search.ts
@ -85,6 +85,7 @@ export async function searchHelper(
      onlyMainContent: pageOptions?.onlyMainContent ?? true,
      fetchPageContent: pageOptions?.fetchPageContent ?? true,
      includeHtml: pageOptions?.includeHtml ?? false,
+      removeTags: pageOptions?.removeTags ?? [],
      fallback: false,
    },
  });
@ -100,7 +101,7 @@ export async function searchHelper(
  );

  if (filteredDocs.length === 0) {
-    return { success: true, error: "No page found", returnCode: 200 };
+    return { success: true, error: "No page found", returnCode: 200, data: docs };
  }

  const billingResult = await billTeam(
@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
      includeHtml: false,
      onlyMainContent: true,
      fetchPageContent: true,
+      removeTags: [],
      fallback: false,
    };
    const origin = req.body.origin ?? "api";
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -19,6 +19,8 @@ export type PageOptions = {
  screenshot?: boolean;
  headers?: Record<string, string>;
  replaceAllPathsWithAbsolutePaths?: boolean;
+  parsePDF?: boolean;
+  removeTags?: string | string[];
 };

 export type ExtractorOptions = {
@ -119,4 +121,7 @@ export class SearchResult {
 export interface FireEngineResponse {
  html: string;
  screenshot: string;
-}
+  pageStatusCode?: number;
+  pageError?: string;
+}
+
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -223,7 +223,7 @@ export class WebCrawler {
    return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
  }

-  async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
+  async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
    const normalizedUrl = this.normalizeCrawlUrl(url);
    if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
      return [];
@ -243,20 +243,27 @@ export class WebCrawler {

    try {
      let content: string = "";
+      let pageStatusCode: number;
+      let pageError: string | undefined = undefined;
+
      // If it is the first link, fetch with single url
      if (this.visited.size === 1) {
        const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
        content = page.html ?? "";
+        pageStatusCode = page.metadata?.pageStatusCode;
+        pageError = page.metadata?.pageError || undefined;
      } else {
        const response = await axios.get(url);
        content = response.data ?? "";
+        pageStatusCode = response.status;
+        pageError = response.statusText != "OK" ? response.statusText : undefined;
      }
      const $ = load(content);
-      let links: { url: string, html: string }[] = [];
+      let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];

      // Add the initial URL to the list of links
      if (this.visited.size === 1) {
-        links.push({ url, html: content });
+        links.push({ url, html: content, pageStatusCode, pageError });
      }

      $("a").each((_, element) => {
@ -278,7 +285,7 @@ export class WebCrawler {
            !this.matchesExcludes(path) &&
            this.robots.isAllowed(fullUrl, "FireCrawlAgent")
          ) {
-            links.push({ url: fullUrl, html: content });
+            links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
          }
        }
      });
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@ -1,5 +1,3 @@
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-
 export async function handleCustomScraping(
  text: string,
  url: string
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -245,7 +245,7 @@ export class WebScraperDataProvider {
      content: "",
      html: this.pageOptions?.includeHtml ? "" : undefined,
      markdown: "",
-      metadata: { sourceURL: url },
+      metadata: { sourceURL: url, pageStatusCode: 200 },
    }));
  }

@ -284,10 +284,10 @@ export class WebScraperDataProvider {
  private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
    return Promise.all(
      pdfLinks.map(async (pdfLink) => {
-        const pdfContent = await fetchAndProcessPdf(pdfLink);
+        const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
        return {
-          content: pdfContent,
-          metadata: { sourceURL: pdfLink },
+          content: content,
+          metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
          provider: "web-scraper",
        };
      })
@ -296,10 +296,10 @@ export class WebScraperDataProvider {
  private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
    return Promise.all(
      docxLinks.map(async (p) => {
-        const docXDocument = await fetchAndProcessDocx(p);
+        const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p);
        return {
-          content: docXDocument,
-          metadata: { sourceURL: p },
+          content,
+          metadata: { sourceURL: p, pageStatusCode, pageError },
          provider: "web-scraper",
        };
      })
@ -479,7 +479,13 @@ export class WebScraperDataProvider {
    this.limit = options.crawlerOptions?.limit ?? 10000;
    this.generateImgAltText =
      options.crawlerOptions?.generateImgAltText ?? false;
-    this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
+    this.pageOptions = options.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      replaceAllPathsWithAbsolutePaths: false,
+      parsePDF: true,
+      removeTags: []
+    };
    this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
    //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
  url: string,
  waitFor: number = 0,
  screenshot: boolean = false,
-  pageOptions: { scrollXPaths?: string[] } = {},
+  pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
  headers?: Record<string, string>,
  options?: any
 ): Promise<FireEngineResponse> {
@ -83,17 +83,18 @@ export async function scrapWithFireEngine(
      console.error(
        `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
      );
-      return { html: "", screenshot: "" };
+      return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
    }

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return { html: await fetchAndProcessPdf(url), screenshot: "" };
+      const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
+      return { html: content, screenshot: "", pageStatusCode, pageError };
    } else {
      const data = response.data;
      const html = data.content;
      const screenshot = data.screenshot;
-      return { html: html ?? "", screenshot: screenshot ?? "" };
+      return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
    }
  } catch (error) {
    if (error.code === 'ECONNABORTED') {
@ -108,44 +109,51 @@ export async function scrapWithFireEngine(
 export async function scrapWithScrapingBee(
  url: string,
  wait_browser: string = "domcontentloaded",
-  timeout: number = universalTimeout
-): Promise<string> {
+  timeout: number = universalTimeout,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
+): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
  try {
    const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
    const clientParams = await generateRequestParams(
      url,
      wait_browser,
-      timeout
+      timeout,
    );

-    const response = await client.get(clientParams);
-
-    if (response.status !== 200 && response.status !== 404) {
-      console.error(
-        `[ScrapingBee] Error fetching url: ${url} with status code ${response.status}`
-      );
-      return "";
-    }
+    const response = await client.get({
+      ...clientParams,
+      params: {
+        ...clientParams.params,
+        'transparent_status_code': 'True'
+      }
+    });

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
+
    } else {
-      const decoder = new TextDecoder();
-      const text = decoder.decode(response.data);
-      return text;
+      let text = "";
+      try {
+        const decoder = new TextDecoder();
+        text = decoder.decode(response.data);
+      } catch (decodeError) {
+        console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`);
+      }
+      return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined };
    }
  } catch (error) {
    console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
-    return "";
+    return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText };
  }
 }

 export async function scrapWithPlaywright(
  url: string,
  waitFor: number = 0,
-  headers?: Record<string, string>
-): Promise<string> {
+  headers?: Record<string, string>,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
+): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
  try {
    const reqParams = await generateRequestParams(url);
    // If the user has passed a wait parameter in the request, use that
@ -167,21 +175,21 @@ export async function scrapWithPlaywright(
      console.error(
        `[Playwright] Error fetching url: ${url} with status: ${response.status}`
      );
-      return "";
+      return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
    }

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
    } else {
      const textData = response.data;
      try {
        const data = JSON.parse(textData);
        const html = data.content;
-        return html ?? "";
+        return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
      } catch (jsonError) {
        console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`);
-        return "";
+        return { content: "" };
      }
    }
  } catch (error) {
@ -190,11 +198,14 @@ export async function scrapWithPlaywright(
    } else {
      console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
    }
-    return "";
+    return { content: "" };
  }
 }

-export async function scrapWithFetch(url: string): Promise<string> {
+export async function scrapWithFetch(
+  url: string,
+  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
+): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
  try {
    const response = await axios.get(url, {
      headers: {
@ -208,15 +219,15 @@ export async function scrapWithFetch(url: string): Promise<string> {
      console.error(
        `[Axios] Error fetching url: ${url} with status: ${response.status}`
      );
-      return "";
+      return { content: "", pageStatusCode: response.status, pageError: response.statusText };
    }

    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      return fetchAndProcessPdf(url);
+      return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
    } else {
      const text = response.data;
-      return text;
+      return { content: text, pageStatusCode: 200 };
    }
  } catch (error) {
    if (error.code === 'ECONNABORTED') {
@ -224,7 +235,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
    } else {
      console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
    }
-    return "";
+    return { content: "" };
  }
 }

@ -304,6 +315,19 @@ export async function scrapSingleUrl(
  const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
    const soup = cheerio.load(html);
    soup("script, style, iframe, noscript, meta, head").remove();
+
+    if (pageOptions.removeTags) {
+      if (typeof pageOptions.removeTags === 'string') {
+        pageOptions.removeTags.split(',').forEach((tag) => {
+          soup(tag.trim()).remove();
+        });
+      } else if (Array.isArray(pageOptions.removeTags)) {
+        pageOptions.removeTags.forEach((tag) => {
+          soup(tag).remove();
+        });
+      }
+    }
+    
    if (pageOptions.onlyMainContent) {
      // remove any other tags that are not in the main content
      excludeNonMainTags.forEach((tag) => {
@ -317,7 +341,7 @@ export async function scrapSingleUrl(
    url: string,
    method: (typeof baseScrapers)[number]
  ) => {
-    let text = "";
+    let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} };
    let screenshot = "";
    switch (method) {
      case "fire-engine":
@ -329,38 +353,52 @@ export async function scrapSingleUrl(
            pageOptions.screenshot,
            pageOptions.headers
          );
-          text = response.html;
-          screenshot = response.screenshot;
+          scraperResponse.text = response.html;
+          scraperResponse.screenshot = response.screenshot;
+          scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
+          scraperResponse.metadata.pageError = response.pageError;
        }
        break;
      case "scrapingBee":
        if (process.env.SCRAPING_BEE_API_KEY) {
-          text = await scrapWithScrapingBee(
+          const response = await scrapWithScrapingBee(
            url,
            "domcontentloaded",
            pageOptions.fallback === false ? 7000 : 15000
          );
+          scraperResponse.text = response.content;
+          scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
+          scraperResponse.metadata.pageError = response.pageError;
        }
        break;
      case "playwright":
        if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
-          text = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
+          const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
+          scraperResponse.text = response.content;
+          scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
+          scraperResponse.metadata.pageError = response.pageError;
        }
        break;
      case "scrapingBeeLoad":
        if (process.env.SCRAPING_BEE_API_KEY) {
-          text = await scrapWithScrapingBee(url, "networkidle2");
+          const response = await scrapWithScrapingBee(url, "networkidle2");
+          scraperResponse.text = response.content;
+          scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
+          scraperResponse.metadata.pageError = response.pageError;
        }
        break;
      case "fetch":
-        text = await scrapWithFetch(url);
+        const response = await scrapWithFetch(url);
+        scraperResponse.text = response.content;
+        scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
+        scraperResponse.metadata.pageError = response.pageError;
        break;
    }

    let customScrapedContent : FireEngineResponse | null = null;

    // Check for custom scraping conditions
-    const customScraperResult = await handleCustomScraping(text, url);
+    const customScraperResult = await handleCustomScraping(scraperResponse.text, url);

    if (customScraperResult){
      switch (customScraperResult.scraper) {
@ -371,23 +409,30 @@ export async function scrapSingleUrl(
          }
          break;
        case "pdf":
-          customScrapedContent  = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
+          const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF);
+          customScrapedContent  = { html: content, screenshot, pageStatusCode, pageError }
          break;
      }
    }

    if (customScrapedContent) {
-      text = customScrapedContent.html;
+      scraperResponse.text = customScrapedContent.html;
      screenshot = customScrapedContent.screenshot;
    }

    //* TODO: add an optional to return markdown or structured/extracted content
-    let cleanedHtml = removeUnwantedElements(text, pageOptions);
+    let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);

-    return [await parseMarkdown(cleanedHtml), text, screenshot];
+    return {
+      text: await parseMarkdown(cleanedHtml),
+      html: scraperResponse.text,
+      screenshot: scraperResponse.screenshot,
+      pageStatusCode: scraperResponse.metadata.pageStatusCode,
+      pageError: scraperResponse.metadata.pageError || undefined
+    };
  };
+  let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
  try {
-    let [text, html, screenshot] = ["", "", ""];
    let urlKey = urlToScrap;
    try {
      urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
@ -410,8 +455,21 @@ export async function scrapSingleUrl(
        html = existingHtml;
        break;
      }
-      [text, html, screenshot] = await attemptScraping(urlToScrap, scraper);
+
+      const attempt = await attemptScraping(urlToScrap, scraper);
+      text = attempt.text ?? '';
+      html = attempt.html ?? '';
+      screenshot = attempt.screenshot ?? '';
+      if (attempt.pageStatusCode) {
+        pageStatusCode = attempt.pageStatusCode;
+      }
+      if (attempt.pageError) {
+        pageError = attempt.pageError;
+      }
+      
+      
      if (text && text.trim().length >= 100) break;
+      if (pageStatusCode && pageStatusCode == 404) break;
      const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
      if (nextScraperIndex < scrapersInOrder.length) {
        console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
@ -435,6 +493,8 @@ export async function scrapSingleUrl(
          ...metadata,
          screenshot: screenshot,
          sourceURL: urlToScrap,
+          pageStatusCode: pageStatusCode,
+          pageError: pageError
        },
      };
    } else {
@ -442,7 +502,12 @@ export async function scrapSingleUrl(
        content: text,
        markdown: text,
        html: pageOptions.includeHtml ? html : undefined,
-        metadata: { ...metadata, sourceURL: urlToScrap },
+        metadata: {
+          ...metadata,
+          sourceURL: urlToScrap,
+          pageStatusCode: pageStatusCode,
+          pageError: pageError
+        },
      };
    }

@ -453,7 +518,11 @@ export async function scrapSingleUrl(
      content: "",
      markdown: "",
      html: "",
-      metadata: { sourceURL: urlToScrap },
+      metadata: {
+        sourceURL: urlToScrap,
+        pageStatusCode: pageStatusCode,
+        pageError: pageError
+      },
    } as Document;
  }
 }
--- a/apps/api/src/scraper/WebScraper/utils/tests/docxProcessor.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/docxProcessor.test.ts
@ -3,11 +3,13 @@ import * as docxProcessor from "../docxProcessor";
 describe("DOCX Processing Module - Integration Test", () => {
  it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
    delete process.env.LLAMAPARSE_API_KEY;
-    const docxContent = await docxProcessor.fetchAndProcessDocx(
+    const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx(
      "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
    );
-    expect(docxContent.trim()).toContain(
+    expect(content.trim()).toContain(
      "SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
    );
+    expect(pageStatusCode).toBe(200);
+    expect(pageError).toBeUndefined();
  });
 });
--- a/apps/api/src/scraper/WebScraper/utils/tests/pdfProcessor.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/pdfProcessor.test.ts
@ -3,8 +3,10 @@ import * as pdfProcessor from '../pdfProcessor';
 describe('PDF Processing Module - Integration Test', () => {
  it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
    delete process.env.LLAMAPARSE_API_KEY;
-    const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
-    expect(pdfContent.trim()).toEqual("Dummy PDF file");
+    const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
+    expect(content.trim()).toEqual("Dummy PDF file");
+    expect(pageStatusCode).toEqual(200);
+    expect(pageError).toBeUndefined();
  });

 // We're hitting the LLAMAPARSE rate limit 🫠
--- a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
@ -5,14 +5,14 @@ import path from "path";
 import os from "os";
 import mammoth from "mammoth";

-export async function fetchAndProcessDocx(url: string): Promise<string> {
-  const tempFilePath = await downloadDocx(url);
+export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
+  const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url);
  const content = await processDocxToText(tempFilePath);
  fs.unlinkSync(tempFilePath); // Clean up the temporary file
-  return content;
+  return { content, pageStatusCode, pageError };
 }

-async function downloadDocx(url: string): Promise<string> {
+async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
  const response = await axios({
    url,
    method: "GET",
@ -25,7 +25,7 @@ async function downloadDocx(url: string): Promise<string> {
  response.data.pipe(writer);

  return new Promise((resolve, reject) => {
-    writer.on("finish", () => resolve(tempFilePath));
+    writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
    writer.on("error", reject);
  });
 }
--- a/apps/api/src/scraper/WebScraper/utils/metadata.ts
+++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts
@ -29,6 +29,9 @@ interface Metadata {
  publishedTime?: string;
  articleTag?: string;
  articleSection?: string;
+  sourceURL?: string;
+  pageStatusCode?: number;
+  pageError?: string;
 }

 export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
@ -61,6 +64,9 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
  let publishedTime: string | null = null;
  let articleTag: string | null = null;
  let articleSection: string | null = null;
+  let sourceURL: string | null = null;
+  let pageStatusCode: number | null = null;
+  let pageError: string | null = null;

  try {
    title = soup("title").text() || null;
@ -132,5 +138,8 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
    ...(publishedTime ? { publishedTime } : {}),
    ...(articleTag ? { articleTag } : {}),
    ...(articleSection ? { articleSection } : {}),
+    ...(sourceURL ? { sourceURL } : {}),
+    ...(pageStatusCode ? { pageStatusCode } : {}),
+    ...(pageError ? { pageError } : {}),
  };
 }
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@ -9,14 +9,14 @@ import os from "os";

 dotenv.config();

-export async function fetchAndProcessPdf(url: string): Promise<string> {
-  const tempFilePath = await downloadPdf(url);
-  const content = await processPdfToText(tempFilePath);
+export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
+  const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
+  const content = await processPdfToText(tempFilePath, parsePDF);
  fs.unlinkSync(tempFilePath); // Clean up the temporary file
-  return content;
+  return { content, pageStatusCode, pageError };
 }

-async function downloadPdf(url: string): Promise<string> {
+async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
  const response = await axios({
    url,
    method: "GET",
@ -29,15 +29,15 @@ async function downloadPdf(url: string): Promise<string> {
  response.data.pipe(writer);

  return new Promise((resolve, reject) => {
-    writer.on("finish", () => resolve(tempFilePath));
+    writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
    writer.on("error", reject);
  });
 }

-export async function processPdfToText(filePath: string): Promise<string> {
+export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
  let content = "";

-  if (process.env.LLAMAPARSE_API_KEY) {
+  if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
    const apiKey = process.env.LLAMAPARSE_API_KEY;
    const headers = {
      Authorization: `Bearer ${apiKey}`,
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
      console.error("Error processing pdf document w/ LlamaIndex(2)");
      content = await processPdf(filePath);
    }
-  } else {
+  } else if (parsePDF) {
    content = await processPdf(filePath);
+  } else {
+    content = fs.readFileSync(filePath, "utf-8");
  }
  return content;
 }
--- a/apps/playwright-service/get_error.py
+++ b/apps/playwright-service/get_error.py
@ -0,0 +1,63 @@
+def get_error(status_code: int) -> str:
+    error_messages = {
+        300: "Multiple Choices",
+        301: "Moved Permanently",
+        302: "Found",
+        303: "See Other",
+        304: "Not Modified",
+        305: "Use Proxy",
+        307: "Temporary Redirect",
+        308: "Permanent Redirect",
+        309: "Resume Incomplete",
+        310: "Too Many Redirects",
+        311: "Unavailable For Legal Reasons",
+        312: "Previously Used",
+        313: "I'm Used",
+        314: "Switch Proxy",
+        315: "Temporary Redirect",
+        316: "Resume Incomplete",
+        317: "Too Many Redirects",
+        400: "Bad Request",
+        401: "Unauthorized",
+        403: "Forbidden",
+        404: "Not Found",
+        405: "Method Not Allowed",
+        406: "Not Acceptable",
+        407: "Proxy Authentication Required",
+        408: "Request Timeout",
+        409: "Conflict",
+        410: "Gone",
+        411: "Length Required",
+        412: "Precondition Failed",
+        413: "Payload Too Large",
+        414: "URI Too Long",
+        415: "Unsupported Media Type",
+        416: "Range Not Satisfiable",
+        417: "Expectation Failed",
+        418: "I'm a teapot",
+        421: "Misdirected Request",
+        422: "Unprocessable Entity",
+        423: "Locked",
+        424: "Failed Dependency",
+        425: "Too Early",
+        426: "Upgrade Required",
+        428: "Precondition Required",
+        429: "Too Many Requests",
+        431: "Request Header Fields Too Large",
+        451: "Unavailable For Legal Reasons",
+        500: "Internal Server Error",
+        501: "Not Implemented",
+        502: "Bad Gateway",
+        503: "Service Unavailable",
+        504: "Gateway Timeout",
+        505: "HTTP Version Not Supported",
+        506: "Variant Also Negotiates",
+        507: "Insufficient Storage",
+        508: "Loop Detected",
+        510: "Not Extended",
+        511: "Network Authentication Required",
+        599: "Network Connect Timeout Error"
+    }
+    if status_code < 300:
+        return None
+    return error_messages.get(status_code, "Unknown Error")
--- a/apps/playwright-service/main.py
+++ b/apps/playwright-service/main.py
@ -9,6 +9,7 @@ from fastapi import FastAPI
 from fastapi.responses import JSONResponse
 from playwright.async_api import Browser, async_playwright
 from pydantic import BaseModel
+from get_error import get_error

 PROXY_SERVER = environ.get("PROXY_SERVER", None)
 PROXY_USERNAME = environ.get("PROXY_USERNAME", None)
@ -73,16 +74,22 @@ async def root(body: UrlModel):
    if body.headers:
        await page.set_extra_http_headers(body.headers)

-    await page.goto(
+    response = await page.goto(
        body.url,
        wait_until="load",
        timeout=body.timeout,
    )
+    page_status_code = response.status
+    page_error = get_error(page_status_code)
    # Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
    if body.wait_after_load > 0:
        await page.wait_for_timeout(body.wait_after_load)

    page_content = await page.content()
    await context.close()
-    json_compatible_item_data = {"content": page_content}
-    return JSONResponse(content=json_compatible_item_data)
+    json_compatible_item_data = {
+        "content": page_content,
+        "pageStatusCode": page_status_code,
+        "pageError": page_error
+      }
+    return JSONResponse(content=json_compatible_item_data)
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -1,3 +1,57 @@
+"""
+This is the Firecrawl package.
+
+This package provides a Python SDK for interacting with the Firecrawl API.
+It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
+and check the status of these jobs.
+
+For more information visit https://github.com/firecrawl/
+"""
+
+import logging
+import os
+
 from .firecrawl import FirecrawlApp

-__version__ = "0.0.14"
+__version__ = "0.0.16"
+
+# Define the logger for the Firecrawl project
+logger: logging.Logger = logging.getLogger("firecrawl")
+
+
+def _basic_config() -> None:
+    """Set up basic configuration for logging with a specific format and date format."""
+    try:
+        logging.basicConfig(
+            format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+    except Exception as e:
+        logger.error("Failed to configure logging: %s", e)
+
+
+def setup_logging() -> None:
+    """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
+    env = os.environ.get(
+        "FIRECRAWL_LOGGING_LEVEL", "INFO"
+    ).upper()  # Default to 'INFO' level
+    _basic_config()
+
+    if env == "DEBUG":
+        logger.setLevel(logging.DEBUG)
+    elif env == "INFO":
+        logger.setLevel(logging.INFO)
+    elif env == "WARNING":
+        logger.setLevel(logging.WARNING)
+    elif env == "ERROR":
+        logger.setLevel(logging.ERROR)
+    elif env == "CRITICAL":
+        logger.setLevel(logging.CRITICAL)
+    else:
+        logger.setLevel(logging.INFO)
+        logger.warning("Unknown logging level: %s, defaulting to INFO", env)
+
+
+# Initialize logging configuration when the module is imported
+setup_logging()
+logger.debug("Debugging logger setup")
--- a/apps/python-sdk/firecrawl/tests/e2e_withAuth/pycache/test.cpython-311-pytest-8.2.1.pyc
+++ b/apps/python-sdk/firecrawl/tests/e2e_withAuth/pycache/test.cpython-311-pytest-8.2.1.pyc
--- a/apps/python-sdk/firecrawl/tests/e2e_withAuth/test.py
+++ b/apps/python-sdk/firecrawl/tests/e2e_withAuth/test.py
@ -27,14 +27,14 @@ def test_scrape_url_invalid_api_key():
    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
    with pytest.raises(Exception) as excinfo:
        invalid_app.scrape_url('https://firecrawl.dev')
-    assert "Failed to scrape URL. Status code: 401" in str(excinfo.value)
+    assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)

 def test_blocklisted_url():
    blocklisted_url = "https://facebook.com/fake-test"
    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
    with pytest.raises(Exception) as excinfo:
        app.scrape_url(blocklisted_url)
-    assert "Failed to scrape URL. Status code: 403" in str(excinfo.value)
+    assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)

 def test_successful_response_with_valid_preview_token():
    app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
@ -86,14 +86,14 @@ def test_crawl_url_invalid_api_key():
    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
    with pytest.raises(Exception) as excinfo:
        invalid_app.crawl_url('https://firecrawl.dev')
-    assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value)
+    assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)

 def test_should_return_error_for_blocklisted_url():
    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
    blocklisted_url = "https://twitter.com/fake-test"
    with pytest.raises(Exception) as excinfo:
        app.crawl_url(blocklisted_url)
-    assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value)
+    assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)

 def test_crawl_url_wait_for_completion_e2e():
    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@ -114,7 +114,7 @@ def test_crawl_url_with_idempotency_key_e2e():

    with pytest.raises(Exception) as excinfo:
        app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
-    assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value) 
+    assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) 

 def test_check_crawl_status_e2e():
    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@ -141,7 +141,7 @@ def test_search_invalid_api_key():
    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
    with pytest.raises(Exception) as excinfo:
        invalid_app.search("test query")
-    assert "Failed to search. Status code: 401" in str(excinfo.value)
+    assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)

 def test_llm_extraction():
    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes.
 Classes:
    - FirecrawlApp: Main class for interacting with the Firecrawl API.
 """
-
+import logging
 import os
 import time
 from typing import Any, Dict, Optional

 import requests

+logger : logging.Logger = logging.getLogger("firecrawl")

 class FirecrawlApp:
    """
@ -28,8 +29,15 @@ class FirecrawlApp:
    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
        self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
        if self.api_key is None:
+            logger.warning("No API key provided")
            raise ValueError('No API key provided')
+        else:
+            logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
+
        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
+        if self.api_url != 'https://api.firecrawl.dev':
+            logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
+
    def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
        """
        Scrape the specified URL using the Firecrawl API.
@ -45,10 +53,8 @@ class FirecrawlApp:
            Exception: If the scrape request fails.
        """

-        headers = {
-            'Content-Type': 'application/json',
-            'Authorization': f'Bearer {self.api_key}'
-        }
+        headers = self._prepare_headers()
+
        # Prepare the base scrape parameters with the URL
        scrape_params = {'url': url}

@ -81,13 +87,10 @@ class FirecrawlApp:
                return response['data']
            else:
                raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
-        elif response.status_code in [402, 408, 409, 500]:
-            error_message = response.json().get('error', 'Unknown error occurred')
-            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
        else:
-            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
+            self._handle_error(response, 'scrape URL')

-    def search(self, query, params=None):
+    def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
        """
        Perform a search using the Firecrawl API.

@ -101,10 +104,7 @@ class FirecrawlApp:
        Raises:
            Exception: If the search request fails.
        """
-        headers = {
-            'Content-Type': 'application/json',
-            'Authorization': f'Bearer {self.api_key}'
-        }
+        headers = self._prepare_headers()
        json_data = {'query': query}
        if params:
            json_data.update(params)
@ -121,13 +121,14 @@ class FirecrawlApp:
            else:
                raise Exception(f'Failed to search. Error: {response["error"]}')

-        elif response.status_code in [402, 409, 500]:
-            error_message = response.json().get('error', 'Unknown error occurred')
-            raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
        else:
-            raise Exception(f'Failed to search. Status code: {response.status_code}')
+            self._handle_error(response, 'search')

-    def crawl_url(self, url, params=None, wait_until_done=True, poll_interval=2, idempotency_key=None):
+    def crawl_url(self, url: str,
+                  params: Optional[Dict[str, Any]] = None,
+                  wait_until_done: bool = True,
+                  poll_interval: int = 2,
+                  idempotency_key: Optional[str] = None) -> Any:
        """
        Initiate a crawl job for the specified URL using the Firecrawl API.

@ -158,7 +159,7 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'start crawl job')

-    def check_crawl_status(self, job_id):
+    def check_crawl_status(self, job_id: str) -> Any:
        """
        Check the status of a crawl job using the Firecrawl API.

@ -178,7 +179,7 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'check crawl status')

-    def _prepare_headers(self, idempotency_key=None):
+    def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
        """
        Prepare the headers for API requests.

@ -200,7 +201,11 @@ class FirecrawlApp:
            'Authorization': f'Bearer {self.api_key}',
        }

-    def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
+    def _post_request(self, url: str,
+                      data: Dict[str, Any],
+                      headers: Dict[str, str],
+                      retries: int = 3,
+                      backoff_factor: float = 0.5) -> requests.Response:
        """
        Make a POST request with retries.

@ -225,7 +230,10 @@ class FirecrawlApp:
                return response
        return response

-    def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
+    def _get_request(self, url: str,
+                     headers: Dict[str, str],
+                     retries: int = 3,
+                     backoff_factor: float = 0.5) -> requests.Response:
        """
        Make a GET request with retries.

@ -249,7 +257,7 @@ class FirecrawlApp:
                return response
        return response

-    def _monitor_job_status(self, job_id, headers, poll_interval):
+    def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any:
        """
        Monitor the status of a crawl job until completion.

@ -281,7 +289,7 @@ class FirecrawlApp:
            else:
                self._handle_error(status_response, 'check crawl status')

-    def _handle_error(self, response, action):
+    def _handle_error(self, response: requests.Response, action: str) -> None:
        """
        Handle errors from API responses.

@ -292,8 +300,19 @@ class FirecrawlApp:
        Raises:
            Exception: An exception with a message containing the status code and error details from the response.
        """
-        if response.status_code in [402, 408, 409, 500]:
-            error_message = response.json().get('error', 'Unknown error occurred')
-            raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
+        error_message = response.json().get('error', 'No additional error details provided.')
+
+        if response.status_code == 402:
+            message = f"Payment Required: Failed to {action}. {error_message}"
+        elif response.status_code == 408:
+            message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}"
+        elif response.status_code == 409:
+            message = f"Conflict: Failed to {action} due to a conflict. {error_message}"
+        elif response.status_code == 500:
+            message = f"Internal Server Error: Failed to {action}. {error_message}"
        else:
-            raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
+            message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}"
+
+        # Raise an HTTPError with the custom message and attach the response
+        raise requests.exceptions.HTTPError(message, response=response)
+