Merge pull request #145 from mendableai/nsc/timeout-scrape

Timeout on /scrape
2024-05-13 13:07:25 -07:00 · 2024-05-13 13:07:25 -07:00 · c9133f3d15
commit c9133f3d15
parent 3f090ffd7c 65d89afba9
2 changed files with 29 additions and 6 deletions
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -176,6 +176,16 @@ describe("E2E Tests for API Routes", () => {
    //   expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
    // });

+    it("should return a timeout error when scraping takes longer than the specified timeout", async () => {
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://firecrawl.dev", timeout: 1000 });
+
+      expect(response.statusCode).toBe(408);
+    }, 3000); 
+
    it("should return a successful response with a valid API key", async () => {
      const response = await request(TEST_URL)
        .post("/v0/crawlWebsitePreview")
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@ -15,6 +15,7 @@ export async function scrapeHelper(
  crawlerOptions: any,
  pageOptions: PageOptions,
  extractorOptions: ExtractorOptions,
+  timeout: number
 ): Promise<{
  success: boolean;
  error?: string;
@ -30,7 +31,6 @@ export async function scrapeHelper(
    return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
  }

-
  const a = new WebScraperDataProvider();
  await a.setOptions({
    mode: "single_urls",
@ -42,7 +42,19 @@ export async function scrapeHelper(
    extractorOptions: extractorOptions,
  });

-  const docs = await a.getDocuments(false);
+  const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
+    setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
+  );
+
+  const docsPromise = a.getDocuments(false);
+
+  let docs;
+  try {
+    docs = await Promise.race([docsPromise, timeoutPromise]);
+  } catch (error) {
+    return error;
+  }
+
  // make sure doc.content is not empty
  const filteredDocs = docs.filter(
    (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
@ -51,12 +63,11 @@ export async function scrapeHelper(
    return { success: true, error: "No page found", returnCode: 200 };
  }

-
  let creditsToBeBilled = filteredDocs.length;
  const creditsPerLLMExtract = 5;

  if (extractorOptions.mode === "llm-extraction") {
-    creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length)
+    creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
  }

  const billingResult = await billTeam(
@ -96,6 +107,7 @@ export async function scrapeController(req: Request, res: Response) {
      mode: "markdown"
    }
    const origin = req.body.origin ?? "api";
+    const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds

    try {
      const { success: creditsCheckSuccess, message: creditsCheckMessage } =
@ -114,6 +126,7 @@ export async function scrapeController(req: Request, res: Response) {
      crawlerOptions,
      pageOptions,
      extractorOptions,
+      timeout
    );
    const endTime = new Date().getTime();
    const timeTakenInSeconds = (endTime - startTime) / 1000;