[Test] Added integration tests suite

solves #15
2024-05-03 17:23:25 -03:00 · 2024-05-03 17:23:25 -03:00 · fbb4c63a1a
commit fbb4c63a1a
parent ef6db3b7c2
10 changed files with 3056 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,5 @@ dump.rdb
 apps/js-sdk/node_modules/
 apps/api/.env.local
 apps/test-suite/node_modules/
--- a/apps/test-suite/.env.example
+++ b/apps/test-suite/.env.example
@ -0,0 +1,3 @@
 OPENAI_API_KEY=
 TEST_API_KEY=
 TEST_URL=http://localhost:3002
--- a/apps/test-suite/README.md
+++ b/apps/test-suite/README.md
@ -0,0 +1,43 @@
 # Test Suite for Firecrawl
 This document provides an overview of the test suite for the Firecrawl project. It includes instructions on how to run the tests and interpret the results.
 ## Overview
 The test suite is designed to ensure the reliability and performance of the Firecrawl system. It includes a series of automated tests that check various functionalities and performance metrics.
 ## Running the Tests
 To run the tests, navigate to the `test-suite` directory and execute the following command:
 ```bash
 npm install
 npx playwright install
 npm run test
 ```
 ## Test Results
 The tests are designed to cover various aspects of the system, including:
 - Crawling accuracy
 - Response time
 - Error handling
 ### Example Test Case
 - **Test Name**: Accuracy Test
 - **Description**: This test checks the accuracy of the scraping mechanism with 100 pages and a fuzzy threshold of 0.8.
 - **Expected Result**: Accuracy >= 0.9
 - **Received Result**: Accuracy between 0.2 and 0.3
 ## Troubleshooting
 If you encounter any failures or unexpected results, please check the following:
 - Ensure your network connection is stable.
 - Verify that all dependencies are correctly installed.
 - Review the error logs for any specific error messages.
 ## Contributing
 Contributions to the test suite are welcome. Please refer to the project's main [CONTRIBUTING.md](../CONTRIBUTING.md) file for guidelines on how to contribute.
--- a/apps/test-suite/assets/test_screenshot.png
+++ b/apps/test-suite/assets/test_screenshot.png
--- a/apps/test-suite/index.test.ts
+++ b/apps/test-suite/index.test.ts
@ -0,0 +1,214 @@
 import request from "supertest";
 import dotenv from "dotenv";
 import { OpenAI } from "openai";
 import path from "path";
 import playwright from "playwright";
 const fs = require('fs').promises;
 dotenv.config();
 describe("Scraping/Crawling Checkup (E2E)", () => {
  beforeAll(() => {
    if (!process.env.TEST_API_KEY) {
      throw new Error("TEST_API_KEY is not set");
    }
    if (!process.env.TEST_URL) {
      throw new Error("TEST_URL is not set");
    }
    if (!process.env.OPENAI_API_KEY) {
      throw new Error("OPENAI_API_KEY is not set");
    }
  });
  // restore original process.env
  afterAll(() => {
    // process.env = originalEnv;
  });
  describe("Scraping static websites", () => {
    it("should scrape the content of 5 static websites", async () => {      
      const urls = [
        'https://www.mendable.ai/blog/coachgtm-mongodb',
        'https://www.mendable.ai/blog/building-safe-rag',
        'https://www.mendable.ai/blog/gdpr-repository-pattern',
        'https://www.mendable.ai/blog/how-mendable-leverages-langsmith-to-debug-tools-and-actions',
        'https://www.mendable.ai/blog/european-data-storage'
      ];
      const expectedContent = [
        "CoachGTM, a Mendable AI Slack bot powered by MongoDB Atlas Vector Search, equips MongoDB’s teams with the knowledge and expertise they need to engage with customers meaningfully, reducing the risk of churn and fostering lasting relationships.",
        "You should consider security if you’re building LLM (Large Language Models) systems for enterprise. Over 67% percent of enterprise CEOs report a lack of trust in AI. An LLM system must protect sensitive data and refuse to take dangerous actions or it can’t be deployed in an enterprise.",
        "The biggest obstacle we encountered was breaking the strong dependency on a specific database throughout all our functions. This required weeks of diligent effort from our teams. Despite the hurdles, we remained committed to pushing forward, fixing bugs, and ultimately reaching our goal.",
        "It is no secret that 2024 will be the year we start seeing more LLMs baked into our workflows. This means that the way we interact with LLM models will be less just Question and Answer and more action-based.",
        "A major request from many of our enterprise customers has been the option for data storage in Europe. Although our existing Data Processing Agreement (DPA) with our current provider met the needs of many customers, the location of our data storage led to some potential clients choosing to wait until we had European storage."
      ]
      const responses = await Promise.all(urls.map(url => 
        request(process.env.TEST_URL || '')
          .post("/v0/scrape")
          .set("Content-Type", "application/json")
          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
          .send({ url })
      ));
      for (const response of responses) {
        expect(response.statusCode).toBe(200);
        expect(response.body.data).toHaveProperty("content");
        expect(response.body.data).toHaveProperty("markdown");
        expect(response.body.data).toHaveProperty("metadata");
        expect(response.body.data.content).toContain(expectedContent[responses.indexOf(response)]);
      }
    }, 15000); // 15 seconds timeout
  })
  describe("Crawling hacker news dynamic websites", () => {
    it("should return crawl hacker news, retrieve {numberOfPages} pages, get using firecrawl vs LLM Vision and successfully compare both", async () => {
      const numberOfPages = 100;
      const hackerNewsScrape = await request(process.env.TEST_URL || '')
        .post("/v0/scrape")
        .set("Content-Type", "application/json")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .send({ url: "https://news.ycombinator.com/" });
      const scrapeUrls = [...await getRandomLinksFromContent({
        content: hackerNewsScrape.body.data.markdown,
        excludes: ['ycombinator.com', '.pdf'],
        limit: numberOfPages
      })];
      const fireCrawlResponses = await Promise.all(scrapeUrls.map(url => 
        request(process.env.TEST_URL || '')
          .post("/v0/scrape")
          .set("Content-Type", "application/json")
          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
          .send({ url })
      ));
      const visionResponses = await Promise.all(scrapeUrls.map(url => {
        return getPageContentByScreenshot(url);
      }));
      let successCount = 0;
      const fireCrawlContents = fireCrawlResponses.map(response => response.body?.data?.content ? response.body.data.content : '');
      for (let i = 0; i < scrapeUrls.length; i++) {
        if (fuzzyContains({
          largeText: fireCrawlContents[i],
          queryText: visionResponses[i],
          threshold: 0.8
        })) {
          successCount += 1;
        } else {
          console.log(`Failed to match content for ${scrapeUrls[i]}`);
          console.log(`Firecrawl: ${fireCrawlContents[i]}`);
          console.log(`Vision: ${visionResponses[i]}`);
        }
      }
      expect(successCount/scrapeUrls.length).toBeGreaterThanOrEqual(0.9);
    }, 120000); // 120 seconds
  });
 });
 const getImageDescription = async (
  imagePath: string
 ): Promise<string> => {
  try {
    const prompt = `
      Get a part of the written content inside the website.
      We are going to compare if the content we retrieve contains the content of the screenshot.
      Use an easy verifiable content with close to 150 characters.
      Answer using this template: 'Content: [CONTENT]'
    `
    if (!process.env.OPENAI_API_KEY) {
      throw new Error("No OpenAI API key provided");
    }
    // const imageMediaType = 'image/png';
    const imageBuffer = await fs.readFile(imagePath);
    const imageData = imageBuffer.toString('base64');
    const openai = new OpenAI();
    const response = await openai.chat.completions.create({
      model: "gpt-4-turbo",
      messages: [
        {
          role: "user",
          content: [
            {
              type: "text",
              text: prompt,
            },
            {
              type: "image_url",
              image_url: {
                "url": "data:image/png;base64," + imageData
              }
            },
          ],
        },
      ],
    });
    return response.choices[0].message.content?.replace("Content: ", "") || '';
  } catch (error) {
    // console.error("Error generating content from screenshot:", error);
    return '';
  }
 }
 const getPageContentByScreenshot = async (url: string): Promise<string> => {
  try {
    const screenshotPath = path.join(__dirname, "assets/test_screenshot.png");
    const browser = await playwright.chromium.launch();
    const page = await browser.newPage();
    await page.goto(url);
    await page.screenshot({ path: screenshotPath });
    await browser.close();
    return await getImageDescription(screenshotPath);
  } catch (error) {
    // console.error("Error generating content from screenshot:", error);
    return '';
  }
 }
 const getRandomLinksFromContent = async (options: { content: string, excludes: string[], limit: number }): Promise<string[]> => {
  const regex = /(?<=\()https:\/\/(.*?)(?=\))/g;
  const links = options.content.match(regex);
  const filteredLinks = links ? links.filter(link => !options.excludes.some(exclude => link.includes(exclude))) : [];
  const uniqueLinks = [...new Set(filteredLinks)]; // Ensure all links are unique
  const randomLinks = [];
  while (randomLinks.length < options.limit && uniqueLinks.length > 0) {
    const randomIndex = Math.floor(Math.random() * uniqueLinks.length);
    randomLinks.push(uniqueLinks.splice(randomIndex, 1)[0]);
  }
  return randomLinks;
 }
 function fuzzyContains(options: { 
  largeText: string, 
  queryText: string, 
  threshold?: number
 }): boolean {
  // Normalize texts: lowercasing and removing non-alphanumeric characters
  const normalize = (text: string) => text.toLowerCase().replace(/[^a-z0-9]+/g, ' ');
  const normalizedLargeText = normalize(options.largeText);
  const normalizedQueryText = normalize(options.queryText);
  // Split the query into words
  const queryWords = normalizedQueryText.split(/\s+/);
  // Count how many query words are in the large text
  const matchCount = queryWords.reduce((count, word) => {
    return count + (normalizedLargeText.includes(word) ? 1 : 0);
  }, 0);
  // Calculate the percentage of words matched
  const matchPercentage = matchCount / queryWords.length;
  // Check if the match percentage meets or exceeds the threshold
  return matchPercentage >= (options.threshold || 0.8);
 }
--- a/apps/test-suite/jest.config.js
+++ b/apps/test-suite/jest.config.js
@ -0,0 +1,5 @@
 module.exports = {
  preset: "ts-jest",
  testEnvironment: "node",
  setupFiles: ["./jest.setup.js"],
 };
--- a/apps/test-suite/jest.setup.js
+++ b/apps/test-suite/jest.setup.js
--- a/apps/test-suite/package.json
+++ b/apps/test-suite/package.json
@ -0,0 +1,24 @@
 {
  "name": "test-suite",
  "version": "1.0.0",
  "description": "",
  "scripts": {
    "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false"
  },
  "author": "",
  "license": "ISC",
  "dependencies": {
    "@anthropic-ai/sdk": "^0.20.8",
    "dotenv": "^16.4.5",
    "jest": "^29.7.0",
    "openai": "^4.40.2",
    "playwright": "^1.43.1",
    "supertest": "^7.0.0",
    "ts-jest": "^29.1.2"
  },
  "devDependencies": {
    "@types/jest": "^29.5.12",
    "@types/supertest": "^6.0.2",
    "typescript": "^5.4.5"
  }
 }
--- a/apps/test-suite/pnpm-lock.yaml
+++ b/apps/test-suite/pnpm-lock.yaml
--- a/apps/test-suite/tsconfig.json
+++ b/apps/test-suite/tsconfig.json
@ -0,0 +1,109 @@
 {
  "compilerOptions": {
    /* Visit https://aka.ms/tsconfig to read more about this file */
    /* Projects */
    // "incremental": true,                              /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
    // "composite": true,                                /* Enable constraints that allow a TypeScript project to be used with project references. */
    // "tsBuildInfoFile": "./.tsbuildinfo",              /* Specify the path to .tsbuildinfo incremental compilation file. */
    // "disableSourceOfProjectReferenceRedirect": true,  /* Disable preferring source files instead of declaration files when referencing composite projects. */
    // "disableSolutionSearching": true,                 /* Opt a project out of multi-project reference checking when editing. */
    // "disableReferencedProjectLoad": true,             /* Reduce the number of projects loaded automatically by TypeScript. */
    /* Language and Environment */
    "target": "es2016",                                  /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
    // "lib": [],                                        /* Specify a set of bundled library declaration files that describe the target runtime environment. */
    // "jsx": "preserve",                                /* Specify what JSX code is generated. */
    // "experimentalDecorators": true,                   /* Enable experimental support for legacy experimental decorators. */
    // "emitDecoratorMetadata": true,                    /* Emit design-type metadata for decorated declarations in source files. */
    // "jsxFactory": "",                                 /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
    // "jsxFragmentFactory": "",                         /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
    // "jsxImportSource": "",                            /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
    // "reactNamespace": "",                             /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
    // "noLib": true,                                    /* Disable including any library files, including the default lib.d.ts. */
    // "useDefineForClassFields": true,                  /* Emit ECMAScript-standard-compliant class fields. */
    // "moduleDetection": "auto",                        /* Control what method is used to detect module-format JS files. */
    /* Modules */
    "module": "commonjs",                                /* Specify what module code is generated. */
    // "rootDir": "./",                                  /* Specify the root folder within your source files. */
    // "moduleResolution": "node10",                     /* Specify how TypeScript looks up a file from a given module specifier. */
    // "baseUrl": "./",                                  /* Specify the base directory to resolve non-relative module names. */
    // "paths": {},                                      /* Specify a set of entries that re-map imports to additional lookup locations. */
    // "rootDirs": [],                                   /* Allow multiple folders to be treated as one when resolving modules. */
    // "typeRoots": [],                                  /* Specify multiple folders that act like './node_modules/@types'. */
    // "types": [],                                      /* Specify type package names to be included without being referenced in a source file. */
    // "allowUmdGlobalAccess": true,                     /* Allow accessing UMD globals from modules. */
    // "moduleSuffixes": [],                             /* List of file name suffixes to search when resolving a module. */
    // "allowImportingTsExtensions": true,               /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
    // "resolvePackageJsonExports": true,                /* Use the package.json 'exports' field when resolving package imports. */
    // "resolvePackageJsonImports": true,                /* Use the package.json 'imports' field when resolving imports. */
    // "customConditions": [],                           /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
    // "resolveJsonModule": true,                        /* Enable importing .json files. */
    // "allowArbitraryExtensions": true,                 /* Enable importing files with any extension, provided a declaration file is present. */
    // "noResolve": true,                                /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
    /* JavaScript Support */
    // "allowJs": true,                                  /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
    // "checkJs": true,                                  /* Enable error reporting in type-checked JavaScript files. */
    // "maxNodeModuleJsDepth": 1,                        /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
    /* Emit */
    // "declaration": true,                              /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
    // "declarationMap": true,                           /* Create sourcemaps for d.ts files. */
    // "emitDeclarationOnly": true,                      /* Only output d.ts files and not JavaScript files. */
    // "sourceMap": true,                                /* Create source map files for emitted JavaScript files. */
    // "inlineSourceMap": true,                          /* Include sourcemap files inside the emitted JavaScript. */
    // "outFile": "./",                                  /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
    // "outDir": "./",                                   /* Specify an output folder for all emitted files. */
    // "removeComments": true,                           /* Disable emitting comments. */
    // "noEmit": true,                                   /* Disable emitting files from a compilation. */
    // "importHelpers": true,                            /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
    // "importsNotUsedAsValues": "remove",               /* Specify emit/checking behavior for imports that are only used for types. */
    // "downlevelIteration": true,                       /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
    // "sourceRoot": "",                                 /* Specify the root path for debuggers to find the reference source code. */
    // "mapRoot": "",                                    /* Specify the location where debugger should locate map files instead of generated locations. */
    // "inlineSources": true,                            /* Include source code in the sourcemaps inside the emitted JavaScript. */
    // "emitBOM": true,                                  /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
    // "newLine": "crlf",                                /* Set the newline character for emitting files. */
    // "stripInternal": true,                            /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
    // "noEmitHelpers": true,                            /* Disable generating custom helper functions like '__extends' in compiled output. */
    // "noEmitOnError": true,                            /* Disable emitting files if any type checking errors are reported. */
    // "preserveConstEnums": true,                       /* Disable erasing 'const enum' declarations in generated code. */
    // "declarationDir": "./",                           /* Specify the output directory for generated declaration files. */
    // "preserveValueImports": true,                     /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */
    /* Interop Constraints */
    // "isolatedModules": true,                          /* Ensure that each file can be safely transpiled without relying on other imports. */
    // "verbatimModuleSyntax": true,                     /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
    // "allowSyntheticDefaultImports": true,             /* Allow 'import x from y' when a module doesn't have a default export. */
    "esModuleInterop": true,                             /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
    // "preserveSymlinks": true,                         /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
    "forceConsistentCasingInFileNames": true,            /* Ensure that casing is correct in imports. */
    /* Type Checking */
    "strict": true,                                      /* Enable all strict type-checking options. */
    // "noImplicitAny": true,                            /* Enable error reporting for expressions and declarations with an implied 'any' type. */
    // "strictNullChecks": true,                         /* When type checking, take into account 'null' and 'undefined'. */
    // "strictFunctionTypes": true,                      /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
    // "strictBindCallApply": true,                      /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
    // "strictPropertyInitialization": true,             /* Check for class properties that are declared but not set in the constructor. */
    // "noImplicitThis": true,                           /* Enable error reporting when 'this' is given the type 'any'. */
    // "useUnknownInCatchVariables": true,               /* Default catch clause variables as 'unknown' instead of 'any'. */
    // "alwaysStrict": true,                             /* Ensure 'use strict' is always emitted. */
    // "noUnusedLocals": true,                           /* Enable error reporting when local variables aren't read. */
    // "noUnusedParameters": true,                       /* Raise an error when a function parameter isn't read. */
    // "exactOptionalPropertyTypes": true,               /* Interpret optional property types as written, rather than adding 'undefined'. */
    // "noImplicitReturns": true,                        /* Enable error reporting for codepaths that do not explicitly return in a function. */
    // "noFallthroughCasesInSwitch": true,               /* Enable error reporting for fallthrough cases in switch statements. */
    // "noUncheckedIndexedAccess": true,                 /* Add 'undefined' to a type when accessed using an index. */
    // "noImplicitOverride": true,                       /* Ensure overriding members in derived classes are marked with an override modifier. */
    // "noPropertyAccessFromIndexSignature": true,       /* Enforces using indexed accessors for keys declared using an indexed type. */
    // "allowUnusedLabels": true,                        /* Disable error reporting for unused labels. */
    // "allowUnreachableCode": true,                     /* Disable error reporting for unreachable code. */
    /* Completeness */
    // "skipDefaultLibCheck": true,                      /* Skip type checking .d.ts files that are included with TypeScript. */
    "skipLibCheck": true                                 /* Skip type checking all .d.ts files. */
  }
 }