diff --git a/README.md b/README.md index 3b3968f..2aaeeeb 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ _This repository is in its early development stages. We are still merging custom ## What is Firecrawl? -[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown. We crawl all accessible subpages and give you clean markdown for each. No sitemap required. +[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. _Pst. hey, you, join our stargazers :)_ @@ -114,7 +114,7 @@ Response: ### Search (Beta) -Used to search the web, get the most relevant results, scrap each page and return the markdown. +Used to search the web, get the most relevant results, scrape each page and return the markdown. ```bash curl -X POST https://api.firecrawl.dev/v0/search \ @@ -296,7 +296,6 @@ npm install @mendable/firecrawl-js 1. Get an API key from [firecrawl.dev](https://firecrawl.dev) 2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. - ### Scraping a URL To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary. diff --git a/SELF_HOST.md b/SELF_HOST.md index 8d1d490..ff5ee04 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,6 +1,31 @@ # Self-hosting Firecrawl +*We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version.* Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. -*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* +## Getting Started +First, clone this repository and copy the example env file from api folder `.env.example` to `.env`. +```bash +git clone https://github.com/mendableai/firecrawl.git +cd firecrawl +cp ./apps/api/.env.example ./.env +``` + +For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` on `.env` to not use the database authentication. +```yml +USE_DB_AUTHENTICATION=false +``` + +Update the Redis URL in the .env file to align with the Docker configuration: +```yml +REDIS_URL=redis://redis:6379 +``` + +Once that's complete, you can simply run the following commands to get started: +```bash +docker compose up +``` + + +This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`. diff --git a/apps/api/.env.example b/apps/api/.env.example index 1ba5ffe..3bdfcf1 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -3,6 +3,7 @@ NUM_WORKERS_PER_QUEUE=8 PORT=3002 HOST=0.0.0.0 REDIS_URL=redis://localhost:6379 +PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000 ## To turn on DB authentication, you need to set up supabase. USE_DB_AUTHENTICATION=true @@ -22,7 +23,6 @@ SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blockin OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) BULL_AUTH_KEY= # LOGTAIL_KEY= # Use if you're configuring basic logging with logtail -PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 127fe51..b483bc4 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -18,8 +18,8 @@ "paths": { "/scrape": { "post": { - "summary": "Scrape a single URL", - "operationId": "scrapeSingleUrl", + "summary": "Scrape a single URL and optionally extract information using an LLM", + "operationId": "scrapeAndExtractFromUrl", "tags": ["Scraping"], "security": [ { @@ -45,8 +45,43 @@ "type": "boolean", "description": "Only return the main content of the page excluding headers, navs, footers, etc.", "default": false + }, + "includeHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a html key in the response.", + "default": false } } + }, + "extractorOptions": { + "type": "object", + "description": "Options for LLM-based extraction of structured information from the page content", + "properties": { + "mode": { + "type": "string", + "enum": ["llm-extraction"], + "description": "The extraction mode to use, currently supports 'llm-extraction'" + }, + "extractionPrompt": { + "type": "string", + "description": "A prompt describing what information to extract from the page" + }, + "extractionSchema": { + "type": "object", + "additionalProperties": true, + "description": "The schema for the data to be extracted", + "required": [ + "company_mission", + "supports_sso", + "is_open_source" + ] + } + } + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds for the request", + "default": 30000 } }, "required": ["url"] @@ -126,6 +161,16 @@ "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", "default": false }, + "maxDepth": { + "type": "integer", + "description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on." + }, + "mode": { + "type": "string", + "enum": ["default", "fast"], + "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", + "default": "default" + }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl", @@ -140,6 +185,11 @@ "type": "boolean", "description": "Only return the main content of the page excluding headers, navs, footers, etc.", "default": false + }, + "includeHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a html key in the response.", + "default": false } } } @@ -192,7 +242,7 @@ "query": { "type": "string", "format": "uri", - "description": "The URL to scrape" + "description": "The query to search for" }, "pageOptions": { "type": "object", @@ -206,6 +256,11 @@ "type": "boolean", "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", "default": true + }, + "includeHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a html key in the response.", + "default": false } } }, @@ -299,9 +354,66 @@ "data": { "type": "array", "items": { - "$ref": "#/components/schemas/ScrapeResponse" + "$ref": "#/components/schemas/CrawlStatusResponseObj" }, "description": "Data returned from the job (null when it is in progress)" + }, + "partial_data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/CrawlStatusResponseObj" + }, + "description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled." + } + } + } + } + } + }, + "402": { + "description": "Payment required" + }, + "429": { + "description": "Too many requests" + }, + "500": { + "description": "Server error" + } + } + } + }, + "/crawl/cancel/{jobId}": { + "delete": { + "tags": ["Crawl"], + "summary": "Cancel a crawl job", + "operationId": "cancelCrawlJob", + "security": [ + { + "bearerAuth": [] + } + ], + "parameters": [ + { + "name": "jobId", + "in": "path", + "description": "ID of the crawl job", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "Returns cancelled." } } } @@ -344,6 +456,11 @@ "content": { "type": "string" }, + "html": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeHtml` is true" + }, "metadata": { "type": "object", "properties": { @@ -362,6 +479,51 @@ "format": "uri" } } + }, + "llm_extraction": { + "type": "object", + "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.", + "nullable": true + }, + "warning": { + "type": "string", + "nullable": true, + "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction." + } + } + } + } + }, + "CrawlStatusResponseObj": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "content": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeHtml` is true" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" } } } diff --git a/apps/api/package.json b/apps/api/package.json index 9c68e92..32a0f54 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -34,6 +34,7 @@ "express": "^4.18.2", "jest": "^29.6.3", "jest-fetch-mock": "^3.0.3", + "mammoth": "^1.7.2", "nodemon": "^2.0.20", "supabase": "^1.77.9", "supertest": "^6.3.3", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 7873375..16b2f6c 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -97,7 +97,7 @@ dependencies: version: 0.0.25 langchain: specifier: ^0.1.25 - version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2) + version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(mammoth@1.7.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2) languagedetect: specifier: ^2.0.0 version: 2.0.0 @@ -214,6 +214,9 @@ devDependencies: jest-fetch-mock: specifier: ^3.0.3 version: 3.0.3 + mammoth: + specifier: ^1.7.2 + version: 1.7.2 nodemon: specifier: ^2.0.20 version: 2.0.22 @@ -1765,6 +1768,10 @@ packages: dev: false optional: true + /@xmldom/xmldom@0.8.10: + resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==} + engines: {node: '>=10.0.0'} + /abbrev@1.1.1: resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==} dev: true @@ -1895,7 +1902,6 @@ packages: resolution: {integrity: sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==} dependencies: sprintf-js: 1.0.3 - dev: true /argparse@2.0.1: resolution: {integrity: sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==} @@ -2071,7 +2077,6 @@ packages: /base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} - dev: false /basic-ftp@5.0.5: resolution: {integrity: sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==} @@ -2096,6 +2101,9 @@ packages: resolution: {integrity: sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA==} dev: false + /bluebird@3.4.7: + resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==} + /body-parser@1.20.2: resolution: {integrity: sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==} engines: {node: '>= 0.8', npm: 1.2.8000 || >= 1.4.16} @@ -2421,6 +2429,9 @@ packages: resolution: {integrity: sha512-LDx6oHrK+PhzLKJU9j5S7/Y3jM/mUHvD/DeI1WQmJn652iPC5Y4TBzC9l+5OMOXlyTTA+SmVUPm0HQUwpD5Jqw==} dev: true + /core-util-is@1.0.3: + resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==} + /cors@2.8.5: resolution: {integrity: sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==} engines: {node: '>= 0.10'} @@ -2659,6 +2670,9 @@ packages: md5: 2.3.0 dev: false + /dingbat-to-unicode@1.0.1: + resolution: {integrity: sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==} + /dom-serializer@2.0.0: resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==} dependencies: @@ -2695,6 +2709,11 @@ packages: engines: {node: '>=12'} dev: false + /duck@0.1.12: + resolution: {integrity: sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==} + dependencies: + underscore: 1.13.6 + /eastasianwidth@0.2.0: resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==} dev: false @@ -3332,6 +3351,9 @@ packages: resolution: {integrity: sha512-Ius2VYcGNk7T90CppJqcIkS5ooHUZyIQK+ClZfMfMNFEF9VSE73Fq+906u/CWu92x4gzZMWOwfFYckPObzdEbA==} dev: true + /immediate@3.0.6: + resolution: {integrity: sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==} + /import-fresh@3.3.0: resolution: {integrity: sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==} engines: {node: '>=6'} @@ -3462,6 +3484,9 @@ packages: engines: {node: '>=8'} dev: true + /isarray@1.0.0: + resolution: {integrity: sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==} + /isexe@2.0.0: resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==} @@ -4049,6 +4074,14 @@ packages: engines: {node: '>=0.10.0'} dev: false + /jszip@3.10.1: + resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==} + dependencies: + lie: 3.3.0 + pako: 1.0.11 + readable-stream: 2.3.8 + setimmediate: 1.0.5 + /kareem@2.5.1: resolution: {integrity: sha512-7jFxRVm+jD+rkq3kY0iZDJfsO2/t4BBPeEb2qKn2lR/9KhuksYk5hxzfRYWMPV8P/x2d0kHD306YyWLzjjH+uA==} engines: {node: '>=12.0.0'} @@ -4064,7 +4097,7 @@ packages: engines: {node: '>=6'} dev: true - /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2): + /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(mammoth@1.7.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2): resolution: {integrity: sha512-sfEChvr4H2CklHdSByNBbytwBrFhgtA5kPOnwcBrxuXGg1iOaTzhVxQA0QcNcQucI3hZrsNbZjxGp+Can1ooZQ==} engines: {node: '>=18'} peerDependencies: @@ -4238,6 +4271,7 @@ packages: jsonpointer: 5.0.1 langchainhub: 0.0.8 langsmith: 0.1.13 + mammoth: 1.7.2 ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 @@ -4344,6 +4378,11 @@ packages: type-check: 0.3.2 dev: false + /lie@3.3.0: + resolution: {integrity: sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==} + dependencies: + immediate: 3.0.6 + /lines-and-columns@1.2.4: resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==} @@ -4380,6 +4419,13 @@ packages: - encoding dev: false + /lop@0.4.1: + resolution: {integrity: sha512-9xyho9why2A2tzm5aIcMWKvzqKsnxrf9B5I+8O30olh6lQU8PH978LqZoI4++37RBgS1Em5i54v1TFs/3wnmXQ==} + dependencies: + duck: 0.1.12 + option: 0.2.4 + underscore: 1.13.6 + /lru-cache@10.2.0: resolution: {integrity: sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==} engines: {node: 14 || >=16.14} @@ -4423,6 +4469,22 @@ packages: tmpl: 1.0.5 dev: true + /mammoth@1.7.2: + resolution: {integrity: sha512-MqWU2hcLf1I5QMKyAbfJCvrLxnv5WztrAQyorfZ+WPq7Hk82vZFmvfR2/64ajIPpM4jlq0TXp1xZvp/FFaL1Ug==} + engines: {node: '>=12.0.0'} + hasBin: true + dependencies: + '@xmldom/xmldom': 0.8.10 + argparse: 1.0.10 + base64-js: 1.5.1 + bluebird: 3.4.7 + dingbat-to-unicode: 1.0.1 + jszip: 3.10.1 + lop: 0.4.1 + path-is-absolute: 1.0.1 + underscore: 1.13.6 + xmlbuilder: 10.1.1 + /md5@2.3.0: resolution: {integrity: sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==} dependencies: @@ -4867,6 +4929,9 @@ packages: resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==} dev: false + /option@0.2.4: + resolution: {integrity: sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==} + /optionator@0.8.3: resolution: {integrity: sha512-+IW9pACdk3XWmmTXG8m3upGUJst5XRGzxMRjXzAuJ1XnIFNvfhjjIuYkDvysnPQ7qzqVzLt78BCruntqRhWQbA==} engines: {node: '>= 0.8.0'} @@ -4957,6 +5022,9 @@ packages: netmask: 2.0.2 dev: false + /pako@1.0.11: + resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==} + /parent-module@1.0.1: resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} engines: {node: '>=6'} @@ -5002,7 +5070,6 @@ packages: /path-is-absolute@1.0.1: resolution: {integrity: sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==} engines: {node: '>=0.10.0'} - dev: true /path-key@3.1.1: resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==} @@ -5095,6 +5162,9 @@ packages: react-is: 18.2.0 dev: true + /process-nextick-args@2.0.1: + resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==} + /progress@2.0.3: resolution: {integrity: sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==} engines: {node: '>=0.4.0'} @@ -5251,6 +5321,17 @@ packages: engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} dev: true + /readable-stream@2.3.8: + resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==} + dependencies: + core-util-is: 1.0.3 + inherits: 2.0.4 + isarray: 1.0.0 + process-nextick-args: 2.0.1 + safe-buffer: 5.1.2 + string_decoder: 1.1.1 + util-deprecate: 1.0.2 + /readdirp@3.6.0: resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==} engines: {node: '>=8.10.0'} @@ -5347,6 +5428,9 @@ packages: resolution: {integrity: sha512-cLgakCUf6PedEu15t8kbsjnwIFFR2D4RfL+W3iWFJ4iac7z4B0ZI8fxy4R3J956kAI68HclCFGL8MPoUVC3qVA==} dev: false + /safe-buffer@5.1.2: + resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==} + /safe-buffer@5.2.1: resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} @@ -5460,6 +5544,9 @@ packages: gopd: 1.0.1 has-property-descriptors: 1.0.2 + /setimmediate@1.0.5: + resolution: {integrity: sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==} + /setprototypeof@1.2.0: resolution: {integrity: sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==} @@ -5562,7 +5649,6 @@ packages: /sprintf-js@1.0.3: resolution: {integrity: sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==} - dev: true /sprintf-js@1.1.3: resolution: {integrity: sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==} @@ -5631,6 +5717,11 @@ packages: strip-ansi: 7.1.0 dev: false + /string_decoder@1.1.1: + resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==} + dependencies: + safe-buffer: 5.1.2 + /strip-ansi@6.0.1: resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==} engines: {node: '>=8'} @@ -5975,7 +6066,6 @@ packages: /underscore@1.13.6: resolution: {integrity: sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==} - dev: false /undici-types@5.26.5: resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} @@ -6022,6 +6112,9 @@ packages: resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==} dev: false + /util-deprecate@1.0.2: + resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==} + /utils-merge@1.0.1: resolution: {integrity: sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==} engines: {node: '>= 0.4.0'} @@ -6182,6 +6275,10 @@ packages: xmlbuilder: 11.0.1 dev: false + /xmlbuilder@10.1.1: + resolution: {integrity: sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==} + engines: {node: '>=4.0'} + /xmlbuilder@11.0.1: resolution: {integrity: sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==} engines: {node: '>=4.0'} diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index fcf3284..e9082ca 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -103,6 +103,36 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.markdown).toContain("🔥 FireCrawl"); expect(response.body.data.html).toContain(" { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 60000); // 60 seconds + + it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { + const response = await request(TEST_URL) + .post('/v0/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + expect(response.body.data).toHaveProperty('content'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 60000); // 60 seconds }); describe("POST /v0/crawl", () => { @@ -146,7 +176,274 @@ describe("E2E Tests for API Routes", () => { ); }); - // Additional tests for insufficient credits? + it("should return a successful response with a valid API key and valid includes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + includes: ["blog/*"], + }, + }); + + let response; + let isFinished = false; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + console.log({url}) + expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy(); + }); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + }, 60000); // 60 seconds + + it("should return a successful response with a valid API key and valid excludes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + excludes: ["blog/*"], + }, + }); + + let isFinished = false; + let response; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); + }); + }, 60000); // 60 seconds + + it("should return a successful response with a valid API key and limit to 3", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + crawlerOptions: { limit: 3 }, + }); + + let isFinished = false; + let response; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data.length).toBe(3); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + }, 60000); // 60 seconds + + it("should return a successful response with max depth option for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 2 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + // wait for 60 seconds + await new Promise((r) => setTimeout(r, 60000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const depth = new URL(url).pathname.split("/").filter(Boolean).length; + expect(depth).toBeLessThanOrEqual(1); + }); + }, 120000); + + // it("should return a successful response with a valid API key and valid limit option", async () => { + // const crawlResponse = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://mendable.ai", + // crawlerOptions: { limit: 10 }, + // }); + + // const response = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty("status"); + // expect(response.body.status).toBe("active"); + + // let isCompleted = false; + // while (!isCompleted) { + // const statusCheckResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + // expect(statusCheckResponse.statusCode).toBe(200); + // isCompleted = statusCheckResponse.body.status === "completed"; + // if (!isCompleted) { + // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + // } + // } + + // const completedResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + // expect(completedResponse.statusCode).toBe(200); + // expect(completedResponse.body).toHaveProperty("status"); + // expect(completedResponse.body.status).toBe("completed"); + // expect(completedResponse.body).toHaveProperty("data"); + // expect(completedResponse.body.data.length).toBe(10); + // expect(completedResponse.body.data[0]).toHaveProperty("content"); + // expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + // expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + // expect(completedResponse.body.data[0].content).toContain("Mendable"); + // expect(completedResponse.body.data[0].content).not.toContain("main menu"); + // }, 60000); // 60 seconds + + it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + pageOptions: { includeHtml: true }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + + // 120 seconds + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); + expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); + expect(completedResponse.body.data[0].html).toContain(" { @@ -248,7 +545,7 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(404); }); - it("should return a successful response for a valid crawl job", async () => { + it("should return a successful crawl status response for a valid crawl job", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -256,20 +553,23 @@ describe("E2E Tests for API Routes", () => { .send({ url: "https://firecrawl.dev" }); expect(crawlResponse.statusCode).toBe(200); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + let isCompleted = false; + let completedResponse; - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); + if (response.body.status === "completed") { + isCompleted = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body).toHaveProperty("data"); @@ -278,6 +578,43 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); }, 60000); // 60 seconds + + it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { + const crawlResponse = await request(TEST_URL) + .post('/v0/crawl') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }}); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + let completedResponse; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('status'); + + if (response.body.status === 'completed') { + isCompleted = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + expect(completedResponse.body.status).toBe('completed'); + expect(completedResponse.body).toHaveProperty('data'); + expect(completedResponse.body.data.length).toEqual(1); + expect(completedResponse.body.data).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.') + }) + ]) + ); + }, 60000); // 60 seconds it("should return a successful response with max depth option for a valid crawl job", async () => { const crawlResponse = await request(TEST_URL) @@ -290,18 +627,21 @@ describe("E2E Tests for API Routes", () => { }); expect(crawlResponse.statusCode).toBe(200); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); - // wait for 60 seconds - await new Promise((r) => setTimeout(r, 60000)); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + let isCompleted = false; + let completedResponse; + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + completedResponse = response; + } + } expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("completed"); @@ -371,10 +711,8 @@ describe("E2E Tests for API Routes", () => { .send({ url: "https://jestjs.io" }); expect(crawlResponse.statusCode).toBe(200); - - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 10000)); + await new Promise((r) => setTimeout(r, 20000)); const response = await request(TEST_URL) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) @@ -383,7 +721,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body).toHaveProperty("status"); expect(response.body.status).toBe("cancelled"); - await new Promise((r) => setTimeout(r, 20000)); + await new Promise((r) => setTimeout(r, 10000)); const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) @@ -400,8 +738,6 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds - - describe("POST /v0/scrape with LLM Extraction", () => { it("should extract data using LLM extraction mode", async () => { const response = await request(TEST_URL) @@ -511,6 +847,107 @@ describe("E2E Tests for API Routes", () => { // }, 120000); // 120 secs // }); + describe("POST /v0/crawl with fast mode", () => { + it("should complete the crawl under 20 seconds", async () => { + const startTime = Date.now(); + + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://flutterbricks.com", + crawlerOptions: { + mode: "fast" + } + }); + + expect(crawlResponse.statusCode).toBe(200); + + const jobId = crawlResponse.body.jobId; + let statusResponse; + let isFinished = false; + + while (!isFinished) { + statusResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(statusResponse.statusCode).toBe(200); + isFinished = statusResponse.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const endTime = Date.now(); + const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + + console.log(`Time elapsed: ${timeElapsed} seconds`); + + expect(statusResponse.body.status).toBe("completed"); + expect(statusResponse.body).toHaveProperty("data"); + expect(statusResponse.body.data[0]).toHaveProperty("content"); + expect(statusResponse.body.data[0]).toHaveProperty("markdown"); + const results = statusResponse.body.data; + // results.forEach((result, i) => { + // console.log(result.metadata.sourceURL); + // }); + expect(results.length).toBeGreaterThanOrEqual(10); + expect(results.length).toBeLessThanOrEqual(15); + + }, 20000); + + // it("should complete the crawl in more than 10 seconds", async () => { + // const startTime = Date.now(); + + // const crawlResponse = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://flutterbricks.com", + // }); + + // expect(crawlResponse.statusCode).toBe(200); + + // const jobId = crawlResponse.body.jobId; + // let statusResponse; + // let isFinished = false; + + // while (!isFinished) { + // statusResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + // expect(statusResponse.statusCode).toBe(200); + // isFinished = statusResponse.body.status === "completed"; + + // if (!isFinished) { + // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + // } + // } + + // const endTime = Date.now(); + // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + + // console.log(`Time elapsed: ${timeElapsed} seconds`); + + // expect(statusResponse.body.status).toBe("completed"); + // expect(statusResponse.body).toHaveProperty("data"); + // expect(statusResponse.body.data[0]).toHaveProperty("content"); + // expect(statusResponse.body.data[0]).toHaveProperty("markdown"); + // const results = statusResponse.body.data; + // // results.forEach((result, i) => { + // // console.log(result.metadata.sourceURL); + // // }); + // expect(results.length).toBeGreaterThanOrEqual(10); + // expect(results.length).toBeLessThanOrEqual(15); + + // }, 50000);// 15 seconds timeout to account for network delays + }); + describe("GET /is-production", () => { it("should return the production status", async () => { const response = await request(TEST_URL).get("/is-production"); @@ -540,43 +977,43 @@ describe("E2E Tests for API Routes", () => { }, 60000); }); - it("should return 429 when rate limit is exceeded for API key", async () => { - for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com" }); + // it("should return 429 when rate limit is exceeded for API key", async () => { + // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) { + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); - expect(response.statusCode).toBe(200); - } + // expect(response.statusCode).toBe(200); + // } - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com" }); + // const response = await request(TEST_URL) + // .post("/v0/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); - expect(response.statusCode).toBe(429); - }, 60000); + // expect(response.statusCode).toBe(429); + // }, 60000); - it("should return 429 when rate limit is exceeded for API key", async () => { - for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) { - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com" }); + // it("should return 429 when rate limit is exceeded for API key", async () => { + // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) { + // const response = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); - expect(response.statusCode).toBe(200); - } + // expect(response.statusCode).toBe(200); + // } - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com" }); + // const response = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: "https://www.scrapethissite.com" }); - expect(response.statusCode).toBe(429); - }, 60000); + // expect(response.statusCode).toBe(429); + // }, 60000); }); diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index ff751ef..4009d69 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -1,5 +1,5 @@ import { parseApi } from "../../src/lib/parseApi"; -import { getRateLimiter, crawlRateLimit, scrapeRateLimit } from "../../src/services/rate-limiter"; +import { getRateLimiter, } from "../../src/services/rate-limiter"; import { AuthResponse, RateLimiterMode } from "../../src/types"; import { supabase_service } from "../../src/services/supabase"; import { withAuth } from "../../src/lib/withAuth"; @@ -68,7 +68,7 @@ export async function supaAuthenticateUser( if (error) { console.error('Error fetching key and price_id:', error); } else { - console.log('Key and Price ID:', data); + // console.log('Key and Price ID:', data); } if (error || !data || data.length === 0) { @@ -79,20 +79,27 @@ export async function supaAuthenticateUser( }; } + subscriptionData = { team_id: data[0].team_id, plan: getPlanByPriceId(data[0].price_id) } switch (mode) { case RateLimiterMode.Crawl: - rateLimiter = crawlRateLimit(subscriptionData.plan); + rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan); break; case RateLimiterMode.Scrape: - rateLimiter = scrapeRateLimit(subscriptionData.plan); + rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan); break; case RateLimiterMode.CrawlStatus: rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); break; + case RateLimiterMode.Search: + rateLimiter = getRateLimiter(RateLimiterMode.Search, token); + break; + case RateLimiterMode.Preview: + rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); + break; default: rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token); break; diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 449a50f..0b3f146 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -106,6 +106,9 @@ export async function scrapeController(req: Request, res: Response) { const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } + if (extractorOptions.mode === "llm-extraction") { + pageOptions.onlyMainContent = true; + } const origin = req.body.origin ?? "api"; const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 4a25b43..1434e35 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -1,25 +1,38 @@ import OpenAI from "openai"; import { Document } from "../../lib/entities"; +import { numTokensFromString } from "./helpers"; export type ScraperCompletionResult = { data: any | null; url: string; }; +const maxTokens = 32000; +const modifier = 4; const defaultPrompt = "You are a professional web scraper. Extract the contents of the webpage"; function prepareOpenAIDoc( document: Document -): OpenAI.Chat.Completions.ChatCompletionContentPart[] { - // Check if the markdown content exists in the document - if (!document.markdown) { +): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] { + let markdown = document.markdown; + +// Check if the markdown content exists in the document + if (!markdown) { throw new Error( "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" ); } - return [{ type: "text", text: document.markdown }]; + // count number of tokens + const numTokens = numTokensFromString(document.markdown, "gpt-4"); + + if (numTokens > maxTokens) { + // trim the document to the maximum number of tokens, tokens != characters + markdown = markdown.slice(0, (maxTokens * modifier)); + } + + return [[{ type: "text", text: markdown }], numTokens]; } export async function generateOpenAICompletions({ @@ -38,7 +51,7 @@ export async function generateOpenAICompletions({ temperature?: number; }): Promise { const openai = client as OpenAI; - const content = prepareOpenAIDoc(document); + const [content, numTokens] = prepareOpenAIDoc(document); const completion = await openai.chat.completions.create({ model, @@ -72,6 +85,7 @@ export async function generateOpenAICompletions({ return { ...document, llm_extraction: llmExtraction, + warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined, }; } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index a387b54..ab0a0ef 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -44,6 +44,7 @@ export type WebScraperOptions = { limit?: number; generateImgAltText?: boolean; replaceAllPathsWithAbsolutePaths?: boolean; + mode?: "default" | "fast"; // have a mode of some sort }; pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; @@ -71,6 +72,7 @@ export class Document { }; childrenLinks?: string[]; provider?: string; + warning?: string; constructor(data: Partial) { if (!data.content) { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 0248df2..f53ef22 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -4,7 +4,7 @@ import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import async from "async"; import { Progress } from "../../lib/entities"; -import { scrapWithScrapingBee } from "./single_url"; +import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import robotsParser from "robots-parser"; export class WebCrawler { @@ -15,7 +15,7 @@ export class WebCrawler { private maxCrawledLinks: number; private maxCrawledDepth: number; private visited: Set = new Set(); - private crawledUrls: Set = new Set(); + private crawledUrls: Map = new Map(); private limit: number; private robotsTxtUrl: string; private robots: any; @@ -51,7 +51,6 @@ export class WebCrawler { this.generateImgAltText = generateImgAltText ?? false; } - private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { return sitemapLinks .filter((link) => { @@ -77,9 +76,22 @@ export class WebCrawler { // Check if the link matches the include patterns, if any are specified if (this.includes.length > 0 && this.includes[0] !== "") { - return this.includes.some((includePattern) => + if (!this.includes.some((includePattern) => new RegExp(includePattern).test(path) - ); + )) { + return false; + } + } + + // Normalize the initial URL and the link to account for www and non-www versions + const normalizedInitialUrl = new URL(this.initialUrl); + const normalizedLink = new URL(link); + const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, ''); + const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); + + // Ensure the protocol and hostname match, and the path starts with the initial URL's path + if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { + return false; } const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; @@ -99,19 +111,21 @@ export class WebCrawler { concurrencyLimit: number = 5, limit: number = 10000, maxDepth: number = 10 - ): Promise { + ): Promise<{ url: string, html: string }[]> { // Fetch and parse robots.txt try { const response = await axios.get(this.robotsTxtUrl); this.robots = robotsParser(this.robotsTxtUrl, response.data); } catch (error) { console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); + } + const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { - const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); - return filteredLinks; + let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); + return filteredLinks.map(link => ({ url: link, html: "" })); } const urls = await this.crawlUrls( @@ -123,18 +137,20 @@ export class WebCrawler { urls.length === 0 && this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 ) { - return [this.initialUrl]; + return [{ url: this.initialUrl, html: "" }]; } + // make sure to run include exclude here again - return this.filterLinks(urls, limit, this.maxCrawledDepth); + const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); + return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); } private async crawlUrls( urls: string[], concurrencyLimit: number, - inProgress?: (progress: Progress) => void - ): Promise { + inProgress?: (progress: Progress) => void, + ): Promise<{ url: string, html: string }[]> { const queue = async.queue(async (task: string, callback) => { if (this.crawledUrls.size >= this.maxCrawledLinks) { if (callback && typeof callback === "function") { @@ -143,13 +159,26 @@ export class WebCrawler { return; } const newUrls = await this.crawl(task); - newUrls.forEach((url) => this.crawledUrls.add(url)); + // add the initial url if not already added + // if (this.visited.size === 1) { + // let normalizedInitial = this.initialUrl; + // if (!normalizedInitial.endsWith("/")) { + // normalizedInitial = normalizedInitial + "/"; + // } + // if (!newUrls.some(page => page.url === this.initialUrl)) { + // newUrls.push({ url: this.initialUrl, html: "" }); + // } + // } + + + newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); + if (inProgress && newUrls.length > 0) { inProgress({ current: this.crawledUrls.size, total: this.maxCrawledLinks, status: "SCRAPING", - currentDocumentUrl: newUrls[newUrls.length - 1], + currentDocumentUrl: newUrls[newUrls.length - 1].url, }); } else if (inProgress) { inProgress({ @@ -159,7 +188,7 @@ export class WebCrawler { currentDocumentUrl: task, }); } - await this.crawlUrls(newUrls, concurrencyLimit, inProgress); + await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress); if (callback && typeof callback === "function") { callback(); } @@ -175,34 +204,48 @@ export class WebCrawler { } ); await queue.drain(); - return Array.from(this.crawledUrls); + return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } - async crawl(url: string): Promise { - if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) + async crawl(url: string): Promise<{url: string, html: string}[]> { + if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){ return []; + } this.visited.add(url); + + if (!url.startsWith("http")) { url = "https://" + url; + } if (url.endsWith("/")) { url = url.slice(0, -1); + } + if (this.isFile(url) || this.isSocialMediaOrEmail(url)) { return []; } try { - let content; - // If it is the first link, fetch with scrapingbee + let content : string = ""; + // If it is the first link, fetch with single url if (this.visited.size === 1) { - content = await scrapWithScrapingBee(url, "load"); + const page = await scrapSingleUrl(url, {includeHtml: true}); + content = page.html ?? "" } else { const response = await axios.get(url); - content = response.data; + content = response.data ?? ""; } const $ = load(content); - let links: string[] = []; + let links: {url: string, html: string}[] = []; + + // Add the initial URL to the list of links + if(this.visited.size === 1) + { + links.push({url, html: content}); + } + $("a").each((_, element) => { const href = $(element).attr("href"); @@ -215,7 +258,6 @@ export class WebCrawler { const path = url.pathname; if ( - // fullUrl.startsWith(this.initialUrl) && // this condition makes it stop crawling back the url this.isInternalLink(fullUrl) && this.matchesPattern(fullUrl) && this.noSections(fullUrl) && @@ -223,12 +265,16 @@ export class WebCrawler { !this.matchesExcludes(path) && this.robots.isAllowed(fullUrl, "FireCrawlAgent") ) { - links.push(fullUrl); + links.push({url: fullUrl, html: content}); } } }); - return links.filter((link) => !this.visited.has(link)); + if(this.visited.size === 1){ + return links; + } + // Create a new list to return to avoid modifying the visited list + return links.filter((link) => !this.visited.has(link.url)); } catch (error) { return []; } @@ -275,7 +321,7 @@ export class WebCrawler { ".mp4", ".mp3", ".pptx", - ".docx", + // ".docx", ".xlsx", ".xml", ]; @@ -294,18 +340,57 @@ export class WebCrawler { return socialMediaOrEmail.some((ext) => url.includes(ext)); } + // private async tryFetchSitemapLinks(url: string): Promise { + const normalizeUrl = (url: string) => { + url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + return url; + }; + const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; + + let sitemapLinks: string[] = []; + try { const response = await axios.get(sitemapUrl); if (response.status === 200) { - return await getLinksFromSitemap(sitemapUrl); + sitemapLinks = await getLinksFromSitemap(sitemapUrl); } } catch (error) { // Error handling for failed sitemap fetch + // console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); } - return []; + + if (sitemapLinks.length === 0) { + // If the first one doesn't work, try the base URL + const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; + try { + const response = await axios.get(baseUrlSitemap); + if (response.status === 200) { + sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); + } + } catch (error) { + // Error handling for failed base URL sitemap fetch + // console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); + } + } + + // Normalize and check if the URL is present in any of the sitemaps + const normalizedUrl = normalizeUrl(url); + + const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); + + // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl + if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { + // do not push the normalized url + sitemapLinks.push(url); + } + + return sitemapLinks; } } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 7ef0a10..0e295ae 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -17,6 +17,7 @@ import { } from "./utils/replacePaths"; import { generateCompletions } from "../../lib/LLM-extraction"; import { getWebScraperQueue } from "../../../src/services/queue-service"; +import { fetchAndProcessDocx } from "./utils/docxProcessor"; export class WebScraperDataProvider { private bullJobId: string; @@ -35,6 +36,7 @@ export class WebScraperDataProvider { private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; + private crawlerMode: string = "default"; authorize(): void { throw new Error("Method not implemented."); @@ -46,7 +48,8 @@ export class WebScraperDataProvider { private async convertUrlsToDocuments( urls: string[], - inProgress?: (progress: Progress) => void + inProgress?: (progress: Progress) => void, + allHtmls?: string[] ): Promise { const totalUrls = urls.length; let processedUrls = 0; @@ -56,7 +59,12 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, this.pageOptions); + const existingHTML = allHtmls ? allHtmls[i + index] : ""; + const result = await scrapSingleUrl( + url, + this.pageOptions, + existingHTML + ); processedUrls++; if (inProgress) { inProgress({ @@ -127,9 +135,30 @@ export class WebScraperDataProvider { } } + private async cleanIrrelevantPath(links: string[]) { + return links.filter((link) => { + const normalizedInitialUrl = new URL(this.urls[0]); + const normalizedLink = new URL(link); + + // Normalize the hostname to account for www and non-www versions + const initialHostname = normalizedInitialUrl.hostname.replace( + /^www\./, + "" + ); + const linkHostname = normalizedLink.hostname.replace(/^www\./, ""); + + // Ensure the protocol and hostname match, and the path starts with the initial URL's path + return ( + linkHostname === initialHostname && + normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname) + ); + }); + } + private async handleCrawlMode( inProgress?: (progress: Progress) => void ): Promise { + const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, @@ -139,19 +168,38 @@ export class WebScraperDataProvider { limit: this.limit, generateImgAltText: this.generateImgAltText, }); - let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); + + let links = await crawler.start( + inProgress, + 5, + this.limit, + this.maxCrawledDepth + ); + + let allLinks = links.map((e) => e.url); + const allHtmls = links.map((e) => e.html); + if (this.returnOnlyUrls) { - return this.returnOnlyUrlsResponse(links, inProgress); + return this.returnOnlyUrlsResponse(allLinks, inProgress); } - let documents = await this.processLinks(links, inProgress); - return this.cacheAndFinalizeDocuments(documents, links); + let documents = []; + // check if fast mode is enabled and there is html inside the links + if (this.crawlerMode === "fast" && links.some((link) => link.html)) { + documents = await this.processLinks(allLinks, inProgress, allHtmls); + } else { + documents = await this.processLinks(allLinks, inProgress); + } + + return this.cacheAndFinalizeDocuments(documents, allLinks); } private async handleSingleUrlsMode( inProgress?: (progress: Progress) => void ): Promise { - let documents = await this.processLinks(this.urls, inProgress); + const links = this.urls; + + let documents = await this.processLinks(links, inProgress); return documents; } @@ -159,6 +207,8 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { let links = await getLinksFromSitemap(this.urls[0]); + links = await this.cleanIrrelevantPath(links); + if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(links, inProgress); } @@ -187,14 +237,24 @@ export class WebScraperDataProvider { private async processLinks( links: string[], - inProgress?: (progress: Progress) => void + inProgress?: (progress: Progress) => void, + allHtmls?: string[] ): Promise { - let pdfLinks = links.filter((link) => link.endsWith(".pdf")); - let pdfDocuments = await this.fetchPdfDocuments(pdfLinks); - links = links.filter((link) => !link.endsWith(".pdf")); + const pdfLinks = links.filter(link => link.endsWith(".pdf")); + const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx")); - let documents = await this.convertUrlsToDocuments(links, inProgress); + const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); + const docxDocuments = await this.fetchDocxDocuments(docLinks); + + links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link)); + + let documents = await this.convertUrlsToDocuments( + links, + inProgress, + allHtmls + ); documents = await this.getSitemapData(this.urls[0], documents); + documents = this.applyPathReplacements(documents); // documents = await this.applyImgAltText(documents); @@ -204,7 +264,7 @@ export class WebScraperDataProvider { ) { documents = await generateCompletions(documents, this.extractorOptions); } - return documents.concat(pdfDocuments); + return documents.concat(pdfDocuments).concat(docxDocuments); } private async fetchPdfDocuments(pdfLinks: string[]): Promise { @@ -219,6 +279,18 @@ export class WebScraperDataProvider { }) ); } + private async fetchDocxDocuments(docxLinks: string[]): Promise { + return Promise.all( + docxLinks.map(async (p) => { + const docXDocument = await fetchAndProcessDocx(p); + return { + content: docXDocument, + metadata: { sourceURL: p }, + provider: "web-scraper", + }; + }) + ); + } private applyPathReplacements(documents: Document[]): Document[] { return this.replaceAllPathsWithAbsolutePaths @@ -395,8 +467,9 @@ export class WebScraperDataProvider { this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; - //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check + //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); + this.crawlerMode = options.crawlerOptions?.mode ?? "default"; // make sure all urls start with https:// this.urls = this.urls.map((url) => { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c43ea40..4c08168 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; +import { fetchAndProcessPdf } from "./utils/pdfProcessor"; dotenv.config(); @@ -66,9 +67,15 @@ export async function scrapWithScrapingBee( ); return ""; } - const decoder = new TextDecoder(); - const text = decoder.decode(response.data); - return text; + + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + const decoder = new TextDecoder(); + const text = decoder.decode(response.data); + return text; + } } catch (error) { console.error(`Error scraping with Scraping Bee: ${error}`); return ""; @@ -95,9 +102,14 @@ export async function scrapWithPlaywright(url: string): Promise { return ""; } - const data = await response.json(); - const html = data.content; - return html ?? ""; + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + const data = await response.json(); + const html = data.content; + return html ?? ""; + } } catch (error) { console.error(`Error scraping with Puppeteer: ${error}`); return ""; @@ -106,7 +118,8 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false } + pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, + existingHtml: string = "" ): Promise { urlToScrap = urlToScrap.trim(); @@ -164,7 +177,13 @@ export async function scrapSingleUrl( ); return ""; } - text = await response.text(); + + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + return fetchAndProcessPdf(url); + } else { + text = await response.text(); + } } catch (error) { console.error(`Error scraping URL: ${error}`); return ""; @@ -197,8 +216,15 @@ export async function scrapSingleUrl( : ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"]; for (const scraper of scrapersInOrder) { + // If exists text coming from crawler, use it + if (existingHtml && existingHtml.trim().length >= 100) { + let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions); + text = await parseMarkdown(cleanedHtml); + html = existingHtml; + break; + } [text, html] = await attemptScraping(urlToScrap, scraper); - if (text && text.length >= 100) break; + if (text && text.trim().length >= 100) break; console.log(`Falling back to ${scraper}`); } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts new file mode 100644 index 0000000..e018ffa --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts @@ -0,0 +1,13 @@ +import * as docxProcessor from "../docxProcessor"; + +describe("DOCX Processing Module - Integration Test", () => { + it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => { + delete process.env.LLAMAPARSE_API_KEY; + const docxContent = await docxProcessor.fetchAndProcessDocx( + "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx" + ); + expect(docxContent.trim()).toContain( + "SERIES A PREFERRED STOCK PURCHASE AGREEMENT" + ); + }); +}); diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index 763a857..a798d05 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -136,5 +136,25 @@ export const urlSpecificParams = { accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", }, + }, + "help.salesforce.com":{ + defaultScraper: "playwright", + params: { + wait_browser: "networkidle2", + block_resources: false, + wait: 2000, + }, + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "sec-fetch-site": "same-origin", + "sec-fetch-mode": "cors", + "sec-fetch-dest": "empty", + referer: "https://www.google.com/", + "accept-language": "en-US,en;q=0.9", + "accept-encoding": "gzip, deflate, br", + accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + }, } }; diff --git a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts new file mode 100644 index 0000000..38759f8 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts @@ -0,0 +1,41 @@ +import axios from "axios"; +import fs from "fs"; +import { createWriteStream } from "node:fs"; +import path from "path"; +import os from "os"; +import mammoth from "mammoth"; + +export async function fetchAndProcessDocx(url: string): Promise { + const tempFilePath = await downloadDocx(url); + const content = await processDocxToText(tempFilePath); + fs.unlinkSync(tempFilePath); // Clean up the temporary file + return content; +} + +async function downloadDocx(url: string): Promise { + const response = await axios({ + url, + method: "GET", + responseType: "stream", + }); + + const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`); + const writer = createWriteStream(tempFilePath); + + response.data.pipe(writer); + + return new Promise((resolve, reject) => { + writer.on("finish", () => resolve(tempFilePath)); + writer.on("error", reject); + }); +} + +export async function processDocxToText(filePath: string): Promise { + const content = await extractTextFromDocx(filePath); + return content; +} + +async function extractTextFromDocx(filePath: string): Promise { + const result = await mammoth.extractRawText({ path: filePath }); + return result.value; +} diff --git a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts index 142bcef..bb9c519 100644 --- a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts +++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts @@ -34,8 +34,6 @@ export const excludeNonMainTags = [ "#nav", ".breadcrumbs", "#breadcrumbs", - ".form", - "form", "#search-form", ".search", "#search", @@ -51,10 +49,6 @@ export const excludeNonMainTags = [ "#tag", ".category", "#category", - ".comment", - "#comment", - ".reply", - "#reply", - ".author", - "#author", + ".cookie", + "#cookie" ]; diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index fb08d9c..7c57007 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise { async function downloadPdf(url: string): Promise { const response = await axios({ url, - method: 'GET', - responseType: 'stream', + method: "GET", + responseType: "stream", }); const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`); @@ -29,8 +29,8 @@ async function downloadPdf(url: string): Promise { response.data.pipe(writer); return new Promise((resolve, reject) => { - writer.on('finish', () => resolve(tempFilePath)); - writer.on('error', reject); + writer.on("finish", () => resolve(tempFilePath)); + writer.on("error", reject); }); } @@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise { } else { // If the status code is not 200, increment the attempt counter and wait attempt++; - await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds + await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds } } catch (error) { - console.error("Error fetching result:", error); + console.error("Error fetching result:", error || ''); attempt++; - await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying + await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying // You may want to handle specific errors differently } } @@ -101,7 +101,7 @@ export async function processPdfToText(filePath: string): Promise { return content; } -async function processPdf(file: string){ +async function processPdf(file: string) { const fileContent = fs.readFileSync(file); const data = await pdf(fileContent); return data.text; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 78ea030..ef7bb1f 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -26,7 +26,7 @@ getWebScraperQueue().process( success: success, result: { links: docs.map((doc) => { - return { content: doc, source: doc.metadata.sourceURL }; + return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" }; }), }, project_id: job.data.project_id, diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 8fa3c93..a0db08d 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -2,17 +2,21 @@ import { RateLimiterRedis } from "rate-limiter-flexible"; import * as redis from "redis"; import { RateLimiterMode } from "../../src/types"; -const MAX_CRAWLS_PER_MINUTE_STARTER = 2; -const MAX_CRAWLS_PER_MINUTE_STANDARD = 4; +const MAX_CRAWLS_PER_MINUTE_STARTER = 3; +const MAX_CRAWLS_PER_MINUTE_STANDARD = 5; const MAX_CRAWLS_PER_MINUTE_SCALE = 20; -const MAX_SCRAPES_PER_MINUTE_STARTER = 10; -const MAX_SCRAPES_PER_MINUTE_STANDARD = 15; -const MAX_SCRAPES_PER_MINUTE_SCALE = 30; +const MAX_SCRAPES_PER_MINUTE_STARTER = 20; +const MAX_SCRAPES_PER_MINUTE_STANDARD = 40; +const MAX_SCRAPES_PER_MINUTE_SCALE = 50; + +const MAX_SEARCHES_PER_MINUTE_STARTER = 20; +const MAX_SEARCHES_PER_MINUTE_STANDARD = 40; +const MAX_SEARCHES_PER_MINUTE_SCALE = 50; const MAX_REQUESTS_PER_MINUTE_PREVIEW = 5; const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 20; -const MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS = 120; +const MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS = 150; export const redisClient = redis.createClient({ url: process.env.REDIS_URL, @@ -43,60 +47,12 @@ export const crawlStatusRateLimiter = new RateLimiterRedis({ export const testSuiteRateLimiter = new RateLimiterRedis({ storeClient: redisClient, keyPrefix: "test-suite", - points: 1000, + points: 10000, duration: 60, // Duration in seconds }); -export function crawlRateLimit (plan: string){ - if (plan === "standard"){ - return new RateLimiterRedis({ - storeClient: redisClient, - keyPrefix: "crawl-standard", - points: MAX_CRAWLS_PER_MINUTE_STANDARD, - duration: 60, // Duration in seconds - }); - } else if (plan === "scale"){ - return new RateLimiterRedis({ - storeClient: redisClient, - keyPrefix: "crawl-scale", - points: MAX_CRAWLS_PER_MINUTE_SCALE, - duration: 60, // Duration in seconds - }); - } - return new RateLimiterRedis({ - storeClient: redisClient, - keyPrefix: "crawl-starter", - points: MAX_CRAWLS_PER_MINUTE_STARTER, - duration: 60, // Duration in seconds - }); -} - -export function scrapeRateLimit (plan: string){ - if (plan === "standard"){ - return new RateLimiterRedis({ - storeClient: redisClient, - keyPrefix: "scrape-standard", - points: MAX_SCRAPES_PER_MINUTE_STANDARD, - duration: 60, // Duration in seconds - }); - } else if (plan === "scale"){ - return new RateLimiterRedis({ - storeClient: redisClient, - keyPrefix: "scrape-scale", - points: MAX_SCRAPES_PER_MINUTE_SCALE, - duration: 60, // Duration in seconds - }); - } - return new RateLimiterRedis({ - storeClient: redisClient, - keyPrefix: "scrape-starter", - points: MAX_SCRAPES_PER_MINUTE_STARTER, - duration: 60, // Duration in seconds - }); -} - -export function getRateLimiter(mode: RateLimiterMode, token: string){ +export function getRateLimiter(mode: RateLimiterMode, token: string, plan?: string){ // Special test suite case. TODO: Change this later. if (token.includes("57017")){ return testSuiteRateLimiter; @@ -106,6 +62,72 @@ export function getRateLimiter(mode: RateLimiterMode, token: string){ return previewRateLimiter; case RateLimiterMode.CrawlStatus: return crawlStatusRateLimiter; + case RateLimiterMode.Crawl: + if (plan === "standard"){ + return new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "crawl-standard", + points: MAX_CRAWLS_PER_MINUTE_STANDARD, + duration: 60, // Duration in seconds + }); + } else if (plan === "scale"){ + return new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "crawl-scale", + points: MAX_CRAWLS_PER_MINUTE_SCALE, + duration: 60, // Duration in seconds + }); + } + return new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "crawl-starter", + points: MAX_CRAWLS_PER_MINUTE_STARTER, + duration: 60, // Duration in seconds + }); + case RateLimiterMode.Scrape: + if (plan === "standard"){ + return new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "scrape-standard", + points: MAX_SCRAPES_PER_MINUTE_STANDARD, + duration: 60, // Duration in seconds + }); + } else if (plan === "scale"){ + return new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "scrape-scale", + points: MAX_SCRAPES_PER_MINUTE_SCALE, + duration: 60, // Duration in seconds + }); + } + return new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "scrape-starter", + points: MAX_SCRAPES_PER_MINUTE_STARTER, + duration: 60, // Duration in seconds + }); + case RateLimiterMode.Search: + if (plan === "standard"){ + return new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "search-standard", + points: MAX_SEARCHES_PER_MINUTE_STANDARD, + duration: 60, // Duration in seconds + }); + } else if (plan === "scale"){ + return new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "search-scale", + points: MAX_SEARCHES_PER_MINUTE_SCALE, + duration: 60, // Duration in seconds + }); + } + return new RateLimiterRedis({ + storeClient: redisClient, + keyPrefix: "search-starter", + points: MAX_SEARCHES_PER_MINUTE_STARTER, + duration: 60, // Duration in seconds + }); default: return serverRateLimiter; } diff --git a/apps/python-sdk/build/lib/firecrawl/firecrawl.py b/apps/python-sdk/build/lib/firecrawl/firecrawl.py index 701810c..98cb8ed 100644 --- a/apps/python-sdk/build/lib/firecrawl/firecrawl.py +++ b/apps/python-sdk/build/lib/firecrawl/firecrawl.py @@ -4,10 +4,11 @@ import requests import time class FirecrawlApp: - def __init__(self, api_key=None): + def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'): self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') if self.api_key is None: raise ValueError('No API key provided') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL') @@ -38,7 +39,7 @@ class FirecrawlApp: scrape_params[key] = value # Make the POST request with the prepared headers and JSON data response = requests.post( - 'https://api.firecrawl.dev/v0/scrape', + f'{self.api_url}/v0/scrape', headers=headers, json=scrape_params ) @@ -48,7 +49,7 @@ class FirecrawlApp: return response['data'] else: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - elif response.status_code in [402, 409, 500]: + elif response.status_code in [402, 408, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') else: @@ -63,7 +64,7 @@ class FirecrawlApp: if params: json_data.update(params) response = requests.post( - 'https://api.firecrawl.dev/v0/search', + f'{self.api_url}/v0/search', headers=headers, json=json_data ) @@ -85,7 +86,7 @@ class FirecrawlApp: json_data = {'url': url} if params: json_data.update(params) - response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers) + response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) if response.status_code == 200: job_id = response.json().get('jobId') if wait_until_done: @@ -97,7 +98,7 @@ class FirecrawlApp: def check_crawl_status(self, job_id): headers = self._prepare_headers() - response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) + response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) if response.status_code == 200: return response.json() else: @@ -130,7 +131,7 @@ class FirecrawlApp: def _monitor_job_status(self, job_id, headers, timeout): import time while True: - status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) + status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) if status_response.status_code == 200: status_data = status_response.json() if status_data['status'] == 'completed': @@ -148,7 +149,7 @@ class FirecrawlApp: self._handle_error(status_response, 'check crawl status') def _handle_error(self, response, action): - if response.status_code in [402, 409, 500]: + if response.status_code in [402, 408, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') else: diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.8.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.8.tar.gz deleted file mode 100644 index b18dde5..0000000 Binary files a/apps/python-sdk/dist/firecrawl-py-0.0.8.tar.gz and /dev/null differ diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz new file mode 100644 index 0000000..55e0eed Binary files /dev/null and b/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz differ diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.8-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.8-py3-none-any.whl deleted file mode 100644 index f71cb8e..0000000 Binary files a/apps/python-sdk/dist/firecrawl_py-0.0.8-py3-none-any.whl and /dev/null differ diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl new file mode 100644 index 0000000..83cb5b7 Binary files /dev/null and b/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl differ diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 701810c..98cb8ed 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -4,10 +4,11 @@ import requests import time class FirecrawlApp: - def __init__(self, api_key=None): + def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'): self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') if self.api_key is None: raise ValueError('No API key provided') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL') @@ -38,7 +39,7 @@ class FirecrawlApp: scrape_params[key] = value # Make the POST request with the prepared headers and JSON data response = requests.post( - 'https://api.firecrawl.dev/v0/scrape', + f'{self.api_url}/v0/scrape', headers=headers, json=scrape_params ) @@ -48,7 +49,7 @@ class FirecrawlApp: return response['data'] else: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - elif response.status_code in [402, 409, 500]: + elif response.status_code in [402, 408, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') else: @@ -63,7 +64,7 @@ class FirecrawlApp: if params: json_data.update(params) response = requests.post( - 'https://api.firecrawl.dev/v0/search', + f'{self.api_url}/v0/search', headers=headers, json=json_data ) @@ -85,7 +86,7 @@ class FirecrawlApp: json_data = {'url': url} if params: json_data.update(params) - response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers) + response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) if response.status_code == 200: job_id = response.json().get('jobId') if wait_until_done: @@ -97,7 +98,7 @@ class FirecrawlApp: def check_crawl_status(self, job_id): headers = self._prepare_headers() - response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) + response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) if response.status_code == 200: return response.json() else: @@ -130,7 +131,7 @@ class FirecrawlApp: def _monitor_job_status(self, job_id, headers, timeout): import time while True: - status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) + status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) if status_response.status_code == 200: status_data = status_response.json() if status_data['status'] == 'completed': @@ -148,7 +149,7 @@ class FirecrawlApp: self._handle_error(status_response, 'check crawl status') def _handle_error(self, response, action): - if response.status_code in [402, 409, 500]: + if response.status_code in [402, 408, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') else: diff --git a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO index e54fda5..c1ee531 100644 --- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO +++ b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: firecrawl-py -Version: 0.0.8 +Version: 0.0.9 Summary: Python SDK for Firecrawl API Home-page: https://github.com/mendableai/firecrawl Author: Mendable.ai diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py index 78a4d84..7df520e 100644 --- a/apps/python-sdk/setup.py +++ b/apps/python-sdk/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages setup( name='firecrawl-py', - version='0.0.8', + version='0.0.9', url='https://github.com/mendableai/firecrawl', author='Mendable.ai', author_email='nick@mendable.ai', diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json new file mode 100644 index 0000000..8bc28a6 --- /dev/null +++ b/apps/test-suite/data/crawl.json @@ -0,0 +1,178 @@ +[ + { + "website": "https://www.vellum.ai/llm-leaderboard", + "expected_min_num_of_pages": 1, + "expected_crawled_pages": ["https://www.vellum.ai/llm-leaderboard"] + }, + { + "website": "https://openai.com/news", + "expected_min_num_of_pages": 4, + "expected_crawled_pages": [ + "https://openai.com/news/company/", + "https://openai.com/news/research/", + "https://openai.com/news/safety-and-alignment/", + "https://openai.com/news/stories/" + ] +}, + { + "website": "https://www.framer.com/pricing", + "expected_min_num_of_pages": 1, + "expected_not_crawled_pages": [ + "https://www.framer.com/features/navigation/", + "https://www.framer.com/contact/", + "https://www.framer.com/add-ons/", + "https://www.framer.com/free-saas-ui-kit/", + "https://www.framer.com/help/", + "https://www.framer.com/features/effects/", + "https://www.framer.com/enterprise/", + "https://www.framer.com/templates/" + ] +}, + { + "website": "https://mendable.ai/pricing", + "expected_min_num_of_pages": 1, + "expected_not_crawled_pages": [ + "https://mendable.ai/", + "https://mendable.ai/blog", + "https://mendable.ai/signin", + "https://mendable.ai/signup", + "https://mendable.ai", + "https://mendable.ai/usecases/sales-enablement", + "https://mendable.ai/usecases/documentation", + "https://mendable.ai/usecases/cs-enablement", + "https://mendable.ai/usecases/productcopilot", + "https://mendable.ai/security" + ], + "notes": "This one should not go backwards, but it does!" +}, + + { + "website": "https://agentops.ai/blog", + "expected_min_num_of_pages": 6, + "expected_crawled_pages": [ + "https://www.agentops.ai/blog/effortless-hr-management-with-saas", + "https://www.agentops.ai/blog/streamlining-hr-with-saas", + "https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions", + "https://www.agentops.ai/blog/efficient-hr-operations-with-saas", + "https://www.agentops.ai/blog/hr-made-simple-with-saas", + "https://agentops.ai/blog" + ], + "expected_not_crawled_pages": [ + "https://agentops.ai/about-us", + "https://agentops.ai/contact-us" + ] + }, + { + "website": "https://en.wikipedia.org/wiki/T._N._Seshan", + "expected_min_num_of_pages": 1, + "expected_not_crawled_pages": [ + "https://en.wikipedia.org/wiki/Wikipedia:Contents", + "https://en.wikipedia.org/wiki/Wikipedia:Contact_us", + "https://en.wikipedia.org/wiki/V._S._Ramadevi", + "https://en.wikipedia.org/wiki/Wikipedia:About", + "https://en.wikipedia.org/wiki/Help:Introduction", + "https://en.wikipedia.org/wiki/H._D._Deve_Gowda", + "https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg" + ] + }, + + { + "website": "https://ycombinator.com/companies", + "expected_min_num_of_pages": 20, + "expected_crawled_pages": [ + "https://www.ycombinator.com/companies/industry/elearning", + "https://www.ycombinator.com/companies/industry/computer-vision", + "https://www.ycombinator.com/companies/industry/health-tech", + "https://www.ycombinator.com/companies/industry/education", + "https://www.ycombinator.com/companies/industry/robotics", + "https://www.ycombinator.com/companies/industry/hardware", + "https://www.ycombinator.com/companies/industry/saas", + "https://www.ycombinator.com/companies/industry/hard-tech", + "https://www.ycombinator.com/companies/industry/developer-tools", + "https://www.ycombinator.com/companies/industry/entertainment", + "https://www.ycombinator.com/companies/industry/finance", + "https://www.ycombinator.com/companies/industry/generative-ai", + "https://www.ycombinator.com/companies/industry/machine-learning" + ] + }, + { + "website": "https://firecrawl.dev", + "expected_min_num_of_pages": 2, + "expected_crawled_pages": [ + "https://firecrawl.dev/", + "https://firecrawl.dev/pricing" + ] + }, + + + { + "website": "https://fly.io/docs/gpus/gpu-quickstart", + "expected_min_num_of_pages": 1, + "expected_not_crawled_pages": [ + "https://fly.io/docs/getting-started/", + "https://fly.io/docs/hands-on/", + "https://fly.io/docs/about/support/", + "https://fly.io/docs/blueprints/going-to-production-with-healthcare-apps/", + "https://fly.io/docs/machines/flyctl/fly-machine-update/", + "https://fly.io/docs/blueprints/review-apps-guide/", + "https://fly.io/docs/blueprints/supercronic/" + ], + "notes": "This one should not go backwards, but it does!" + }, + + { + "website": "https://www.instructables.com/circuits", + "expected_min_num_of_pages": 12, + "expected_crawled_pages": [ + "https://www.instructables.com/circuits/", + "https://www.instructables.com/circuits/apple/projects/", + "https://www.instructables.com/circuits/art/projects/", + "https://www.instructables.com/circuits/electronics/projects/", + "https://www.instructables.com/circuits/microsoft/projects/", + "https://www.instructables.com/circuits/microcontrollers/projects/", + "https://www.instructables.com/circuits/community/", + "https://www.instructables.com/circuits/leds/projects/", + "https://www.instructables.com/circuits/gadgets/projects/", + "https://www.instructables.com/circuits/arduino/projects/", + "https://www.instructables.com/circuits/lasers/projects/", + "https://www.instructables.com/circuits/clocks/projects/" + ] + }, + { + "website": "https://richmondconfidential.org", + "expected_min_num_of_pages": 20, + "expected_crawled_pages": [ + "https://richmondconfidential.org/2009/10/13/salesians-star-guard-has-a-big-impact/", + "https://richmondconfidential.org/2009/10/13/on-team-of-beginners-oilers-old-hand-stands-out/", + "https://richmondconfidential.org/2009/10/19/point-richmond-clockmaker-turns-clutter-into-crafts/", + "https://richmondconfidential.org/2009/10/13/profile-maurice-cathy/", + "https://richmondconfidential.org/2009/10/13/soul-food-rescue-mission-rebuilds-diets-and-lives/", + "https://richmondconfidential.org/2009/10/21/in-tough-economy-pain-trickles-to-the-bottom/", + "https://richmondconfidential.org/2009/10/19/richmond-homicide-map/", + "https://richmondconfidential.org/2009/10/13/rough-roads-for-richmonds-cab-drivers/", + "https://richmondconfidential.org/2009/10/13/before-napa-there-was-winehaven/", + "https://richmondconfidential.org/2009/10/13/family-calls-for-end-to-violence-at-memorial-for-slain-woman-friend/" + ] + }, + { + "website": "https://www.boardgamegeek.com", + "expected_min_num_of_pages": 15, + "expected_crawled_pages": [ + "https://www.boardgamegeek.com/browse/boardgameartist", + "https://www.boardgamegeek.com/browse/boardgamehonor", + "https://www.boardgamegeek.com/browse/boardgamepublisher", + "https://www.boardgamegeek.com/browse/boardgamepodcast", + "https://www.boardgamegeek.com/wiki/page/Index", + "https://www.boardgamegeek.com/browse/boardgamecategory", + "https://www.boardgamegeek.com/boardgame/random", + "https://www.boardgamegeek.com/browse/boardgamemechanic", + "https://www.boardgamegeek.com/forums", + "https://www.boardgamegeek.com/gonecardboard", + "https://www.boardgamegeek.com/browse/boardgameaccessory", + "https://www.boardgamegeek.com/browse/boardgamedesigner", + "https://www.boardgamegeek.com/", + "https://www.boardgamegeek.com/previews", + "https://www.boardgamegeek.com/browse/boardgame" + ] + } +] diff --git a/apps/test-suite/data/websites.json b/apps/test-suite/data/scrape.json similarity index 100% rename from apps/test-suite/data/websites.json rename to apps/test-suite/data/scrape.json diff --git a/apps/test-suite/package.json b/apps/test-suite/package.json index 9ccb660..11c0c09 100644 --- a/apps/test-suite/package.json +++ b/apps/test-suite/package.json @@ -4,7 +4,9 @@ "description": "", "scripts": { "test:suite": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false", - "test:load": "artillery run load-test.yml" + "test:load": "artillery run load-test.yml", + "test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts", + "test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts" }, "author": "", "license": "ISC", diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts new file mode 100644 index 0000000..577725a --- /dev/null +++ b/apps/test-suite/tests/crawl.test.ts @@ -0,0 +1,150 @@ +import request from "supertest"; +import dotenv from "dotenv"; +import { WebsiteScrapeError } from "../utils/types"; +import { logErrors } from "../utils/log"; + +import websitesData from "../data/crawl.json"; +import "dotenv/config"; + +import fs from 'fs'; +dotenv.config(); + +interface WebsiteData { + website: string; + expected_min_num_of_pages: number; + expected_crawled_pages: string[]; +} + +const TEST_URL = "http://127.0.0.1:3002"; + +describe("Crawling Checkup (E2E)", () => { + beforeAll(() => { + if (!process.env.TEST_API_KEY) { + throw new Error("TEST_API_KEY is not set"); + } + }); + + describe("Crawling website tests with a dataset", () => { + it("Should crawl the website and verify the response", async () => { + let passedTests = 0; + const startTime = new Date().getTime(); + const date = new Date(); + const logsDir = `logs/${date.getMonth() + 1}-${date.getDate()}-${date.getFullYear()}`; + + let errorLogFileName = `${logsDir}/run.log_${new Date().toTimeString().split(' ')[0]}`; + const errorLog: WebsiteScrapeError[] = []; + + for (const websiteData of websitesData) { + try { + const crawlResponse = await request(TEST_URL || "") + .post("/v0/crawl") + .set("Content-Type", "application/json") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100, returnOnlyUrls: true }}); + + const jobId = crawlResponse.body.jobId; + let completedResponse: any; + let isFinished = false; + + while (!isFinished) { + completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + isFinished = completedResponse.body.status === "completed"; + + if (!isFinished) { + await new Promise(resolve => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + if(!completedResponse) { + // fail the test + console.log('No response'); + continue; + // continue; + } + + if (!completedResponse.body || completedResponse.body.status !== "completed") { + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: 'SUCCESS', + actual_output: 'FAILURE', + error: `Crawl job did not complete successfully.` + }); + continue; + } + + // check how many webpages were crawled successfully + // compares with expected_num_of_pages + if (completedResponse.body.data.length < websiteData.expected_min_num_of_pages) { + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: `SUCCESS: ${websiteData.expected_min_num_of_pages}`, + actual_output: `FAILURE: ${completedResponse.body.data.length}`, + error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}` + }); + console.log('Error: ', errorLog); + continue; + } + + // checks if crawled pages contain expected_crawled_pages + if (websiteData.expected_crawled_pages && websiteData.expected_crawled_pages.length > 0 && websiteData.expected_crawled_pages.some(page => !completedResponse.body.data?.some((d: { url: string }) => d.url === page))) { + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: `SUCCESS: ${websiteData.expected_crawled_pages}`, + actual_output: `FAILURE: ${completedResponse.body.data}`, + error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}` + }); + console.log('Error: ', errorLog); + continue; + } + + // checks if crawled pages not contain expected_not_crawled_pages + if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && completedResponse.body.data && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) { + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: `SUCCESS: ${websiteData.expected_not_crawled_pages}`, + actual_output: `FAILURE: ${completedResponse.body.data}`, + error: `Expected crawled pages to not contain ${websiteData.expected_not_crawled_pages}, but got ${completedResponse.body.data}` + }); + console.log('Error: ', errorLog); + continue; + } + + passedTests++; + } catch (error) { + console.error(`Error processing ${websiteData.website}: ${error}`); + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: 'SUCCESS', + actual_output: 'FAILURE', + error: `Error processing ${websiteData.website}: ${error}` + }); + continue; + } + } + + const score = (passedTests / websitesData.length) * 100; + const endTime = new Date().getTime(); + const timeTaken = (endTime - startTime) / 1000; + console.log(`Score: ${score}%`); + + await logErrors(errorLog, timeTaken, 0, score, websitesData.length); + + if (process.env.ENV === "local" && errorLog.length > 0) { + if (!fs.existsSync(logsDir)){ + fs.mkdirSync(logsDir, { recursive: true }); + } + fs.writeFileSync(errorLogFileName, JSON.stringify(errorLog, null, 2)); + } + + expect(score).toBeGreaterThanOrEqual(90); + }, 350000); // 150 seconds timeout + }); +}); diff --git a/apps/test-suite/index.test.ts b/apps/test-suite/tests/scrape.test.ts similarity index 93% rename from apps/test-suite/index.test.ts rename to apps/test-suite/tests/scrape.test.ts index 8d6c31f..ec7b720 100644 --- a/apps/test-suite/index.test.ts +++ b/apps/test-suite/tests/scrape.test.ts @@ -1,16 +1,14 @@ import request from "supertest"; import dotenv from "dotenv"; -import Anthropic from "@anthropic-ai/sdk"; -import { numTokensFromString } from "./utils/tokens"; +import { numTokensFromString } from "../utils/tokens"; import OpenAI from "openai"; -import { WebsiteScrapeError } from "./utils/types"; -import { logErrors } from "./utils/log"; +import { WebsiteScrapeError } from "../utils/types"; +import { logErrors } from "../utils/log"; -const websitesData = require("./data/websites.json"); +import websitesData from "../data/scrape.json"; import "dotenv/config"; -const fs = require('fs'); - +import fs from 'fs'; dotenv.config(); interface WebsiteData { @@ -21,8 +19,7 @@ interface WebsiteData { const TEST_URL = "http://127.0.0.1:3002"; - -describe("Scraping/Crawling Checkup (E2E)", () => { +describe("Scraping Checkup (E2E)", () => { beforeAll(() => { if (!process.env.TEST_API_KEY) { throw new Error("TEST_API_KEY is not set"); @@ -72,10 +69,6 @@ describe("Scraping/Crawling Checkup (E2E)", () => { return null; } - const anthropic = new Anthropic({ - apiKey: process.env.ANTHROPIC_API_KEY, - }); - const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, }); @@ -183,7 +176,7 @@ describe("Scraping/Crawling Checkup (E2E)", () => { } - expect(score).toBeGreaterThanOrEqual(75); + expect(score).toBeGreaterThanOrEqual(70); }, 350000); // 150 seconds timeout }); }); diff --git a/apps/test-suite/tsconfig.json b/apps/test-suite/tsconfig.json index e075f97..afa29e7 100644 --- a/apps/test-suite/tsconfig.json +++ b/apps/test-suite/tsconfig.json @@ -39,7 +39,7 @@ // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ - // "resolveJsonModule": true, /* Enable importing .json files. */ + "resolveJsonModule": true, /* Enable importing .json files. */ // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..049672d --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,78 @@ +name: firecrawl +version: '3.9' +services: + playwright-service: + build: apps/playwright-service + environment: + - PORT=3000 + networks: + - backend + + api: + build: apps/api + environment: + - REDIS_URL=${REDIS_URL:-redis://redis:6379} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} + - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} + - PORT=${PORT:-3002} + - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} + - SERPER_API_KEY=${SERPER_API_KEY} + - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} + - LOGTAIL_KEY=${LOGTAIL_KEY} + - BULL_AUTH_KEY=${BULL_AUTH_KEY} + - TEST_API_KEY=${TEST_API_KEY} + - POSTHOG_API_KEY=${POSTHOG_API_KEY} + - POSTHOG_HOST=${POSTHOG_HOST} + - SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN} + - SUPABASE_URL=${SUPABASE_URL} + - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} + - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} + - HOST=${HOST:-0.0.0.0} + depends_on: + - redis + - playwright-service + ports: + - "3002:3002" + command: [ "pnpm", "run", "start:production" ] + networks: + - backend + + worker: + build: apps/api + environment: + - REDIS_URL=${REDIS_URL:-redis://redis:6379} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} + - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} + - PORT=${PORT:-3002} + - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} + - SERPER_API_KEY=${SERPER_API_KEY} + - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} + - LOGTAIL_KEY=${LOGTAIL_KEY} + - BULL_AUTH_KEY=${BULL_AUTH_KEY} + - TEST_API_KEY=${TEST_API_KEY} + - POSTHOG_API_KEY=${POSTHOG_API_KEY} + - POSTHOG_HOST=${POSTHOG_HOST} + - SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN} + - SUPABASE_URL=${SUPABASE_URL} + - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} + - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} + - HOST=${HOST:-0.0.0.0} + depends_on: + - redis + - playwright-service + - api + networks: + - backend + redis: + image: redis:alpine + networks: + - backend + command: redis-server --bind 0.0.0.0 + +networks: + backend: + driver: bridge