From 9e9d66f7a3979e3fbc179959bb35d8dc6233f091 Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Sat, 20 Apr 2024 02:27:53 +0900 Subject: [PATCH 01/47] refactor: fix typo in WebScraper/index.ts breakign -> breaking --- apps/api/src/scraper/WebScraper/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 551c8d8..efbdc6a 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -352,7 +352,7 @@ export class WebScraperDataProvider { options.crawlerOptions?.generateImgAltText ?? false; this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; - //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check + //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); // make sure all urls start with https:// From 07012ca19c0e33033d30494aecefed5bfde2fd02 Mon Sep 17 00:00:00 2001 From: chand1012 Date: Sat, 4 May 2024 15:12:17 -0400 Subject: [PATCH 02/47] Add docker compose file for self hosting --- .env.example | 16 ++++++++++++ docker-compose.yaml | 64 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 .env.example create mode 100644 docker-compose.yaml diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..e95ead0 --- /dev/null +++ b/.env.example @@ -0,0 +1,16 @@ +NUM_WORKERS_PER_QUEUE=8 +OPENAI_API_KEY= +SLACK_WEBHOOK_URL= +SERPER_API_KEY= +LLAMAPARSE_API_KEY= +LOGTAIL_KEY= +BULL_AUTH_KEY= +TEST_API_KEY= +POSTHOG_API_KEY= +POSTHOG_HOST= +SUPABASE_ANON_TOKEN= +SUPABASE_URL= +SUPABASE_SERVICE_TOKEN= +SCRAPING_BEE_API_KEY= +USE_DB_AUTHENTICATION=false +SELFHOST_API_KEY= diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..c65de3f --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,64 @@ +name: firecrawl +version: '3.9' +services: + redis: + image: redis:alpine + + playwright-service: + build: apps/playwright-service + environment: + - PORT=3000 + + api: + build: apps/api + environment: + - REDIS_URL=redis://redis:6379 + - PLAYWRIGHT_SERVICE_URL=http://playwright-service:3000 + - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} + - PORT=3002 + - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} + - SERPER_API_KEY=${SERPER_API_KEY} + - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} + - LOGTAIL_KEY=${LOGTAIL_KEY} + - BULL_AUTH_KEY=${BULL_AUTH_KEY} + - TEST_API_KEY=${TEST_API_KEY} + - POSTHOG_API_KEY=${POSTHOG_API_KEY} + - POSTHOG_HOST=${POSTHOG_HOST} + - SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN} + - SUPABASE_URL=${SUPABASE_URL} + - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} + - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} + - HOST=0.0.0.0 + depends_on: + - redis + - playwright-service + ports: + - "3002:3002" + command: [ "pnpm", "run", "start:production" ] + + worker: + build: apps/api + environment: + - REDIS_URL=redis://redis:6379 + - PLAYWRIGHT_SERVICE_URL=http://playwright-service:3000 + - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} + - PORT=3002 + - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} + - SERPER_API_KEY=${SERPER_API_KEY} + - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} + - LOGTAIL_KEY=${LOGTAIL_KEY} + - BULL_AUTH_KEY=${BULL_AUTH_KEY} + - TEST_API_KEY=${TEST_API_KEY} + - POSTHOG_API_KEY=${POSTHOG_API_KEY} + - POSTHOG_HOST=${POSTHOG_HOST} + - SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN} + - SUPABASE_URL=${SUPABASE_URL} + - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} + - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} + depends_on: + - redis + - playwright-service From 5a352b2b4f008b2d70178b57ebe5f771b5cc30e4 Mon Sep 17 00:00:00 2001 From: chand1012 Date: Sat, 4 May 2024 15:13:51 -0400 Subject: [PATCH 03/47] Remove selfhost api key --- .env.example | 1 - 1 file changed, 1 deletion(-) diff --git a/.env.example b/.env.example index e95ead0..e7ddc9b 100644 --- a/.env.example +++ b/.env.example @@ -13,4 +13,3 @@ SUPABASE_URL= SUPABASE_SERVICE_TOKEN= SCRAPING_BEE_API_KEY= USE_DB_AUTHENTICATION=false -SELFHOST_API_KEY= From b32057ec890dc5b79ea7b05323820cf337afe68f Mon Sep 17 00:00:00 2001 From: chand1012 Date: Sun, 5 May 2024 12:03:42 -0400 Subject: [PATCH 04/47] Update SELF_HOST.md --- SELF_HOST.md | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/SELF_HOST.md b/SELF_HOST.md index 8d1d490..0deb543 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,6 +1,41 @@ # Self-hosting Firecrawl -Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. +First, clone this repository and copy `.env.example` to `.env`. +```bash +git clone https://github.com/mendableai/firecrawl.git +cd firecrawl +cp .env.example .env +``` -*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* +Then, edit the .env.example to have the correct values for your environment. +``` +## To turn on DB authentication, you need to set up supabase. +USE_DB_AUTHENTICATION=false +# ===== Optional ENVS ====== + +# Supabase Setup (used to support DB authentication, advanced logging, etc.) +SUPABASE_ANON_TOKEN= +SUPABASE_URL= +SUPABASE_SERVICE_TOKEN= + +# Other Optionals +TEST_API_KEY= # use if you've set up authentication and want to test with a real API key +SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking +OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) +BULL_AUTH_KEY= # +LOGTAIL_KEY= # Use if you're configuring basic logging with logtail +PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback +LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs +SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api +SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages +POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs +POSTHOG_HOST= # set if you'd like to send posthog events like job logs +``` + +Once that's complete, you can simply run the following commands to get started: +```bash +docker compose up +``` + +This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`. From 18480b2005dcc669762294f9cf40cf8bb57f17ce Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 10 May 2024 11:38:17 -0300 Subject: [PATCH 05/47] Removed .env.example, improved docs and docker compose envs --- .env.example | 15 --------------- SELF_HOST.md | 31 ++++++------------------------- apps/api/.env.example | 2 +- docker-compose.yaml | 16 +++++++++------- 4 files changed, 16 insertions(+), 48 deletions(-) delete mode 100644 .env.example diff --git a/.env.example b/.env.example deleted file mode 100644 index e7ddc9b..0000000 --- a/.env.example +++ /dev/null @@ -1,15 +0,0 @@ -NUM_WORKERS_PER_QUEUE=8 -OPENAI_API_KEY= -SLACK_WEBHOOK_URL= -SERPER_API_KEY= -LLAMAPARSE_API_KEY= -LOGTAIL_KEY= -BULL_AUTH_KEY= -TEST_API_KEY= -POSTHOG_API_KEY= -POSTHOG_HOST= -SUPABASE_ANON_TOKEN= -SUPABASE_URL= -SUPABASE_SERVICE_TOKEN= -SCRAPING_BEE_API_KEY= -USE_DB_AUTHENTICATION=false diff --git a/SELF_HOST.md b/SELF_HOST.md index 0deb543..a695f84 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,36 +1,17 @@ # Self-hosting Firecrawl -First, clone this repository and copy `.env.example` to `.env`. +## Getting Started + +First, clone this repository and copy the example env file from api folder `.env.example` to `.env`. ```bash git clone https://github.com/mendableai/firecrawl.git cd firecrawl -cp .env.example .env +cp ./apps/api/.env.example ./.env ``` -Then, edit the .env.example to have the correct values for your environment. -``` -## To turn on DB authentication, you need to set up supabase. +For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` on `.env` to not use the database authentication. +```yml USE_DB_AUTHENTICATION=false - -# ===== Optional ENVS ====== - -# Supabase Setup (used to support DB authentication, advanced logging, etc.) -SUPABASE_ANON_TOKEN= -SUPABASE_URL= -SUPABASE_SERVICE_TOKEN= - -# Other Optionals -TEST_API_KEY= # use if you've set up authentication and want to test with a real API key -SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking -OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) -BULL_AUTH_KEY= # -LOGTAIL_KEY= # Use if you're configuring basic logging with logtail -PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback -LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs -SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api -SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages -POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs -POSTHOG_HOST= # set if you'd like to send posthog events like job logs ``` Once that's complete, you can simply run the following commands to get started: diff --git a/apps/api/.env.example b/apps/api/.env.example index b025326..55271ec 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -3,6 +3,7 @@ NUM_WORKERS_PER_QUEUE=8 PORT=3002 HOST=0.0.0.0 REDIS_URL=redis://localhost:6379 +PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000 ## To turn on DB authentication, you need to set up supabase. USE_DB_AUTHENTICATION=true @@ -20,7 +21,6 @@ SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blockin OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) BULL_AUTH_KEY= # LOGTAIL_KEY= # Use if you're configuring basic logging with logtail -PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages diff --git a/docker-compose.yaml b/docker-compose.yaml index c65de3f..af6921c 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,10 +12,10 @@ services: api: build: apps/api environment: - - REDIS_URL=redis://redis:6379 - - PLAYWRIGHT_SERVICE_URL=http://playwright-service:3000 + - REDIS_URL=${REDIS_URL} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - - PORT=3002 + - PORT=${PORT} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} - OPENAI_API_KEY=${OPENAI_API_KEY} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} @@ -30,7 +30,7 @@ services: - SUPABASE_URL=${SUPABASE_URL} - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} - - HOST=0.0.0.0 + - HOST=${HOST} depends_on: - redis - playwright-service @@ -41,10 +41,10 @@ services: worker: build: apps/api environment: - - REDIS_URL=redis://redis:6379 - - PLAYWRIGHT_SERVICE_URL=http://playwright-service:3000 + - REDIS_URL=${REDIS_URL} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - - PORT=3002 + - PORT=${PORT} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} - OPENAI_API_KEY=${OPENAI_API_KEY} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} @@ -59,6 +59,8 @@ services: - SUPABASE_URL=${SUPABASE_URL} - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} + - HOST=${HOST} depends_on: - redis - playwright-service + - api From df16890f84b2d67420fa061d5fd901f04a5160bd Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 10 May 2024 11:59:33 -0300 Subject: [PATCH 06/47] Added default value for crawlOptions.limit --- apps/api/openapi.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 7861f32..127fe51 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -128,7 +128,8 @@ }, "limit": { "type": "integer", - "description": "Maximum number of pages to crawl" + "description": "Maximum number of pages to crawl", + "default": 10000 } } }, From 02450660092e402671b36f03d5b77d349bd4b403 Mon Sep 17 00:00:00 2001 From: chand1012 Date: Fri, 10 May 2024 17:15:32 -0400 Subject: [PATCH 07/47] chore: Update docker-compose.yaml with default values for REDIS_URL and PLAYWRIGHT_MICROSERVICE_URL --- docker-compose.yaml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index af6921c..9128042 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,8 +12,8 @@ services: api: build: apps/api environment: - - REDIS_URL=${REDIS_URL} - - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL} + - REDIS_URL=${REDIS_URL:-redis://redis:6379} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - PORT=${PORT} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} @@ -41,8 +41,8 @@ services: worker: build: apps/api environment: - - REDIS_URL=${REDIS_URL} - - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL} + - REDIS_URL=${REDIS_URL:-redis://redis:6379} + - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - PORT=${PORT} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} @@ -64,3 +64,7 @@ services: - redis - playwright-service - api + +networks: + default: + name: firecrawl From 2021a822ffccde73e9cefbcb2a2467179db3cb0e Mon Sep 17 00:00:00 2001 From: chand1012 Date: Fri, 10 May 2024 17:20:33 -0400 Subject: [PATCH 08/47] chore: Add firecrawl network to docker-compose.yaml --- docker-compose.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docker-compose.yaml b/docker-compose.yaml index 9128042..2daabec 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -3,11 +3,15 @@ version: '3.9' services: redis: image: redis:alpine + networks: + - firecrawl playwright-service: build: apps/playwright-service environment: - PORT=3000 + networks: + - firecrawl api: build: apps/api @@ -37,6 +41,8 @@ services: ports: - "3002:3002" command: [ "pnpm", "run", "start:production" ] + networks: + - firecrawl worker: build: apps/api @@ -64,6 +70,8 @@ services: - redis - playwright-service - api + networks: + - firecrawl networks: default: From b498e9881c5bcaf7ddad6a4d8d1e540e24d316f5 Mon Sep 17 00:00:00 2001 From: chand1012 Date: Fri, 10 May 2024 17:23:22 -0400 Subject: [PATCH 09/47] chore: Update docker-compose.yaml network configuration --- docker-compose.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 2daabec..12a8219 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -4,14 +4,14 @@ services: redis: image: redis:alpine networks: - - firecrawl + - default playwright-service: build: apps/playwright-service environment: - PORT=3000 networks: - - firecrawl + - default api: build: apps/api @@ -42,7 +42,7 @@ services: - "3002:3002" command: [ "pnpm", "run", "start:production" ] networks: - - firecrawl + - default worker: build: apps/api @@ -71,8 +71,7 @@ services: - playwright-service - api networks: - - firecrawl + - default networks: default: - name: firecrawl From 5cbce060edee4cd860f21b3a6c2d7660defda604 Mon Sep 17 00:00:00 2001 From: chand1012 Date: Fri, 10 May 2024 17:26:00 -0400 Subject: [PATCH 10/47] chore: Update docker-compose.yaml with default values for PORT and HOST --- docker-compose.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 12a8219..0cc9d43 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -19,7 +19,7 @@ services: - REDIS_URL=${REDIS_URL:-redis://redis:6379} - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - - PORT=${PORT} + - PORT=${PORT:-3002} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} - OPENAI_API_KEY=${OPENAI_API_KEY} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} @@ -34,7 +34,7 @@ services: - SUPABASE_URL=${SUPABASE_URL} - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} - - HOST=${HOST} + - HOST=${HOST:-0.0.0.0} depends_on: - redis - playwright-service @@ -50,7 +50,7 @@ services: - REDIS_URL=${REDIS_URL:-redis://redis:6379} - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - - PORT=${PORT} + - PORT=${PORT:-3002} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} - OPENAI_API_KEY=${OPENAI_API_KEY} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} @@ -65,7 +65,7 @@ services: - SUPABASE_URL=${SUPABASE_URL} - SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN} - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} - - HOST=${HOST} + - HOST=${HOST:-0.0.0.0} depends_on: - redis - playwright-service From 4737fe871127764f2d868fa434a4249e7a8939ef Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 13 May 2024 13:47:49 -0300 Subject: [PATCH 11/47] Added missing instruction --- SELF_HOST.md | 5 +++++ docker-compose.yaml | 19 ++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/SELF_HOST.md b/SELF_HOST.md index a695f84..8c3c0aa 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -14,6 +14,11 @@ For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` USE_DB_AUTHENTICATION=false ``` +Update the Redis URL in the .env file to align with the Docker configuration: +```yml +REDIS_URL=redis://redis:6379 +``` + Once that's complete, you can simply run the following commands to get started: ```bash docker compose up diff --git a/docker-compose.yaml b/docker-compose.yaml index 0cc9d43..049672d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,17 +1,12 @@ name: firecrawl version: '3.9' services: - redis: - image: redis:alpine - networks: - - default - playwright-service: build: apps/playwright-service environment: - PORT=3000 networks: - - default + - backend api: build: apps/api @@ -42,7 +37,7 @@ services: - "3002:3002" command: [ "pnpm", "run", "start:production" ] networks: - - default + - backend worker: build: apps/api @@ -71,7 +66,13 @@ services: - playwright-service - api networks: - - default + - backend + redis: + image: redis:alpine + networks: + - backend + command: redis-server --bind 0.0.0.0 networks: - default: + backend: + driver: bridge From 2ce045912f31202a7701c513c1ebffe8f21469f3 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 10:56:08 -0700 Subject: [PATCH 12/47] Nick: disable vision right now --- apps/api/src/scraper/WebScraper/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index e3256db..7ef0a10 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -196,7 +196,7 @@ export class WebScraperDataProvider { let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); documents = this.applyPathReplacements(documents); - documents = await this.applyImgAltText(documents); + // documents = await this.applyImgAltText(documents); if ( this.extractorOptions.mode === "llm-extraction" && From 4cc46d4af8813e0e2411c8de56e0365e17717c0b Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Mon, 13 May 2024 15:23:31 -0400 Subject: [PATCH 13/47] Update models.ts --- apps/api/src/lib/LLM-extraction/models.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index ff805bb..4a25b43 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -24,7 +24,7 @@ function prepareOpenAIDoc( export async function generateOpenAICompletions({ client, - model = "gpt-4-turbo", + model = "gpt-4o", document, schema, //TODO - add zod dynamic type checking prompt = defaultPrompt, From 65d89afba9081b526fb1ee03a4540f6284fe4be4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 13:01:43 -0700 Subject: [PATCH 14/47] Nick: --- .../src/__tests__/e2e_withAuth/index.test.ts | 10 ++++++++ apps/api/src/controllers/scrape.ts | 25 ++++++++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 5e3777b..0e2caeb 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -176,6 +176,16 @@ describe("E2E Tests for API Routes", () => { // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); // }); + it("should return a timeout error when scraping takes longer than the specified timeout", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", timeout: 1000 }); + + expect(response.statusCode).toBe(408); + }, 3000); + it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/crawlWebsitePreview") diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 021a9d0..449a50f 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -15,6 +15,7 @@ export async function scrapeHelper( crawlerOptions: any, pageOptions: PageOptions, extractorOptions: ExtractorOptions, + timeout: number ): Promise<{ success: boolean; error?: string; @@ -30,7 +31,6 @@ export async function scrapeHelper( return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; } - const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", @@ -42,7 +42,19 @@ export async function scrapeHelper( extractorOptions: extractorOptions, }); - const docs = await a.getDocuments(false); + const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) => + setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout) + ); + + const docsPromise = a.getDocuments(false); + + let docs; + try { + docs = await Promise.race([docsPromise, timeoutPromise]); + } catch (error) { + return error; + } + // make sure doc.content is not empty const filteredDocs = docs.filter( (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 @@ -51,12 +63,11 @@ export async function scrapeHelper( return { success: true, error: "No page found", returnCode: 200 }; } - - let creditsToBeBilled = filteredDocs.length; + let creditsToBeBilled = filteredDocs.length; const creditsPerLLMExtract = 5; - if (extractorOptions.mode === "llm-extraction"){ - creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length) + if (extractorOptions.mode === "llm-extraction") { + creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); } const billingResult = await billTeam( @@ -96,6 +107,7 @@ export async function scrapeController(req: Request, res: Response) { mode: "markdown" } const origin = req.body.origin ?? "api"; + const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -114,6 +126,7 @@ export async function scrapeController(req: Request, res: Response) { crawlerOptions, pageOptions, extractorOptions, + timeout ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; From f3ec21d9c486a67e564e78daf140416f263a00ee Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 13:57:22 -0700 Subject: [PATCH 15/47] Update runWebScraper.ts --- apps/api/src/main/runWebScraper.ts | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 3c9ea88..632d110 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -17,8 +17,10 @@ export async function startWebScraperPipeline({ crawlerOptions: job.data.crawlerOptions, pageOptions: job.data.pageOptions, inProgress: (progress) => { - partialDocs.push(progress.currentDocument); - job.progress({...progress, partialDocs: partialDocs}); + if (progress.currentDocument) { + partialDocs.push(progress.currentDocument); + job.progress({ ...progress, partialDocs: partialDocs }); + } }, onSuccess: (result) => { job.moveToCompleted(result); @@ -27,7 +29,7 @@ export async function startWebScraperPipeline({ job.moveToFailed(error); }, team_id: job.data.team_id, - bull_job_id: job.id.toString() + bull_job_id: job.id.toString(), })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -63,26 +65,25 @@ export async function runWebScraper({ urls: [url], crawlerOptions: crawlerOptions, pageOptions: pageOptions, - bullJobId: bull_job_id + bullJobId: bull_job_id, }); } else { await provider.setOptions({ mode: mode, urls: url.split(","), crawlerOptions: crawlerOptions, - pageOptions: pageOptions + pageOptions: pageOptions, }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { inProgress(progress); - })) as Document[]; if (docs.length === 0) { return { success: true, message: "No pages found", - docs: [] + docs: [], }; } @@ -95,18 +96,14 @@ export async function runWebScraper({ }) : docs.filter((doc) => doc.content.trim().length > 0); - - const billingResult = await billTeam( - team_id, - filteredDocs.length - ); + const billingResult = await billTeam(team_id, filteredDocs.length); if (!billingResult.success) { // throw new Error("Failed to bill team, no subscription was found"); return { success: false, message: "Failed to bill team, no subscription was found", - docs: [] + docs: [], }; } From aa0c8188c9d4d11c128474d3cf7f322ee72d326b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 18:34:00 -0700 Subject: [PATCH 16/47] Nick: 408 handling --- apps/js-sdk/firecrawl/src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 0319c74..7654f1b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -109,7 +109,7 @@ export default class FirecrawlApp { const response: AxiosResponse = await axios.post( "https://api.firecrawl.dev/v0/scrape", jsonData, - { headers } + { headers }, ); if (response.status === 200) { const responseData = response.data; @@ -324,7 +324,7 @@ export default class FirecrawlApp { * @param {string} action - The action being performed when the error occurred. */ handleError(response: AxiosResponse, action: string): void { - if ([402, 409, 500].includes(response.status)) { + if ([402, 408, 409, 500].includes(response.status)) { const errorMessage: string = response.data.error || "Unknown error occurred"; throw new Error( From 512449e1aa667b18d8ca98b6718af420c15a84c5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 19:54:12 -0700 Subject: [PATCH 17/47] Nick: v21 --- apps/js-sdk/firecrawl/build/index.js | 2 +- apps/js-sdk/firecrawl/package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js index 6e0f367..b850d5c 100644 --- a/apps/js-sdk/firecrawl/build/index.js +++ b/apps/js-sdk/firecrawl/build/index.js @@ -240,7 +240,7 @@ export default class FirecrawlApp { * @param {string} action - The action being performed when the error occurred. */ handleError(response, action) { - if ([402, 409, 500].includes(response.status)) { + if ([402, 408, 409, 500].includes(response.status)) { const errorMessage = response.data.error || "Unknown error occurred"; throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); } diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 9e1948a..3bacdf4 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.20", + "version": "0.0.21", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "types": "types/index.d.ts", From a96fc5b96d4e2144ed933d8a445900ec653c208a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 20:45:11 -0700 Subject: [PATCH 18/47] Nick: 4x speed --- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/crawler.ts | 53 ++++++++-------- apps/api/src/scraper/WebScraper/index.ts | 60 ++++++++++++++++--- apps/api/src/scraper/WebScraper/single_url.ts | 10 +++- apps/api/src/services/queue-worker.ts | 2 +- 5 files changed, 90 insertions(+), 36 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index a387b54..0c34126 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -44,6 +44,7 @@ export type WebScraperOptions = { limit?: number; generateImgAltText?: boolean; replaceAllPathsWithAbsolutePaths?: boolean; + fastMode?: boolean; // have a mode of some sort }; pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 0248df2..25f2e9d 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -4,7 +4,7 @@ import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import async from "async"; import { Progress } from "../../lib/entities"; -import { scrapWithScrapingBee } from "./single_url"; +import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import robotsParser from "robots-parser"; export class WebCrawler { @@ -15,11 +15,12 @@ export class WebCrawler { private maxCrawledLinks: number; private maxCrawledDepth: number; private visited: Set = new Set(); - private crawledUrls: Set = new Set(); + private crawledUrls: { url: string, html: string }[] = []; private limit: number; private robotsTxtUrl: string; private robots: any; private generateImgAltText: boolean; + private fastMode: boolean = false; constructor({ initialUrl, @@ -49,9 +50,9 @@ export class WebCrawler { this.maxCrawledLinks = maxCrawledLinks ?? limit; this.maxCrawledDepth = maxCrawledDepth ?? 10; this.generateImgAltText = generateImgAltText ?? false; + this.fastMode = false; } - private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { return sitemapLinks .filter((link) => { @@ -99,7 +100,7 @@ export class WebCrawler { concurrencyLimit: number = 5, limit: number = 10000, maxDepth: number = 10 - ): Promise { + ): Promise<{ url: string, html: string }[]> { // Fetch and parse robots.txt try { const response = await axios.get(this.robotsTxtUrl); @@ -111,7 +112,7 @@ export class WebCrawler { const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); - return filteredLinks; + return filteredLinks.map(link => ({ url: link, html: "" })); } const urls = await this.crawlUrls( @@ -123,43 +124,44 @@ export class WebCrawler { urls.length === 0 && this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 ) { - return [this.initialUrl]; + return [{ url: this.initialUrl, html: "" }]; } // make sure to run include exclude here again - return this.filterLinks(urls, limit, this.maxCrawledDepth); + const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); + return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); } private async crawlUrls( urls: string[], concurrencyLimit: number, inProgress?: (progress: Progress) => void - ): Promise { + ): Promise<{ url: string, html: string }[]> { const queue = async.queue(async (task: string, callback) => { - if (this.crawledUrls.size >= this.maxCrawledLinks) { + if (this.crawledUrls.length >= this.maxCrawledLinks) { if (callback && typeof callback === "function") { callback(); } return; } const newUrls = await this.crawl(task); - newUrls.forEach((url) => this.crawledUrls.add(url)); + newUrls.forEach((page) => this.crawledUrls.push(page)); if (inProgress && newUrls.length > 0) { inProgress({ - current: this.crawledUrls.size, + current: this.crawledUrls.length, total: this.maxCrawledLinks, status: "SCRAPING", - currentDocumentUrl: newUrls[newUrls.length - 1], + currentDocumentUrl: newUrls[newUrls.length - 1].url, }); } else if (inProgress) { inProgress({ - current: this.crawledUrls.size, + current: this.crawledUrls.length, total: this.maxCrawledLinks, status: "SCRAPING", currentDocumentUrl: task, }); } - await this.crawlUrls(newUrls, concurrencyLimit, inProgress); + await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress); if (callback && typeof callback === "function") { callback(); } @@ -175,10 +177,10 @@ export class WebCrawler { } ); await queue.drain(); - return Array.from(this.crawledUrls); + return this.crawledUrls; } - async crawl(url: string): Promise { + async crawl(url: string): Promise<{url: string, html: string}[]> { if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) return []; this.visited.add(url); @@ -193,16 +195,17 @@ export class WebCrawler { } try { - let content; - // If it is the first link, fetch with scrapingbee + let content : string = ""; + // If it is the first link, fetch with single url if (this.visited.size === 1) { - content = await scrapWithScrapingBee(url, "load"); + const page = await scrapSingleUrl(url, {includeHtml: true}); + content = page.html ?? "" } else { const response = await axios.get(url); - content = response.data; + content = response.data ?? ""; } const $ = load(content); - let links: string[] = []; + let links: {url: string, html: string}[] = []; $("a").each((_, element) => { const href = $(element).attr("href"); @@ -215,7 +218,6 @@ export class WebCrawler { const path = url.pathname; if ( - // fullUrl.startsWith(this.initialUrl) && // this condition makes it stop crawling back the url this.isInternalLink(fullUrl) && this.matchesPattern(fullUrl) && this.noSections(fullUrl) && @@ -223,12 +225,14 @@ export class WebCrawler { !this.matchesExcludes(path) && this.robots.isAllowed(fullUrl, "FireCrawlAgent") ) { - links.push(fullUrl); + links.push({url: fullUrl, html: content}); } } }); - return links.filter((link) => !this.visited.has(link)); + // Create a new list to return to avoid modifying the visited list + const filteredLinks = links.filter((link) => !this.visited.has(link.url)); + return filteredLinks; } catch (error) { return []; } @@ -309,3 +313,4 @@ export class WebCrawler { return []; } } + diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 7ef0a10..9221666 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -17,7 +17,20 @@ import { } from "./utils/replacePaths"; import { generateCompletions } from "../../lib/LLM-extraction"; import { getWebScraperQueue } from "../../../src/services/queue-service"; - +import { parseMarkdown } from "../../lib/html-to-markdown"; +import cheerio from "cheerio"; +import { excludeNonMainTags } from "./utils/excludeTags"; +const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { + const soup = cheerio.load(html); + soup("script, style, iframe, noscript, meta, head").remove(); + if (pageOptions.onlyMainContent) { + // remove any other tags that are not in the main content + excludeNonMainTags.forEach((tag) => { + soup(tag).remove(); + }); + } + return soup.html(); +}; export class WebScraperDataProvider { private bullJobId: string; private urls: string[] = [""]; @@ -35,6 +48,7 @@ export class WebScraperDataProvider { private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; + private fastMode: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -46,7 +60,8 @@ export class WebScraperDataProvider { private async convertUrlsToDocuments( urls: string[], - inProgress?: (progress: Progress) => void + inProgress?: (progress: Progress) => void, + allHtmls?: string[] ): Promise { const totalUrls = urls.length; let processedUrls = 0; @@ -56,7 +71,8 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, this.pageOptions); + const existingText = allHtmls ? allHtmls[i + index] : ""; + const result = await scrapSingleUrl(url, this.pageOptions, existingText); processedUrls++; if (inProgress) { inProgress({ @@ -139,13 +155,33 @@ export class WebScraperDataProvider { limit: this.limit, generateImgAltText: this.generateImgAltText, }); + let start = Date.now(); let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); + console.log(links.length) + let end = Date.now(); + console.log("Crawl end in seconds ", (end - start) / 1000); + const allLinks = links.map((e) => e.url); + const allHtmls = links.map((e)=> e.html); + console.log("All links", allLinks.length); + console.log("All htmls", allHtmls.length); + if (this.returnOnlyUrls) { - return this.returnOnlyUrlsResponse(links, inProgress); + return this.returnOnlyUrlsResponse(allLinks , inProgress); + } + + + let fastDocs = [] + let documents = []; + // check if fast mode is enabled and there is html inside the links + if (this.fastMode && links.some((link) => link.html)) { + console.log("Fast mode enabled"); + documents = await this.processLinks(allLinks, inProgress, allHtmls); + + }else{ + documents = await this.convertUrlsToDocuments(allLinks, inProgress, allHtmls); } - let documents = await this.processLinks(links, inProgress); - return this.cacheAndFinalizeDocuments(documents, links); + return this.cacheAndFinalizeDocuments(documents, allLinks); } private async handleSingleUrlsMode( @@ -187,14 +223,17 @@ export class WebScraperDataProvider { private async processLinks( links: string[], - inProgress?: (progress: Progress) => void + inProgress?: (progress: Progress) => void, + allHtmls?: string[] ): Promise { let pdfLinks = links.filter((link) => link.endsWith(".pdf")); let pdfDocuments = await this.fetchPdfDocuments(pdfLinks); links = links.filter((link) => !link.endsWith(".pdf")); - - let documents = await this.convertUrlsToDocuments(links, inProgress); + + let documents = await this.convertUrlsToDocuments(links, inProgress, allHtmls); documents = await this.getSitemapData(this.urls[0], documents); + + documents = this.applyPathReplacements(documents); // documents = await this.applyImgAltText(documents); @@ -238,6 +277,8 @@ export class WebScraperDataProvider { ): Promise { await this.setCachedDocuments(documents, links); documents = this.removeChildLinks(documents); + documents = this.filterDocsExcludeInclude(documents); + documents = this.filterDepth(documents); return documents.splice(0, this.limit); } @@ -397,6 +438,7 @@ export class WebScraperDataProvider { this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); + this.fastMode = options.crawlerOptions?.fastMode ?? false; // make sure all urls start with https:// this.urls = this.urls.map((url) => { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c43ea40..c41beb5 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -106,7 +106,8 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false } + pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, + existingText: string = "" ): Promise { urlToScrap = urlToScrap.trim(); @@ -197,8 +198,13 @@ export async function scrapSingleUrl( : ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"]; for (const scraper of scrapersInOrder) { + // If exists text coming from crawler, use it + if (existingText && existingText.trim().length >= 100) { + text = existingText; + break; + } [text, html] = await attemptScraping(urlToScrap, scraper); - if (text && text.length >= 100) break; + if (text && text.trim().length >= 100) break; console.log(`Falling back to ${scraper}`); } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 78ea030..ef7bb1f 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -26,7 +26,7 @@ getWebScraperQueue().process( success: success, result: { links: docs.map((doc) => { - return { content: doc, source: doc.metadata.sourceURL }; + return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" }; }), }, project_id: job.data.project_id, From 8a72cf556bf8cff1b21983a8fd50f56abc2ec8af Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 21:10:58 -0700 Subject: [PATCH 19/47] Nick: --- apps/api/src/lib/entities.ts | 2 +- apps/api/src/scraper/WebScraper/crawler.ts | 5 +---- apps/api/src/scraper/WebScraper/index.ts | 6 +++--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 0c34126..15550be 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -44,7 +44,7 @@ export type WebScraperOptions = { limit?: number; generateImgAltText?: boolean; replaceAllPathsWithAbsolutePaths?: boolean; - fastMode?: boolean; // have a mode of some sort + mode?: "default" | "fast"; // have a mode of some sort }; pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 25f2e9d..4509531 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -20,7 +20,6 @@ export class WebCrawler { private robotsTxtUrl: string; private robots: any; private generateImgAltText: boolean; - private fastMode: boolean = false; constructor({ initialUrl, @@ -50,7 +49,6 @@ export class WebCrawler { this.maxCrawledLinks = maxCrawledLinks ?? limit; this.maxCrawledDepth = maxCrawledDepth ?? 10; this.generateImgAltText = generateImgAltText ?? false; - this.fastMode = false; } private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { @@ -231,8 +229,7 @@ export class WebCrawler { }); // Create a new list to return to avoid modifying the visited list - const filteredLinks = links.filter((link) => !this.visited.has(link.url)); - return filteredLinks; + return links.filter((link) => !this.visited.has(link.url)); } catch (error) { return []; } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 9221666..1eeb65f 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -48,7 +48,7 @@ export class WebScraperDataProvider { private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; - private fastMode: boolean = false; + private crawlerMode: string = "default"; authorize(): void { throw new Error("Method not implemented."); @@ -173,7 +173,7 @@ export class WebScraperDataProvider { let fastDocs = [] let documents = []; // check if fast mode is enabled and there is html inside the links - if (this.fastMode && links.some((link) => link.html)) { + if (this.crawlerMode === "fast" && links.some((link) => link.html)) { console.log("Fast mode enabled"); documents = await this.processLinks(allLinks, inProgress, allHtmls); @@ -438,7 +438,7 @@ export class WebScraperDataProvider { this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); - this.fastMode = options.crawlerOptions?.fastMode ?? false; + this.crawlerMode = options.crawlerOptions?.mode ?? "default"; // make sure all urls start with https:// this.urls = this.urls.map((url) => { From 7f31959be7a3333b32bc6b3d2dcc128fa07fb5b6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 14 May 2024 12:04:36 -0700 Subject: [PATCH 20/47] Nick: --- apps/api/src/scraper/WebScraper/crawler.ts | 13 +++++++------ apps/api/src/scraper/WebScraper/index.ts | 2 -- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 4509531..3dc6dc4 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -15,7 +15,7 @@ export class WebCrawler { private maxCrawledLinks: number; private maxCrawledDepth: number; private visited: Set = new Set(); - private crawledUrls: { url: string, html: string }[] = []; + private crawledUrls: Set<{ url: string, html: string }> = new Set(); private limit: number; private robotsTxtUrl: string; private robots: any; @@ -136,24 +136,24 @@ export class WebCrawler { inProgress?: (progress: Progress) => void ): Promise<{ url: string, html: string }[]> { const queue = async.queue(async (task: string, callback) => { - if (this.crawledUrls.length >= this.maxCrawledLinks) { + if (this.crawledUrls.size >= this.maxCrawledLinks) { if (callback && typeof callback === "function") { callback(); } return; } const newUrls = await this.crawl(task); - newUrls.forEach((page) => this.crawledUrls.push(page)); + newUrls.forEach((page) => this.crawledUrls.add(page)); if (inProgress && newUrls.length > 0) { inProgress({ - current: this.crawledUrls.length, + current: this.crawledUrls.size, total: this.maxCrawledLinks, status: "SCRAPING", currentDocumentUrl: newUrls[newUrls.length - 1].url, }); } else if (inProgress) { inProgress({ - current: this.crawledUrls.length, + current: this.crawledUrls.size, total: this.maxCrawledLinks, status: "SCRAPING", currentDocumentUrl: task, @@ -175,7 +175,7 @@ export class WebCrawler { } ); await queue.drain(); - return this.crawledUrls; + return Array.from(this.crawledUrls); } async crawl(url: string): Promise<{url: string, html: string}[]> { @@ -311,3 +311,4 @@ export class WebCrawler { } } + diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1eeb65f..1f5a785 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -277,8 +277,6 @@ export class WebScraperDataProvider { ): Promise { await this.setCachedDocuments(documents, links); documents = this.removeChildLinks(documents); - documents = this.filterDocsExcludeInclude(documents); - documents = this.filterDepth(documents); return documents.splice(0, this.limit); } From a0fdc6f7c6ec646f9a1627baf1afff314628b487 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 14 May 2024 12:12:40 -0700 Subject: [PATCH 21/47] Nick: --- apps/api/src/scraper/WebScraper/crawler.ts | 8 +++----- apps/api/src/scraper/WebScraper/index.ts | 3 +-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 3dc6dc4..521b1e1 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -15,7 +15,7 @@ export class WebCrawler { private maxCrawledLinks: number; private maxCrawledDepth: number; private visited: Set = new Set(); - private crawledUrls: Set<{ url: string, html: string }> = new Set(); + private crawledUrls: Map = new Map(); private limit: number; private robotsTxtUrl: string; private robots: any; @@ -143,7 +143,7 @@ export class WebCrawler { return; } const newUrls = await this.crawl(task); - newUrls.forEach((page) => this.crawledUrls.add(page)); + newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); if (inProgress && newUrls.length > 0) { inProgress({ current: this.crawledUrls.size, @@ -175,7 +175,7 @@ export class WebCrawler { } ); await queue.drain(); - return Array.from(this.crawledUrls); + return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } async crawl(url: string): Promise<{url: string, html: string}[]> { @@ -310,5 +310,3 @@ export class WebCrawler { return []; } } - - diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1f5a785..13f39c2 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -176,9 +176,8 @@ export class WebScraperDataProvider { if (this.crawlerMode === "fast" && links.some((link) => link.html)) { console.log("Fast mode enabled"); documents = await this.processLinks(allLinks, inProgress, allHtmls); - }else{ - documents = await this.convertUrlsToDocuments(allLinks, inProgress, allHtmls); + documents = await this.processLinks(allLinks, inProgress); } return this.cacheAndFinalizeDocuments(documents, allLinks); From 27e1e22a0abdd49ebcb9574f24c5934e19240241 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 14 May 2024 12:28:25 -0700 Subject: [PATCH 22/47] Update index.test.ts --- .../src/__tests__/e2e_withAuth/index.test.ts | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 0e2caeb..35ae746 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -511,6 +511,107 @@ describe("E2E Tests for API Routes", () => { // }, 120000); // 120 secs // }); + describe("POST /v0/crawl with fast mode", () => { + it("should complete the crawl under 20 seconds", async () => { + const startTime = Date.now(); + + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://flutterbricks.com", + crawlerOptions: { + mode: "fast" + } + }); + + expect(crawlResponse.statusCode).toBe(200); + + const jobId = crawlResponse.body.jobId; + let statusResponse; + let isFinished = false; + + while (!isFinished) { + statusResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(statusResponse.statusCode).toBe(200); + isFinished = statusResponse.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const endTime = Date.now(); + const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + + console.log(`Time elapsed: ${timeElapsed} seconds`); + + expect(statusResponse.body.status).toBe("completed"); + expect(statusResponse.body).toHaveProperty("data"); + expect(statusResponse.body.data[0]).toHaveProperty("content"); + expect(statusResponse.body.data[0]).toHaveProperty("markdown"); + const results = statusResponse.body.data; + // results.forEach((result, i) => { + // console.log(result.metadata.sourceURL); + // }); + expect(results.length).toBeGreaterThanOrEqual(10); + expect(results.length).toBeLessThanOrEqual(15); + + }, 20000); + + // it("should complete the crawl in more than 10 seconds", async () => { + // const startTime = Date.now(); + + // const crawlResponse = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://flutterbricks.com", + // }); + + // expect(crawlResponse.statusCode).toBe(200); + + // const jobId = crawlResponse.body.jobId; + // let statusResponse; + // let isFinished = false; + + // while (!isFinished) { + // statusResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + // expect(statusResponse.statusCode).toBe(200); + // isFinished = statusResponse.body.status === "completed"; + + // if (!isFinished) { + // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + // } + // } + + // const endTime = Date.now(); + // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + + // console.log(`Time elapsed: ${timeElapsed} seconds`); + + // expect(statusResponse.body.status).toBe("completed"); + // expect(statusResponse.body).toHaveProperty("data"); + // expect(statusResponse.body.data[0]).toHaveProperty("content"); + // expect(statusResponse.body.data[0]).toHaveProperty("markdown"); + // const results = statusResponse.body.data; + // // results.forEach((result, i) => { + // // console.log(result.metadata.sourceURL); + // // }); + // expect(results.length).toBeGreaterThanOrEqual(10); + // expect(results.length).toBeLessThanOrEqual(15); + + // }, 50000);// 15 seconds timeout to account for network delays + }); + describe("GET /is-production", () => { it("should return the production status", async () => { const response = await request(TEST_URL).get("/is-production"); From 87570bdfa1dab843710352098d19bd687acdf3c0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 11:06:03 -0700 Subject: [PATCH 23/47] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 13f39c2..bdc7483 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -155,22 +155,16 @@ export class WebScraperDataProvider { limit: this.limit, generateImgAltText: this.generateImgAltText, }); - let start = Date.now(); + let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); - console.log(links.length) - let end = Date.now(); - console.log("Crawl end in seconds ", (end - start) / 1000); + const allLinks = links.map((e) => e.url); const allHtmls = links.map((e)=> e.html); - console.log("All links", allLinks.length); - console.log("All htmls", allHtmls.length); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(allLinks , inProgress); } - - let fastDocs = [] let documents = []; // check if fast mode is enabled and there is html inside the links if (this.crawlerMode === "fast" && links.some((link) => link.html)) { From d10f81e7feecf2250b4ca102899dcc33660468bd Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 11:28:20 -0700 Subject: [PATCH 24/47] Nick: fixes --- apps/api/src/scraper/WebScraper/index.ts | 4 ++-- apps/api/src/scraper/WebScraper/single_url.ts | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index bdc7483..0a86a90 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -71,8 +71,8 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const existingText = allHtmls ? allHtmls[i + index] : ""; - const result = await scrapSingleUrl(url, this.pageOptions, existingText); + const existingHTML = allHtmls ? allHtmls[i + index] : ""; + const result = await scrapSingleUrl(url, this.pageOptions, existingHTML); processedUrls++; if (inProgress) { inProgress({ diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c41beb5..4bbaee7 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -107,7 +107,7 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, - existingText: string = "" + existingHtml: string = "" ): Promise { urlToScrap = urlToScrap.trim(); @@ -199,8 +199,10 @@ export async function scrapSingleUrl( for (const scraper of scrapersInOrder) { // If exists text coming from crawler, use it - if (existingText && existingText.trim().length >= 100) { - text = existingText; + if (existingHtml && existingHtml.trim().length >= 100) { + let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions); + text = await parseMarkdown(cleanedHtml); + html = existingHtml; break; } [text, html] = await attemptScraping(urlToScrap, scraper); From 1b0d6341d3e5126fd5e7dbe3e9b997becd249aae Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 11:48:12 -0700 Subject: [PATCH 25/47] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 0a86a90..c95e889 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -17,20 +17,7 @@ import { } from "./utils/replacePaths"; import { generateCompletions } from "../../lib/LLM-extraction"; import { getWebScraperQueue } from "../../../src/services/queue-service"; -import { parseMarkdown } from "../../lib/html-to-markdown"; -import cheerio from "cheerio"; -import { excludeNonMainTags } from "./utils/excludeTags"; -const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { - const soup = cheerio.load(html); - soup("script, style, iframe, noscript, meta, head").remove(); - if (pageOptions.onlyMainContent) { - // remove any other tags that are not in the main content - excludeNonMainTags.forEach((tag) => { - soup(tag).remove(); - }); - } - return soup.html(); -}; + export class WebScraperDataProvider { private bullJobId: string; private urls: string[] = [""]; From 4925ee59f60e442995fd6711aabfa1f50d8c12e9 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 15 May 2024 15:50:50 -0300 Subject: [PATCH 26/47] added crawl test suite --- .../src/__tests__/e2e_withAuth/index.test.ts | 325 +++++++++++++----- apps/test-suite/data/crawl.json | 226 ++++++++++++ .../data/{websites.json => scrape.json} | 0 apps/test-suite/package.json | 4 +- apps/test-suite/tests/crawl.test.ts | 148 ++++++++ .../{index.test.ts => tests/scrape.test.ts} | 19 +- apps/test-suite/tsconfig.json | 2 +- 7 files changed, 621 insertions(+), 103 deletions(-) create mode 100644 apps/test-suite/data/crawl.json rename apps/test-suite/data/{websites.json => scrape.json} (100%) create mode 100644 apps/test-suite/tests/crawl.test.ts rename apps/test-suite/{index.test.ts => tests/scrape.test.ts} (93%) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 0e2caeb..e21e07d 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -146,7 +146,241 @@ describe("E2E Tests for API Routes", () => { ); }); - // Additional tests for insufficient credits? + it("should return a successful response with a valid API key and valid includes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + includes: ["/blog/*"], + }, + }); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + console.log({url}) + expect(url.startsWith("https://mendable.ai/blog/")).toBeTruthy(); + }); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + }, 60000); // 60 seconds + + it("should return a successful response with a valid API key and valid excludes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + excludes: ["/blog/*"], + }, + }); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://mendable.ai/blog/")).toBeFalsy(); + }); + }, 60000); // 60 seconds + + it("should return a successful response with a valid API key and valid excludes option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 3, + }); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data.length).toBe(3); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + }, 60000); // 60 seconds + + it("should return a successful response with max depth option for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 2 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + // wait for 60 seconds + await new Promise((r) => setTimeout(r, 60000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const depth = new URL(url).pathname.split("/").filter(Boolean).length; + expect(depth).toBeLessThanOrEqual(1); + }); + }, 120000); + + it("should return a successful response with a valid API key and valid onlyMainContent option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + }); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data.length).toBe(3); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].content).not.toContain("main menu"); + }, 60000); // 60 seconds + + it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + pageOptions: { includeHtml: true }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + + // 120 seconds + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); + expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); + expect(completedResponse.body.data[0].html).toContain(" { @@ -248,7 +482,7 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(404); }); - it("should return a successful response for a valid crawl job", async () => { + it("should return a successful crawl status response for a valid crawl job", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -278,90 +512,7 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); }, 60000); // 60 seconds - - it("should return a successful response with max depth option for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 2 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); - // wait for 60 seconds - await new Promise((r) => setTimeout(r, 60000)); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(1); - - // Check if all URLs have a maximum depth of 1 - urls.forEach((url) => { - const depth = new URL(url).pathname.split("/").filter(Boolean).length; - expect(depth).toBeLessThanOrEqual(1); - }); - }, 120000); - - it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://firecrawl.dev", - pageOptions: { includeHtml: true }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); - - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); - - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - - // 120 seconds - expect(completedResponse.body.data[0]).toHaveProperty("html"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); - expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); - expect(completedResponse.body.data[0].html).toContain(" { const crawlResponse = await request(TEST_URL) @@ -371,8 +522,6 @@ describe("E2E Tests for API Routes", () => { .send({ url: "https://jestjs.io" }); expect(crawlResponse.statusCode).toBe(200); - - // wait for 30 seconds await new Promise((r) => setTimeout(r, 10000)); diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json new file mode 100644 index 0000000..8577a6e --- /dev/null +++ b/apps/test-suite/data/crawl.json @@ -0,0 +1,226 @@ +[ + { + "website": "https://www.anthropic.com/claude", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://mendable.ai/pricing", + "expected_min_num_of_pages": 29, + "expected_crawled_pages": [ + "https://mendable.ai/", + "https://mendable.ai/blog", + "https://mendable.ai/signin", + "https://mendable.ai/signup", + "https://mendable.ai", + "https://mendable.ai/usecases/sales-enablement", + "https://mendable.ai/usecases/documentation", + "https://mendable.ai/usecases/cs-enablement", + "https://mendable.ai/usecases/productcopilot", + "https://mendable.ai/security" + ], + "notes": "This one should not go backwards, but it does!" + }, + { + "website": "https://openai.com/news", + "expected_min_num_of_pages": 59, + "expected_crawled_pages": [ + "https://openai.com/news/company/", + "https://openai.com/news/research/", + "https://openai.com/news/safety-and-alignment/", + "https://openai.com/news/stories/" + ] + }, + { + "website": "https://agentops.ai", + "expected_min_num_of_pages": 7, + "expected_crawled_pages": [ + "https://www.agentops.ai/blog/effortless-hr-management-with-saas", + "https://www.agentops.ai/blog/streamlining-hr-with-saas", + "https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions", + "https://www.agentops.ai/blog/efficient-hr-operations-with-saas", + "https://www.agentops.ai/blog/hr-made-simple-with-saas", + "https://www.agentops.ai/about-us", + "https://www.agentops.ai/contact-us" + ] + }, + { + "website": "https://ycombinator.com/companies", + "expected_min_num_of_pages": 45, + "expected_crawled_pages": [ + "https://www.ycombinator.com/companies/industry/elearning", + "https://www.ycombinator.com/companies/industry/computer-vision", + "https://www.ycombinator.com/companies/industry/health-tech", + "https://www.ycombinator.com/companies/industry/education", + "https://www.ycombinator.com/companies/industry/robotics", + "https://www.ycombinator.com/companies/industry/hardware", + "https://www.ycombinator.com/companies/industry/saas", + "https://www.ycombinator.com/companies/industry/hard-tech", + "https://www.ycombinator.com/companies/industry/developer-tools", + "https://www.ycombinator.com/companies/industry/entertainment", + "https://www.ycombinator.com/companies/industry/finance", + "https://www.ycombinator.com/companies/industry/generative-ai", + "https://www.ycombinator.com/companies/industry/machine-learning" + ] + }, + { + "website": "https://firecrawl.dev", + "expected_min_num_of_pages": 2, + "expected_crawled_pages": [ + "https://firecrawl.dev/", + "https://firecrawl.dev/pricing" + ] + }, + { + "website": "https://en.wikipedia.org/wiki/T._N._Seshan", + "expected_min_num_of_pages": 100, + "expected_crawled_pages": [ + "https://en.wikipedia.org/wiki/Wikipedia:Contents", + "https://en.wikipedia.org/wiki/Wikipedia:Contact_us", + "https://en.wikipedia.org/wiki/V._S._Ramadevi", + "https://en.wikipedia.org/wiki/Wikipedia:About", + "https://en.wikipedia.org/wiki/Help:Introduction", + "https://en.wikipedia.org/wiki/H._D._Deve_Gowda", + "https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg" + ] + }, + { + "website": "https://mendable.ai/blog", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.framer.com/pricing", + "expected_min_num_of_pages": 58, + "expected_crawled_pages": [ + "https://www.framer.com/features/navigation/", + "https://www.framer.com/contact/", + "https://www.framer.com/add-ons/", + "https://www.framer.com/free-saas-ui-kit/", + "https://www.framer.com/help/", + "https://www.framer.com/features/effects/", + "https://www.framer.com/enterprise/", + "https://www.framer.com/templates/" + ] + }, + { + "website": "https://fly.io/docs/gpus/gpu-quickstart", + "expected_min_num_of_pages": 39, + "expected_crawled_pages": [ + "https://fly.io/docs/getting-started/", + "https://fly.io/docs/hands-on/", + "https://fly.io/docs/about/support/", + "https://fly.io/docs/blueprints/going-to-production-with-healthcare-apps/", + "https://fly.io/docs/machines/flyctl/fly-machine-update/", + "https://fly.io/docs/blueprints/review-apps-guide/", + "https://fly.io/docs/blueprints/supercronic/" + ], + "notes": "This one should not go backwards, but it does!" + }, + { + "website": "https://news.ycombinator.com/", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.vellum.ai/llm-leaderboard", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.bigbadtoystore.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.instructables.com", + "expected_min_num_of_pages": 78, + "expected_crawled_pages": [ + "https://www.instructables.com/circuits/", + "https://www.instructables.com/circuits/apple/projects/", + "https://www.instructables.com/circuits/art/projects/", + "https://www.instructables.com/circuits/electronics/projects/", + "https://www.instructables.com/circuits/microsoft/projects/", + "https://www.instructables.com/circuits/microcontrollers/projects/", + "https://www.instructables.com/circuits/community/", + "https://www.instructables.com/circuits/leds/projects/", + "https://www.instructables.com/circuits/gadgets/projects/", + "https://www.instructables.com/circuits/arduino/projects/", + "https://www.instructables.com/circuits/lasers/projects/", + "https://www.instructables.com/circuits/clocks/projects/" + ] + }, + { + "website": "https://www.powells.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.royalacademy.org.uk", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.eastbaytimes.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://www.manchestereveningnews.co.uk", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://physicsworld.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + }, + { + "website": "https://richmondconfidential.org", + "expected_min_num_of_pages": 50, + "expected_crawled_pages": [ + "https://richmondconfidential.org/2009/10/13/salesians-star-guard-has-a-big-impact/", + "https://richmondconfidential.org/2009/10/13/on-team-of-beginners-oilers-old-hand-stands-out/", + "https://richmondconfidential.org/2009/10/19/point-richmond-clockmaker-turns-clutter-into-crafts/", + "https://richmondconfidential.org/2009/10/13/profile-maurice-cathy/", + "https://richmondconfidential.org/2009/10/13/soul-food-rescue-mission-rebuilds-diets-and-lives/", + "https://richmondconfidential.org/2009/10/21/in-tough-economy-pain-trickles-to-the-bottom/", + "https://richmondconfidential.org/2009/10/19/richmond-homicide-map/", + "https://richmondconfidential.org/2009/10/13/rough-roads-for-richmonds-cab-drivers/", + "https://richmondconfidential.org/2009/10/13/before-napa-there-was-winehaven/", + "https://richmondconfidential.org/2009/10/13/family-calls-for-end-to-violence-at-memorial-for-slain-woman-friend/" + ] + }, + { + "website": "https://www.techinasia.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""], + "notes": "The website has a paywall and bot detectors." + }, + { + "website": "https://www.boardgamegeek.com", + "expected_min_num_of_pages": 15, + "expected_crawled_pages": [ + "https://www.boardgamegeek.com/browse/boardgameartist", + "https://www.boardgamegeek.com/browse/boardgamehonor", + "https://www.boardgamegeek.com/browse/boardgamepublisher", + "https://www.boardgamegeek.com/browse/boardgamepodcast", + "https://www.boardgamegeek.com/wiki/page/Index", + "https://www.boardgamegeek.com/browse/boardgamecategory", + "https://www.boardgamegeek.com/boardgame/random", + "https://www.boardgamegeek.com/browse/boardgamemechanic", + "https://www.boardgamegeek.com/forums", + "https://www.boardgamegeek.com/gonecardboard", + "https://www.boardgamegeek.com/browse/boardgameaccessory", + "https://www.boardgamegeek.com/browse/boardgamedesigner", + "https://www.boardgamegeek.com/", + "https://www.boardgamegeek.com/previews", + "https://www.boardgamegeek.com/browse/boardgame" + ] + }, + { + "website": "https://www.mountainproject.com", + "expected_min_num_of_pages": 0, + "expected_crawled_pages": [""] + } +] diff --git a/apps/test-suite/data/websites.json b/apps/test-suite/data/scrape.json similarity index 100% rename from apps/test-suite/data/websites.json rename to apps/test-suite/data/scrape.json diff --git a/apps/test-suite/package.json b/apps/test-suite/package.json index 74ab7a6..33aa2cd 100644 --- a/apps/test-suite/package.json +++ b/apps/test-suite/package.json @@ -3,7 +3,9 @@ "version": "1.0.0", "description": "", "scripts": { - "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false" + "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false", + "test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts", + "test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts" }, "author": "", "license": "ISC", diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts new file mode 100644 index 0000000..b56a76e --- /dev/null +++ b/apps/test-suite/tests/crawl.test.ts @@ -0,0 +1,148 @@ +import request from "supertest"; +import dotenv from "dotenv"; +import { WebsiteScrapeError } from "../utils/types"; +import { logErrors } from "../utils/log"; + +import websitesData from "../data/crawl.json"; +import "dotenv/config"; + +import fs from 'fs'; +dotenv.config(); + +interface WebsiteData { + website: string; + expected_min_num_of_pages: number; + expected_crawled_pages: string[]; +} + +const TEST_URL = "http://127.0.0.1:3002"; + +describe("Crawling Checkup (E2E)", () => { + beforeAll(() => { + if (!process.env.TEST_API_KEY) { + throw new Error("TEST_API_KEY is not set"); + } + }); + + describe("Crawling website tests with a dataset", () => { + it("Should crawl the website and verify the response", async () => { + let passedTests = 0; + const batchSize = 15; + const batchPromises = []; + const startTime = new Date().getTime(); + const date = new Date(); + const logsDir = `logs/${date.getMonth() + 1}-${date.getDate()}-${date.getFullYear()}`; + + let errorLogFileName = `${logsDir}/run.log_${new Date().toTimeString().split(' ')[0]}`; + const errorLog: WebsiteScrapeError[] = []; + + for (let i = 0; i < websitesData.length; i += batchSize) { + await new Promise(resolve => setTimeout(resolve, 10000)); + + const batch = websitesData.slice(i, i + batchSize); + const batchPromise = Promise.all( + batch.map(async (websiteData: WebsiteData) => { + try { + const crawlResponse = await request(TEST_URL || "") + .post("/v0/crawl") + .set("Content-Type", "application/json") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100 }}); + + await new Promise(resolve => setTimeout(resolve, 300000)); // wait for 300 seconds + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + console.log('-------------------') + console.log(websiteData.website); + + if (!completedResponse.body.data) { + console.log(completedResponse.body.partial_data.length); + const urls = completedResponse.body.partial_data.map((page: any) => page.metadata?.sourceURL); + console.log(urls); + } else { + console.log(completedResponse.body.data.length); + const urls = completedResponse.body.data.map((page: any) => page.metadata?.sourceURL); + console.log(urls); + } + + console.log('-------------------') + + // if (!completedResponse.body || completedResponse.body.status !== "completed") { + // errorLog.push({ + // website: websiteData.website, + // prompt: 'CRAWL', + // expected_output: 'SUCCESS', + // actual_output: 'FAILURE', + // error: `Crawl job did not complete successfully.` + // }); + // return null; + // } + + // // check how many webpages were crawled successfully + // // compares with expected_num_of_pages + // if (completedResponse.body.data.length < websiteData.expected_min_num_of_pages) { + // errorLog.push({ + // website: websiteData.website, + // prompt: 'CRAWL', + // expected_output: `SUCCESS: ${websiteData.expected_min_num_of_pages}`, + // actual_output: `FAILURE: ${completedResponse.body.data.length}`, + // error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}` + // }); + // return null; + // } + + // // checks if crawled pages contain expected_crawled_pages + // if (websiteData.expected_crawled_pages.some(page => !completedResponse.body.data.includes(page))) { + // errorLog.push({ + // website: websiteData.website, + // prompt: 'CRAWL', + // expected_output: `SUCCESS: ${websiteData.expected_crawled_pages}`, + // actual_output: `FAILURE: ${completedResponse.body.data}`, + // error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}` + // }); + // return null; + // } + + passedTests++; + return { + website: websiteData.website, + statusCode: completedResponse.statusCode, + }; + } catch (error) { + console.error(`Error processing ${websiteData.website}: ${error}`); + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: 'SUCCESS', + actual_output: 'FAILURE', + error: `Error processing ${websiteData.website}: ${error}` + }); + return null; + } + }) + ); + batchPromises.push(batchPromise); + } + + (await Promise.all(batchPromises)).flat(); + const score = (passedTests / websitesData.length) * 100; + const endTime = new Date().getTime(); + const timeTaken = (endTime - startTime) / 1000; + console.log(`Score: ${score}%`); + + await logErrors(errorLog, timeTaken, 0, score, websitesData.length); + + if (process.env.ENV === "local" && errorLog.length > 0) { + if (!fs.existsSync(logsDir)){ + fs.mkdirSync(logsDir, { recursive: true }); + } + fs.writeFileSync(errorLogFileName, JSON.stringify(errorLog, null, 2)); + } + + expect(score).toBeGreaterThanOrEqual(95); + }, 350000); // 150 seconds timeout + }); +}); \ No newline at end of file diff --git a/apps/test-suite/index.test.ts b/apps/test-suite/tests/scrape.test.ts similarity index 93% rename from apps/test-suite/index.test.ts rename to apps/test-suite/tests/scrape.test.ts index 8d6c31f..3f421dc 100644 --- a/apps/test-suite/index.test.ts +++ b/apps/test-suite/tests/scrape.test.ts @@ -1,16 +1,14 @@ import request from "supertest"; import dotenv from "dotenv"; -import Anthropic from "@anthropic-ai/sdk"; -import { numTokensFromString } from "./utils/tokens"; +import { numTokensFromString } from "../utils/tokens"; import OpenAI from "openai"; -import { WebsiteScrapeError } from "./utils/types"; -import { logErrors } from "./utils/log"; +import { WebsiteScrapeError } from "../utils/types"; +import { logErrors } from "../utils/log"; -const websitesData = require("./data/websites.json"); +import websitesData from "../data/scrape.json"; import "dotenv/config"; -const fs = require('fs'); - +import fs from 'fs'; dotenv.config(); interface WebsiteData { @@ -21,8 +19,7 @@ interface WebsiteData { const TEST_URL = "http://127.0.0.1:3002"; - -describe("Scraping/Crawling Checkup (E2E)", () => { +describe("Scraping Checkup (E2E)", () => { beforeAll(() => { if (!process.env.TEST_API_KEY) { throw new Error("TEST_API_KEY is not set"); @@ -72,10 +69,6 @@ describe("Scraping/Crawling Checkup (E2E)", () => { return null; } - const anthropic = new Anthropic({ - apiKey: process.env.ANTHROPIC_API_KEY, - }); - const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, }); diff --git a/apps/test-suite/tsconfig.json b/apps/test-suite/tsconfig.json index e075f97..afa29e7 100644 --- a/apps/test-suite/tsconfig.json +++ b/apps/test-suite/tsconfig.json @@ -39,7 +39,7 @@ // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ - // "resolveJsonModule": true, /* Enable importing .json files. */ + "resolveJsonModule": true, /* Enable importing .json files. */ // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ From fd82982a3198e68a136c2f8ce99a89639ee495d5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 12:11:16 -0700 Subject: [PATCH 27/47] Nick: --- apps/api/openapi.json | 121 +++++++++++++++++++++++++++++++++- apps/test-suite/index.test.ts | 2 +- 2 files changed, 120 insertions(+), 3 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 127fe51..b0f8b99 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -18,8 +18,8 @@ "paths": { "/scrape": { "post": { - "summary": "Scrape a single URL", - "operationId": "scrapeSingleUrl", + "summary": "Scrape a single URL and optionally extract information using an LLM", + "operationId": "scrapeAndExtractFromUrl", "tags": ["Scraping"], "security": [ { @@ -45,8 +45,43 @@ "type": "boolean", "description": "Only return the main content of the page excluding headers, navs, footers, etc.", "default": false + }, + "includeHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a html key in the response.", + "default": false } } + }, + "extractorOptions": { + "type": "object", + "description": "Options for LLM-based extraction of structured information from the page content", + "properties": { + "mode": { + "type": "string", + "enum": ["llm-extraction"], + "description": "The extraction mode to use, currently supports 'llm-extraction'" + }, + "extractionPrompt": { + "type": "string", + "description": "A prompt describing what information to extract from the page" + }, + "extractionSchema": { + "type": "object", + "additionalProperties": true, + "description": "The schema for the data to be extracted", + "required": [ + "company_mission", + "supports_sso", + "is_open_source" + ] + } + } + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds for the request", + "default": 30000 } }, "required": ["url"] @@ -126,6 +161,16 @@ "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", "default": false }, + "maxDepth": { + "type": "integer", + "description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on." + }, + "mode": { + "type": "string", + "enum": ["default", "fast"], + "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", + "default": "default" + }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl", @@ -140,6 +185,11 @@ "type": "boolean", "description": "Only return the main content of the page excluding headers, navs, footers, etc.", "default": false + }, + "includeHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a html key in the response.", + "default": false } } } @@ -206,6 +256,11 @@ "type": "boolean", "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", "default": true + }, + "includeHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a html key in the response.", + "default": false } } }, @@ -302,6 +357,63 @@ "$ref": "#/components/schemas/ScrapeResponse" }, "description": "Data returned from the job (null when it is in progress)" + }, + "partial_data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ScrapeResponse" + }, + "description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled." + } + } + } + } + } + }, + "402": { + "description": "Payment required" + }, + "429": { + "description": "Too many requests" + }, + "500": { + "description": "Server error" + } + } + } + }, + "/crawl/cancel/{jobId}": { + "delete": { + "tags": ["Crawl"], + "summary": "Cancel a crawl job", + "operationId": "cancelCrawlJob", + "security": [ + { + "bearerAuth": [] + } + ], + "parameters": [ + { + "name": "jobId", + "in": "path", + "description": "ID of the crawl job", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "Returns cancelled." } } } @@ -344,6 +456,11 @@ "content": { "type": "string" }, + "html": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeHtml` is true" + }, "metadata": { "type": "object", "properties": { diff --git a/apps/test-suite/index.test.ts b/apps/test-suite/index.test.ts index 8d6c31f..7b38791 100644 --- a/apps/test-suite/index.test.ts +++ b/apps/test-suite/index.test.ts @@ -183,7 +183,7 @@ describe("Scraping/Crawling Checkup (E2E)", () => { } - expect(score).toBeGreaterThanOrEqual(75); + expect(score).toBeGreaterThanOrEqual(70); }, 350000); // 150 seconds timeout }); }); From 4745d114be3123ff9aa1d0fb98d0e1fe41995562 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 12:42:14 -0700 Subject: [PATCH 28/47] Update crawl.test.ts --- apps/test-suite/tests/crawl.test.ts | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts index b56a76e..cdf0945 100644 --- a/apps/test-suite/tests/crawl.test.ts +++ b/apps/test-suite/tests/crawl.test.ts @@ -49,14 +49,29 @@ describe("Crawling Checkup (E2E)", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100 }}); - await new Promise(resolve => setTimeout(resolve, 300000)); // wait for 300 seconds + const jobId = crawlResponse.body.jobId; + let completedResponse; + let isFinished = false; - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + while (!isFinished) { + completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + isFinished = completedResponse.body.status === "completed"; + + if (!isFinished) { + await new Promise(resolve => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } console.log('-------------------') console.log(websiteData.website); + if(!completedResponse) { + // fail the test + console.log('No response'); + return null; + } if (!completedResponse.body.data) { console.log(completedResponse.body.partial_data.length); From 58053eb423335b2f3504990f6f95ec16f02b8dd8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 12:47:35 -0700 Subject: [PATCH 29/47] Update rate-limiter.ts --- apps/api/src/services/rate-limiter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 5bc9acb..34c243b 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -43,7 +43,7 @@ export const crawlStatusRateLimiter = new RateLimiterRedis({ export const testSuiteRateLimiter = new RateLimiterRedis({ storeClient: redisClient, keyPrefix: "middleware", - points: 1000, + points: 100000, duration: 60, // Duration in seconds }); From 499671c87f2cbb560a8c783c0b1bd27af2640fd1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 12:50:13 -0700 Subject: [PATCH 30/47] Update crawl.test.ts --- apps/test-suite/tests/crawl.test.ts | 152 ++++++++++------------------ 1 file changed, 51 insertions(+), 101 deletions(-) diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts index cdf0945..ff9c212 100644 --- a/apps/test-suite/tests/crawl.test.ts +++ b/apps/test-suite/tests/crawl.test.ts @@ -27,8 +27,6 @@ describe("Crawling Checkup (E2E)", () => { describe("Crawling website tests with a dataset", () => { it("Should crawl the website and verify the response", async () => { let passedTests = 0; - const batchSize = 15; - const batchPromises = []; const startTime = new Date().getTime(); const date = new Date(); const logsDir = `logs/${date.getMonth() + 1}-${date.getDate()}-${date.getFullYear()}`; @@ -36,113 +34,65 @@ describe("Crawling Checkup (E2E)", () => { let errorLogFileName = `${logsDir}/run.log_${new Date().toTimeString().split(' ')[0]}`; const errorLog: WebsiteScrapeError[] = []; - for (let i = 0; i < websitesData.length; i += batchSize) { + for (const websiteData of websitesData) { await new Promise(resolve => setTimeout(resolve, 10000)); - const batch = websitesData.slice(i, i + batchSize); - const batchPromise = Promise.all( - batch.map(async (websiteData: WebsiteData) => { - try { - const crawlResponse = await request(TEST_URL || "") - .post("/v0/crawl") - .set("Content-Type", "application/json") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100 }}); + try { + const crawlResponse = await request(TEST_URL || "") + .post("/v0/crawl") + .set("Content-Type", "application/json") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100 }}); - const jobId = crawlResponse.body.jobId; - let completedResponse; - let isFinished = false; + const jobId = crawlResponse.body.jobId; + let completedResponse; + let isFinished = false; - while (!isFinished) { - completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + while (!isFinished) { + completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - isFinished = completedResponse.body.status === "completed"; + isFinished = completedResponse.body.status === "completed"; - if (!isFinished) { - await new Promise(resolve => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - - console.log('-------------------') - console.log(websiteData.website); - if(!completedResponse) { - // fail the test - console.log('No response'); - return null; - } - - if (!completedResponse.body.data) { - console.log(completedResponse.body.partial_data.length); - const urls = completedResponse.body.partial_data.map((page: any) => page.metadata?.sourceURL); - console.log(urls); - } else { - console.log(completedResponse.body.data.length); - const urls = completedResponse.body.data.map((page: any) => page.metadata?.sourceURL); - console.log(urls); - } - - console.log('-------------------') - - // if (!completedResponse.body || completedResponse.body.status !== "completed") { - // errorLog.push({ - // website: websiteData.website, - // prompt: 'CRAWL', - // expected_output: 'SUCCESS', - // actual_output: 'FAILURE', - // error: `Crawl job did not complete successfully.` - // }); - // return null; - // } - - // // check how many webpages were crawled successfully - // // compares with expected_num_of_pages - // if (completedResponse.body.data.length < websiteData.expected_min_num_of_pages) { - // errorLog.push({ - // website: websiteData.website, - // prompt: 'CRAWL', - // expected_output: `SUCCESS: ${websiteData.expected_min_num_of_pages}`, - // actual_output: `FAILURE: ${completedResponse.body.data.length}`, - // error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}` - // }); - // return null; - // } - - // // checks if crawled pages contain expected_crawled_pages - // if (websiteData.expected_crawled_pages.some(page => !completedResponse.body.data.includes(page))) { - // errorLog.push({ - // website: websiteData.website, - // prompt: 'CRAWL', - // expected_output: `SUCCESS: ${websiteData.expected_crawled_pages}`, - // actual_output: `FAILURE: ${completedResponse.body.data}`, - // error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}` - // }); - // return null; - // } - - passedTests++; - return { - website: websiteData.website, - statusCode: completedResponse.statusCode, - }; - } catch (error) { - console.error(`Error processing ${websiteData.website}: ${error}`); - errorLog.push({ - website: websiteData.website, - prompt: 'CRAWL', - expected_output: 'SUCCESS', - actual_output: 'FAILURE', - error: `Error processing ${websiteData.website}: ${error}` - }); - return null; + if (!isFinished) { + await new Promise(resolve => setTimeout(resolve, 1000)); // Wait for 1 second before checking again } - }) - ); - batchPromises.push(batchPromise); + } + + console.log('-------------------') + console.log(websiteData.website); + if(!completedResponse) { + // fail the test + console.log('No response'); + continue; + } + + if (!completedResponse.body.data) { + console.log(completedResponse.body.partial_data.length); + const urls = completedResponse.body.partial_data.map((page: any) => page.metadata?.sourceURL); + console.log(urls); + } else { + console.log(completedResponse.body.data.length); + const urls = completedResponse.body.data.map((page: any) => page.metadata?.sourceURL); + console.log(urls); + } + + console.log('-------------------') + + passedTests++; + } catch (error) { + console.error(`Error processing ${websiteData.website}: ${error}`); + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: 'SUCCESS', + actual_output: 'FAILURE', + error: `Error processing ${websiteData.website}: ${error}` + }); + } } - (await Promise.all(batchPromises)).flat(); const score = (passedTests / websitesData.length) * 100; const endTime = new Date().getTime(); const timeTaken = (endTime - startTime) / 1000; @@ -160,4 +110,4 @@ describe("Crawling Checkup (E2E)", () => { expect(score).toBeGreaterThanOrEqual(95); }, 350000); // 150 seconds timeout }); -}); \ No newline at end of file +}); From 98dd672d0a06700b9a517be53410f2f0731e6f7c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 12:55:04 -0700 Subject: [PATCH 31/47] Update crawl.json --- apps/test-suite/data/crawl.json | 46 --------------------------------- 1 file changed, 46 deletions(-) diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json index 8577a6e..28d436b 100644 --- a/apps/test-suite/data/crawl.json +++ b/apps/test-suite/data/crawl.json @@ -117,21 +117,11 @@ ], "notes": "This one should not go backwards, but it does!" }, - { - "website": "https://news.ycombinator.com/", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, { "website": "https://www.vellum.ai/llm-leaderboard", "expected_min_num_of_pages": 0, "expected_crawled_pages": [""] }, - { - "website": "https://www.bigbadtoystore.com", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, { "website": "https://www.instructables.com", "expected_min_num_of_pages": 78, @@ -150,31 +140,6 @@ "https://www.instructables.com/circuits/clocks/projects/" ] }, - { - "website": "https://www.powells.com", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, - { - "website": "https://www.royalacademy.org.uk", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, - { - "website": "https://www.eastbaytimes.com", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, - { - "website": "https://www.manchestereveningnews.co.uk", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, - { - "website": "https://physicsworld.com", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, { "website": "https://richmondconfidential.org", "expected_min_num_of_pages": 50, @@ -191,12 +156,6 @@ "https://richmondconfidential.org/2009/10/13/family-calls-for-end-to-violence-at-memorial-for-slain-woman-friend/" ] }, - { - "website": "https://www.techinasia.com", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""], - "notes": "The website has a paywall and bot detectors." - }, { "website": "https://www.boardgamegeek.com", "expected_min_num_of_pages": 15, @@ -217,10 +176,5 @@ "https://www.boardgamegeek.com/previews", "https://www.boardgamegeek.com/browse/boardgame" ] - }, - { - "website": "https://www.mountainproject.com", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] } ] From f15b8f855e7152f7672ebce57fc42f43c81aaf4e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 12:57:24 -0700 Subject: [PATCH 32/47] Update crawl.json --- apps/test-suite/data/crawl.json | 5 ----- 1 file changed, 5 deletions(-) diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json index 28d436b..3a56131 100644 --- a/apps/test-suite/data/crawl.json +++ b/apps/test-suite/data/crawl.json @@ -1,9 +1,4 @@ [ - { - "website": "https://www.anthropic.com/claude", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, { "website": "https://mendable.ai/pricing", "expected_min_num_of_pages": 29, From 95ffaa22368371f4430440427b9cb507178d4ff9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 12:58:02 -0700 Subject: [PATCH 33/47] Update crawl.test.ts --- apps/test-suite/tests/crawl.test.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts index ff9c212..bbf4d4c 100644 --- a/apps/test-suite/tests/crawl.test.ts +++ b/apps/test-suite/tests/crawl.test.ts @@ -35,8 +35,6 @@ describe("Crawling Checkup (E2E)", () => { const errorLog: WebsiteScrapeError[] = []; for (const websiteData of websitesData) { - await new Promise(resolve => setTimeout(resolve, 10000)); - try { const crawlResponse = await request(TEST_URL || "") .post("/v0/crawl") From da8d94105de5a56c04ac98e09308872c53f4e4e3 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 15 May 2024 17:16:03 -0300 Subject: [PATCH 34/47] fixed for testing the crawl algorithm only --- apps/test-suite/tests/crawl.test.ts | 48 +++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts index bbf4d4c..85bcabe 100644 --- a/apps/test-suite/tests/crawl.test.ts +++ b/apps/test-suite/tests/crawl.test.ts @@ -40,10 +40,10 @@ describe("Crawling Checkup (E2E)", () => { .post("/v0/crawl") .set("Content-Type", "application/json") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100 }}); + .send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100, returnOnlyUrls: true }}); const jobId = crawlResponse.body.jobId; - let completedResponse; + let completedResponse: any; let isFinished = false; while (!isFinished) { @@ -58,25 +58,47 @@ describe("Crawling Checkup (E2E)", () => { } } - console.log('-------------------') - console.log(websiteData.website); if(!completedResponse) { // fail the test console.log('No response'); continue; } - if (!completedResponse.body.data) { - console.log(completedResponse.body.partial_data.length); - const urls = completedResponse.body.partial_data.map((page: any) => page.metadata?.sourceURL); - console.log(urls); - } else { - console.log(completedResponse.body.data.length); - const urls = completedResponse.body.data.map((page: any) => page.metadata?.sourceURL); - console.log(urls); + if (!completedResponse.body || completedResponse.body.status !== "completed") { + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: 'SUCCESS', + actual_output: 'FAILURE', + error: `Crawl job did not complete successfully.` + }); + return null; } - console.log('-------------------') + // check how many webpages were crawled successfully + // compares with expected_num_of_pages + if (completedResponse.body.data.length < websiteData.expected_min_num_of_pages) { + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: `SUCCESS: ${websiteData.expected_min_num_of_pages}`, + actual_output: `FAILURE: ${completedResponse.body.data.length}`, + error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}` + }); + return null; + } + + // checks if crawled pages contain expected_crawled_pages + if (websiteData.expected_crawled_pages.some(page => !completedResponse.body.data.some((d: { url: string }) => d.url === page))) { + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: `SUCCESS: ${websiteData.expected_crawled_pages}`, + actual_output: `FAILURE: ${completedResponse.body.data}`, + error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}` + }); + return null; + } passedTests++; } catch (error) { From eb36d4b3bdcaa2475c846af7ca5a217070993963 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 13:25:39 -0700 Subject: [PATCH 35/47] Update SELF_HOST.md --- SELF_HOST.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/SELF_HOST.md b/SELF_HOST.md index 8c3c0aa..bbce267 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,4 +1,7 @@ # Self-hosting Firecrawl +*We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version.* + +Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. ## Getting Started From fa014defc733c00ee200d064813cf51a0d7d7be4 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 15 May 2024 18:35:09 -0300 Subject: [PATCH 36/47] Fixing child links only bug --- apps/api/src/scraper/WebScraper/crawler.ts | 6 +++++- apps/api/src/scraper/WebScraper/index.ts | 14 +++++++++++++- apps/test-suite/data/crawl.json | 21 +++++++++------------ apps/test-suite/tests/crawl.test.ts | 22 ++++++++++++++++++---- 4 files changed, 45 insertions(+), 18 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 521b1e1..7cfd1be 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -88,6 +88,10 @@ export class WebCrawler { return false; } + if (!this.initialUrl.includes(link)) { + return false; + } + return true; }) .slice(0, limit); @@ -109,7 +113,7 @@ export class WebCrawler { const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { - const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); + let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); return filteredLinks.map(link => ({ url: link, html: "" })); } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index c95e889..cf074ec 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -145,12 +145,18 @@ export class WebScraperDataProvider { let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); - const allLinks = links.map((e) => e.url); + let allLinks = links.map((e) => e.url); const allHtmls = links.map((e)=> e.html); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(allLinks , inProgress); } + + allLinks = allLinks.filter(link => { + const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; + const normalizedLink = link.endsWith('/') ? link : `${link}/`; + return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl; + }); let documents = []; // check if fast mode is enabled and there is html inside the links @@ -175,6 +181,12 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { let links = await getLinksFromSitemap(this.urls[0]); + links = links.filter(link => { + const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; + const normalizedLink = link.endsWith('/') ? link : `${link}/`; + return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl; + }); + if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(links, inProgress); } diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json index 3a56131..d729644 100644 --- a/apps/test-suite/data/crawl.json +++ b/apps/test-suite/data/crawl.json @@ -2,7 +2,7 @@ { "website": "https://mendable.ai/pricing", "expected_min_num_of_pages": 29, - "expected_crawled_pages": [ + "expected_not_crawled_pages": [ "https://mendable.ai/", "https://mendable.ai/blog", "https://mendable.ai/signin", @@ -34,7 +34,9 @@ "https://www.agentops.ai/blog/streamlining-hr-with-saas", "https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions", "https://www.agentops.ai/blog/efficient-hr-operations-with-saas", - "https://www.agentops.ai/blog/hr-made-simple-with-saas", + "https://www.agentops.ai/blog/hr-made-simple-with-saas" + ], + "expected_not_crawled_pages": [ "https://www.agentops.ai/about-us", "https://www.agentops.ai/contact-us" ] @@ -69,7 +71,7 @@ { "website": "https://en.wikipedia.org/wiki/T._N._Seshan", "expected_min_num_of_pages": 100, - "expected_crawled_pages": [ + "expected_not_crawled_pages": [ "https://en.wikipedia.org/wiki/Wikipedia:Contents", "https://en.wikipedia.org/wiki/Wikipedia:Contact_us", "https://en.wikipedia.org/wiki/V._S._Ramadevi", @@ -79,15 +81,10 @@ "https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg" ] }, - { - "website": "https://mendable.ai/blog", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, { "website": "https://www.framer.com/pricing", "expected_min_num_of_pages": 58, - "expected_crawled_pages": [ + "expected_not_crawled_pages": [ "https://www.framer.com/features/navigation/", "https://www.framer.com/contact/", "https://www.framer.com/add-ons/", @@ -101,7 +98,7 @@ { "website": "https://fly.io/docs/gpus/gpu-quickstart", "expected_min_num_of_pages": 39, - "expected_crawled_pages": [ + "expected_not_crawled_pages": [ "https://fly.io/docs/getting-started/", "https://fly.io/docs/hands-on/", "https://fly.io/docs/about/support/", @@ -118,8 +115,8 @@ "expected_crawled_pages": [""] }, { - "website": "https://www.instructables.com", - "expected_min_num_of_pages": 78, + "website": "https://www.instructables.com/circuits", + "expected_min_num_of_pages": 12, "expected_crawled_pages": [ "https://www.instructables.com/circuits/", "https://www.instructables.com/circuits/apple/projects/", diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts index 85bcabe..3a4a35e 100644 --- a/apps/test-suite/tests/crawl.test.ts +++ b/apps/test-suite/tests/crawl.test.ts @@ -62,6 +62,7 @@ describe("Crawling Checkup (E2E)", () => { // fail the test console.log('No response'); continue; + // continue; } if (!completedResponse.body || completedResponse.body.status !== "completed") { @@ -72,7 +73,7 @@ describe("Crawling Checkup (E2E)", () => { actual_output: 'FAILURE', error: `Crawl job did not complete successfully.` }); - return null; + continue; } // check how many webpages were crawled successfully @@ -85,11 +86,11 @@ describe("Crawling Checkup (E2E)", () => { actual_output: `FAILURE: ${completedResponse.body.data.length}`, error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}` }); - return null; + continue; } // checks if crawled pages contain expected_crawled_pages - if (websiteData.expected_crawled_pages.some(page => !completedResponse.body.data.some((d: { url: string }) => d.url === page))) { + if (websiteData.expected_crawled_pages && websiteData.expected_crawled_pages.length > 0 && websiteData.expected_crawled_pages.some(page => !completedResponse.body.data?.some((d: { url: string }) => d.url === page))) { errorLog.push({ website: websiteData.website, prompt: 'CRAWL', @@ -97,7 +98,19 @@ describe("Crawling Checkup (E2E)", () => { actual_output: `FAILURE: ${completedResponse.body.data}`, error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}` }); - return null; + continue; + } + + // checks if crawled pages not contain expected_not_crawled_pages + if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) { + errorLog.push({ + website: websiteData.website, + prompt: 'CRAWL', + expected_output: `SUCCESS: ${websiteData.expected_not_crawled_pages}`, + actual_output: `FAILURE: ${completedResponse.body.data}`, + error: `Expected crawled pages to not contain ${websiteData.expected_not_crawled_pages}, but got ${completedResponse.body.data}` + }); + continue; } passedTests++; @@ -110,6 +123,7 @@ describe("Crawling Checkup (E2E)", () => { actual_output: 'FAILURE', error: `Error processing ${websiteData.website}: ${error}` }); + continue; } } From d91043376ce01b1ef8469bf3037cfe220452c5d4 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 15 May 2024 18:54:40 -0300 Subject: [PATCH 37/47] not working yet --- apps/api/src/scraper/WebScraper/index.ts | 16 ++++++++++------ apps/test-suite/tests/crawl.test.ts | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index cf074ec..7e19357 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -133,6 +133,7 @@ export class WebScraperDataProvider { private async handleCrawlMode( inProgress?: (progress: Progress) => void ): Promise { + console.log('??? >>>', this.urls[0]) const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, @@ -148,15 +149,16 @@ export class WebScraperDataProvider { let allLinks = links.map((e) => e.url); const allHtmls = links.map((e)=> e.html); - if (this.returnOnlyUrls) { - return this.returnOnlyUrlsResponse(allLinks , inProgress); - } - allLinks = allLinks.filter(link => { const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; const normalizedLink = link.endsWith('/') ? link : `${link}/`; - return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl; + return normalizedLink.startsWith(normalizedInitialUrl); }); + console.log('>>>>>??>?>?>?>?.', {allLinks}) + + if (this.returnOnlyUrls) { + return this.returnOnlyUrlsResponse(allLinks , inProgress); + } let documents = []; // check if fast mode is enabled and there is html inside the links @@ -184,9 +186,11 @@ export class WebScraperDataProvider { links = links.filter(link => { const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; const normalizedLink = link.endsWith('/') ? link : `${link}/`; - return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl; + return normalizedLink.startsWith(normalizedInitialUrl); }); + console.log('>>>>>??>?>?>?>?.', {links}) + if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(links, inProgress); } diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts index 3a4a35e..853379b 100644 --- a/apps/test-suite/tests/crawl.test.ts +++ b/apps/test-suite/tests/crawl.test.ts @@ -102,7 +102,7 @@ describe("Crawling Checkup (E2E)", () => { } // checks if crawled pages not contain expected_not_crawled_pages - if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) { + if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && completedResponse.body.data && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) { errorLog.push({ website: websiteData.website, prompt: 'CRAWL', From bfccaf670d3ea00e6460c015b50367d019e322aa Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 15:30:37 -0700 Subject: [PATCH 38/47] Nick: fixes most of it --- apps/api/src/scraper/WebScraper/crawler.ts | 39 ++++++++++++++++++---- apps/api/src/scraper/WebScraper/index.ts | 33 +++++++++++------- apps/test-suite/data/crawl.json | 2 +- 3 files changed, 55 insertions(+), 19 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 7cfd1be..98a0738 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -76,9 +76,22 @@ export class WebCrawler { // Check if the link matches the include patterns, if any are specified if (this.includes.length > 0 && this.includes[0] !== "") { - return this.includes.some((includePattern) => + if (!this.includes.some((includePattern) => new RegExp(includePattern).test(path) - ); + )) { + return false; + } + } + + // Normalize the initial URL and the link to account for www and non-www versions + const normalizedInitialUrl = new URL(this.initialUrl); + const normalizedLink = new URL(link); + const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, ''); + const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); + + // Ensure the protocol and hostname match, and the path starts with the initial URL's path + if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { + return false; } const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; @@ -88,10 +101,6 @@ export class WebCrawler { return false; } - if (!this.initialUrl.includes(link)) { - return false; - } - return true; }) .slice(0, limit); @@ -109,11 +118,15 @@ export class WebCrawler { this.robots = robotsParser(this.robotsTxtUrl, response.data); } catch (error) { console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); + } + console.log("Initial URL: ", this.initialUrl); + const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); + console.log("Filtered links: ", filteredLinks.length); return filteredLinks.map(link => ({ url: link, html: "" })); } @@ -310,7 +323,21 @@ export class WebCrawler { } } catch (error) { // Error handling for failed sitemap fetch + // console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); } + + // If the first one doesn't work, try the base URL + const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; + try { + const response = await axios.get(baseUrlSitemap); + if (response.status === 200) { + return await getLinksFromSitemap(baseUrlSitemap); + } + } catch (error) { + // Error handling for failed base URL sitemap fetch + console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); + } + return []; } } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 7e19357..3ba5a1d 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -130,6 +130,21 @@ export class WebScraperDataProvider { } } + private async cleanIrrelevantPath(links: string[]){ + return links.filter(link => { + const normalizedInitialUrl = new URL(this.urls[0]); + const normalizedLink = new URL(link); + + // Normalize the hostname to account for www and non-www versions + const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, ''); + const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); + + // Ensure the protocol and hostname match, and the path starts with the initial URL's path + return linkHostname === initialHostname && + normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname); + }); + } + private async handleCrawlMode( inProgress?: (progress: Progress) => void ): Promise { @@ -149,11 +164,11 @@ export class WebScraperDataProvider { let allLinks = links.map((e) => e.url); const allHtmls = links.map((e)=> e.html); - allLinks = allLinks.filter(link => { - const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; - const normalizedLink = link.endsWith('/') ? link : `${link}/`; - return normalizedLink.startsWith(normalizedInitialUrl); - }); + console.log(">>>>>> all links >>>>", {allLinks}) + // allLinks = await this.cleanIrrelevantPath(allLinks); + + + console.log('>>>>>??>?>?>?>?.', {allLinks}) if (this.returnOnlyUrls) { @@ -183,13 +198,7 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { let links = await getLinksFromSitemap(this.urls[0]); - links = links.filter(link => { - const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`; - const normalizedLink = link.endsWith('/') ? link : `${link}/`; - return normalizedLink.startsWith(normalizedInitialUrl); - }); - - console.log('>>>>>??>?>?>?>?.', {links}) + links = await this.cleanIrrelevantPath(links); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(links, inProgress); diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json index d729644..651468a 100644 --- a/apps/test-suite/data/crawl.json +++ b/apps/test-suite/data/crawl.json @@ -27,7 +27,7 @@ ] }, { - "website": "https://agentops.ai", + "website": "https://agentops.ai/blog", "expected_min_num_of_pages": 7, "expected_crawled_pages": [ "https://www.agentops.ai/blog/effortless-hr-management-with-saas", From ade4e05cffefd6bf5e0be73a2b4e0afa7ebe3273 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 17:13:04 -0700 Subject: [PATCH 39/47] Nick: working --- apps/api/src/scraper/WebScraper/crawler.ts | 84 +++++++++++--- apps/api/src/scraper/WebScraper/index.ts | 67 ++++++----- apps/python-sdk/firecrawl/firecrawl.py | 4 +- apps/test-suite/data/crawl.json | 126 +++++++++++---------- apps/test-suite/tests/crawl.test.ts | 5 +- 5 files changed, 181 insertions(+), 105 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 98a0738..8449efb 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -121,12 +121,10 @@ export class WebCrawler { } - console.log("Initial URL: ", this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); - console.log("Filtered links: ", filteredLinks.length); return filteredLinks.map(link => ({ url: link, html: "" })); } @@ -142,6 +140,7 @@ export class WebCrawler { return [{ url: this.initialUrl, html: "" }]; } + // make sure to run include exclude here again const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); @@ -150,8 +149,9 @@ export class WebCrawler { private async crawlUrls( urls: string[], concurrencyLimit: number, - inProgress?: (progress: Progress) => void + inProgress?: (progress: Progress) => void, ): Promise<{ url: string, html: string }[]> { + console.log("Crawling URLs: ", urls); const queue = async.queue(async (task: string, callback) => { if (this.crawledUrls.size >= this.maxCrawledLinks) { if (callback && typeof callback === "function") { @@ -160,7 +160,20 @@ export class WebCrawler { return; } const newUrls = await this.crawl(task); + // add the initial url if not already added + // if (this.visited.size === 1) { + // let normalizedInitial = this.initialUrl; + // if (!normalizedInitial.endsWith("/")) { + // normalizedInitial = normalizedInitial + "/"; + // } + // if (!newUrls.some(page => page.url === this.initialUrl)) { + // newUrls.push({ url: this.initialUrl, html: "" }); + // } + // } + + newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); + if (inProgress && newUrls.length > 0) { inProgress({ current: this.crawledUrls.size, @@ -196,15 +209,21 @@ export class WebCrawler { } async crawl(url: string): Promise<{url: string, html: string}[]> { - if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) + if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){ return []; + } this.visited.add(url); + + if (!url.startsWith("http")) { url = "https://" + url; + } if (url.endsWith("/")) { url = url.slice(0, -1); + } + if (this.isFile(url) || this.isSocialMediaOrEmail(url)) { return []; } @@ -222,6 +241,13 @@ export class WebCrawler { const $ = load(content); let links: {url: string, html: string}[] = []; + // Add the initial URL to the list of links + if(this.visited.size === 1) + { + links.push({url, html: content}); + } + + $("a").each((_, element) => { const href = $(element).attr("href"); if (href) { @@ -245,6 +271,9 @@ export class WebCrawler { } }); + if(this.visited.size === 1){ + return links; + } // Create a new list to return to avoid modifying the visited list return links.filter((link) => !this.visited.has(link.url)); } catch (error) { @@ -312,32 +341,57 @@ export class WebCrawler { return socialMediaOrEmail.some((ext) => url.includes(ext)); } + // private async tryFetchSitemapLinks(url: string): Promise { + const normalizeUrl = (url: string) => { + url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + return url; + }; + const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; + + let sitemapLinks: string[] = []; + try { const response = await axios.get(sitemapUrl); if (response.status === 200) { - return await getLinksFromSitemap(sitemapUrl); + sitemapLinks = await getLinksFromSitemap(sitemapUrl); } } catch (error) { // Error handling for failed sitemap fetch // console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); } - // If the first one doesn't work, try the base URL - const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; - try { - const response = await axios.get(baseUrlSitemap); - if (response.status === 200) { - return await getLinksFromSitemap(baseUrlSitemap); + if (sitemapLinks.length === 0) { + // If the first one doesn't work, try the base URL + const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; + try { + const response = await axios.get(baseUrlSitemap); + if (response.status === 200) { + sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); + } + } catch (error) { + // Error handling for failed base URL sitemap fetch + // console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); } - } catch (error) { - // Error handling for failed base URL sitemap fetch - console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); } - return []; + // Normalize and check if the URL is present in any of the sitemaps + const normalizedUrl = normalizeUrl(url); + + const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); + + // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl + if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { + // do not push the normalized url + sitemapLinks.push(url); + } + + return sitemapLinks; } } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 3ba5a1d..8bc33eb 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -59,7 +59,11 @@ export class WebScraperDataProvider { await Promise.all( batchUrls.map(async (url, index) => { const existingHTML = allHtmls ? allHtmls[i + index] : ""; - const result = await scrapSingleUrl(url, this.pageOptions, existingHTML); + const result = await scrapSingleUrl( + url, + this.pageOptions, + existingHTML + ); processedUrls++; if (inProgress) { inProgress({ @@ -130,25 +134,30 @@ export class WebScraperDataProvider { } } - private async cleanIrrelevantPath(links: string[]){ - return links.filter(link => { + private async cleanIrrelevantPath(links: string[]) { + return links.filter((link) => { const normalizedInitialUrl = new URL(this.urls[0]); const normalizedLink = new URL(link); // Normalize the hostname to account for www and non-www versions - const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, ''); - const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); + const initialHostname = normalizedInitialUrl.hostname.replace( + /^www\./, + "" + ); + const linkHostname = normalizedLink.hostname.replace(/^www\./, ""); // Ensure the protocol and hostname match, and the path starts with the initial URL's path - return linkHostname === initialHostname && - normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname); + return ( + linkHostname === initialHostname && + normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname) + ); }); } private async handleCrawlMode( inProgress?: (progress: Progress) => void ): Promise { - console.log('??? >>>', this.urls[0]) + console.log("??? >>>", this.urls[0]); const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, @@ -159,28 +168,25 @@ export class WebScraperDataProvider { generateImgAltText: this.generateImgAltText, }); - let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); + let links = await crawler.start( + inProgress, + 5, + this.limit, + this.maxCrawledDepth + ); let allLinks = links.map((e) => e.url); - const allHtmls = links.map((e)=> e.html); - - console.log(">>>>>> all links >>>>", {allLinks}) - // allLinks = await this.cleanIrrelevantPath(allLinks); - - - - console.log('>>>>>??>?>?>?>?.', {allLinks}) + const allHtmls = links.map((e) => e.html); if (this.returnOnlyUrls) { - return this.returnOnlyUrlsResponse(allLinks , inProgress); + return this.returnOnlyUrlsResponse(allLinks, inProgress); } - + let documents = []; // check if fast mode is enabled and there is html inside the links if (this.crawlerMode === "fast" && links.some((link) => link.html)) { - console.log("Fast mode enabled"); documents = await this.processLinks(allLinks, inProgress, allHtmls); - }else{ + } else { documents = await this.processLinks(allLinks, inProgress); } @@ -234,10 +240,13 @@ export class WebScraperDataProvider { let pdfLinks = links.filter((link) => link.endsWith(".pdf")); let pdfDocuments = await this.fetchPdfDocuments(pdfLinks); links = links.filter((link) => !link.endsWith(".pdf")); - - let documents = await this.convertUrlsToDocuments(links, inProgress, allHtmls); - documents = await this.getSitemapData(this.urls[0], documents); + let documents = await this.convertUrlsToDocuments( + links, + inProgress, + allHtmls + ); + documents = await this.getSitemapData(this.urls[0], documents); documents = this.applyPathReplacements(documents); // documents = await this.applyImgAltText(documents); @@ -436,9 +445,13 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; - this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} - this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; + this.pageOptions = options.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + }; + this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; + this.replaceAllPathsWithAbsolutePaths = + options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); this.crawlerMode = options.crawlerOptions?.mode ?? "default"; diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 701810c..7483ea5 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -48,7 +48,7 @@ class FirecrawlApp: return response['data'] else: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - elif response.status_code in [402, 409, 500]: + elif response.status_code in [402, 408, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') else: @@ -148,7 +148,7 @@ class FirecrawlApp: self._handle_error(status_response, 'check crawl status') def _handle_error(self, response, action): - if response.status_code in [402, 409, 500]: + if response.status_code in [402, 408, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') else: diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json index 651468a..59cfa9f 100644 --- a/apps/test-suite/data/crawl.json +++ b/apps/test-suite/data/crawl.json @@ -1,49 +1,80 @@ -[ +[{ + "website": "https://openai.com/news", + "expected_min_num_of_pages": 4, + "expected_crawled_pages": [ + "https://openai.com/news/company/", + "https://openai.com/news/research/", + "https://openai.com/news/safety-and-alignment/", + "https://openai.com/news/stories/" + ] +}, { - "website": "https://mendable.ai/pricing", - "expected_min_num_of_pages": 29, - "expected_not_crawled_pages": [ - "https://mendable.ai/", - "https://mendable.ai/blog", - "https://mendable.ai/signin", - "https://mendable.ai/signup", - "https://mendable.ai", - "https://mendable.ai/usecases/sales-enablement", - "https://mendable.ai/usecases/documentation", - "https://mendable.ai/usecases/cs-enablement", - "https://mendable.ai/usecases/productcopilot", - "https://mendable.ai/security" - ], - "notes": "This one should not go backwards, but it does!" - }, + "website": "https://www.framer.com/pricing", + "expected_min_num_of_pages": 1, + "expected_not_crawled_pages": [ + "https://www.framer.com/features/navigation/", + "https://www.framer.com/contact/", + "https://www.framer.com/add-ons/", + "https://www.framer.com/free-saas-ui-kit/", + "https://www.framer.com/help/", + "https://www.framer.com/features/effects/", + "https://www.framer.com/enterprise/", + "https://www.framer.com/templates/" + ] +}, { - "website": "https://openai.com/news", - "expected_min_num_of_pages": 59, - "expected_crawled_pages": [ - "https://openai.com/news/company/", - "https://openai.com/news/research/", - "https://openai.com/news/safety-and-alignment/", - "https://openai.com/news/stories/" - ] - }, + "website": "https://mendable.ai/pricing", + "expected_min_num_of_pages": 1, + "expected_not_crawled_pages": [ + "https://mendable.ai/", + "https://mendable.ai/blog", + "https://mendable.ai/signin", + "https://mendable.ai/signup", + "https://mendable.ai", + "https://mendable.ai/usecases/sales-enablement", + "https://mendable.ai/usecases/documentation", + "https://mendable.ai/usecases/cs-enablement", + "https://mendable.ai/usecases/productcopilot", + "https://mendable.ai/security" + ], + "notes": "This one should not go backwards, but it does!" +}, + { "website": "https://agentops.ai/blog", - "expected_min_num_of_pages": 7, + "expected_min_num_of_pages": 6, "expected_crawled_pages": [ "https://www.agentops.ai/blog/effortless-hr-management-with-saas", "https://www.agentops.ai/blog/streamlining-hr-with-saas", "https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions", "https://www.agentops.ai/blog/efficient-hr-operations-with-saas", - "https://www.agentops.ai/blog/hr-made-simple-with-saas" + "https://www.agentops.ai/blog/hr-made-simple-with-saas", + "https://agentops.ai/blog" ], "expected_not_crawled_pages": [ - "https://www.agentops.ai/about-us", - "https://www.agentops.ai/contact-us" + "https://agentops.ai/about-us", + "https://agentops.ai/contact-us" ] }, + { + "website": "https://en.wikipedia.org/wiki/T._N._Seshan", + "expected_min_num_of_pages": 1, + "expected_not_crawled_pages": [ + "https://en.wikipedia.org/wiki/Wikipedia:Contents", + "https://en.wikipedia.org/wiki/Wikipedia:Contact_us", + "https://en.wikipedia.org/wiki/V._S._Ramadevi", + "https://en.wikipedia.org/wiki/Wikipedia:About", + "https://en.wikipedia.org/wiki/Help:Introduction", + "https://en.wikipedia.org/wiki/H._D._Deve_Gowda", + "https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg" + ] + }, + + + { "website": "https://ycombinator.com/companies", - "expected_min_num_of_pages": 45, + "expected_min_num_of_pages": 20, "expected_crawled_pages": [ "https://www.ycombinator.com/companies/industry/elearning", "https://www.ycombinator.com/companies/industry/computer-vision", @@ -68,36 +99,11 @@ "https://firecrawl.dev/pricing" ] }, - { - "website": "https://en.wikipedia.org/wiki/T._N._Seshan", - "expected_min_num_of_pages": 100, - "expected_not_crawled_pages": [ - "https://en.wikipedia.org/wiki/Wikipedia:Contents", - "https://en.wikipedia.org/wiki/Wikipedia:Contact_us", - "https://en.wikipedia.org/wiki/V._S._Ramadevi", - "https://en.wikipedia.org/wiki/Wikipedia:About", - "https://en.wikipedia.org/wiki/Help:Introduction", - "https://en.wikipedia.org/wiki/H._D._Deve_Gowda", - "https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg" - ] - }, - { - "website": "https://www.framer.com/pricing", - "expected_min_num_of_pages": 58, - "expected_not_crawled_pages": [ - "https://www.framer.com/features/navigation/", - "https://www.framer.com/contact/", - "https://www.framer.com/add-ons/", - "https://www.framer.com/free-saas-ui-kit/", - "https://www.framer.com/help/", - "https://www.framer.com/features/effects/", - "https://www.framer.com/enterprise/", - "https://www.framer.com/templates/" - ] - }, + + { "website": "https://fly.io/docs/gpus/gpu-quickstart", - "expected_min_num_of_pages": 39, + "expected_min_num_of_pages": 1, "expected_not_crawled_pages": [ "https://fly.io/docs/getting-started/", "https://fly.io/docs/hands-on/", @@ -134,7 +140,7 @@ }, { "website": "https://richmondconfidential.org", - "expected_min_num_of_pages": 50, + "expected_min_num_of_pages": 20, "expected_crawled_pages": [ "https://richmondconfidential.org/2009/10/13/salesians-star-guard-has-a-big-impact/", "https://richmondconfidential.org/2009/10/13/on-team-of-beginners-oilers-old-hand-stands-out/", diff --git a/apps/test-suite/tests/crawl.test.ts b/apps/test-suite/tests/crawl.test.ts index 853379b..577725a 100644 --- a/apps/test-suite/tests/crawl.test.ts +++ b/apps/test-suite/tests/crawl.test.ts @@ -86,6 +86,7 @@ describe("Crawling Checkup (E2E)", () => { actual_output: `FAILURE: ${completedResponse.body.data.length}`, error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}` }); + console.log('Error: ', errorLog); continue; } @@ -98,6 +99,7 @@ describe("Crawling Checkup (E2E)", () => { actual_output: `FAILURE: ${completedResponse.body.data}`, error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}` }); + console.log('Error: ', errorLog); continue; } @@ -110,6 +112,7 @@ describe("Crawling Checkup (E2E)", () => { actual_output: `FAILURE: ${completedResponse.body.data}`, error: `Expected crawled pages to not contain ${websiteData.expected_not_crawled_pages}, but got ${completedResponse.body.data}` }); + console.log('Error: ', errorLog); continue; } @@ -141,7 +144,7 @@ describe("Crawling Checkup (E2E)", () => { fs.writeFileSync(errorLogFileName, JSON.stringify(errorLog, null, 2)); } - expect(score).toBeGreaterThanOrEqual(95); + expect(score).toBeGreaterThanOrEqual(90); }, 350000); // 150 seconds timeout }); }); From 24be4866c56d6c660ba170bf5a7088f6e9f9e1f1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 17:16:20 -0700 Subject: [PATCH 40/47] Nick: --- apps/api/src/scraper/WebScraper/crawler.ts | 1 - apps/test-suite/data/crawl.json | 16 ++++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 8449efb..9e080d7 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -151,7 +151,6 @@ export class WebCrawler { concurrencyLimit: number, inProgress?: (progress: Progress) => void, ): Promise<{ url: string, html: string }[]> { - console.log("Crawling URLs: ", urls); const queue = async.queue(async (task: string, callback) => { if (this.crawledUrls.size >= this.maxCrawledLinks) { if (callback && typeof callback === "function") { diff --git a/apps/test-suite/data/crawl.json b/apps/test-suite/data/crawl.json index 59cfa9f..8bc28a6 100644 --- a/apps/test-suite/data/crawl.json +++ b/apps/test-suite/data/crawl.json @@ -1,4 +1,10 @@ -[{ +[ + { + "website": "https://www.vellum.ai/llm-leaderboard", + "expected_min_num_of_pages": 1, + "expected_crawled_pages": ["https://www.vellum.ai/llm-leaderboard"] + }, + { "website": "https://openai.com/news", "expected_min_num_of_pages": 4, "expected_crawled_pages": [ @@ -70,8 +76,6 @@ ] }, - - { "website": "https://ycombinator.com/companies", "expected_min_num_of_pages": 20, @@ -115,11 +119,7 @@ ], "notes": "This one should not go backwards, but it does!" }, - { - "website": "https://www.vellum.ai/llm-leaderboard", - "expected_min_num_of_pages": 0, - "expected_crawled_pages": [""] - }, + { "website": "https://www.instructables.com/circuits", "expected_min_num_of_pages": 12, From 4a6cfb6097be2c32ddc4f750a962177914f529cb Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 17:22:29 -0700 Subject: [PATCH 41/47] Update index.test.ts --- .../src/__tests__/e2e_withAuth/index.test.ts | 136 +++++++++++------- 1 file changed, 86 insertions(+), 50 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 2590592..c748a6d 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -159,21 +159,26 @@ describe("E2E Tests for API Routes", () => { }, }); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + let response; + let isFinished = false; - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; - const urls = completedResponse.body.data.map( + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + const urls = completedResponse.body.data.map( (item: any) => item.metadata?.sourceURL ); expect(urls.length).toBeGreaterThan(5); @@ -205,19 +210,24 @@ describe("E2E Tests for API Routes", () => { }, }); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + let isFinished = false; + let response; - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; const urls = completedResponse.body.data.map( (item: any) => item.metadata?.sourceURL @@ -238,19 +248,24 @@ describe("E2E Tests for API Routes", () => { limit: 3, }); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + let isFinished = false; + let response; - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); @@ -322,8 +337,17 @@ describe("E2E Tests for API Routes", () => { expect(response.body).toHaveProperty("status"); expect(response.body.status).toBe("active"); - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) @@ -359,8 +383,17 @@ describe("E2E Tests for API Routes", () => { expect(response.body).toHaveProperty("status"); expect(response.body.status).toBe("active"); - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) @@ -490,20 +523,23 @@ describe("E2E Tests for API Routes", () => { .send({ url: "https://firecrawl.dev" }); expect(crawlResponse.statusCode).toBe(200); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + let isCompleted = false; + let completedResponse; - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 30000)); + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); + if (response.body.status === "completed") { + isCompleted = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body).toHaveProperty("data"); From 123fb784cab8337df8f191762066f280a61f938c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 17:29:22 -0700 Subject: [PATCH 42/47] Update index.test.ts --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index c748a6d..24b4fd0 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -155,7 +155,7 @@ describe("E2E Tests for API Routes", () => { url: "https://mendable.ai", limit: 10, crawlerOptions: { - includes: ["/blog/*"], + includes: ["blog/*"], }, }); @@ -184,7 +184,7 @@ describe("E2E Tests for API Routes", () => { expect(urls.length).toBeGreaterThan(5); urls.forEach((url: string) => { console.log({url}) - expect(url.startsWith("https://mendable.ai/blog/")).toBeTruthy(); + expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy(); }); expect(completedResponse.statusCode).toBe(200); @@ -206,7 +206,7 @@ describe("E2E Tests for API Routes", () => { url: "https://mendable.ai", limit: 10, crawlerOptions: { - excludes: ["/blog/*"], + excludes: ["blog/*"], }, }); @@ -234,7 +234,7 @@ describe("E2E Tests for API Routes", () => { ); expect(urls.length).toBeGreaterThan(5); urls.forEach((url: string) => { - expect(url.startsWith("https://mendable.ai/blog/")).toBeFalsy(); + expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); }); }, 60000); // 60 seconds @@ -357,7 +357,7 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data.length).toBe(3); + expect(completedResponse.body.data.length).toBe(10); expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); From 93b1f0334ea736a2facb4eebe00f42fafaf3f324 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 17:35:06 -0700 Subject: [PATCH 43/47] Update index.test.ts --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 24b4fd0..3c031a1 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -238,14 +238,14 @@ describe("E2E Tests for API Routes", () => { }); }, 60000); // 60 seconds - it("should return a successful response with a valid API key and valid excludes option", async () => { + it("should return a successful response with a valid API key and limit to 3", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send({ url: "https://mendable.ai", - limit: 3, + crawlerOptions: { limit: 3 }, }); let isFinished = false; @@ -327,7 +327,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://mendable.ai", - limit: 10, + crawlerOptions: { onlyMainContent: true, limit: 10 }, }); const response = await request(TEST_URL) From 098db17913bda755a9f32c93ddc956b1cac8126b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 17:37:09 -0700 Subject: [PATCH 44/47] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index d7870c2..a0f719a 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -157,7 +157,7 @@ export class WebScraperDataProvider { private async handleCrawlMode( inProgress?: (progress: Progress) => void ): Promise { - console.log("??? >>>", this.urls[0]); + const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, From 80250fb54fae15c4c822e7e7b52398afb3d6220c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 15 May 2024 17:40:46 -0700 Subject: [PATCH 45/47] Update index.test.ts --- .../src/__tests__/e2e_withAuth/index.test.ts | 80 +++++++++---------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 3c031a1..8106ae1 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -320,50 +320,50 @@ describe("E2E Tests for API Routes", () => { }); }, 120000); - it("should return a successful response with a valid API key and valid onlyMainContent option", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - crawlerOptions: { onlyMainContent: true, limit: 10 }, - }); + // it("should return a successful response with a valid API key and valid limit option", async () => { + // const crawlResponse = await request(TEST_URL) + // .post("/v0/crawl") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ + // url: "https://mendable.ai", + // crawlerOptions: { limit: 10 }, + // }); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + // const response = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty("status"); + // expect(response.body.status).toBe("active"); - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } + // let isCompleted = false; + // while (!isCompleted) { + // const statusCheckResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + // expect(statusCheckResponse.statusCode).toBe(200); + // isCompleted = statusCheckResponse.body.status === "completed"; + // if (!isCompleted) { + // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + // } + // } - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + // const completedResponse = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data.length).toBe(10); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].content).not.toContain("main menu"); - }, 60000); // 60 seconds + // expect(completedResponse.statusCode).toBe(200); + // expect(completedResponse.body).toHaveProperty("status"); + // expect(completedResponse.body.status).toBe("completed"); + // expect(completedResponse.body).toHaveProperty("data"); + // expect(completedResponse.body.data.length).toBe(10); + // expect(completedResponse.body.data[0]).toHaveProperty("content"); + // expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + // expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + // expect(completedResponse.body.data[0].content).toContain("Mendable"); + // expect(completedResponse.body.data[0].content).not.toContain("main menu"); + // }, 60000); // 60 seconds it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { const crawlResponse = await request(TEST_URL) From bcce0544e78285d5615528c031be8fb24c0017bf Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 16 May 2024 11:03:32 -0700 Subject: [PATCH 46/47] Update openapi.json --- apps/api/openapi.json | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index b0f8b99..98acbbb 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -242,7 +242,7 @@ "query": { "type": "string", "format": "uri", - "description": "The URL to scrape" + "description": "The query to search for" }, "pageOptions": { "type": "object", @@ -354,14 +354,14 @@ "data": { "type": "array", "items": { - "$ref": "#/components/schemas/ScrapeResponse" + "$ref": "#/components/schemas/CrawlStatusResponseObj" }, "description": "Data returned from the job (null when it is in progress)" }, "partial_data": { "type": "array", "items": { - "$ref": "#/components/schemas/ScrapeResponse" + "$ref": "#/components/schemas/CrawlStatusResponseObj" }, "description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled." } @@ -484,6 +484,41 @@ } } }, + "CrawlStatusResponseObj": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "content": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeHtml` is true" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + } + } + } + } + }, "SearchResponse": { "type": "object", "properties": { From 9d635cb2a3d21041da1cc624251601422b3ff75b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 16 May 2024 11:48:02 -0700 Subject: [PATCH 47/47] Nick: docx support --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 111 ++++++++++++++++-- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- apps/api/src/scraper/WebScraper/index.ts | 27 ++++- .../utils/__tests__/docxProcessor.test.ts | 13 ++ .../scraper/WebScraper/utils/docxProcessor.ts | 41 +++++++ 6 files changed, 182 insertions(+), 13 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/docxProcessor.ts diff --git a/apps/api/package.json b/apps/api/package.json index a79e3dc..ad99c5e 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -33,6 +33,7 @@ "express": "^4.18.2", "jest": "^29.6.3", "jest-fetch-mock": "^3.0.3", + "mammoth": "^1.7.2", "nodemon": "^2.0.20", "supabase": "^1.77.9", "supertest": "^6.3.3", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 7873375..16b2f6c 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -97,7 +97,7 @@ dependencies: version: 0.0.25 langchain: specifier: ^0.1.25 - version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2) + version: 0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(mammoth@1.7.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2) languagedetect: specifier: ^2.0.0 version: 2.0.0 @@ -214,6 +214,9 @@ devDependencies: jest-fetch-mock: specifier: ^3.0.3 version: 3.0.3 + mammoth: + specifier: ^1.7.2 + version: 1.7.2 nodemon: specifier: ^2.0.20 version: 2.0.22 @@ -1765,6 +1768,10 @@ packages: dev: false optional: true + /@xmldom/xmldom@0.8.10: + resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==} + engines: {node: '>=10.0.0'} + /abbrev@1.1.1: resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==} dev: true @@ -1895,7 +1902,6 @@ packages: resolution: {integrity: sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==} dependencies: sprintf-js: 1.0.3 - dev: true /argparse@2.0.1: resolution: {integrity: sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==} @@ -2071,7 +2077,6 @@ packages: /base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} - dev: false /basic-ftp@5.0.5: resolution: {integrity: sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==} @@ -2096,6 +2101,9 @@ packages: resolution: {integrity: sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA==} dev: false + /bluebird@3.4.7: + resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==} + /body-parser@1.20.2: resolution: {integrity: sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==} engines: {node: '>= 0.8', npm: 1.2.8000 || >= 1.4.16} @@ -2421,6 +2429,9 @@ packages: resolution: {integrity: sha512-LDx6oHrK+PhzLKJU9j5S7/Y3jM/mUHvD/DeI1WQmJn652iPC5Y4TBzC9l+5OMOXlyTTA+SmVUPm0HQUwpD5Jqw==} dev: true + /core-util-is@1.0.3: + resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==} + /cors@2.8.5: resolution: {integrity: sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==} engines: {node: '>= 0.10'} @@ -2659,6 +2670,9 @@ packages: md5: 2.3.0 dev: false + /dingbat-to-unicode@1.0.1: + resolution: {integrity: sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==} + /dom-serializer@2.0.0: resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==} dependencies: @@ -2695,6 +2709,11 @@ packages: engines: {node: '>=12'} dev: false + /duck@0.1.12: + resolution: {integrity: sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==} + dependencies: + underscore: 1.13.6 + /eastasianwidth@0.2.0: resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==} dev: false @@ -3332,6 +3351,9 @@ packages: resolution: {integrity: sha512-Ius2VYcGNk7T90CppJqcIkS5ooHUZyIQK+ClZfMfMNFEF9VSE73Fq+906u/CWu92x4gzZMWOwfFYckPObzdEbA==} dev: true + /immediate@3.0.6: + resolution: {integrity: sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==} + /import-fresh@3.3.0: resolution: {integrity: sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==} engines: {node: '>=6'} @@ -3462,6 +3484,9 @@ packages: engines: {node: '>=8'} dev: true + /isarray@1.0.0: + resolution: {integrity: sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==} + /isexe@2.0.0: resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==} @@ -4049,6 +4074,14 @@ packages: engines: {node: '>=0.10.0'} dev: false + /jszip@3.10.1: + resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==} + dependencies: + lie: 3.3.0 + pako: 1.0.11 + readable-stream: 2.3.8 + setimmediate: 1.0.5 + /kareem@2.5.1: resolution: {integrity: sha512-7jFxRVm+jD+rkq3kY0iZDJfsO2/t4BBPeEb2qKn2lR/9KhuksYk5hxzfRYWMPV8P/x2d0kHD306YyWLzjjH+uA==} engines: {node: '>=12.0.0'} @@ -4064,7 +4097,7 @@ packages: engines: {node: '>=6'} dev: true - /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2): + /langchain@0.1.25(@supabase/supabase-js@2.39.7)(axios@1.6.7)(cheerio@1.0.0-rc.12)(ioredis@5.3.2)(mammoth@1.7.2)(pdf-parse@1.1.1)(puppeteer@22.6.3)(redis@4.6.13)(typesense@1.7.2): resolution: {integrity: sha512-sfEChvr4H2CklHdSByNBbytwBrFhgtA5kPOnwcBrxuXGg1iOaTzhVxQA0QcNcQucI3hZrsNbZjxGp+Can1ooZQ==} engines: {node: '>=18'} peerDependencies: @@ -4238,6 +4271,7 @@ packages: jsonpointer: 5.0.1 langchainhub: 0.0.8 langsmith: 0.1.13 + mammoth: 1.7.2 ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 @@ -4344,6 +4378,11 @@ packages: type-check: 0.3.2 dev: false + /lie@3.3.0: + resolution: {integrity: sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==} + dependencies: + immediate: 3.0.6 + /lines-and-columns@1.2.4: resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==} @@ -4380,6 +4419,13 @@ packages: - encoding dev: false + /lop@0.4.1: + resolution: {integrity: sha512-9xyho9why2A2tzm5aIcMWKvzqKsnxrf9B5I+8O30olh6lQU8PH978LqZoI4++37RBgS1Em5i54v1TFs/3wnmXQ==} + dependencies: + duck: 0.1.12 + option: 0.2.4 + underscore: 1.13.6 + /lru-cache@10.2.0: resolution: {integrity: sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==} engines: {node: 14 || >=16.14} @@ -4423,6 +4469,22 @@ packages: tmpl: 1.0.5 dev: true + /mammoth@1.7.2: + resolution: {integrity: sha512-MqWU2hcLf1I5QMKyAbfJCvrLxnv5WztrAQyorfZ+WPq7Hk82vZFmvfR2/64ajIPpM4jlq0TXp1xZvp/FFaL1Ug==} + engines: {node: '>=12.0.0'} + hasBin: true + dependencies: + '@xmldom/xmldom': 0.8.10 + argparse: 1.0.10 + base64-js: 1.5.1 + bluebird: 3.4.7 + dingbat-to-unicode: 1.0.1 + jszip: 3.10.1 + lop: 0.4.1 + path-is-absolute: 1.0.1 + underscore: 1.13.6 + xmlbuilder: 10.1.1 + /md5@2.3.0: resolution: {integrity: sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==} dependencies: @@ -4867,6 +4929,9 @@ packages: resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==} dev: false + /option@0.2.4: + resolution: {integrity: sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==} + /optionator@0.8.3: resolution: {integrity: sha512-+IW9pACdk3XWmmTXG8m3upGUJst5XRGzxMRjXzAuJ1XnIFNvfhjjIuYkDvysnPQ7qzqVzLt78BCruntqRhWQbA==} engines: {node: '>= 0.8.0'} @@ -4957,6 +5022,9 @@ packages: netmask: 2.0.2 dev: false + /pako@1.0.11: + resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==} + /parent-module@1.0.1: resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} engines: {node: '>=6'} @@ -5002,7 +5070,6 @@ packages: /path-is-absolute@1.0.1: resolution: {integrity: sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==} engines: {node: '>=0.10.0'} - dev: true /path-key@3.1.1: resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==} @@ -5095,6 +5162,9 @@ packages: react-is: 18.2.0 dev: true + /process-nextick-args@2.0.1: + resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==} + /progress@2.0.3: resolution: {integrity: sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==} engines: {node: '>=0.4.0'} @@ -5251,6 +5321,17 @@ packages: engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} dev: true + /readable-stream@2.3.8: + resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==} + dependencies: + core-util-is: 1.0.3 + inherits: 2.0.4 + isarray: 1.0.0 + process-nextick-args: 2.0.1 + safe-buffer: 5.1.2 + string_decoder: 1.1.1 + util-deprecate: 1.0.2 + /readdirp@3.6.0: resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==} engines: {node: '>=8.10.0'} @@ -5347,6 +5428,9 @@ packages: resolution: {integrity: sha512-cLgakCUf6PedEu15t8kbsjnwIFFR2D4RfL+W3iWFJ4iac7z4B0ZI8fxy4R3J956kAI68HclCFGL8MPoUVC3qVA==} dev: false + /safe-buffer@5.1.2: + resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==} + /safe-buffer@5.2.1: resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} @@ -5460,6 +5544,9 @@ packages: gopd: 1.0.1 has-property-descriptors: 1.0.2 + /setimmediate@1.0.5: + resolution: {integrity: sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==} + /setprototypeof@1.2.0: resolution: {integrity: sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==} @@ -5562,7 +5649,6 @@ packages: /sprintf-js@1.0.3: resolution: {integrity: sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==} - dev: true /sprintf-js@1.1.3: resolution: {integrity: sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==} @@ -5631,6 +5717,11 @@ packages: strip-ansi: 7.1.0 dev: false + /string_decoder@1.1.1: + resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==} + dependencies: + safe-buffer: 5.1.2 + /strip-ansi@6.0.1: resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==} engines: {node: '>=8'} @@ -5975,7 +6066,6 @@ packages: /underscore@1.13.6: resolution: {integrity: sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==} - dev: false /undici-types@5.26.5: resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} @@ -6022,6 +6112,9 @@ packages: resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==} dev: false + /util-deprecate@1.0.2: + resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==} + /utils-merge@1.0.1: resolution: {integrity: sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==} engines: {node: '>= 0.4.0'} @@ -6182,6 +6275,10 @@ packages: xmlbuilder: 11.0.1 dev: false + /xmlbuilder@10.1.1: + resolution: {integrity: sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==} + engines: {node: '>=4.0'} + /xmlbuilder@11.0.1: resolution: {integrity: sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==} engines: {node: '>=4.0'} diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 9e080d7..f53ef22 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -321,7 +321,7 @@ export class WebCrawler { ".mp4", ".mp3", ".pptx", - ".docx", + // ".docx", ".xlsx", ".xml", ]; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index a0f719a..d244993 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -17,6 +17,7 @@ import { } from "./utils/replacePaths"; import { generateCompletions } from "../../lib/LLM-extraction"; import { getWebScraperQueue } from "../../../src/services/queue-service"; +import { fetchAndProcessDocx } from "./utils/docxProcessor"; export class WebScraperDataProvider { private bullJobId: string; @@ -157,7 +158,7 @@ export class WebScraperDataProvider { private async handleCrawlMode( inProgress?: (progress: Progress) => void ): Promise { - + const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, @@ -237,9 +238,13 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void, allHtmls?: string[] ): Promise { - let pdfLinks = links.filter((link) => link.endsWith(".pdf")); - let pdfDocuments = await this.fetchPdfDocuments(pdfLinks); - links = links.filter((link) => !link.endsWith(".pdf")); + const pdfLinks = links.filter(link => link.endsWith(".pdf")); + const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx")); + + const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); + const docxDocuments = await this.fetchDocxDocuments(docLinks); + + links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link)); let documents = await this.convertUrlsToDocuments( links, @@ -257,7 +262,7 @@ export class WebScraperDataProvider { ) { documents = await generateCompletions(documents, this.extractorOptions); } - return documents.concat(pdfDocuments); + return documents.concat(pdfDocuments).concat(docxDocuments); } private async fetchPdfDocuments(pdfLinks: string[]): Promise { @@ -272,6 +277,18 @@ export class WebScraperDataProvider { }) ); } + private async fetchDocxDocuments(docxLinks: string[]): Promise { + return Promise.all( + docxLinks.map(async (p) => { + const docXDocument = await fetchAndProcessDocx(p); + return { + content: docXDocument, + metadata: { sourceURL: p }, + provider: "web-scraper", + }; + }) + ); + } private applyPathReplacements(documents: Document[]): Document[] { return this.replaceAllPathsWithAbsolutePaths diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts new file mode 100644 index 0000000..e018ffa --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts @@ -0,0 +1,13 @@ +import * as docxProcessor from "../docxProcessor"; + +describe("DOCX Processing Module - Integration Test", () => { + it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => { + delete process.env.LLAMAPARSE_API_KEY; + const docxContent = await docxProcessor.fetchAndProcessDocx( + "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx" + ); + expect(docxContent.trim()).toContain( + "SERIES A PREFERRED STOCK PURCHASE AGREEMENT" + ); + }); +}); diff --git a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts new file mode 100644 index 0000000..38759f8 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts @@ -0,0 +1,41 @@ +import axios from "axios"; +import fs from "fs"; +import { createWriteStream } from "node:fs"; +import path from "path"; +import os from "os"; +import mammoth from "mammoth"; + +export async function fetchAndProcessDocx(url: string): Promise { + const tempFilePath = await downloadDocx(url); + const content = await processDocxToText(tempFilePath); + fs.unlinkSync(tempFilePath); // Clean up the temporary file + return content; +} + +async function downloadDocx(url: string): Promise { + const response = await axios({ + url, + method: "GET", + responseType: "stream", + }); + + const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`); + const writer = createWriteStream(tempFilePath); + + response.data.pipe(writer); + + return new Promise((resolve, reject) => { + writer.on("finish", () => resolve(tempFilePath)); + writer.on("error", reject); + }); +} + +export async function processDocxToText(filePath: string): Promise { + const content = await extractTextFromDocx(filePath); + return content; +} + +async function extractTextFromDocx(filePath: string): Promise { + const result = await mammoth.extractRawText({ path: filePath }); + return result.value; +}