From 32e814bedcc63350e63aa4c7ed3288eb29e368d6 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 13 Jun 2024 17:02:30 -0400 Subject: [PATCH 01/17] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index f432f43..dfb52c4 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -163,12 +163,15 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { + const baseURLDepth = new URL(this.urls[0]).pathname.split('/').length - 1; + const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth; + const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, excludes: this.excludes, maxCrawledLinks: this.maxCrawledLinks, - maxCrawledDepth: this.maxCrawledDepth, + maxCrawledDepth: adjustedMaxDepth, limit: this.limit, generateImgAltText: this.generateImgAltText, allowBackwardCrawling: this.allowBackwardCrawling, From 65d63bae4554da28751ac688c162ac13e667ce75 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 13 Jun 2024 17:17:44 -0400 Subject: [PATCH 02/17] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index dfb52c4..e8ce813 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -163,9 +163,12 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { - const baseURLDepth = new URL(this.urls[0]).pathname.split('/').length - 1; + const baseURLDepth = new URL(this.urls[0]).pathname.split('/').length - 2; const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth; + + + const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, From bf10e9d392d009942fa17c0ed439d5ba437ed3ac Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 13 Jun 2024 17:28:59 -0400 Subject: [PATCH 03/17] Update index.test.ts --- .../src/__tests__/e2e_withAuth/index.test.ts | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 431c7d1..39635cb 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -395,6 +395,48 @@ describe("E2E Tests for API Routes", () => { }); }, 120000); + it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://docs.dify.ai/v/zh-hans", + crawlerOptions: { maxDepth: 1 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + // wait for 60 seconds + await new Promise((r) => setTimeout(r, 60000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const depth = new URL(url).pathname.split("/").filter(Boolean).length; + expect(depth).toBeLessThanOrEqual(4); + }); + }, 120000); + // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { // const crawlResponse = await request(TEST_URL) // .post("/v0/crawl") From 5e8aa927881062896ec651e5324a1178a901c84b Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 13 Jun 2024 17:33:13 -0400 Subject: [PATCH 04/17] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index e8ce813..ef0c839 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -163,7 +163,7 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { - const baseURLDepth = new URL(this.urls[0]).pathname.split('/').length - 2; + const baseURLDepth = new URL(this.urls[0]).pathname.split('/').length - 1; const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth; From 095951aa4df5a47a55fc5fdc0c599f1ecfdb2b95 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 13 Jun 2024 17:40:00 -0400 Subject: [PATCH 05/17] Update test --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 4 ++-- apps/api/src/scraper/WebScraper/index.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 39635cb..b0642cf 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -401,7 +401,7 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send({ - url: "https://docs.dify.ai/v/zh-hans", + url: "https://www.scrapethissite.com/pages/", crawlerOptions: { maxDepth: 1 }, }); expect(crawlResponse.statusCode).toBe(200); @@ -433,7 +433,7 @@ describe("E2E Tests for API Routes", () => { // Check if all URLs have a maximum depth of 1 urls.forEach((url: string) => { const depth = new URL(url).pathname.split("/").filter(Boolean).length; - expect(depth).toBeLessThanOrEqual(4); + expect(depth).toBeLessThanOrEqual(3); }); }, 120000); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index ef0c839..e8ce813 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -163,7 +163,7 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { - const baseURLDepth = new URL(this.urls[0]).pathname.split('/').length - 1; + const baseURLDepth = new URL(this.urls[0]).pathname.split('/').length - 2; const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth; From 71c98d8b80a9487dff3e5e8ac2e40967fd9a8a0c Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 13 Jun 2024 18:00:52 -0400 Subject: [PATCH 06/17] Update logic --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 2 +- apps/api/src/scraper/WebScraper/index.ts | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index b0642cf..4ac57e6 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -430,7 +430,7 @@ describe("E2E Tests for API Routes", () => { ); expect(urls.length).toBeGreaterThan(1); - // Check if all URLs have a maximum depth of 1 + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 urls.forEach((url: string) => { const depth = new URL(url).pathname.split("/").filter(Boolean).length; expect(depth).toBeLessThanOrEqual(3); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index e8ce813..67d73d4 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -163,11 +163,10 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { - const baseURLDepth = new URL(this.urls[0]).pathname.split('/').length - 2; + const pathSplits = new URL(this.urls[0]).pathname.split('/'); + const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth; - - const crawler = new WebCrawler({ initialUrl: this.urls[0], From 393bd45237a4cacf71dd2d8b2bd57aca8716812b Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 13 Jun 2024 18:13:15 -0400 Subject: [PATCH 07/17] Update index.test.ts --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 4ac57e6..0ae5ad9 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -360,7 +360,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 2 }, + crawlerOptions: { maxDepth: 1 }, }); expect(crawlResponse.statusCode).toBe(200); @@ -391,7 +391,7 @@ describe("E2E Tests for API Routes", () => { // Check if all URLs have a maximum depth of 1 urls.forEach((url: string) => { const depth = new URL(url).pathname.split("/").filter(Boolean).length; - expect(depth).toBeLessThanOrEqual(1); + expect(depth).toBeLessThanOrEqual(2); }); }, 120000); @@ -716,7 +716,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 2 }, + crawlerOptions: { maxDepth: 1 }, }); expect(crawlResponse.statusCode).toBe(200); @@ -750,7 +750,7 @@ describe("E2E Tests for API Routes", () => { // Check if all URLs have a maximum depth of 1 urls.forEach((url) => { const depth = new URL(url).pathname.split("/").filter(Boolean).length; - expect(depth).toBeLessThanOrEqual(1); + expect(depth).toBeLessThanOrEqual(2); }); }, 180000); From ab9de0f5ab3e0d0056ed5c5e3228b841c34afa64 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 13 Jun 2024 18:46:30 -0400 Subject: [PATCH 08/17] Update maxDepth tests --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 9 ++++++--- apps/api/src/scraper/WebScraper/crawler.ts | 1 - 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 0ae5ad9..ef4f3e1 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -390,7 +390,8 @@ describe("E2E Tests for API Routes", () => { // Check if all URLs have a maximum depth of 1 urls.forEach((url: string) => { - const depth = new URL(url).pathname.split("/").filter(Boolean).length; + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(2); }); }, 120000); @@ -432,7 +433,8 @@ describe("E2E Tests for API Routes", () => { // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 urls.forEach((url: string) => { - const depth = new URL(url).pathname.split("/").filter(Boolean).length; + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(3); }); }, 120000); @@ -749,7 +751,8 @@ describe("E2E Tests for API Routes", () => { // Check if all URLs have a maximum depth of 1 urls.forEach((url) => { - const depth = new URL(url).pathname.split("/").filter(Boolean).length; + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(2); }); }, 180000); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 7720991..33a643b 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -158,7 +158,6 @@ export class WebCrawler { // make sure to run include exclude here again const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); - return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); } From 9aba451b188b333a05c1996b76cc9376b72631d9 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 09:33:43 -0400 Subject: [PATCH 09/17] Update index.test.ts --- .../src/__tests__/e2e_withAuth/index.test.ts | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index ef4f3e1..b7f03da 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -439,6 +439,49 @@ describe("E2E Tests for API Routes", () => { }); }, 120000); + it.concurrent("should return a successful response with relative max depth option for a valid crawl job different depths (0)", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 0 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + // wait for 60 seconds + await new Promise((r) => setTimeout(r, 60000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); + + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + expect(depth).toBeLessThanOrEqual(0); + }); + }, 120000); + // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { // const crawlResponse = await request(TEST_URL) // .post("/v0/crawl") From 9b254c1cd0d9aa22d6bea59d24d6969bceef228f Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 09:48:14 -0400 Subject: [PATCH 10/17] Update index.test.ts --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index b7f03da..a4a427c 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -478,7 +478,7 @@ describe("E2E Tests for API Routes", () => { urls.forEach((url: string) => { const pathSplits = new URL(url).pathname.split('/'); const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(0); + expect(depth).toBeLessThanOrEqual(1); }); }, 120000); From 59451754f52743b1ba598d51ce3db3c2cd595bdb Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 10:14:07 -0400 Subject: [PATCH 11/17] Add tests --- .../src/__tests__/e2e_withAuth/index.test.ts | 43 +++++++++++++++++++ apps/api/src/scraper/WebScraper/index.ts | 3 +- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index a4a427c..38ea631 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -482,6 +482,49 @@ describe("E2E Tests for API Routes", () => { }); }, 120000); + it.concurrent("should return a successful response with relative max depth option for a valid crawl job different depths (2)", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 2 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + // wait for 60 seconds + await new Promise((r) => setTimeout(r, 60000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); + + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split('/'); + const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); + expect(depth).toBeLessThanOrEqual(3); + }); + }, 120000); + // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { // const crawlResponse = await request(TEST_URL) // .post("/v0/crawl") diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 67d73d4..250931b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -166,8 +166,7 @@ export class WebScraperDataProvider { const pathSplits = new URL(this.urls[0]).pathname.split('/'); const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth; - - + const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, From 36a62727b84c891c4b95e692f49eb6271fd58f7f Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 10:52:43 -0400 Subject: [PATCH 12/17] Update index.test.ts --- .../src/__tests__/e2e_withAuth/index.test.ts | 64 +++++++++++++++---- 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 38ea631..6ba9aa5 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -369,9 +369,19 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + expect(["active", "waiting"]).toContain(response.body.status); // wait for 60 seconds - await new Promise((r) => setTimeout(r, 60000)); + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); @@ -412,9 +422,19 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + expect(["active", "waiting"]).toContain(response.body.status); // wait for 60 seconds - await new Promise((r) => setTimeout(r, 60000)); + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); @@ -455,9 +475,19 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + expect(["active", "waiting"]).toContain(response.body.status); // wait for 60 seconds - await new Promise((r) => setTimeout(r, 60000)); + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); @@ -498,9 +528,19 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + expect(["active", "waiting"]).toContain(response.body.status); // wait for 60 seconds - await new Promise((r) => setTimeout(r, 60000)); + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); @@ -570,7 +610,7 @@ describe("E2E Tests for API Routes", () => { // expect(completedResponse.body.data[0].content).not.toContain("main menu"); // }, 60000); // 60 seconds - it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { + it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (1)", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -586,7 +626,7 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + expect(["active", "waiting"]).toContain(response.body.status); let isCompleted = false; while (!isCompleted) { @@ -843,7 +883,7 @@ describe("E2E Tests for API Routes", () => { }); }, 180000); - it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { + it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -859,7 +899,7 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); - expect(response.body.status).toBe("active"); + expect(["active", "waiting"]).toContain(response.body.status); let isFinished = false; let completedResponse; From 278bb311cbc7f4b5fb31ac08962e6ca8cfd104e2 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 11:02:39 -0400 Subject: [PATCH 13/17] Update index.test.ts --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 6ba9aa5..db08752 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -563,7 +563,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(3); }); - }, 120000); + }, 180000); // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { // const crawlResponse = await request(TEST_URL) From 8830acce0744e42d6cf5b527a63c9828e982cb3a Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 11:11:58 -0400 Subject: [PATCH 14/17] Update index.test.ts --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index db08752..4c07c87 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -519,7 +519,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 2 }, + crawlerOptions: { maxDepth: 2, limit: 5 }, }); expect(crawlResponse.statusCode).toBe(200); From 42ed1f447946fffd0fe23fb83ff3703aab5b9f52 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 11:20:24 -0400 Subject: [PATCH 15/17] Update index.test.ts --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 4c07c87..155017d 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -404,7 +404,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(2); }); - }, 120000); + }, 180000); it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => { const crawlResponse = await request(TEST_URL) @@ -457,7 +457,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(3); }); - }, 120000); + }, 180000); it.concurrent("should return a successful response with relative max depth option for a valid crawl job different depths (0)", async () => { const crawlResponse = await request(TEST_URL) @@ -510,7 +510,7 @@ describe("E2E Tests for API Routes", () => { const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); expect(depth).toBeLessThanOrEqual(1); }); - }, 120000); + }, 180000); it.concurrent("should return a successful response with relative max depth option for a valid crawl job different depths (2)", async () => { const crawlResponse = await request(TEST_URL) @@ -658,7 +658,7 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].content).toContain("_Roast_"); expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); expect(completedResponse.body.data[0].html).toContain(" { @@ -798,7 +798,7 @@ describe("E2E Tests for API Routes", () => { ); expect(childrenLinks.length).toBe(completedResponse.body.data.length); - }, 120000); // 120 seconds + }, 180000); // 120 seconds it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { const crawlResponse = await request(TEST_URL) @@ -835,7 +835,7 @@ describe("E2E Tests for API Routes", () => { }) ]) ); - }, 120000); // 120 seconds + }, 180000); // 120 seconds it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { const crawlResponse = await request(TEST_URL) From 80c10393b41f587df283456f51393b1c496dc6b8 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 14 Jun 2024 11:32:30 -0400 Subject: [PATCH 16/17] Update index.test.ts --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 155017d..a225a80 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -800,7 +800,7 @@ describe("E2E Tests for API Routes", () => { expect(childrenLinks.length).toBe(completedResponse.body.data.length); }, 180000); // 120 seconds - it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { + it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { const crawlResponse = await request(TEST_URL) .post('/v0/crawl') .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) From 354712a8a352a0487d718cfff24703d3ced344d2 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 14 Jun 2024 13:02:04 -0300 Subject: [PATCH 17/17] just changed the name for the test? --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index adce022..7c234ef 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -618,7 +618,7 @@ describe("E2E Tests for API Routes", () => { }); }, 180000); - it.concurrent("should return a successful response with relative max depth option for a valid crawl job different depths (0)", async () => { + it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -671,7 +671,7 @@ describe("E2E Tests for API Routes", () => { }); }, 180000); - it.concurrent("should return a successful response with relative max depth option for a valid crawl job different depths (2)", async () => { + it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepth equals to 2", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)