diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 215e1d1..5e3777b 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -7,7 +7,6 @@ dotenv.config(); // const TEST_URL = 'http://localhost:3002' const TEST_URL = "http://127.0.0.1:3002"; - describe("E2E Tests for API Routes", () => { beforeAll(() => { process.env.USE_DB_AUTHENTICATION = "true"; @@ -56,7 +55,9 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(response.body.error).toContain( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); }); it("should return a successful response with a valid preview token", async () => { @@ -88,7 +89,10 @@ describe("E2E Tests for API Routes", () => { .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true }}); + .send({ + url: "https://firecrawl.dev", + pageOptions: { includeHtml: true }, + }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); expect(response.body.data).toHaveProperty("content"); @@ -124,7 +128,9 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(response.body.error).toContain( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); }); it("should return a successful response with a valid API key", async () => { @@ -140,15 +146,12 @@ describe("E2E Tests for API Routes", () => { ); }); - // Additional tests for insufficient credits? }); describe("POST /v0/crawlWebsitePreview", () => { it("should require authorization", async () => { - const response = await request(TEST_URL).post( - "/v0/crawlWebsitePreview" - ); + const response = await request(TEST_URL).post("/v0/crawlWebsitePreview"); expect(response.statusCode).toBe(401); }); @@ -202,8 +205,6 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - - it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/search") @@ -265,17 +266,60 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain( - "🔥 FireCrawl" - ); + expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); }, 60000); // 60 seconds + it("should return a successful response with max depth option for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 2 }, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + // wait for 60 seconds + await new Promise((r) => setTimeout(r, 60000)); + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url) => { + const depth = new URL(url).pathname.split("/").filter(Boolean).length; + expect(depth).toBeLessThanOrEqual(1); + }); + }, 120000); + it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true } }); + .send({ + url: "https://firecrawl.dev", + pageOptions: { includeHtml: true }, + }); expect(crawlResponse.statusCode).toBe(200); const response = await request(TEST_URL) @@ -291,25 +335,23 @@ describe("E2E Tests for API Routes", () => { const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body).toHaveProperty("data"); expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + + // 120 seconds expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain( - "🔥 FireCrawl" - ); - expect(completedResponse.body.data[0].markdown).toContain( - "FireCrawl" - ); - expect(completedResponse.body.data[0].html).toContain( - " { const crawlResponse = await request(TEST_URL) @@ -359,35 +401,33 @@ describe("E2E Tests for API Routes", () => { .send({ url: "https://mendable.ai", pageOptions: { - onlyMainContent: true + onlyMainContent: true, }, extractorOptions: { mode: "llm-extraction", - extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionPrompt: + "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", extractionSchema: { type: "object", properties: { company_mission: { - type: "string" + type: "string", }, supports_sso: { - type: "boolean" + type: "boolean", }, is_open_source: { - type: "boolean" - } + type: "boolean", + }, }, - required: ["company_mission", "supports_sso", "is_open_source"] - } - } + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, }); - // Ensure that the job was successfully created before proceeding with LLM extraction expect(response.statusCode).toBe(200); - - // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` let llmExtraction = response.body.data.llm_extraction; @@ -440,7 +480,6 @@ describe("E2E Tests for API Routes", () => { // } // }); - // // Print the response body to the console for debugging purposes // console.log("Response companies:", response.body.data.llm_extraction.companies); @@ -462,9 +501,6 @@ describe("E2E Tests for API Routes", () => { // }, 120000); // 120 secs // }); - - - describe("GET /is-production", () => { it("should return the production status", async () => { const response = await request(TEST_URL).get("/is-production"); diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 7cf48cb..a387b54 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -40,6 +40,7 @@ export type WebScraperOptions = { includes?: string[]; excludes?: string[]; maxCrawledLinks?: number; + maxDepth?: number; limit?: number; generateImgAltText?: boolean; replaceAllPathsWithAbsolutePaths?: boolean; diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 23cb629..0248df2 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -13,6 +13,7 @@ export class WebCrawler { private includes: string[]; private excludes: string[]; private maxCrawledLinks: number; + private maxCrawledDepth: number; private visited: Set = new Set(); private crawledUrls: Set = new Set(); private limit: number; @@ -27,6 +28,7 @@ export class WebCrawler { maxCrawledLinks, limit = 10000, generateImgAltText = false, + maxCrawledDepth = 10, }: { initialUrl: string; includes?: string[]; @@ -34,6 +36,7 @@ export class WebCrawler { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; + maxCrawledDepth?: number; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -44,15 +47,22 @@ export class WebCrawler { this.robots = robotsParser(this.robotsTxtUrl, ""); // Deprecated, use limit instead this.maxCrawledLinks = maxCrawledLinks ?? limit; + this.maxCrawledDepth = maxCrawledDepth ?? 10; this.generateImgAltText = generateImgAltText ?? false; } - private filterLinks(sitemapLinks: string[], limit: number): string[] { + private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { return sitemapLinks .filter((link) => { const url = new URL(link); const path = url.pathname; + const depth = url.pathname.split('/').length - 1; + + // Check if the link exceeds the maximum depth allowed + if (depth > maxDepth) { + return false; + } // Check if the link should be excluded if (this.excludes.length > 0 && this.excludes[0] !== "") { @@ -87,7 +97,8 @@ export class WebCrawler { public async start( inProgress?: (progress: Progress) => void, concurrencyLimit: number = 5, - limit: number = 10000 + limit: number = 10000, + maxDepth: number = 10 ): Promise { // Fetch and parse robots.txt try { @@ -99,7 +110,7 @@ export class WebCrawler { const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { - const filteredLinks = this.filterLinks(sitemapLinks, limit); + const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); return filteredLinks; } @@ -110,13 +121,13 @@ export class WebCrawler { ); if ( urls.length === 0 && - this.filterLinks([this.initialUrl], limit).length > 0 + this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 ) { return [this.initialUrl]; } // make sure to run include exclude here again - return this.filterLinks(urls, limit); + return this.filterLinks(urls, limit, this.maxCrawledDepth); } private async crawlUrls( diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index c127433..e3256db 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -25,6 +25,7 @@ export class WebScraperDataProvider { private includes: string[]; private excludes: string[]; private maxCrawledLinks: number; + private maxCrawledDepth: number = 10; private returnOnlyUrls: boolean; private limit: number = 10000; private concurrentRequests: number = 20; @@ -134,10 +135,11 @@ export class WebScraperDataProvider { includes: this.includes, excludes: this.excludes, maxCrawledLinks: this.maxCrawledLinks, + maxCrawledDepth: this.maxCrawledDepth, limit: this.limit, generateImgAltText: this.generateImgAltText, }); - let links = await crawler.start(inProgress, 5, this.limit); + let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(links, inProgress); } @@ -253,6 +255,7 @@ export class WebScraperDataProvider { documents = this.mergeNewDocuments(documents, newDocuments); } documents = this.filterDocsExcludeInclude(documents); + documents = this.filterDepth(documents); documents = this.removeChildLinks(documents); return documents.splice(0, this.limit); } @@ -384,6 +387,7 @@ export class WebScraperDataProvider { this.includes = options.crawlerOptions?.includes ?? []; this.excludes = options.crawlerOptions?.excludes ?? []; this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000; + this.maxCrawledDepth = options.crawlerOptions?.maxDepth ?? 10; this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false; this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = @@ -476,4 +480,12 @@ export class WebScraperDataProvider { return documents; }; + + filterDepth(documents: Document[]): Document[] { + return documents.filter((document) => { + const url = new URL(document.metadata.sourceURL); + const path = url.pathname; + return path.split("/").length <= this.maxCrawledDepth; + }); + } }