diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index c6c59bc..169c75b 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -250,6 +250,47 @@ describe("E2E Tests for API Routes", () => { "🔥 FireCrawl" ); }, 60000); // 60 seconds + + it("should return a successful response with max depth option for a valid crawl job", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com", crawlerOptions: { maxDepth: 2 }}); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 60 seconds + await new Promise((r) => setTimeout(r, 60000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + + const urls = completedResponse.body.data.map((item: any) => item.metadata?.sourceURL); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url) => { + const depth = new URL(url).pathname.split('/').filter(Boolean).length; + expect(depth).toBeLessThanOrEqual(1); + }); + + }, 120000); // 120 seconds }); describe("POST /v0/scrape with LLM Extraction", () => { diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 5b663f2..6a58de5 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -40,6 +40,7 @@ export type WebScraperOptions = { includes?: string[]; excludes?: string[]; maxCrawledLinks?: number; + maxDepth?: number; limit?: number; generateImgAltText?: boolean; replaceAllPathsWithAbsolutePaths?: boolean; diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 23cb629..0248df2 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -13,6 +13,7 @@ export class WebCrawler { private includes: string[]; private excludes: string[]; private maxCrawledLinks: number; + private maxCrawledDepth: number; private visited: Set = new Set(); private crawledUrls: Set = new Set(); private limit: number; @@ -27,6 +28,7 @@ export class WebCrawler { maxCrawledLinks, limit = 10000, generateImgAltText = false, + maxCrawledDepth = 10, }: { initialUrl: string; includes?: string[]; @@ -34,6 +36,7 @@ export class WebCrawler { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; + maxCrawledDepth?: number; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -44,15 +47,22 @@ export class WebCrawler { this.robots = robotsParser(this.robotsTxtUrl, ""); // Deprecated, use limit instead this.maxCrawledLinks = maxCrawledLinks ?? limit; + this.maxCrawledDepth = maxCrawledDepth ?? 10; this.generateImgAltText = generateImgAltText ?? false; } - private filterLinks(sitemapLinks: string[], limit: number): string[] { + private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { return sitemapLinks .filter((link) => { const url = new URL(link); const path = url.pathname; + const depth = url.pathname.split('/').length - 1; + + // Check if the link exceeds the maximum depth allowed + if (depth > maxDepth) { + return false; + } // Check if the link should be excluded if (this.excludes.length > 0 && this.excludes[0] !== "") { @@ -87,7 +97,8 @@ export class WebCrawler { public async start( inProgress?: (progress: Progress) => void, concurrencyLimit: number = 5, - limit: number = 10000 + limit: number = 10000, + maxDepth: number = 10 ): Promise { // Fetch and parse robots.txt try { @@ -99,7 +110,7 @@ export class WebCrawler { const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { - const filteredLinks = this.filterLinks(sitemapLinks, limit); + const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); return filteredLinks; } @@ -110,13 +121,13 @@ export class WebCrawler { ); if ( urls.length === 0 && - this.filterLinks([this.initialUrl], limit).length > 0 + this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 ) { return [this.initialUrl]; } // make sure to run include exclude here again - return this.filterLinks(urls, limit); + return this.filterLinks(urls, limit, this.maxCrawledDepth); } private async crawlUrls( diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1e28552..38ff47b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -16,6 +16,7 @@ export class WebScraperDataProvider { private includes: string[]; private excludes: string[]; private maxCrawledLinks: number; + private maxCrawledDepth: number = 10; private returnOnlyUrls: boolean; private limit: number = 10000; private concurrentRequests: number = 20; @@ -106,10 +107,11 @@ export class WebScraperDataProvider { includes: this.includes, excludes: this.excludes, maxCrawledLinks: this.maxCrawledLinks, + maxCrawledDepth: this.maxCrawledDepth, limit: this.limit, generateImgAltText: this.generateImgAltText, }); - let links = await crawler.start(inProgress, 5, this.limit); + let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(links, inProgress); } @@ -198,6 +200,7 @@ export class WebScraperDataProvider { documents = this.mergeNewDocuments(documents, newDocuments); } documents = this.filterDocsExcludeInclude(documents); + documents = this.filterDepth(documents); documents = this.removeChildLinks(documents); return documents.splice(0, this.limit); } @@ -319,6 +322,7 @@ export class WebScraperDataProvider { this.includes = options.crawlerOptions?.includes ?? []; this.excludes = options.crawlerOptions?.excludes ?? []; this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000; + this.maxCrawledDepth = options.crawlerOptions?.maxDepth ?? 10; this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false; this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = @@ -327,6 +331,8 @@ export class WebScraperDataProvider { this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; + console.log("maxDepth:", this.maxCrawledDepth, options.crawlerOptions?.maxDepth); + //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); @@ -411,4 +417,12 @@ export class WebScraperDataProvider { return documents; }; + + filterDepth(documents: Document[]): Document[] { + return documents.filter((document) => { + const url = new URL(document.metadata.sourceURL); + const path = url.pathname; + return path.split("/").length <= this.maxCrawledDepth; + }); + } }