0

Merge pull request #143 from mendableai/bug/crawl-limit

[Bug] Fixing /crawl limit
This commit is contained in:
Rafael Miller 2024-05-22 14:51:54 -03:00 committed by GitHub
commit df0550d2f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 5 additions and 5 deletions

View File

@ -266,7 +266,7 @@ describe("E2E Tests for API Routes", () => {
urls.forEach((url: string) => { urls.forEach((url: string) => {
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
}); });
}, 60000); // 60 seconds }, 90000); // 90 seconds
it("should return a successful response with a valid API key and limit to 3", async () => { it("should return a successful response with a valid API key and limit to 3", async () => {
const crawlResponse = await request(TEST_URL) const crawlResponse = await request(TEST_URL)

View File

@ -25,7 +25,7 @@ export class WebCrawler {
initialUrl, initialUrl,
includes, includes,
excludes, excludes,
maxCrawledLinks, maxCrawledLinks = 10000,
limit = 10000, limit = 10000,
generateImgAltText = false, generateImgAltText = false,
maxCrawledDepth = 10, maxCrawledDepth = 10,
@ -152,7 +152,7 @@ export class WebCrawler {
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
): Promise<{ url: string, html: string }[]> { ): Promise<{ url: string, html: string }[]> {
const queue = async.queue(async (task: string, callback) => { const queue = async.queue(async (task: string, callback) => {
if (this.crawledUrls.size >= this.maxCrawledLinks) { if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
} }
@ -176,14 +176,14 @@ export class WebCrawler {
if (inProgress && newUrls.length > 0) { if (inProgress && newUrls.length > 0) {
inProgress({ inProgress({
current: this.crawledUrls.size, current: this.crawledUrls.size,
total: this.maxCrawledLinks, total: Math.min(this.maxCrawledLinks, this.limit),
status: "SCRAPING", status: "SCRAPING",
currentDocumentUrl: newUrls[newUrls.length - 1].url, currentDocumentUrl: newUrls[newUrls.length - 1].url,
}); });
} else if (inProgress) { } else if (inProgress) {
inProgress({ inProgress({
current: this.crawledUrls.size, current: this.crawledUrls.size,
total: this.maxCrawledLinks, total: Math.min(this.maxCrawledLinks, this.limit),
status: "SCRAPING", status: "SCRAPING",
currentDocumentUrl: task, currentDocumentUrl: task,
}); });