Fix for maxDepth
This commit is contained in:
parent
354712a8a3
commit
a6b7197737
@ -619,13 +619,14 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
}, 180000);
|
}, 180000);
|
||||||
|
|
||||||
it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
|
it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
|
||||||
|
|
||||||
const crawlResponse = await request(TEST_URL)
|
const crawlResponse = await request(TEST_URL)
|
||||||
.post("/v0/crawl")
|
.post("/v0/crawl")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.scrapethissite.com",
|
url: "https://www.mendable.ai",
|
||||||
crawlerOptions: { maxDepth: 0 },
|
crawlerOptions: { maxDepth: 2 },
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@ -651,6 +652,70 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
const testurls = completedResponse.body.data.map(
|
||||||
|
(item: any) => item.metadata?.sourceURL
|
||||||
|
);
|
||||||
|
console.log(testurls)
|
||||||
|
|
||||||
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
const urls = completedResponse.body.data.map(
|
||||||
|
(item: any) => item.metadata?.sourceURL
|
||||||
|
);
|
||||||
|
expect(urls.length).toBeGreaterThanOrEqual(1);
|
||||||
|
|
||||||
|
// Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
|
||||||
|
urls.forEach((url: string) => {
|
||||||
|
const pathSplits = new URL(url).pathname.split('/');
|
||||||
|
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
|
||||||
|
expect(depth).toBeLessThanOrEqual(1);
|
||||||
|
});
|
||||||
|
}, 180000);
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
|
||||||
|
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://www.scrapethissite.com",
|
||||||
|
crawlerOptions: { maxDepth: 2 },
|
||||||
|
});
|
||||||
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("status");
|
||||||
|
expect(["active", "waiting"]).toContain(response.body.status);
|
||||||
|
// wait for 60 seconds
|
||||||
|
let isCompleted = false;
|
||||||
|
while (!isCompleted) {
|
||||||
|
const statusCheckResponse = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(statusCheckResponse.statusCode).toBe(200);
|
||||||
|
isCompleted = statusCheckResponse.body.status === "completed";
|
||||||
|
if (!isCompleted) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const completedResponse = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
const testurls = completedResponse.body.data.map(
|
||||||
|
(item: any) => item.metadata?.sourceURL
|
||||||
|
);
|
||||||
|
console.log(testurls)
|
||||||
|
|
||||||
expect(completedResponse.statusCode).toBe(200);
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
expect(completedResponse.body).toHaveProperty("status");
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
expect(completedResponse.body.status).toBe("completed");
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
|
@ -60,8 +60,11 @@ export class WebCrawler {
|
|||||||
.filter((link) => {
|
.filter((link) => {
|
||||||
const url = new URL(link);
|
const url = new URL(link);
|
||||||
const path = url.pathname;
|
const path = url.pathname;
|
||||||
const depth = url.pathname.split('/').length - 1;
|
|
||||||
|
const pathSplits = new URL(url).pathname.split('/');
|
||||||
|
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1;
|
||||||
|
|
||||||
|
|
||||||
// Check if the link exceeds the maximum depth allowed
|
// Check if the link exceeds the maximum depth allowed
|
||||||
if (depth > maxDepth) {
|
if (depth > maxDepth) {
|
||||||
return false;
|
return false;
|
||||||
@ -136,8 +139,10 @@ export class WebCrawler {
|
|||||||
|
|
||||||
if(!crawlerOptions?.ignoreSitemap){
|
if(!crawlerOptions?.ignoreSitemap){
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
|
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||||
|
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -148,6 +153,7 @@ export class WebCrawler {
|
|||||||
concurrencyLimit,
|
concurrencyLimit,
|
||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
urls.length === 0 &&
|
urls.length === 0 &&
|
||||||
|
@ -164,9 +164,9 @@ export class WebScraperDataProvider {
|
|||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
|
|
||||||
const pathSplits = new URL(this.urls[0]).pathname.split('/');
|
const pathSplits = new URL(this.urls[0]).pathname.split('/');
|
||||||
const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
|
const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1;
|
||||||
const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth;
|
const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth;
|
||||||
|
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
includes: this.includes,
|
includes: this.includes,
|
||||||
|
Loading…
Reference in New Issue
Block a user