0

Add unit tests to replace e2e

This commit is contained in:
Eric Ciarla 2024-06-15 16:43:37 -04:00
parent 2b40729cc2
commit 34e37c5671
6 changed files with 226 additions and 67 deletions

View File

@ -563,7 +563,7 @@ describe("E2E Tests for API Routes", () => {
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
expect(depth).toBeLessThanOrEqual(2);
});
}, 240000);
}, 180000);
it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => {
const crawlResponse = await request(TEST_URL)
@ -616,7 +616,7 @@ describe("E2E Tests for API Routes", () => {
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
expect(depth).toBeLessThanOrEqual(3);
});
}, 240000);
}, 180000);
it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
@ -675,60 +675,11 @@ describe("E2E Tests for API Routes", () => {
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
expect(depth).toBeLessThanOrEqual(1);
});
}, 240000);
}, 180000);
it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepth equals to 2", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com",
crawlerOptions: { maxDepth: 2, limit: 5 },
});
expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(["active", "waiting"]).toContain(response.body.status);
// wait for 60 seconds
let isCompleted = false;
while (!isCompleted) {
const statusCheckResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(statusCheckResponse.statusCode).toBe(200);
isCompleted = statusCheckResponse.body.status === "completed";
if (!isCompleted) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThanOrEqual(1);
// Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
urls.forEach((url: string) => {
const pathSplits = new URL(url).pathname.split('/');
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
expect(depth).toBeLessThanOrEqual(3);
});
}, 240000);
// it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
// const crawlResponse = await request(TEST_URL)
@ -828,7 +779,7 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 240000);
}, 180000);
});
@ -971,7 +922,7 @@ describe("E2E Tests for API Routes", () => {
);
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
}, 240000); // 120 seconds
}, 180000); // 120 seconds
it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => {
const crawlResponse = await request(TEST_URL)
@ -1012,7 +963,7 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 240000); // 120 seconds
}, 180000); // 120 seconds
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
@ -1062,7 +1013,7 @@ describe("E2E Tests for API Routes", () => {
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
expect(depth).toBeLessThanOrEqual(2);
});
}, 240000);
}, 180000);
it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => {
const crawlResponse = await request(TEST_URL)

View File

@ -0,0 +1,163 @@
// crawler.test.ts
import { WebCrawler } from '../crawler';
import axios from 'axios';
import robotsParser from 'robots-parser';
import { getAdjustedMaxDepth } from '../utils/maxDepthUtils';
jest.mock('axios');
jest.mock('robots-parser');
describe('WebCrawler maxDepth and filterLinks', () => {
let crawler: WebCrawler;
const mockAxios = axios as jest.Mocked<typeof axios>;
const mockRobotsParser = robotsParser as jest.MockedFunction<typeof robotsParser>;
let maxCrawledDepth: number;
beforeEach(() => {
// Setup default mocks
mockAxios.get.mockImplementation((url) => {
if (url.includes('robots.txt')) {
return Promise.resolve({ data: 'User-agent: *\nAllow: /' });
} else if (url.includes('sitemap.xml')) {
return Promise.resolve({ data: 'sitemap content' }); // You would normally parse this to URLs
}
return Promise.resolve({ data: '<html></html>' });
});
mockRobotsParser.mockReturnValue({
isAllowed: jest.fn().mockReturnValue(true),
isDisallowed: jest.fn().mockReturnValue(false),
getMatchingLineNumber: jest.fn().mockReturnValue(0),
getCrawlDelay: jest.fn().mockReturnValue(0),
getSitemaps: jest.fn().mockReturnValue([]),
getPreferredHost: jest.fn().mockReturnValue('example.com')
});
});
it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => {
const initialUrl = 'http://example.com'; // Set initial URL for this test
const enteredMaxCrawledDepth = 2;
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
crawler = new WebCrawler({
initialUrl: initialUrl,
includes: [],
excludes: [],
limit: 100,
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
});
// Mock sitemap fetching function to return controlled links
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
initialUrl, // depth 0
initialUrl + '/page1', // depth 1
initialUrl + '/page1/page2', // depth 2
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
]);
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
expect(results).toEqual([
{ url: initialUrl, html: '' },
{ url: initialUrl + '/page1', html: '' },
{ url: initialUrl + '/page1/page2', html: '' }
]);
// Ensure that the link with depth 3 is not included
expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false);
});
it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => {
const initialUrl = 'http://example.com'; // Set initial URL for this test
const enteredMaxCrawledDepth = 0;
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
console.log(maxCrawledDepth);
crawler = new WebCrawler({
initialUrl: initialUrl,
includes: [],
excludes: [],
limit: 100,
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
});
// Mock sitemap fetching function to return controlled links
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
initialUrl, // depth 0
initialUrl + '/page1', // depth 1
initialUrl + '/page1/page2', // depth 2
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
]);
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
expect(results).toEqual([
{ url: initialUrl, html: '' },
]);
});
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => {
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
const enteredMaxCrawledDepth = 1;
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
console.log(maxCrawledDepth);
crawler = new WebCrawler({
initialUrl: initialUrl,
includes: [],
excludes: [],
limit: 100,
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
});
// Mock sitemap fetching function to return controlled links
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
initialUrl, // depth 0
initialUrl + '/page2', // depth 1
initialUrl + '/page2/page3', // depth 2
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
]);
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
expect(results).toEqual([
{ url: initialUrl, html: '' },
{ url: initialUrl + '/page2', html: '' }
]);
});
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => {
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
const enteredMaxCrawledDepth = 2;
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
console.log(maxCrawledDepth);
crawler = new WebCrawler({
initialUrl: initialUrl,
includes: [],
excludes: [],
limit: 100,
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
});
// Mock sitemap fetching function to return controlled links
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
initialUrl, // depth 0
initialUrl + '/page2', // depth 1
initialUrl + '/page2/page3', // depth 2
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
]);
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
expect(results).toEqual([
{ url: initialUrl, html: '' },
{ url: initialUrl + '/page2', html: '' },
{ url: initialUrl + '/page2/page3', html: '' }
]);
});
// Add more tests to cover other scenarios, such as checking includes and excludes
});

View File

@ -6,6 +6,7 @@ import async from "async";
import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
export class WebCrawler {
private initialUrl: string;
@ -61,8 +62,7 @@ export class WebCrawler {
const url = new URL(link);
const path = url.pathname;
const pathSplits = new URL(url).pathname.split('/');
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1;
const depth = getURLDepth(url.toString());
// Check if the link exceeds the maximum depth allowed

View File

@ -18,6 +18,7 @@ import {
import { generateCompletions } from "../../lib/LLM-extraction";
import { getWebScraperQueue } from "../../../src/services/queue-service";
import { fetchAndProcessDocx } from "./utils/docxProcessor";
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
export class WebScraperDataProvider {
private bullJobId: string;
@ -163,16 +164,12 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
const pathSplits = new URL(this.urls[0]).pathname.split('/');
const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1;
const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth;
const crawler = new WebCrawler({
initialUrl: this.urls[0],
includes: this.includes,
excludes: this.excludes,
maxCrawledLinks: this.maxCrawledLinks,
maxCrawledDepth: adjustedMaxDepth,
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
limit: this.limit,
generateImgAltText: this.generateImgAltText,
allowBackwardCrawling: this.allowBackwardCrawling,
@ -580,8 +577,7 @@ export class WebScraperDataProvider {
filterDepth(documents: Document[]): Document[] {
return documents.filter((document) => {
const url = new URL(document.metadata.sourceURL);
const path = url.pathname;
return path.split("/").length <= this.maxCrawledDepth;
return getURLDepth(url.toString()) <= this.maxCrawledDepth;
});
}
}

View File

@ -0,0 +1,37 @@
import { getURLDepth, getAdjustedMaxDepth } from '../maxDepthUtils';
describe('Testing getURLDepth and getAdjustedMaxDepth', () => {
it('should return 0 for root - mendable.ai', () => {
const enteredURL = "https://www.mendable.ai/"
expect(getURLDepth(enteredURL)).toBe(0);
});
it('should return 0 for root - scrapethissite.com', () => {
const enteredURL = "https://scrapethissite.com/"
expect(getURLDepth(enteredURL)).toBe(0);
});
it('should return 1 for scrapethissite.com/pages', () => {
const enteredURL = "https://scrapethissite.com/pages"
expect(getURLDepth(enteredURL)).toBe(1);
});
it('should return 2 for scrapethissite.com/pages/articles', () => {
const enteredURL = "https://scrapethissite.com/pages/articles"
expect(getURLDepth(enteredURL)).toBe(2);
});
it('Adjusted maxDepth should return 1 for scrapethissite.com', () => {
const enteredURL = "https://scrapethissite.com"
expect(getAdjustedMaxDepth(enteredURL, 1)).toBe(1);
});
it('Adjusted maxDepth should return 5 for scrapethissite.com/pages/articles', () => {
const enteredURL = "https://scrapethissite.com/pages/articles"
expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(5);
});
});

View File

@ -0,0 +1,12 @@
export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number {
const baseURLDepth = getURLDepth(url);
const adjustedMaxDepth = maxCrawlDepth + baseURLDepth;
return adjustedMaxDepth;
}
export function getURLDepth(url: string): number {
const pathSplits = new URL(url).pathname.split('/');
return pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) - 1;
}