Merge branch 'main' into nsc/rate-limiter-tests
This commit is contained in:
commit
ab038051e9
@ -619,12 +619,13 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
}, 180000);
|
}, 180000);
|
||||||
|
|
||||||
it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
|
it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
|
||||||
|
|
||||||
const crawlResponse = await request(TEST_URL)
|
const crawlResponse = await request(TEST_URL)
|
||||||
.post("/v0/crawl")
|
.post("/v0/crawl")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.scrapethissite.com",
|
url: "https://www.mendable.ai",
|
||||||
crawlerOptions: { maxDepth: 0 },
|
crawlerOptions: { maxDepth: 0 },
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
@ -651,6 +652,11 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
const testurls = completedResponse.body.data.map(
|
||||||
|
(item: any) => item.metadata?.sourceURL
|
||||||
|
);
|
||||||
|
//console.log(testurls)
|
||||||
|
|
||||||
expect(completedResponse.statusCode).toBe(200);
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
expect(completedResponse.body).toHaveProperty("status");
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
expect(completedResponse.body.status).toBe("completed");
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
@ -671,58 +677,9 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
});
|
});
|
||||||
}, 180000);
|
}, 180000);
|
||||||
|
|
||||||
it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepth equals to 2", async () => {
|
|
||||||
const crawlResponse = await request(TEST_URL)
|
|
||||||
.post("/v0/crawl")
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
|
||||||
.set("Content-Type", "application/json")
|
|
||||||
.send({
|
|
||||||
url: "proxyway.com",
|
|
||||||
crawlerOptions: { maxDepth: 2, limit: 5 },
|
|
||||||
});
|
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
|
||||||
|
|
||||||
const response = await request(TEST_URL)
|
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
|
||||||
expect(response.statusCode).toBe(200);
|
|
||||||
expect(response.body).toHaveProperty("status");
|
|
||||||
expect(["active", "waiting"]).toContain(response.body.status);
|
|
||||||
// wait for 60 seconds
|
|
||||||
let isCompleted = false;
|
|
||||||
while (!isCompleted) {
|
|
||||||
const statusCheckResponse = await request(TEST_URL)
|
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
|
||||||
expect(statusCheckResponse.statusCode).toBe(200);
|
|
||||||
isCompleted = statusCheckResponse.body.status === "completed";
|
|
||||||
if (!isCompleted) {
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const completedResponse = await request(TEST_URL)
|
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
|
||||||
|
|
||||||
expect(completedResponse.statusCode).toBe(200);
|
|
||||||
expect(completedResponse.body).toHaveProperty("status");
|
|
||||||
expect(completedResponse.body.status).toBe("completed");
|
|
||||||
expect(completedResponse.body).toHaveProperty("data");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
|
||||||
const urls = completedResponse.body.data.map(
|
|
||||||
(item: any) => item.metadata?.sourceURL
|
|
||||||
);
|
|
||||||
expect(urls.length).toBeGreaterThanOrEqual(1);
|
|
||||||
|
|
||||||
// Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
|
|
||||||
urls.forEach((url: string) => {
|
|
||||||
const pathSplits = new URL(url).pathname.split('/');
|
|
||||||
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
|
|
||||||
expect(depth).toBeLessThanOrEqual(3);
|
|
||||||
});
|
|
||||||
}, 180000);
|
|
||||||
|
|
||||||
// it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
|
// it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
|
||||||
// const crawlResponse = await request(TEST_URL)
|
// const crawlResponse = await request(TEST_URL)
|
||||||
@ -1009,54 +966,6 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
}, 180000); // 120 seconds
|
}, 180000); // 120 seconds
|
||||||
|
|
||||||
|
|
||||||
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
|
|
||||||
const crawlResponse = await request(TEST_URL)
|
|
||||||
.post("/v0/crawl")
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
|
||||||
.set("Content-Type", "application/json")
|
|
||||||
.send({
|
|
||||||
url: "https://www.scrapethissite.com",
|
|
||||||
crawlerOptions: { maxDepth: 1 },
|
|
||||||
});
|
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
|
||||||
|
|
||||||
let isCompleted = false;
|
|
||||||
let completedResponse;
|
|
||||||
|
|
||||||
while (!isCompleted) {
|
|
||||||
const response = await request(TEST_URL)
|
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
|
||||||
expect(response.statusCode).toBe(200);
|
|
||||||
expect(response.body).toHaveProperty("status");
|
|
||||||
|
|
||||||
if (response.body.status === "completed") {
|
|
||||||
isCompleted = true;
|
|
||||||
completedResponse = response;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
expect(completedResponse.statusCode).toBe(200);
|
|
||||||
expect(completedResponse.body).toHaveProperty("status");
|
|
||||||
expect(completedResponse.body.status).toBe("completed");
|
|
||||||
expect(completedResponse.body).toHaveProperty("data");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
|
||||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
|
||||||
|
|
||||||
const urls = completedResponse.body.data.map(
|
|
||||||
(item: any) => item.metadata?.sourceURL
|
|
||||||
);
|
|
||||||
expect(urls.length).toBeGreaterThan(1);
|
|
||||||
|
|
||||||
// Check if all URLs have a maximum depth of 1
|
|
||||||
urls.forEach((url) => {
|
|
||||||
const pathSplits = new URL(url).pathname.split('/');
|
|
||||||
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
|
|
||||||
expect(depth).toBeLessThanOrEqual(2);
|
|
||||||
});
|
|
||||||
}, 180000);
|
|
||||||
|
|
||||||
it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => {
|
it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => {
|
||||||
const crawlResponse = await request(TEST_URL)
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
163
apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts
Normal file
163
apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
// crawler.test.ts
|
||||||
|
import { WebCrawler } from '../crawler';
|
||||||
|
import axios from 'axios';
|
||||||
|
import robotsParser from 'robots-parser';
|
||||||
|
import { getAdjustedMaxDepth } from '../utils/maxDepthUtils';
|
||||||
|
|
||||||
|
jest.mock('axios');
|
||||||
|
jest.mock('robots-parser');
|
||||||
|
|
||||||
|
describe('WebCrawler maxDepth and filterLinks', () => {
|
||||||
|
let crawler: WebCrawler;
|
||||||
|
const mockAxios = axios as jest.Mocked<typeof axios>;
|
||||||
|
const mockRobotsParser = robotsParser as jest.MockedFunction<typeof robotsParser>;
|
||||||
|
|
||||||
|
let maxCrawledDepth: number;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
// Setup default mocks
|
||||||
|
mockAxios.get.mockImplementation((url) => {
|
||||||
|
if (url.includes('robots.txt')) {
|
||||||
|
return Promise.resolve({ data: 'User-agent: *\nAllow: /' });
|
||||||
|
} else if (url.includes('sitemap.xml')) {
|
||||||
|
return Promise.resolve({ data: 'sitemap content' }); // You would normally parse this to URLs
|
||||||
|
}
|
||||||
|
return Promise.resolve({ data: '<html></html>' });
|
||||||
|
});
|
||||||
|
|
||||||
|
mockRobotsParser.mockReturnValue({
|
||||||
|
isAllowed: jest.fn().mockReturnValue(true),
|
||||||
|
isDisallowed: jest.fn().mockReturnValue(false),
|
||||||
|
getMatchingLineNumber: jest.fn().mockReturnValue(0),
|
||||||
|
getCrawlDelay: jest.fn().mockReturnValue(0),
|
||||||
|
getSitemaps: jest.fn().mockReturnValue([]),
|
||||||
|
getPreferredHost: jest.fn().mockReturnValue('example.com')
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => {
|
||||||
|
const initialUrl = 'http://example.com'; // Set initial URL for this test
|
||||||
|
const enteredMaxCrawledDepth = 2;
|
||||||
|
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||||
|
|
||||||
|
|
||||||
|
crawler = new WebCrawler({
|
||||||
|
initialUrl: initialUrl,
|
||||||
|
includes: [],
|
||||||
|
excludes: [],
|
||||||
|
limit: 100,
|
||||||
|
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||||
|
});
|
||||||
|
|
||||||
|
// Mock sitemap fetching function to return controlled links
|
||||||
|
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||||
|
initialUrl, // depth 0
|
||||||
|
initialUrl + '/page1', // depth 1
|
||||||
|
initialUrl + '/page1/page2', // depth 2
|
||||||
|
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
|
||||||
|
]);
|
||||||
|
|
||||||
|
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||||
|
expect(results).toEqual([
|
||||||
|
{ url: initialUrl, html: '' },
|
||||||
|
{ url: initialUrl + '/page1', html: '' },
|
||||||
|
{ url: initialUrl + '/page1/page2', html: '' }
|
||||||
|
]);
|
||||||
|
|
||||||
|
|
||||||
|
// Ensure that the link with depth 3 is not included
|
||||||
|
expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => {
|
||||||
|
const initialUrl = 'http://example.com'; // Set initial URL for this test
|
||||||
|
const enteredMaxCrawledDepth = 0;
|
||||||
|
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||||
|
|
||||||
|
|
||||||
|
crawler = new WebCrawler({
|
||||||
|
initialUrl: initialUrl,
|
||||||
|
includes: [],
|
||||||
|
excludes: [],
|
||||||
|
limit: 100,
|
||||||
|
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||||
|
});
|
||||||
|
|
||||||
|
// Mock sitemap fetching function to return controlled links
|
||||||
|
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||||
|
initialUrl, // depth 0
|
||||||
|
initialUrl + '/page1', // depth 1
|
||||||
|
initialUrl + '/page1/page2', // depth 2
|
||||||
|
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
|
||||||
|
]);
|
||||||
|
|
||||||
|
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||||
|
expect(results).toEqual([
|
||||||
|
{ url: initialUrl, html: '' },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => {
|
||||||
|
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
|
||||||
|
const enteredMaxCrawledDepth = 1;
|
||||||
|
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||||
|
|
||||||
|
|
||||||
|
crawler = new WebCrawler({
|
||||||
|
initialUrl: initialUrl,
|
||||||
|
includes: [],
|
||||||
|
excludes: [],
|
||||||
|
limit: 100,
|
||||||
|
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||||
|
});
|
||||||
|
|
||||||
|
// Mock sitemap fetching function to return controlled links
|
||||||
|
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||||
|
initialUrl, // depth 0
|
||||||
|
initialUrl + '/page2', // depth 1
|
||||||
|
initialUrl + '/page2/page3', // depth 2
|
||||||
|
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
|
||||||
|
]);
|
||||||
|
|
||||||
|
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||||
|
expect(results).toEqual([
|
||||||
|
{ url: initialUrl, html: '' },
|
||||||
|
{ url: initialUrl + '/page2', html: '' }
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => {
|
||||||
|
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
|
||||||
|
const enteredMaxCrawledDepth = 2;
|
||||||
|
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||||
|
|
||||||
|
|
||||||
|
crawler = new WebCrawler({
|
||||||
|
initialUrl: initialUrl,
|
||||||
|
includes: [],
|
||||||
|
excludes: [],
|
||||||
|
limit: 100,
|
||||||
|
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||||
|
});
|
||||||
|
|
||||||
|
// Mock sitemap fetching function to return controlled links
|
||||||
|
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||||
|
initialUrl, // depth 0
|
||||||
|
initialUrl + '/page2', // depth 1
|
||||||
|
initialUrl + '/page2/page3', // depth 2
|
||||||
|
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
|
||||||
|
]);
|
||||||
|
|
||||||
|
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||||
|
expect(results).toEqual([
|
||||||
|
{ url: initialUrl, html: '' },
|
||||||
|
{ url: initialUrl + '/page2', html: '' },
|
||||||
|
{ url: initialUrl + '/page2/page3', html: '' }
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Add more tests to cover other scenarios, such as checking includes and excludes
|
||||||
|
});
|
||||||
|
|
@ -6,6 +6,7 @@ import async from "async";
|
|||||||
import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
|
import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
|
||||||
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
|
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
@ -60,8 +61,10 @@ export class WebCrawler {
|
|||||||
.filter((link) => {
|
.filter((link) => {
|
||||||
const url = new URL(link);
|
const url = new URL(link);
|
||||||
const path = url.pathname;
|
const path = url.pathname;
|
||||||
const depth = url.pathname.split('/').length - 1;
|
|
||||||
|
const depth = getURLDepth(url.toString());
|
||||||
|
|
||||||
|
|
||||||
// Check if the link exceeds the maximum depth allowed
|
// Check if the link exceeds the maximum depth allowed
|
||||||
if (depth > maxDepth) {
|
if (depth > maxDepth) {
|
||||||
return false;
|
return false;
|
||||||
@ -136,8 +139,10 @@ export class WebCrawler {
|
|||||||
|
|
||||||
if(!crawlerOptions?.ignoreSitemap){
|
if(!crawlerOptions?.ignoreSitemap){
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
|
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||||
|
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -148,6 +153,7 @@ export class WebCrawler {
|
|||||||
concurrencyLimit,
|
concurrencyLimit,
|
||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
urls.length === 0 &&
|
urls.length === 0 &&
|
||||||
@ -224,11 +230,10 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||||
const normalizedUrl = this.normalizeCrawlUrl(url);
|
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||||
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
this.visited.add(normalizedUrl);
|
this.visited.add(url);
|
||||||
|
|
||||||
if (!url.startsWith("http")) {
|
if (!url.startsWith("http")) {
|
||||||
url = "https://" + url;
|
url = "https://" + url;
|
||||||
@ -276,15 +281,16 @@ export class WebCrawler {
|
|||||||
const urlObj = new URL(fullUrl);
|
const urlObj = new URL(fullUrl);
|
||||||
const path = urlObj.pathname;
|
const path = urlObj.pathname;
|
||||||
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
this.isInternalLink(fullUrl) &&
|
this.isInternalLink(fullUrl) &&
|
||||||
this.matchesPattern(fullUrl) &&
|
|
||||||
this.noSections(fullUrl) &&
|
this.noSections(fullUrl) &&
|
||||||
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
|
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
|
||||||
// this.matchesIncludes(path) &&
|
// this.matchesIncludes(path) &&
|
||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
this.isRobotsAllowed(fullUrl)
|
||||||
) {
|
) {
|
||||||
|
|
||||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -294,12 +300,15 @@ export class WebCrawler {
|
|||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
// Create a new list to return to avoid modifying the visited list
|
// Create a new list to return to avoid modifying the visited list
|
||||||
return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
|
return links.filter((link) => !this.visited.has(link.url));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private isRobotsAllowed(url: string): boolean {
|
||||||
|
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
|
||||||
|
}
|
||||||
private normalizeCrawlUrl(url: string): string {
|
private normalizeCrawlUrl(url: string): string {
|
||||||
try{
|
try{
|
||||||
const urlObj = new URL(url);
|
const urlObj = new URL(url);
|
||||||
@ -326,12 +335,10 @@ export class WebCrawler {
|
|||||||
|
|
||||||
private isInternalLink(link: string): boolean {
|
private isInternalLink(link: string): boolean {
|
||||||
const urlObj = new URL(link, this.baseUrl);
|
const urlObj = new URL(link, this.baseUrl);
|
||||||
const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
|
const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
|
||||||
return urlObj.hostname === domainWithoutProtocol;
|
const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
|
||||||
}
|
|
||||||
|
return linkDomain === baseDomain;
|
||||||
private matchesPattern(link: string): boolean {
|
|
||||||
return true; // Placeholder for future pattern matching implementation
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private isFile(url: string): boolean {
|
private isFile(url: string): boolean {
|
||||||
|
@ -18,6 +18,7 @@ import {
|
|||||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||||
|
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
private bullJobId: string;
|
private bullJobId: string;
|
||||||
@ -163,16 +164,12 @@ export class WebScraperDataProvider {
|
|||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
|
|
||||||
const pathSplits = new URL(this.urls[0]).pathname.split('/');
|
|
||||||
const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
|
|
||||||
const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth;
|
|
||||||
|
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
includes: this.includes,
|
includes: this.includes,
|
||||||
excludes: this.excludes,
|
excludes: this.excludes,
|
||||||
maxCrawledLinks: this.maxCrawledLinks,
|
maxCrawledLinks: this.maxCrawledLinks,
|
||||||
maxCrawledDepth: adjustedMaxDepth,
|
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
||||||
limit: this.limit,
|
limit: this.limit,
|
||||||
generateImgAltText: this.generateImgAltText,
|
generateImgAltText: this.generateImgAltText,
|
||||||
allowBackwardCrawling: this.allowBackwardCrawling,
|
allowBackwardCrawling: this.allowBackwardCrawling,
|
||||||
@ -580,8 +577,7 @@ export class WebScraperDataProvider {
|
|||||||
filterDepth(documents: Document[]): Document[] {
|
filterDepth(documents: Document[]): Document[] {
|
||||||
return documents.filter((document) => {
|
return documents.filter((document) => {
|
||||||
const url = new URL(document.metadata.sourceURL);
|
const url = new URL(document.metadata.sourceURL);
|
||||||
const path = url.pathname;
|
return getURLDepth(url.toString()) <= this.maxCrawledDepth;
|
||||||
return path.split("/").length <= this.maxCrawledDepth;
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,47 @@
|
|||||||
|
import { getURLDepth, getAdjustedMaxDepth } from '../maxDepthUtils';
|
||||||
|
|
||||||
|
describe('Testing getURLDepth and getAdjustedMaxDepth', () => {
|
||||||
|
it('should return 0 for root - mendable.ai', () => {
|
||||||
|
const enteredURL = "https://www.mendable.ai/"
|
||||||
|
expect(getURLDepth(enteredURL)).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return 0 for root - scrapethissite.com', () => {
|
||||||
|
const enteredURL = "https://scrapethissite.com/"
|
||||||
|
expect(getURLDepth(enteredURL)).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return 1 for scrapethissite.com/pages', () => {
|
||||||
|
const enteredURL = "https://scrapethissite.com/pages"
|
||||||
|
expect(getURLDepth(enteredURL)).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return 2 for scrapethissite.com/pages/articles', () => {
|
||||||
|
const enteredURL = "https://scrapethissite.com/pages/articles"
|
||||||
|
expect(getURLDepth(enteredURL)).toBe(2);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Adjusted maxDepth should return 1 for scrapethissite.com and max depth param of 1', () => {
|
||||||
|
const enteredURL = "https://scrapethissite.com"
|
||||||
|
expect(getAdjustedMaxDepth(enteredURL, 1)).toBe(1);
|
||||||
|
|
||||||
|
});
|
||||||
|
it('Adjusted maxDepth should return 0 for scrapethissite.com and max depth param of 0', () => {
|
||||||
|
const enteredURL = "https://scrapethissite.com"
|
||||||
|
expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Adjusted maxDepth should return 0 for mendable.ai and max depth param of 0', () => {
|
||||||
|
const enteredURL = "https://mendable.ai"
|
||||||
|
expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Adjusted maxDepth should return 4 for scrapethissite.com/pages/articles and max depth param of 2', () => {
|
||||||
|
const enteredURL = "https://scrapethissite.com/pages/articles"
|
||||||
|
expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(4);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
});
|
12
apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts
Normal file
12
apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
|
||||||
|
|
||||||
|
export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number {
|
||||||
|
const baseURLDepth = getURLDepth(url);
|
||||||
|
const adjustedMaxDepth = maxCrawlDepth + baseURLDepth;
|
||||||
|
return adjustedMaxDepth;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getURLDepth(url: string): number {
|
||||||
|
const pathSplits = new URL(url).pathname.split('/');
|
||||||
|
return pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) - 1;
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user