Nick:
This commit is contained in:
parent
8a72cf556b
commit
7f31959be7
@ -15,7 +15,7 @@ export class WebCrawler {
|
|||||||
private maxCrawledLinks: number;
|
private maxCrawledLinks: number;
|
||||||
private maxCrawledDepth: number;
|
private maxCrawledDepth: number;
|
||||||
private visited: Set<string> = new Set();
|
private visited: Set<string> = new Set();
|
||||||
private crawledUrls: { url: string, html: string }[] = [];
|
private crawledUrls: Set<{ url: string, html: string }> = new Set();
|
||||||
private limit: number;
|
private limit: number;
|
||||||
private robotsTxtUrl: string;
|
private robotsTxtUrl: string;
|
||||||
private robots: any;
|
private robots: any;
|
||||||
@ -136,24 +136,24 @@ export class WebCrawler {
|
|||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<{ url: string, html: string }[]> {
|
): Promise<{ url: string, html: string }[]> {
|
||||||
const queue = async.queue(async (task: string, callback) => {
|
const queue = async.queue(async (task: string, callback) => {
|
||||||
if (this.crawledUrls.length >= this.maxCrawledLinks) {
|
if (this.crawledUrls.size >= this.maxCrawledLinks) {
|
||||||
if (callback && typeof callback === "function") {
|
if (callback && typeof callback === "function") {
|
||||||
callback();
|
callback();
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const newUrls = await this.crawl(task);
|
const newUrls = await this.crawl(task);
|
||||||
newUrls.forEach((page) => this.crawledUrls.push(page));
|
newUrls.forEach((page) => this.crawledUrls.add(page));
|
||||||
if (inProgress && newUrls.length > 0) {
|
if (inProgress && newUrls.length > 0) {
|
||||||
inProgress({
|
inProgress({
|
||||||
current: this.crawledUrls.length,
|
current: this.crawledUrls.size,
|
||||||
total: this.maxCrawledLinks,
|
total: this.maxCrawledLinks,
|
||||||
status: "SCRAPING",
|
status: "SCRAPING",
|
||||||
currentDocumentUrl: newUrls[newUrls.length - 1].url,
|
currentDocumentUrl: newUrls[newUrls.length - 1].url,
|
||||||
});
|
});
|
||||||
} else if (inProgress) {
|
} else if (inProgress) {
|
||||||
inProgress({
|
inProgress({
|
||||||
current: this.crawledUrls.length,
|
current: this.crawledUrls.size,
|
||||||
total: this.maxCrawledLinks,
|
total: this.maxCrawledLinks,
|
||||||
status: "SCRAPING",
|
status: "SCRAPING",
|
||||||
currentDocumentUrl: task,
|
currentDocumentUrl: task,
|
||||||
@ -175,7 +175,7 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
await queue.drain();
|
await queue.drain();
|
||||||
return this.crawledUrls;
|
return Array.from(this.crawledUrls);
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
||||||
@ -311,3 +311,4 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -277,8 +277,6 @@ export class WebScraperDataProvider {
|
|||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
await this.setCachedDocuments(documents, links);
|
await this.setCachedDocuments(documents, links);
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
documents = this.filterDocsExcludeInclude(documents);
|
|
||||||
documents = this.filterDepth(documents);
|
|
||||||
return documents.splice(0, this.limit);
|
return documents.splice(0, this.limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user