Nick:
This commit is contained in:
parent
e26008a833
commit
86b8439844
@ -4,7 +4,7 @@ import { URL } from "url";
|
|||||||
import { getLinksFromSitemap } from "./sitemap";
|
import { getLinksFromSitemap } from "./sitemap";
|
||||||
import async from "async";
|
import async from "async";
|
||||||
import { Progress } from "../../lib/entities";
|
import { Progress } from "../../lib/entities";
|
||||||
import { scrapWithScrapingBee } from "./single_url";
|
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
|
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
@ -196,7 +196,8 @@ export class WebCrawler {
|
|||||||
let content;
|
let content;
|
||||||
// If it is the first link, fetch with scrapingbee
|
// If it is the first link, fetch with scrapingbee
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
content = await scrapWithScrapingBee(url, "load");
|
const page = await scrapSingleUrl(url, {includeHtml: true});
|
||||||
|
content = page.html;
|
||||||
} else {
|
} else {
|
||||||
const response = await axios.get(url);
|
const response = await axios.get(url);
|
||||||
content = response.data;
|
content = response.data;
|
||||||
|
@ -140,6 +140,7 @@ export class WebScraperDataProvider {
|
|||||||
generateImgAltText: this.generateImgAltText,
|
generateImgAltText: this.generateImgAltText,
|
||||||
});
|
});
|
||||||
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
|
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
}
|
}
|
||||||
@ -163,6 +164,7 @@ export class WebScraperDataProvider {
|
|||||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
let documents = await this.processLinks(links, inProgress);
|
let documents = await this.processLinks(links, inProgress);
|
||||||
return this.cacheAndFinalizeDocuments(documents, links);
|
return this.cacheAndFinalizeDocuments(documents, links);
|
||||||
}
|
}
|
||||||
@ -237,6 +239,8 @@ export class WebScraperDataProvider {
|
|||||||
links: string[]
|
links: string[]
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
await this.setCachedDocuments(documents, links);
|
await this.setCachedDocuments(documents, links);
|
||||||
|
documents = this.filterDocsExcludeInclude(documents);
|
||||||
|
documents = this.filterDepth(documents);
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
return documents.splice(0, this.limit);
|
return documents.splice(0, this.limit);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user