0
This commit is contained in:
Nicolas 2024-05-13 20:51:42 -07:00
parent e26008a833
commit 86b8439844
2 changed files with 7 additions and 2 deletions

View File

@ -4,7 +4,7 @@ import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
import async from "async"; import async from "async";
import { Progress } from "../../lib/entities"; import { Progress } from "../../lib/entities";
import { scrapWithScrapingBee } from "./single_url"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
export class WebCrawler { export class WebCrawler {
@ -196,7 +196,8 @@ export class WebCrawler {
let content; let content;
// If it is the first link, fetch with scrapingbee // If it is the first link, fetch with scrapingbee
if (this.visited.size === 1) { if (this.visited.size === 1) {
content = await scrapWithScrapingBee(url, "load"); const page = await scrapSingleUrl(url, {includeHtml: true});
content = page.html;
} else { } else {
const response = await axios.get(url); const response = await axios.get(url);
content = response.data; content = response.data;

View File

@ -140,6 +140,7 @@ export class WebScraperDataProvider {
generateImgAltText: this.generateImgAltText, generateImgAltText: this.generateImgAltText,
}); });
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(links, inProgress); return this.returnOnlyUrlsResponse(links, inProgress);
} }
@ -163,6 +164,7 @@ export class WebScraperDataProvider {
return this.returnOnlyUrlsResponse(links, inProgress); return this.returnOnlyUrlsResponse(links, inProgress);
} }
let documents = await this.processLinks(links, inProgress); let documents = await this.processLinks(links, inProgress);
return this.cacheAndFinalizeDocuments(documents, links); return this.cacheAndFinalizeDocuments(documents, links);
} }
@ -237,6 +239,8 @@ export class WebScraperDataProvider {
links: string[] links: string[]
): Promise<Document[]> { ): Promise<Document[]> {
await this.setCachedDocuments(documents, links); await this.setCachedDocuments(documents, links);
documents = this.filterDocsExcludeInclude(documents);
documents = this.filterDepth(documents);
documents = this.removeChildLinks(documents); documents = this.removeChildLinks(documents);
return documents.splice(0, this.limit); return documents.splice(0, this.limit);
} }