0
This commit is contained in:
Nicolas 2024-05-13 21:10:58 -07:00
parent a96fc5b96d
commit 8a72cf556b
3 changed files with 5 additions and 8 deletions

View File

@ -44,7 +44,7 @@ export type WebScraperOptions = {
limit?: number;
generateImgAltText?: boolean;
replaceAllPathsWithAbsolutePaths?: boolean;
fastMode?: boolean; // have a mode of some sort
mode?: "default" | "fast"; // have a mode of some sort
};
pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions;

View File

@ -20,7 +20,6 @@ export class WebCrawler {
private robotsTxtUrl: string;
private robots: any;
private generateImgAltText: boolean;
private fastMode: boolean = false;
constructor({
initialUrl,
@ -50,7 +49,6 @@ export class WebCrawler {
this.maxCrawledLinks = maxCrawledLinks ?? limit;
this.maxCrawledDepth = maxCrawledDepth ?? 10;
this.generateImgAltText = generateImgAltText ?? false;
this.fastMode = false;
}
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
@ -231,8 +229,7 @@ export class WebCrawler {
});
// Create a new list to return to avoid modifying the visited list
const filteredLinks = links.filter((link) => !this.visited.has(link.url));
return filteredLinks;
return links.filter((link) => !this.visited.has(link.url));
} catch (error) {
return [];
}

View File

@ -48,7 +48,7 @@ export class WebScraperDataProvider {
private replaceAllPathsWithAbsolutePaths?: boolean = false;
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
"gpt-4-turbo";
private fastMode: boolean = false;
private crawlerMode: string = "default";
authorize(): void {
throw new Error("Method not implemented.");
@ -173,7 +173,7 @@ export class WebScraperDataProvider {
let fastDocs = []
let documents = [];
// check if fast mode is enabled and there is html inside the links
if (this.fastMode && links.some((link) => link.html)) {
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
console.log("Fast mode enabled");
documents = await this.processLinks(allLinks, inProgress, allHtmls);
@ -438,7 +438,7 @@ export class WebScraperDataProvider {
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== "");
this.fastMode = options.crawlerOptions?.fastMode ?? false;
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
// make sure all urls start with https://
this.urls = this.urls.map((url) => {