Nick:
This commit is contained in:
parent
a96fc5b96d
commit
8a72cf556b
@ -44,7 +44,7 @@ export type WebScraperOptions = {
|
||||
limit?: number;
|
||||
generateImgAltText?: boolean;
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
fastMode?: boolean; // have a mode of some sort
|
||||
mode?: "default" | "fast"; // have a mode of some sort
|
||||
};
|
||||
pageOptions?: PageOptions;
|
||||
extractorOptions?: ExtractorOptions;
|
||||
|
@ -20,7 +20,6 @@ export class WebCrawler {
|
||||
private robotsTxtUrl: string;
|
||||
private robots: any;
|
||||
private generateImgAltText: boolean;
|
||||
private fastMode: boolean = false;
|
||||
|
||||
constructor({
|
||||
initialUrl,
|
||||
@ -50,7 +49,6 @@ export class WebCrawler {
|
||||
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
||||
this.maxCrawledDepth = maxCrawledDepth ?? 10;
|
||||
this.generateImgAltText = generateImgAltText ?? false;
|
||||
this.fastMode = false;
|
||||
}
|
||||
|
||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
@ -231,8 +229,7 @@ export class WebCrawler {
|
||||
});
|
||||
|
||||
// Create a new list to return to avoid modifying the visited list
|
||||
const filteredLinks = links.filter((link) => !this.visited.has(link.url));
|
||||
return filteredLinks;
|
||||
return links.filter((link) => !this.visited.has(link.url));
|
||||
} catch (error) {
|
||||
return [];
|
||||
}
|
||||
|
@ -48,7 +48,7 @@ export class WebScraperDataProvider {
|
||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
||||
"gpt-4-turbo";
|
||||
private fastMode: boolean = false;
|
||||
private crawlerMode: string = "default";
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
@ -173,7 +173,7 @@ export class WebScraperDataProvider {
|
||||
let fastDocs = []
|
||||
let documents = [];
|
||||
// check if fast mode is enabled and there is html inside the links
|
||||
if (this.fastMode && links.some((link) => link.html)) {
|
||||
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
||||
console.log("Fast mode enabled");
|
||||
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
||||
|
||||
@ -438,7 +438,7 @@ export class WebScraperDataProvider {
|
||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||
this.excludes = this.excludes.filter((item) => item !== "");
|
||||
this.fastMode = options.crawlerOptions?.fastMode ?? false;
|
||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
|
Loading…
Reference in New Issue
Block a user