0
This commit is contained in:
Nicolas 2024-05-13 21:10:58 -07:00
parent a96fc5b96d
commit 8a72cf556b
3 changed files with 5 additions and 8 deletions

View File

@ -44,7 +44,7 @@ export type WebScraperOptions = {
limit?: number; limit?: number;
generateImgAltText?: boolean; generateImgAltText?: boolean;
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
fastMode?: boolean; // have a mode of some sort mode?: "default" | "fast"; // have a mode of some sort
}; };
pageOptions?: PageOptions; pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions; extractorOptions?: ExtractorOptions;

View File

@ -20,7 +20,6 @@ export class WebCrawler {
private robotsTxtUrl: string; private robotsTxtUrl: string;
private robots: any; private robots: any;
private generateImgAltText: boolean; private generateImgAltText: boolean;
private fastMode: boolean = false;
constructor({ constructor({
initialUrl, initialUrl,
@ -50,7 +49,6 @@ export class WebCrawler {
this.maxCrawledLinks = maxCrawledLinks ?? limit; this.maxCrawledLinks = maxCrawledLinks ?? limit;
this.maxCrawledDepth = maxCrawledDepth ?? 10; this.maxCrawledDepth = maxCrawledDepth ?? 10;
this.generateImgAltText = generateImgAltText ?? false; this.generateImgAltText = generateImgAltText ?? false;
this.fastMode = false;
} }
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
@ -231,8 +229,7 @@ export class WebCrawler {
}); });
// Create a new list to return to avoid modifying the visited list // Create a new list to return to avoid modifying the visited list
const filteredLinks = links.filter((link) => !this.visited.has(link.url)); return links.filter((link) => !this.visited.has(link.url));
return filteredLinks;
} catch (error) { } catch (error) {
return []; return [];
} }

View File

@ -48,7 +48,7 @@ export class WebScraperDataProvider {
private replaceAllPathsWithAbsolutePaths?: boolean = false; private replaceAllPathsWithAbsolutePaths?: boolean = false;
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
"gpt-4-turbo"; "gpt-4-turbo";
private fastMode: boolean = false; private crawlerMode: string = "default";
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -173,7 +173,7 @@ export class WebScraperDataProvider {
let fastDocs = [] let fastDocs = []
let documents = []; let documents = [];
// check if fast mode is enabled and there is html inside the links // check if fast mode is enabled and there is html inside the links
if (this.fastMode && links.some((link) => link.html)) { if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
console.log("Fast mode enabled"); console.log("Fast mode enabled");
documents = await this.processLinks(allLinks, inProgress, allHtmls); documents = await this.processLinks(allLinks, inProgress, allHtmls);
@ -438,7 +438,7 @@ export class WebScraperDataProvider {
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== ""); this.excludes = this.excludes.filter((item) => item !== "");
this.fastMode = options.crawlerOptions?.fastMode ?? false; this.crawlerMode = options.crawlerOptions?.mode ?? "default";
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {