From 8a72cf556bf8cff1b21983a8fd50f56abc2ec8af Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 21:10:58 -0700 Subject: [PATCH] Nick: --- apps/api/src/lib/entities.ts | 2 +- apps/api/src/scraper/WebScraper/crawler.ts | 5 +---- apps/api/src/scraper/WebScraper/index.ts | 6 +++--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 0c34126..15550be 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -44,7 +44,7 @@ export type WebScraperOptions = { limit?: number; generateImgAltText?: boolean; replaceAllPathsWithAbsolutePaths?: boolean; - fastMode?: boolean; // have a mode of some sort + mode?: "default" | "fast"; // have a mode of some sort }; pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 25f2e9d..4509531 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -20,7 +20,6 @@ export class WebCrawler { private robotsTxtUrl: string; private robots: any; private generateImgAltText: boolean; - private fastMode: boolean = false; constructor({ initialUrl, @@ -50,7 +49,6 @@ export class WebCrawler { this.maxCrawledLinks = maxCrawledLinks ?? limit; this.maxCrawledDepth = maxCrawledDepth ?? 10; this.generateImgAltText = generateImgAltText ?? false; - this.fastMode = false; } private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { @@ -231,8 +229,7 @@ export class WebCrawler { }); // Create a new list to return to avoid modifying the visited list - const filteredLinks = links.filter((link) => !this.visited.has(link.url)); - return filteredLinks; + return links.filter((link) => !this.visited.has(link.url)); } catch (error) { return []; } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 9221666..1eeb65f 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -48,7 +48,7 @@ export class WebScraperDataProvider { private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; - private fastMode: boolean = false; + private crawlerMode: string = "default"; authorize(): void { throw new Error("Method not implemented."); @@ -173,7 +173,7 @@ export class WebScraperDataProvider { let fastDocs = [] let documents = []; // check if fast mode is enabled and there is html inside the links - if (this.fastMode && links.some((link) => link.html)) { + if (this.crawlerMode === "fast" && links.some((link) => link.html)) { console.log("Fast mode enabled"); documents = await this.processLinks(allLinks, inProgress, allHtmls); @@ -438,7 +438,7 @@ export class WebScraperDataProvider { this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); - this.fastMode = options.crawlerOptions?.fastMode ?? false; + this.crawlerMode = options.crawlerOptions?.mode ?? "default"; // make sure all urls start with https:// this.urls = this.urls.map((url) => {