Nick:
This commit is contained in:
parent
a96fc5b96d
commit
8a72cf556b
@ -44,7 +44,7 @@ export type WebScraperOptions = {
|
|||||||
limit?: number;
|
limit?: number;
|
||||||
generateImgAltText?: boolean;
|
generateImgAltText?: boolean;
|
||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
fastMode?: boolean; // have a mode of some sort
|
mode?: "default" | "fast"; // have a mode of some sort
|
||||||
};
|
};
|
||||||
pageOptions?: PageOptions;
|
pageOptions?: PageOptions;
|
||||||
extractorOptions?: ExtractorOptions;
|
extractorOptions?: ExtractorOptions;
|
||||||
|
@ -20,7 +20,6 @@ export class WebCrawler {
|
|||||||
private robotsTxtUrl: string;
|
private robotsTxtUrl: string;
|
||||||
private robots: any;
|
private robots: any;
|
||||||
private generateImgAltText: boolean;
|
private generateImgAltText: boolean;
|
||||||
private fastMode: boolean = false;
|
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
initialUrl,
|
initialUrl,
|
||||||
@ -50,7 +49,6 @@ export class WebCrawler {
|
|||||||
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
||||||
this.maxCrawledDepth = maxCrawledDepth ?? 10;
|
this.maxCrawledDepth = maxCrawledDepth ?? 10;
|
||||||
this.generateImgAltText = generateImgAltText ?? false;
|
this.generateImgAltText = generateImgAltText ?? false;
|
||||||
this.fastMode = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||||
@ -231,8 +229,7 @@ export class WebCrawler {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Create a new list to return to avoid modifying the visited list
|
// Create a new list to return to avoid modifying the visited list
|
||||||
const filteredLinks = links.filter((link) => !this.visited.has(link.url));
|
return links.filter((link) => !this.visited.has(link.url));
|
||||||
return filteredLinks;
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
@ -48,7 +48,7 @@ export class WebScraperDataProvider {
|
|||||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
||||||
"gpt-4-turbo";
|
"gpt-4-turbo";
|
||||||
private fastMode: boolean = false;
|
private crawlerMode: string = "default";
|
||||||
|
|
||||||
authorize(): void {
|
authorize(): void {
|
||||||
throw new Error("Method not implemented.");
|
throw new Error("Method not implemented.");
|
||||||
@ -173,7 +173,7 @@ export class WebScraperDataProvider {
|
|||||||
let fastDocs = []
|
let fastDocs = []
|
||||||
let documents = [];
|
let documents = [];
|
||||||
// check if fast mode is enabled and there is html inside the links
|
// check if fast mode is enabled and there is html inside the links
|
||||||
if (this.fastMode && links.some((link) => link.html)) {
|
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
||||||
console.log("Fast mode enabled");
|
console.log("Fast mode enabled");
|
||||||
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
||||||
|
|
||||||
@ -438,7 +438,7 @@ export class WebScraperDataProvider {
|
|||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
this.fastMode = options.crawlerOptions?.fastMode ?? false;
|
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||||
|
|
||||||
// make sure all urls start with https://
|
// make sure all urls start with https://
|
||||||
this.urls = this.urls.map((url) => {
|
this.urls = this.urls.map((url) => {
|
||||||
|
Loading…
Reference in New Issue
Block a user