Update index.ts
This commit is contained in:
parent
84cebf618b
commit
f1dd97af0f
@ -6,8 +6,10 @@ import { WebCrawler } from "./crawler";
|
|||||||
import { getValue, setValue } from "../../services/redis";
|
import { getValue, setValue } from "../../services/redis";
|
||||||
import { getImageDescription } from "./utils/gptVision";
|
import { getImageDescription } from "./utils/gptVision";
|
||||||
import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
|
import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
|
||||||
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
import {
|
||||||
|
replaceImgPathsWithAbsolutePaths,
|
||||||
|
replacePathsWithAbsolutePaths,
|
||||||
|
} from "./utils/replacePaths";
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
private urls: string[] = [""];
|
private urls: string[] = [""];
|
||||||
@ -36,8 +38,6 @@ export class WebScraperDataProvider {
|
|||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
const totalUrls = urls.length;
|
const totalUrls = urls.length;
|
||||||
let processedUrls = 0;
|
let processedUrls = 0;
|
||||||
console.log("Converting urls to documents");
|
|
||||||
console.log("Total urls", urls);
|
|
||||||
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
||||||
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
||||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||||
@ -88,17 +88,21 @@ export class WebScraperDataProvider {
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true}));
|
let pdfLinks = links.filter(
|
||||||
|
async (link) => await isUrlAPdf({ url: link, fastMode: true })
|
||||||
|
);
|
||||||
let pdfDocuments: Document[] = [];
|
let pdfDocuments: Document[] = [];
|
||||||
for (let pdfLink of pdfLinks) {
|
for (let pdfLink of pdfLinks) {
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
pdfDocuments.push({
|
pdfDocuments.push({
|
||||||
content: pdfContent,
|
content: pdfContent,
|
||||||
metadata: { sourceURL: pdfLink },
|
metadata: { sourceURL: pdfLink },
|
||||||
provider: "web-scraper"
|
provider: "web-scraper",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true}));
|
links = links.filter(
|
||||||
|
async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
|
||||||
|
);
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
@ -157,21 +161,18 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (this.mode === "single_urls") {
|
if (this.mode === "single_urls") {
|
||||||
console.log("Single urls mode");
|
|
||||||
let pdfDocuments: Document[] = [];
|
let pdfDocuments: Document[] = [];
|
||||||
let nonPdfUrls: string[] = [];
|
let nonPdfUrls: string[] = [];
|
||||||
for (let url of this.urls) {
|
for (let url of this.urls) {
|
||||||
console.log("Checking if url is a pdf", url);
|
if (await isUrlAPdf({ url: url, fastMode: false })) {
|
||||||
if (await isUrlAPdf({url: url, fastMode: false})) {
|
|
||||||
const pdfContent = await fetchAndProcessPdf(url);
|
const pdfContent = await fetchAndProcessPdf(url);
|
||||||
pdfDocuments.push({
|
pdfDocuments.push({
|
||||||
content: pdfContent,
|
content: pdfContent,
|
||||||
metadata: { sourceURL: url },
|
metadata: { sourceURL: url },
|
||||||
provider: "web-scraper"
|
provider: "web-scraper",
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
nonPdfUrls.push(url);
|
nonPdfUrls.push(url);
|
||||||
console.log("Fetching and processing url", url);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -200,17 +201,21 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
if (this.mode === "sitemap") {
|
if (this.mode === "sitemap") {
|
||||||
let links = await getLinksFromSitemap(this.urls[0]);
|
let links = await getLinksFromSitemap(this.urls[0]);
|
||||||
let pdfLinks = links.filter(async (link) => await isUrlAPdf({url: link, fastMode: true}));
|
let pdfLinks = links.filter(
|
||||||
|
async (link) => await isUrlAPdf({ url: link, fastMode: true })
|
||||||
|
);
|
||||||
let pdfDocuments: Document[] = [];
|
let pdfDocuments: Document[] = [];
|
||||||
for (let pdfLink of pdfLinks) {
|
for (let pdfLink of pdfLinks) {
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
pdfDocuments.push({
|
pdfDocuments.push({
|
||||||
content: pdfContent,
|
content: pdfContent,
|
||||||
metadata: { sourceURL: pdfLink },
|
metadata: { sourceURL: pdfLink },
|
||||||
provider: "web-scraper"
|
provider: "web-scraper",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
links = links.filter(async (link) => !await isUrlAPdf({url: link, fastMode: true}));
|
links = links.filter(
|
||||||
|
async (link) => !(await isUrlAPdf({ url: link, fastMode: true }))
|
||||||
|
);
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(
|
let documents = await this.convertUrlsToDocuments(
|
||||||
links.slice(0, this.limit),
|
links.slice(0, this.limit),
|
||||||
@ -377,8 +382,9 @@ export class WebScraperDataProvider {
|
|||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
|
this.pageOptions = options.pageOptions ?? { onlyMainContent: false };
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths =
|
||||||
|
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
|
|
||||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
|
Loading…
Reference in New Issue
Block a user