Update index.ts
This commit is contained in:
parent
ef6db3b7c2
commit
00373228fa
@ -67,211 +67,138 @@ export class WebScraperDataProvider {
|
||||
useCaching: boolean = false,
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
this.validateInitialUrl();
|
||||
|
||||
if (!useCaching) {
|
||||
return this.processDocumentsWithoutCache(inProgress);
|
||||
}
|
||||
|
||||
return this.processDocumentsWithCache(inProgress);
|
||||
}
|
||||
|
||||
private validateInitialUrl(): void {
|
||||
if (this.urls[0].trim() === "") {
|
||||
throw new Error("Url is required");
|
||||
}
|
||||
}
|
||||
|
||||
if (!useCaching) {
|
||||
if (this.mode === "crawl") {
|
||||
const crawler = new WebCrawler({
|
||||
initialUrl: this.urls[0],
|
||||
includes: this.includes,
|
||||
excludes: this.excludes,
|
||||
maxCrawledLinks: this.maxCrawledLinks,
|
||||
limit: this.limit,
|
||||
generateImgAltText: this.generateImgAltText,
|
||||
});
|
||||
let links = await crawler.start(inProgress, 5, this.limit);
|
||||
if (this.returnOnlyUrls) {
|
||||
inProgress({
|
||||
current: links.length,
|
||||
total: links.length,
|
||||
status: "COMPLETED",
|
||||
currentDocumentUrl: this.urls[0],
|
||||
});
|
||||
return links.map((url) => ({
|
||||
content: "",
|
||||
markdown: "",
|
||||
metadata: { sourceURL: url },
|
||||
}));
|
||||
}
|
||||
private async processDocumentsWithoutCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||
switch (this.mode) {
|
||||
case "crawl":
|
||||
return this.handleCrawlMode(inProgress);
|
||||
case "single_urls":
|
||||
return this.handleSingleUrlsMode(inProgress);
|
||||
case "sitemap":
|
||||
return this.handleSitemapMode(inProgress);
|
||||
default:
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
let pdfDocuments: Document[] = [];
|
||||
for (let pdfLink of pdfLinks) {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
pdfDocuments.push({
|
||||
content: pdfContent,
|
||||
metadata: { sourceURL: pdfLink },
|
||||
provider: "web-scraper"
|
||||
});
|
||||
}
|
||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
|
||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||
documents = replacePathsWithAbsolutePaths(documents);
|
||||
} else {
|
||||
documents = replaceImgPathsWithAbsolutePaths(documents);
|
||||
}
|
||||
|
||||
if (this.generateImgAltText) {
|
||||
documents = await this.generatesImgAltText(documents);
|
||||
}
|
||||
documents = documents.concat(pdfDocuments);
|
||||
|
||||
// CACHING DOCUMENTS
|
||||
// - parent document
|
||||
const cachedParentDocumentString = await getValue(
|
||||
"web-scraper-cache:" + this.normalizeUrl(this.urls[0])
|
||||
);
|
||||
if (cachedParentDocumentString != null) {
|
||||
let cachedParentDocument = JSON.parse(cachedParentDocumentString);
|
||||
if (
|
||||
!cachedParentDocument.childrenLinks ||
|
||||
cachedParentDocument.childrenLinks.length < links.length - 1
|
||||
) {
|
||||
cachedParentDocument.childrenLinks = links.filter(
|
||||
(link) => link !== this.urls[0]
|
||||
);
|
||||
await setValue(
|
||||
"web-scraper-cache:" + this.normalizeUrl(this.urls[0]),
|
||||
JSON.stringify(cachedParentDocument),
|
||||
60 * 60 * 24 * 10
|
||||
); // 10 days
|
||||
}
|
||||
} else {
|
||||
let parentDocument = documents.filter(
|
||||
(document) =>
|
||||
this.normalizeUrl(document.metadata.sourceURL) ===
|
||||
this.normalizeUrl(this.urls[0])
|
||||
);
|
||||
await this.setCachedDocuments(parentDocument, links);
|
||||
}
|
||||
|
||||
await this.setCachedDocuments(
|
||||
documents.filter(
|
||||
(document) =>
|
||||
this.normalizeUrl(document.metadata.sourceURL) !==
|
||||
this.normalizeUrl(this.urls[0])
|
||||
),
|
||||
[]
|
||||
);
|
||||
documents = this.removeChildLinks(documents);
|
||||
documents = documents.splice(0, this.limit);
|
||||
return documents;
|
||||
}
|
||||
|
||||
if (this.mode === "single_urls") {
|
||||
let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf"));
|
||||
let pdfDocuments: Document[] = [];
|
||||
for (let pdfLink of pdfLinks) {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
pdfDocuments.push({
|
||||
content: pdfContent,
|
||||
metadata: { sourceURL: pdfLink },
|
||||
provider: "web-scraper"
|
||||
});
|
||||
}
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(
|
||||
this.urls.filter((link) => !link.endsWith(".pdf")),
|
||||
inProgress
|
||||
);
|
||||
|
||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||
documents = replacePathsWithAbsolutePaths(documents);
|
||||
} else {
|
||||
documents = replaceImgPathsWithAbsolutePaths(documents);
|
||||
}
|
||||
|
||||
if (this.generateImgAltText) {
|
||||
documents = await this.generatesImgAltText(documents);
|
||||
}
|
||||
const baseUrl = new URL(this.urls[0]).origin;
|
||||
documents = await this.getSitemapData(baseUrl, documents);
|
||||
documents = documents.concat(pdfDocuments);
|
||||
|
||||
if(this.extractorOptions.mode === "llm-extraction") {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions
|
||||
)
|
||||
}
|
||||
|
||||
await this.setCachedDocuments(documents);
|
||||
documents = this.removeChildLinks(documents);
|
||||
documents = documents.splice(0, this.limit);
|
||||
return documents;
|
||||
}
|
||||
if (this.mode === "sitemap") {
|
||||
let links = await getLinksFromSitemap(this.urls[0]);
|
||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
let pdfDocuments: Document[] = [];
|
||||
for (let pdfLink of pdfLinks) {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
pdfDocuments.push({
|
||||
content: pdfContent,
|
||||
metadata: { sourceURL: pdfLink },
|
||||
provider: "web-scraper"
|
||||
});
|
||||
}
|
||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(
|
||||
links.slice(0, this.limit),
|
||||
inProgress
|
||||
);
|
||||
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
|
||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||
documents = replacePathsWithAbsolutePaths(documents);
|
||||
} else {
|
||||
documents = replaceImgPathsWithAbsolutePaths(documents);
|
||||
}
|
||||
|
||||
if (this.generateImgAltText) {
|
||||
documents = await this.generatesImgAltText(documents);
|
||||
}
|
||||
documents = documents.concat(pdfDocuments);
|
||||
|
||||
await this.setCachedDocuments(documents);
|
||||
documents = this.removeChildLinks(documents);
|
||||
documents = documents.splice(0, this.limit);
|
||||
return documents;
|
||||
}
|
||||
|
||||
return [];
|
||||
private async handleCrawlMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||
const crawler = new WebCrawler({
|
||||
initialUrl: this.urls[0],
|
||||
includes: this.includes,
|
||||
excludes: this.excludes,
|
||||
maxCrawledLinks: this.maxCrawledLinks,
|
||||
limit: this.limit,
|
||||
generateImgAltText: this.generateImgAltText,
|
||||
});
|
||||
let links = await crawler.start(inProgress, 5, this.limit);
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
let documents = await this.getCachedDocuments(
|
||||
this.urls.slice(0, this.limit)
|
||||
);
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
||||
private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||
let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
|
||||
documents = await this.applyPathReplacements(documents);
|
||||
documents = await this.applyImgAltText(documents);
|
||||
return documents;
|
||||
}
|
||||
|
||||
private async handleSitemapMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||
let links = await getLinksFromSitemap(this.urls[0]);
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
||||
private async returnOnlyUrlsResponse(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||
inProgress?.({
|
||||
current: links.length,
|
||||
total: links.length,
|
||||
status: "COMPLETED",
|
||||
currentDocumentUrl: this.urls[0],
|
||||
});
|
||||
return links.map(url => ({
|
||||
content: "",
|
||||
markdown: "",
|
||||
metadata: { sourceURL: url },
|
||||
}));
|
||||
}
|
||||
|
||||
private async processLinks(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||
let pdfLinks = links.filter(link => link.endsWith(".pdf"));
|
||||
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
links = links.filter(link => !link.endsWith(".pdf"));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
documents = this.applyPathReplacements(documents);
|
||||
documents = await this.applyImgAltText(documents);
|
||||
return documents.concat(pdfDocuments);
|
||||
}
|
||||
|
||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(pdfLinks.map(async pdfLink => {
|
||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||
return {
|
||||
content: pdfContent,
|
||||
metadata: { sourceURL: pdfLink },
|
||||
provider: "web-scraper"
|
||||
};
|
||||
}));
|
||||
}
|
||||
|
||||
private applyPathReplacements(documents: Document[]): Document[] {
|
||||
return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) : replaceImgPathsWithAbsolutePaths(documents);
|
||||
}
|
||||
|
||||
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
||||
return this.generateImgAltText ? this.generatesImgAltText(documents) : documents;
|
||||
}
|
||||
|
||||
private async cacheAndFinalizeDocuments(documents: Document[], links: string[]): Promise<Document[]> {
|
||||
await this.setCachedDocuments(documents, links);
|
||||
documents = this.removeChildLinks(documents);
|
||||
return documents.splice(0, this.limit);
|
||||
}
|
||||
|
||||
private async processDocumentsWithCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||
let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit));
|
||||
if (documents.length < this.limit) {
|
||||
const newDocuments: Document[] = await this.getDocuments(
|
||||
false,
|
||||
inProgress
|
||||
);
|
||||
newDocuments.forEach((doc) => {
|
||||
if (
|
||||
!documents.some(
|
||||
(d) =>
|
||||
this.normalizeUrl(d.metadata.sourceURL) ===
|
||||
this.normalizeUrl(doc.metadata?.sourceURL)
|
||||
)
|
||||
) {
|
||||
documents.push(doc);
|
||||
}
|
||||
});
|
||||
const newDocuments: Document[] = await this.getDocuments(false, inProgress);
|
||||
documents = this.mergeNewDocuments(documents, newDocuments);
|
||||
}
|
||||
documents = this.filterDocsExcludeInclude(documents);
|
||||
documents = this.removeChildLinks(documents);
|
||||
documents = documents.splice(0, this.limit);
|
||||
return documents;
|
||||
return documents.splice(0, this.limit);
|
||||
}
|
||||
|
||||
private mergeNewDocuments(existingDocuments: Document[], newDocuments: Document[]): Document[] {
|
||||
newDocuments.forEach(doc => {
|
||||
if (!existingDocuments.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) {
|
||||
existingDocuments.push(doc);
|
||||
}
|
||||
});
|
||||
return existingDocuments;
|
||||
}
|
||||
|
||||
private filterDocsExcludeInclude(documents: Document[]): Document[] {
|
||||
|
Loading…
Reference in New Issue
Block a user