0

Update index.ts

This commit is contained in:
Nicolas 2024-05-04 11:53:16 -07:00
parent ef6db3b7c2
commit 00373228fa

View File

@ -67,13 +67,35 @@ export class WebScraperDataProvider {
useCaching: boolean = false, useCaching: boolean = false,
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
this.validateInitialUrl();
if (!useCaching) {
return this.processDocumentsWithoutCache(inProgress);
}
return this.processDocumentsWithCache(inProgress);
}
private validateInitialUrl(): void {
if (this.urls[0].trim() === "") { if (this.urls[0].trim() === "") {
throw new Error("Url is required"); throw new Error("Url is required");
} }
}
if (!useCaching) { private async processDocumentsWithoutCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {
if (this.mode === "crawl") { switch (this.mode) {
case "crawl":
return this.handleCrawlMode(inProgress);
case "single_urls":
return this.handleSingleUrlsMode(inProgress);
case "sitemap":
return this.handleSitemapMode(inProgress);
default:
return [];
}
}
private async handleCrawlMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
initialUrl: this.urls[0], initialUrl: this.urls[0],
includes: this.includes, includes: this.includes,
@ -84,194 +106,99 @@ export class WebScraperDataProvider {
}); });
let links = await crawler.start(inProgress, 5, this.limit); let links = await crawler.start(inProgress, 5, this.limit);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
inProgress({ return this.returnOnlyUrlsResponse(links, inProgress);
}
let documents = await this.processLinks(links, inProgress);
return this.cacheAndFinalizeDocuments(documents, links);
}
private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
documents = await this.applyPathReplacements(documents);
documents = await this.applyImgAltText(documents);
return documents;
}
private async handleSitemapMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
let links = await getLinksFromSitemap(this.urls[0]);
if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(links, inProgress);
}
let documents = await this.processLinks(links, inProgress);
return this.cacheAndFinalizeDocuments(documents, links);
}
private async returnOnlyUrlsResponse(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {
inProgress?.({
current: links.length, current: links.length,
total: links.length, total: links.length,
status: "COMPLETED", status: "COMPLETED",
currentDocumentUrl: this.urls[0], currentDocumentUrl: this.urls[0],
}); });
return links.map((url) => ({ return links.map(url => ({
content: "", content: "",
markdown: "", markdown: "",
metadata: { sourceURL: url }, metadata: { sourceURL: url },
})); }));
} }
let pdfLinks = links.filter((link) => link.endsWith(".pdf")); private async processLinks(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {
let pdfDocuments: Document[] = []; let pdfLinks = links.filter(link => link.endsWith(".pdf"));
for (let pdfLink of pdfLinks) { let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
const pdfContent = await fetchAndProcessPdf(pdfLink); links = links.filter(link => !link.endsWith(".pdf"));
pdfDocuments.push({
content: pdfContent,
metadata: { sourceURL: pdfLink },
provider: "web-scraper"
});
}
links = links.filter((link) => !link.endsWith(".pdf"));
let documents = await this.convertUrlsToDocuments(links, inProgress); let documents = await this.convertUrlsToDocuments(links, inProgress);
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
documents = this.applyPathReplacements(documents);
if (this.replaceAllPathsWithAbsolutePaths) { documents = await this.applyImgAltText(documents);
documents = replacePathsWithAbsolutePaths(documents); return documents.concat(pdfDocuments);
} else {
documents = replaceImgPathsWithAbsolutePaths(documents);
} }
if (this.generateImgAltText) { private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
documents = await this.generatesImgAltText(documents); return Promise.all(pdfLinks.map(async pdfLink => {
}
documents = documents.concat(pdfDocuments);
// CACHING DOCUMENTS
// - parent document
const cachedParentDocumentString = await getValue(
"web-scraper-cache:" + this.normalizeUrl(this.urls[0])
);
if (cachedParentDocumentString != null) {
let cachedParentDocument = JSON.parse(cachedParentDocumentString);
if (
!cachedParentDocument.childrenLinks ||
cachedParentDocument.childrenLinks.length < links.length - 1
) {
cachedParentDocument.childrenLinks = links.filter(
(link) => link !== this.urls[0]
);
await setValue(
"web-scraper-cache:" + this.normalizeUrl(this.urls[0]),
JSON.stringify(cachedParentDocument),
60 * 60 * 24 * 10
); // 10 days
}
} else {
let parentDocument = documents.filter(
(document) =>
this.normalizeUrl(document.metadata.sourceURL) ===
this.normalizeUrl(this.urls[0])
);
await this.setCachedDocuments(parentDocument, links);
}
await this.setCachedDocuments(
documents.filter(
(document) =>
this.normalizeUrl(document.metadata.sourceURL) !==
this.normalizeUrl(this.urls[0])
),
[]
);
documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit);
return documents;
}
if (this.mode === "single_urls") {
let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf"));
let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink); const pdfContent = await fetchAndProcessPdf(pdfLink);
pdfDocuments.push({ return {
content: pdfContent, content: pdfContent,
metadata: { sourceURL: pdfLink }, metadata: { sourceURL: pdfLink },
provider: "web-scraper" provider: "web-scraper"
}); };
}));
} }
let documents = await this.convertUrlsToDocuments( private applyPathReplacements(documents: Document[]): Document[] {
this.urls.filter((link) => !link.endsWith(".pdf")), return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) : replaceImgPathsWithAbsolutePaths(documents);
inProgress
);
if (this.replaceAllPathsWithAbsolutePaths) {
documents = replacePathsWithAbsolutePaths(documents);
} else {
documents = replaceImgPathsWithAbsolutePaths(documents);
} }
if (this.generateImgAltText) { private async applyImgAltText(documents: Document[]): Promise<Document[]> {
documents = await this.generatesImgAltText(documents); return this.generateImgAltText ? this.generatesImgAltText(documents) : documents;
}
const baseUrl = new URL(this.urls[0]).origin;
documents = await this.getSitemapData(baseUrl, documents);
documents = documents.concat(pdfDocuments);
if(this.extractorOptions.mode === "llm-extraction") {
documents = await generateCompletions(
documents,
this.extractorOptions
)
} }
await this.setCachedDocuments(documents); private async cacheAndFinalizeDocuments(documents: Document[], links: string[]): Promise<Document[]> {
await this.setCachedDocuments(documents, links);
documents = this.removeChildLinks(documents); documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit); return documents.splice(0, this.limit);
return documents;
}
if (this.mode === "sitemap") {
let links = await getLinksFromSitemap(this.urls[0]);
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
let pdfDocuments: Document[] = [];
for (let pdfLink of pdfLinks) {
const pdfContent = await fetchAndProcessPdf(pdfLink);
pdfDocuments.push({
content: pdfContent,
metadata: { sourceURL: pdfLink },
provider: "web-scraper"
});
}
links = links.filter((link) => !link.endsWith(".pdf"));
let documents = await this.convertUrlsToDocuments(
links.slice(0, this.limit),
inProgress
);
documents = await this.getSitemapData(this.urls[0], documents);
if (this.replaceAllPathsWithAbsolutePaths) {
documents = replacePathsWithAbsolutePaths(documents);
} else {
documents = replaceImgPathsWithAbsolutePaths(documents);
} }
if (this.generateImgAltText) { private async processDocumentsWithCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {
documents = await this.generatesImgAltText(documents); let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit));
}
documents = documents.concat(pdfDocuments);
await this.setCachedDocuments(documents);
documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit);
return documents;
}
return [];
}
let documents = await this.getCachedDocuments(
this.urls.slice(0, this.limit)
);
if (documents.length < this.limit) { if (documents.length < this.limit) {
const newDocuments: Document[] = await this.getDocuments( const newDocuments: Document[] = await this.getDocuments(false, inProgress);
false, documents = this.mergeNewDocuments(documents, newDocuments);
inProgress
);
newDocuments.forEach((doc) => {
if (
!documents.some(
(d) =>
this.normalizeUrl(d.metadata.sourceURL) ===
this.normalizeUrl(doc.metadata?.sourceURL)
)
) {
documents.push(doc);
}
});
} }
documents = this.filterDocsExcludeInclude(documents); documents = this.filterDocsExcludeInclude(documents);
documents = this.removeChildLinks(documents); documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit); return documents.splice(0, this.limit);
return documents; }
private mergeNewDocuments(existingDocuments: Document[], newDocuments: Document[]): Document[] {
newDocuments.forEach(doc => {
if (!existingDocuments.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) {
existingDocuments.push(doc);
}
});
return existingDocuments;
} }
private filterDocsExcludeInclude(documents: Document[]): Document[] { private filterDocsExcludeInclude(documents: Document[]): Document[] {