Merge pull request #120 from mendableai/nsc/initial-web-refac
Refactor of main web scraper + Partial data streaming
This commit is contained in:
commit
3156e0ca15
@ -19,7 +19,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
return res.status(404).json({ error: "Job not found" });
|
return res.status(404).json({ error: "Job not found" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const { current, current_url, total, current_step } = await job.progress();
|
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||||
res.json({
|
res.json({
|
||||||
status: await job.getState(),
|
status: await job.getState(),
|
||||||
// progress: job.progress(),
|
// progress: job.progress(),
|
||||||
@ -28,6 +28,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
current_step: current_step,
|
current_step: current_step,
|
||||||
total: total,
|
total: total,
|
||||||
data: job.returnvalue,
|
data: job.returnvalue,
|
||||||
|
partial_data: partialDocs ?? [],
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
console.error(error);
|
||||||
|
@ -54,10 +54,11 @@ export async function searchHelper(
|
|||||||
|
|
||||||
// filter out social media links
|
// filter out social media links
|
||||||
|
|
||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: res.map((r) => r.url),
|
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
...crawlerOptions,
|
...crawlerOptions,
|
||||||
},
|
},
|
||||||
@ -69,7 +70,7 @@ export async function searchHelper(
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
const docs = await a.getDocuments(true);
|
const docs = await a.getDocuments(false);
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
return { success: true, error: "No search results found", returnCode: 200 };
|
return { success: true, error: "No search results found", returnCode: 200 };
|
||||||
}
|
}
|
||||||
@ -147,7 +148,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
logJob({
|
logJob({
|
||||||
success: result.success,
|
success: result.success,
|
||||||
message: result.error,
|
message: result.error,
|
||||||
num_docs: result.data.length,
|
num_docs: result.data ? result.data.length : 0,
|
||||||
docs: result.data,
|
docs: result.data,
|
||||||
time_taken: timeTakenInSeconds,
|
time_taken: timeTakenInSeconds,
|
||||||
team_id: team_id,
|
team_id: team_id,
|
||||||
|
@ -8,7 +8,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||||||
return res.status(404).json({ error: "Job not found" });
|
return res.status(404).json({ error: "Job not found" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const { current, current_url, total, current_step } = await job.progress();
|
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||||
res.json({
|
res.json({
|
||||||
status: await job.getState(),
|
status: await job.getState(),
|
||||||
// progress: job.progress(),
|
// progress: job.progress(),
|
||||||
@ -17,6 +17,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||||||
current_step: current_step,
|
current_step: current_step,
|
||||||
total: total,
|
total: total,
|
||||||
data: job.returnvalue,
|
data: job.returnvalue,
|
||||||
|
partial_data: partialDocs ?? [],
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
console.error(error);
|
||||||
|
@ -7,6 +7,7 @@ export interface Progress {
|
|||||||
[key: string]: any;
|
[key: string]: any;
|
||||||
};
|
};
|
||||||
currentDocumentUrl?: string;
|
currentDocumentUrl?: string;
|
||||||
|
currentDocument?: Document;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type PageOptions = {
|
export type PageOptions = {
|
||||||
|
@ -10,13 +10,15 @@ export async function startWebScraperPipeline({
|
|||||||
}: {
|
}: {
|
||||||
job: Job<WebScraperOptions>;
|
job: Job<WebScraperOptions>;
|
||||||
}) {
|
}) {
|
||||||
|
let partialDocs: Document[] = [];
|
||||||
return (await runWebScraper({
|
return (await runWebScraper({
|
||||||
url: job.data.url,
|
url: job.data.url,
|
||||||
mode: job.data.mode,
|
mode: job.data.mode,
|
||||||
crawlerOptions: job.data.crawlerOptions,
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
pageOptions: job.data.pageOptions,
|
pageOptions: job.data.pageOptions,
|
||||||
inProgress: (progress) => {
|
inProgress: (progress) => {
|
||||||
job.progress(progress);
|
partialDocs.push(progress.currentDocument);
|
||||||
|
job.progress({...progress, partialDocs: partialDocs});
|
||||||
},
|
},
|
||||||
onSuccess: (result) => {
|
onSuccess: (result) => {
|
||||||
job.moveToCompleted(result);
|
job.moveToCompleted(result);
|
||||||
@ -69,6 +71,7 @@ export async function runWebScraper({
|
|||||||
}
|
}
|
||||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||||
inProgress(progress);
|
inProgress(progress);
|
||||||
|
|
||||||
})) as Document[];
|
})) as Document[];
|
||||||
|
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
|
@ -7,7 +7,6 @@ import { getValue, setValue } from "../../services/redis";
|
|||||||
import { getImageDescription } from "./utils/imageDescription";
|
import { getImageDescription } from "./utils/imageDescription";
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
|
||||||
import OpenAI from 'openai'
|
|
||||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||||
|
|
||||||
|
|
||||||
@ -54,6 +53,7 @@ export class WebScraperDataProvider {
|
|||||||
total: totalUrls,
|
total: totalUrls,
|
||||||
status: "SCRAPING",
|
status: "SCRAPING",
|
||||||
currentDocumentUrl: url,
|
currentDocumentUrl: url,
|
||||||
|
currentDocument: result
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
results[i + index] = result;
|
results[i + index] = result;
|
||||||
@ -67,13 +67,40 @@ export class WebScraperDataProvider {
|
|||||||
useCaching: boolean = false,
|
useCaching: boolean = false,
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
|
this.validateInitialUrl();
|
||||||
|
|
||||||
|
if (!useCaching) {
|
||||||
|
return this.processDocumentsWithoutCache(inProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
return this.processDocumentsWithCache(inProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
private validateInitialUrl(): void {
|
||||||
if (this.urls[0].trim() === "") {
|
if (this.urls[0].trim() === "") {
|
||||||
throw new Error("Url is required");
|
throw new Error("Url is required");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!useCaching) {
|
/**
|
||||||
if (this.mode === "crawl") {
|
* Process documents without cache handling each mode
|
||||||
|
* @param inProgress inProgress
|
||||||
|
* @returns documents
|
||||||
|
*/
|
||||||
|
private async processDocumentsWithoutCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||||
|
switch (this.mode) {
|
||||||
|
case "crawl":
|
||||||
|
return this.handleCrawlMode(inProgress);
|
||||||
|
case "single_urls":
|
||||||
|
return this.handleSingleUrlsMode(inProgress);
|
||||||
|
case "sitemap":
|
||||||
|
return this.handleSitemapMode(inProgress);
|
||||||
|
default:
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async handleCrawlMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
includes: this.includes,
|
includes: this.includes,
|
||||||
@ -84,194 +111,104 @@ export class WebScraperDataProvider {
|
|||||||
});
|
});
|
||||||
let links = await crawler.start(inProgress, 5, this.limit);
|
let links = await crawler.start(inProgress, 5, this.limit);
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
inProgress({
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
let documents = await this.processLinks(links, inProgress);
|
||||||
|
return this.cacheAndFinalizeDocuments(documents, links);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||||
|
let documents = await this.processLinks(this.urls, inProgress);
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async handleSitemapMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||||
|
let links = await getLinksFromSitemap(this.urls[0]);
|
||||||
|
if (this.returnOnlyUrls) {
|
||||||
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
let documents = await this.processLinks(links, inProgress);
|
||||||
|
return this.cacheAndFinalizeDocuments(documents, links);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async returnOnlyUrlsResponse(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||||
|
inProgress?.({
|
||||||
current: links.length,
|
current: links.length,
|
||||||
total: links.length,
|
total: links.length,
|
||||||
status: "COMPLETED",
|
status: "COMPLETED",
|
||||||
currentDocumentUrl: this.urls[0],
|
currentDocumentUrl: this.urls[0],
|
||||||
});
|
});
|
||||||
return links.map((url) => ({
|
return links.map(url => ({
|
||||||
content: "",
|
content: "",
|
||||||
markdown: "",
|
markdown: "",
|
||||||
metadata: { sourceURL: url },
|
metadata: { sourceURL: url },
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
private async processLinks(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||||
let pdfDocuments: Document[] = [];
|
let pdfLinks = links.filter(link => link.endsWith(".pdf"));
|
||||||
for (let pdfLink of pdfLinks) {
|
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
links = links.filter(link => !link.endsWith(".pdf"));
|
||||||
pdfDocuments.push({
|
|
||||||
content: pdfContent,
|
|
||||||
metadata: { sourceURL: pdfLink },
|
|
||||||
provider: "web-scraper"
|
|
||||||
});
|
|
||||||
}
|
|
||||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
|
documents = this.applyPathReplacements(documents);
|
||||||
|
documents = await this.applyImgAltText(documents);
|
||||||
|
|
||||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
if(this.extractorOptions.mode === "llm-extraction" && this.mode === "single_urls") {
|
||||||
documents = replacePathsWithAbsolutePaths(documents);
|
|
||||||
} else {
|
|
||||||
documents = replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.generateImgAltText) {
|
|
||||||
documents = await this.generatesImgAltText(documents);
|
|
||||||
}
|
|
||||||
documents = documents.concat(pdfDocuments);
|
|
||||||
|
|
||||||
// CACHING DOCUMENTS
|
|
||||||
// - parent document
|
|
||||||
const cachedParentDocumentString = await getValue(
|
|
||||||
"web-scraper-cache:" + this.normalizeUrl(this.urls[0])
|
|
||||||
);
|
|
||||||
if (cachedParentDocumentString != null) {
|
|
||||||
let cachedParentDocument = JSON.parse(cachedParentDocumentString);
|
|
||||||
if (
|
|
||||||
!cachedParentDocument.childrenLinks ||
|
|
||||||
cachedParentDocument.childrenLinks.length < links.length - 1
|
|
||||||
) {
|
|
||||||
cachedParentDocument.childrenLinks = links.filter(
|
|
||||||
(link) => link !== this.urls[0]
|
|
||||||
);
|
|
||||||
await setValue(
|
|
||||||
"web-scraper-cache:" + this.normalizeUrl(this.urls[0]),
|
|
||||||
JSON.stringify(cachedParentDocument),
|
|
||||||
60 * 60 * 24 * 10
|
|
||||||
); // 10 days
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
let parentDocument = documents.filter(
|
|
||||||
(document) =>
|
|
||||||
this.normalizeUrl(document.metadata.sourceURL) ===
|
|
||||||
this.normalizeUrl(this.urls[0])
|
|
||||||
);
|
|
||||||
await this.setCachedDocuments(parentDocument, links);
|
|
||||||
}
|
|
||||||
|
|
||||||
await this.setCachedDocuments(
|
|
||||||
documents.filter(
|
|
||||||
(document) =>
|
|
||||||
this.normalizeUrl(document.metadata.sourceURL) !==
|
|
||||||
this.normalizeUrl(this.urls[0])
|
|
||||||
),
|
|
||||||
[]
|
|
||||||
);
|
|
||||||
documents = this.removeChildLinks(documents);
|
|
||||||
documents = documents.splice(0, this.limit);
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.mode === "single_urls") {
|
|
||||||
let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf"));
|
|
||||||
let pdfDocuments: Document[] = [];
|
|
||||||
for (let pdfLink of pdfLinks) {
|
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
|
||||||
pdfDocuments.push({
|
|
||||||
content: pdfContent,
|
|
||||||
metadata: { sourceURL: pdfLink },
|
|
||||||
provider: "web-scraper"
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(
|
|
||||||
this.urls.filter((link) => !link.endsWith(".pdf")),
|
|
||||||
inProgress
|
|
||||||
);
|
|
||||||
|
|
||||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
|
||||||
documents = replacePathsWithAbsolutePaths(documents);
|
|
||||||
} else {
|
|
||||||
documents = replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.generateImgAltText) {
|
|
||||||
documents = await this.generatesImgAltText(documents);
|
|
||||||
}
|
|
||||||
const baseUrl = new URL(this.urls[0]).origin;
|
|
||||||
documents = await this.getSitemapData(baseUrl, documents);
|
|
||||||
documents = documents.concat(pdfDocuments);
|
|
||||||
|
|
||||||
if(this.extractorOptions.mode === "llm-extraction") {
|
|
||||||
documents = await generateCompletions(
|
documents = await generateCompletions(
|
||||||
documents,
|
documents,
|
||||||
this.extractorOptions
|
this.extractorOptions
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
return documents.concat(pdfDocuments);
|
||||||
await this.setCachedDocuments(documents);
|
|
||||||
documents = this.removeChildLinks(documents);
|
|
||||||
documents = documents.splice(0, this.limit);
|
|
||||||
return documents;
|
|
||||||
}
|
}
|
||||||
if (this.mode === "sitemap") {
|
|
||||||
let links = await getLinksFromSitemap(this.urls[0]);
|
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
return Promise.all(pdfLinks.map(async pdfLink => {
|
||||||
let pdfDocuments: Document[] = [];
|
|
||||||
for (let pdfLink of pdfLinks) {
|
|
||||||
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
const pdfContent = await fetchAndProcessPdf(pdfLink);
|
||||||
pdfDocuments.push({
|
return {
|
||||||
content: pdfContent,
|
content: pdfContent,
|
||||||
metadata: { sourceURL: pdfLink },
|
metadata: { sourceURL: pdfLink },
|
||||||
provider: "web-scraper"
|
provider: "web-scraper"
|
||||||
});
|
};
|
||||||
}
|
}));
|
||||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(
|
|
||||||
links.slice(0, this.limit),
|
|
||||||
inProgress
|
|
||||||
);
|
|
||||||
|
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
|
||||||
|
|
||||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
|
||||||
documents = replacePathsWithAbsolutePaths(documents);
|
|
||||||
} else {
|
|
||||||
documents = replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.generateImgAltText) {
|
private applyPathReplacements(documents: Document[]): Document[] {
|
||||||
documents = await this.generatesImgAltText(documents);
|
return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) : replaceImgPathsWithAbsolutePaths(documents);
|
||||||
}
|
}
|
||||||
documents = documents.concat(pdfDocuments);
|
|
||||||
|
|
||||||
await this.setCachedDocuments(documents);
|
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
||||||
|
return this.generateImgAltText ? this.generatesImgAltText(documents) : documents;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async cacheAndFinalizeDocuments(documents: Document[], links: string[]): Promise<Document[]> {
|
||||||
|
await this.setCachedDocuments(documents, links);
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
documents = documents.splice(0, this.limit);
|
return documents.splice(0, this.limit);
|
||||||
return documents;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return [];
|
private async processDocumentsWithCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||||
}
|
let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit));
|
||||||
|
|
||||||
let documents = await this.getCachedDocuments(
|
|
||||||
this.urls.slice(0, this.limit)
|
|
||||||
);
|
|
||||||
if (documents.length < this.limit) {
|
if (documents.length < this.limit) {
|
||||||
const newDocuments: Document[] = await this.getDocuments(
|
const newDocuments: Document[] = await this.getDocuments(false, inProgress);
|
||||||
false,
|
documents = this.mergeNewDocuments(documents, newDocuments);
|
||||||
inProgress
|
|
||||||
);
|
|
||||||
newDocuments.forEach((doc) => {
|
|
||||||
if (
|
|
||||||
!documents.some(
|
|
||||||
(d) =>
|
|
||||||
this.normalizeUrl(d.metadata.sourceURL) ===
|
|
||||||
this.normalizeUrl(doc.metadata?.sourceURL)
|
|
||||||
)
|
|
||||||
) {
|
|
||||||
documents.push(doc);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
documents = this.filterDocsExcludeInclude(documents);
|
documents = this.filterDocsExcludeInclude(documents);
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
documents = documents.splice(0, this.limit);
|
return documents.splice(0, this.limit);
|
||||||
return documents;
|
}
|
||||||
|
|
||||||
|
private mergeNewDocuments(existingDocuments: Document[], newDocuments: Document[]): Document[] {
|
||||||
|
newDocuments.forEach(doc => {
|
||||||
|
if (!existingDocuments.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) {
|
||||||
|
existingDocuments.push(doc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return existingDocuments;
|
||||||
}
|
}
|
||||||
|
|
||||||
private filterDocsExcludeInclude(documents: Document[]): Document[] {
|
private filterDocsExcludeInclude(documents: Document[]): Document[] {
|
||||||
@ -348,7 +285,7 @@ export class WebScraperDataProvider {
|
|||||||
documents.push(cachedDocument);
|
documents.push(cachedDocument);
|
||||||
|
|
||||||
// get children documents
|
// get children documents
|
||||||
for (const childUrl of cachedDocument.childrenLinks) {
|
for (const childUrl of (cachedDocument.childrenLinks || [])) {
|
||||||
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
||||||
const childCachedDocumentString = await getValue(
|
const childCachedDocumentString = await getValue(
|
||||||
"web-scraper-cache:" + normalizedChildUrl
|
"web-scraper-cache:" + normalizedChildUrl
|
||||||
|
Loading…
Reference in New Issue
Block a user