diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/crawl-status.ts index 3534cd1..05bdb75 100644 --- a/apps/api/src/controllers/crawl-status.ts +++ b/apps/api/src/controllers/crawl-status.ts @@ -19,7 +19,7 @@ export async function crawlStatusController(req: Request, res: Response) { return res.status(404).json({ error: "Job not found" }); } - const { current, current_url, total, current_step } = await job.progress(); + const { current, current_url, total, current_step, partialDocs } = await job.progress(); res.json({ status: await job.getState(), // progress: job.progress(), @@ -28,6 +28,7 @@ export async function crawlStatusController(req: Request, res: Response) { current_step: current_step, total: total, data: job.returnvalue, + partial_docs: partialDocs ?? [], }); } catch (error) { console.error(error); diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 5c2cf80..41270cb 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -147,7 +147,7 @@ export async function searchController(req: Request, res: Response) { logJob({ success: result.success, message: result.error, - num_docs: result.data.length, + num_docs: result.data ? result.data.length : 0, docs: result.data, time_taken: timeTakenInSeconds, team_id: team_id, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 4008785..5b663f2 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -7,6 +7,7 @@ export interface Progress { [key: string]: any; }; currentDocumentUrl?: string; + currentDocument?: Document; } export type PageOptions = { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 892a2a3..827eec5 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -10,13 +10,15 @@ export async function startWebScraperPipeline({ }: { job: Job; }) { + let partialDocs: Document[] = []; return (await runWebScraper({ url: job.data.url, mode: job.data.mode, crawlerOptions: job.data.crawlerOptions, pageOptions: job.data.pageOptions, inProgress: (progress) => { - job.progress(progress); + partialDocs.push(progress.currentDocument); + job.progress({...progress, partialDocs: partialDocs}); }, onSuccess: (result) => { job.moveToCompleted(result); @@ -69,6 +71,7 @@ export async function runWebScraper({ } const docs = (await provider.getDocuments(false, (progress: Progress) => { inProgress(progress); + })) as Document[]; if (docs.length === 0) { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index ebd96d0..0cf001f 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -54,6 +54,7 @@ export class WebScraperDataProvider { total: totalUrls, status: "SCRAPING", currentDocumentUrl: url, + currentDocument: result }); } results[i + index] = result; @@ -114,9 +115,7 @@ export class WebScraperDataProvider { } private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise { - let documents = await this.convertUrlsToDocuments(this.urls, inProgress); - documents = await this.applyPathReplacements(documents); - documents = await this.applyImgAltText(documents); + let documents = await this.processLinks(this.urls, inProgress); return documents; } @@ -153,6 +152,13 @@ export class WebScraperDataProvider { documents = await this.getSitemapData(this.urls[0], documents); documents = this.applyPathReplacements(documents); documents = await this.applyImgAltText(documents); + + if(this.extractorOptions.mode === "llm-extraction" && this.mode === "single_urls") { + documents = await generateCompletions( + documents, + this.extractorOptions + ) + } return documents.concat(pdfDocuments); } @@ -275,7 +281,7 @@ export class WebScraperDataProvider { documents.push(cachedDocument); // get children documents - for (const childUrl of cachedDocument.childrenLinks) { + for (const childUrl of (cachedDocument.childrenLinks || [])) { const normalizedChildUrl = this.normalizeUrl(childUrl); const childCachedDocumentString = await getValue( "web-scraper-cache:" + normalizedChildUrl