0

Nick: partial docs working, cleaner

This commit is contained in:
Nicolas 2024-05-04 12:30:12 -07:00
parent 00373228fa
commit 2aa09a3000
5 changed files with 18 additions and 7 deletions

View File

@ -19,7 +19,7 @@ export async function crawlStatusController(req: Request, res: Response) {
return res.status(404).json({ error: "Job not found" }); return res.status(404).json({ error: "Job not found" });
} }
const { current, current_url, total, current_step } = await job.progress(); const { current, current_url, total, current_step, partialDocs } = await job.progress();
res.json({ res.json({
status: await job.getState(), status: await job.getState(),
// progress: job.progress(), // progress: job.progress(),
@ -28,6 +28,7 @@ export async function crawlStatusController(req: Request, res: Response) {
current_step: current_step, current_step: current_step,
total: total, total: total,
data: job.returnvalue, data: job.returnvalue,
partial_docs: partialDocs ?? [],
}); });
} catch (error) { } catch (error) {
console.error(error); console.error(error);

View File

@ -147,7 +147,7 @@ export async function searchController(req: Request, res: Response) {
logJob({ logJob({
success: result.success, success: result.success,
message: result.error, message: result.error,
num_docs: result.data.length, num_docs: result.data ? result.data.length : 0,
docs: result.data, docs: result.data,
time_taken: timeTakenInSeconds, time_taken: timeTakenInSeconds,
team_id: team_id, team_id: team_id,

View File

@ -7,6 +7,7 @@ export interface Progress {
[key: string]: any; [key: string]: any;
}; };
currentDocumentUrl?: string; currentDocumentUrl?: string;
currentDocument?: Document;
} }
export type PageOptions = { export type PageOptions = {

View File

@ -10,13 +10,15 @@ export async function startWebScraperPipeline({
}: { }: {
job: Job<WebScraperOptions>; job: Job<WebScraperOptions>;
}) { }) {
let partialDocs: Document[] = [];
return (await runWebScraper({ return (await runWebScraper({
url: job.data.url, url: job.data.url,
mode: job.data.mode, mode: job.data.mode,
crawlerOptions: job.data.crawlerOptions, crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions, pageOptions: job.data.pageOptions,
inProgress: (progress) => { inProgress: (progress) => {
job.progress(progress); partialDocs.push(progress.currentDocument);
job.progress({...progress, partialDocs: partialDocs});
}, },
onSuccess: (result) => { onSuccess: (result) => {
job.moveToCompleted(result); job.moveToCompleted(result);
@ -69,6 +71,7 @@ export async function runWebScraper({
} }
const docs = (await provider.getDocuments(false, (progress: Progress) => { const docs = (await provider.getDocuments(false, (progress: Progress) => {
inProgress(progress); inProgress(progress);
})) as Document[]; })) as Document[];
if (docs.length === 0) { if (docs.length === 0) {

View File

@ -54,6 +54,7 @@ export class WebScraperDataProvider {
total: totalUrls, total: totalUrls,
status: "SCRAPING", status: "SCRAPING",
currentDocumentUrl: url, currentDocumentUrl: url,
currentDocument: result
}); });
} }
results[i + index] = result; results[i + index] = result;
@ -114,9 +115,7 @@ export class WebScraperDataProvider {
} }
private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> { private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
let documents = await this.convertUrlsToDocuments(this.urls, inProgress); let documents = await this.processLinks(this.urls, inProgress);
documents = await this.applyPathReplacements(documents);
documents = await this.applyImgAltText(documents);
return documents; return documents;
} }
@ -153,6 +152,13 @@ export class WebScraperDataProvider {
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
documents = this.applyPathReplacements(documents); documents = this.applyPathReplacements(documents);
documents = await this.applyImgAltText(documents); documents = await this.applyImgAltText(documents);
if(this.extractorOptions.mode === "llm-extraction" && this.mode === "single_urls") {
documents = await generateCompletions(
documents,
this.extractorOptions
)
}
return documents.concat(pdfDocuments); return documents.concat(pdfDocuments);
} }
@ -275,7 +281,7 @@ export class WebScraperDataProvider {
documents.push(cachedDocument); documents.push(cachedDocument);
// get children documents // get children documents
for (const childUrl of cachedDocument.childrenLinks) { for (const childUrl of (cachedDocument.childrenLinks || [])) {
const normalizedChildUrl = this.normalizeUrl(childUrl); const normalizedChildUrl = this.normalizeUrl(childUrl);
const childCachedDocumentString = await getValue( const childCachedDocumentString = await getValue(
"web-scraper-cache:" + normalizedChildUrl "web-scraper-cache:" + normalizedChildUrl