Nick: partial docs working, cleaner
This commit is contained in:
parent
00373228fa
commit
2aa09a3000
@ -19,7 +19,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
return res.status(404).json({ error: "Job not found" });
|
return res.status(404).json({ error: "Job not found" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const { current, current_url, total, current_step } = await job.progress();
|
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||||
res.json({
|
res.json({
|
||||||
status: await job.getState(),
|
status: await job.getState(),
|
||||||
// progress: job.progress(),
|
// progress: job.progress(),
|
||||||
@ -28,6 +28,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
current_step: current_step,
|
current_step: current_step,
|
||||||
total: total,
|
total: total,
|
||||||
data: job.returnvalue,
|
data: job.returnvalue,
|
||||||
|
partial_docs: partialDocs ?? [],
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
console.error(error);
|
||||||
|
@ -147,7 +147,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
logJob({
|
logJob({
|
||||||
success: result.success,
|
success: result.success,
|
||||||
message: result.error,
|
message: result.error,
|
||||||
num_docs: result.data.length,
|
num_docs: result.data ? result.data.length : 0,
|
||||||
docs: result.data,
|
docs: result.data,
|
||||||
time_taken: timeTakenInSeconds,
|
time_taken: timeTakenInSeconds,
|
||||||
team_id: team_id,
|
team_id: team_id,
|
||||||
|
@ -7,6 +7,7 @@ export interface Progress {
|
|||||||
[key: string]: any;
|
[key: string]: any;
|
||||||
};
|
};
|
||||||
currentDocumentUrl?: string;
|
currentDocumentUrl?: string;
|
||||||
|
currentDocument?: Document;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type PageOptions = {
|
export type PageOptions = {
|
||||||
|
@ -10,13 +10,15 @@ export async function startWebScraperPipeline({
|
|||||||
}: {
|
}: {
|
||||||
job: Job<WebScraperOptions>;
|
job: Job<WebScraperOptions>;
|
||||||
}) {
|
}) {
|
||||||
|
let partialDocs: Document[] = [];
|
||||||
return (await runWebScraper({
|
return (await runWebScraper({
|
||||||
url: job.data.url,
|
url: job.data.url,
|
||||||
mode: job.data.mode,
|
mode: job.data.mode,
|
||||||
crawlerOptions: job.data.crawlerOptions,
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
pageOptions: job.data.pageOptions,
|
pageOptions: job.data.pageOptions,
|
||||||
inProgress: (progress) => {
|
inProgress: (progress) => {
|
||||||
job.progress(progress);
|
partialDocs.push(progress.currentDocument);
|
||||||
|
job.progress({...progress, partialDocs: partialDocs});
|
||||||
},
|
},
|
||||||
onSuccess: (result) => {
|
onSuccess: (result) => {
|
||||||
job.moveToCompleted(result);
|
job.moveToCompleted(result);
|
||||||
@ -69,6 +71,7 @@ export async function runWebScraper({
|
|||||||
}
|
}
|
||||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||||
inProgress(progress);
|
inProgress(progress);
|
||||||
|
|
||||||
})) as Document[];
|
})) as Document[];
|
||||||
|
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
|
@ -54,6 +54,7 @@ export class WebScraperDataProvider {
|
|||||||
total: totalUrls,
|
total: totalUrls,
|
||||||
status: "SCRAPING",
|
status: "SCRAPING",
|
||||||
currentDocumentUrl: url,
|
currentDocumentUrl: url,
|
||||||
|
currentDocument: result
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
results[i + index] = result;
|
results[i + index] = result;
|
||||||
@ -114,9 +115,7 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||||
let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
|
let documents = await this.processLinks(this.urls, inProgress);
|
||||||
documents = await this.applyPathReplacements(documents);
|
|
||||||
documents = await this.applyImgAltText(documents);
|
|
||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -153,6 +152,13 @@ export class WebScraperDataProvider {
|
|||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
documents = this.applyPathReplacements(documents);
|
documents = this.applyPathReplacements(documents);
|
||||||
documents = await this.applyImgAltText(documents);
|
documents = await this.applyImgAltText(documents);
|
||||||
|
|
||||||
|
if(this.extractorOptions.mode === "llm-extraction" && this.mode === "single_urls") {
|
||||||
|
documents = await generateCompletions(
|
||||||
|
documents,
|
||||||
|
this.extractorOptions
|
||||||
|
)
|
||||||
|
}
|
||||||
return documents.concat(pdfDocuments);
|
return documents.concat(pdfDocuments);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -275,7 +281,7 @@ export class WebScraperDataProvider {
|
|||||||
documents.push(cachedDocument);
|
documents.push(cachedDocument);
|
||||||
|
|
||||||
// get children documents
|
// get children documents
|
||||||
for (const childUrl of cachedDocument.childrenLinks) {
|
for (const childUrl of (cachedDocument.childrenLinks || [])) {
|
||||||
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
||||||
const childCachedDocumentString = await getValue(
|
const childCachedDocumentString = await getValue(
|
||||||
"web-scraper-cache:" + normalizedChildUrl
|
"web-scraper-cache:" + normalizedChildUrl
|
||||||
|
Loading…
Reference in New Issue
Block a user