Nick: partial docs working, cleaner
This commit is contained in:
parent
00373228fa
commit
2aa09a3000
@ -19,7 +19,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
const { current, current_url, total, current_step } = await job.progress();
|
||||
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
res.json({
|
||||
status: await job.getState(),
|
||||
// progress: job.progress(),
|
||||
@ -28,6 +28,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
||||
current_step: current_step,
|
||||
total: total,
|
||||
data: job.returnvalue,
|
||||
partial_docs: partialDocs ?? [],
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
|
@ -147,7 +147,7 @@ export async function searchController(req: Request, res: Response) {
|
||||
logJob({
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: result.data.length,
|
||||
num_docs: result.data ? result.data.length : 0,
|
||||
docs: result.data,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: team_id,
|
||||
|
@ -7,6 +7,7 @@ export interface Progress {
|
||||
[key: string]: any;
|
||||
};
|
||||
currentDocumentUrl?: string;
|
||||
currentDocument?: Document;
|
||||
}
|
||||
|
||||
export type PageOptions = {
|
||||
|
@ -10,13 +10,15 @@ export async function startWebScraperPipeline({
|
||||
}: {
|
||||
job: Job<WebScraperOptions>;
|
||||
}) {
|
||||
let partialDocs: Document[] = [];
|
||||
return (await runWebScraper({
|
||||
url: job.data.url,
|
||||
mode: job.data.mode,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
inProgress: (progress) => {
|
||||
job.progress(progress);
|
||||
partialDocs.push(progress.currentDocument);
|
||||
job.progress({...progress, partialDocs: partialDocs});
|
||||
},
|
||||
onSuccess: (result) => {
|
||||
job.moveToCompleted(result);
|
||||
@ -69,6 +71,7 @@ export async function runWebScraper({
|
||||
}
|
||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||
inProgress(progress);
|
||||
|
||||
})) as Document[];
|
||||
|
||||
if (docs.length === 0) {
|
||||
|
@ -54,6 +54,7 @@ export class WebScraperDataProvider {
|
||||
total: totalUrls,
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: url,
|
||||
currentDocument: result
|
||||
});
|
||||
}
|
||||
results[i + index] = result;
|
||||
@ -114,9 +115,7 @@ export class WebScraperDataProvider {
|
||||
}
|
||||
|
||||
private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
|
||||
let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
|
||||
documents = await this.applyPathReplacements(documents);
|
||||
documents = await this.applyImgAltText(documents);
|
||||
let documents = await this.processLinks(this.urls, inProgress);
|
||||
return documents;
|
||||
}
|
||||
|
||||
@ -153,6 +152,13 @@ export class WebScraperDataProvider {
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
documents = this.applyPathReplacements(documents);
|
||||
documents = await this.applyImgAltText(documents);
|
||||
|
||||
if(this.extractorOptions.mode === "llm-extraction" && this.mode === "single_urls") {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions
|
||||
)
|
||||
}
|
||||
return documents.concat(pdfDocuments);
|
||||
}
|
||||
|
||||
@ -275,7 +281,7 @@ export class WebScraperDataProvider {
|
||||
documents.push(cachedDocument);
|
||||
|
||||
// get children documents
|
||||
for (const childUrl of cachedDocument.childrenLinks) {
|
||||
for (const childUrl of (cachedDocument.childrenLinks || [])) {
|
||||
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
||||
const childCachedDocumentString = await getValue(
|
||||
"web-scraper-cache:" + normalizedChildUrl
|
||||
|
Loading…
Reference in New Issue
Block a user