0

Nick: partial docs working, cleaner

This commit is contained in:
Nicolas 2024-05-04 12:30:12 -07:00
parent 00373228fa
commit 2aa09a3000
5 changed files with 18 additions and 7 deletions

View File

@ -19,7 +19,7 @@ export async function crawlStatusController(req: Request, res: Response) {
return res.status(404).json({ error: "Job not found" });
}
const { current, current_url, total, current_step } = await job.progress();
const { current, current_url, total, current_step, partialDocs } = await job.progress();
res.json({
status: await job.getState(),
// progress: job.progress(),
@ -28,6 +28,7 @@ export async function crawlStatusController(req: Request, res: Response) {
current_step: current_step,
total: total,
data: job.returnvalue,
partial_docs: partialDocs ?? [],
});
} catch (error) {
console.error(error);

View File

@ -147,7 +147,7 @@ export async function searchController(req: Request, res: Response) {
logJob({
success: result.success,
message: result.error,
num_docs: result.data.length,
num_docs: result.data ? result.data.length : 0,
docs: result.data,
time_taken: timeTakenInSeconds,
team_id: team_id,

View File

@ -7,6 +7,7 @@ export interface Progress {
[key: string]: any;
};
currentDocumentUrl?: string;
currentDocument?: Document;
}
export type PageOptions = {

View File

@ -10,13 +10,15 @@ export async function startWebScraperPipeline({
}: {
job: Job<WebScraperOptions>;
}) {
let partialDocs: Document[] = [];
return (await runWebScraper({
url: job.data.url,
mode: job.data.mode,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
inProgress: (progress) => {
job.progress(progress);
partialDocs.push(progress.currentDocument);
job.progress({...progress, partialDocs: partialDocs});
},
onSuccess: (result) => {
job.moveToCompleted(result);
@ -69,6 +71,7 @@ export async function runWebScraper({
}
const docs = (await provider.getDocuments(false, (progress: Progress) => {
inProgress(progress);
})) as Document[];
if (docs.length === 0) {

View File

@ -54,6 +54,7 @@ export class WebScraperDataProvider {
total: totalUrls,
status: "SCRAPING",
currentDocumentUrl: url,
currentDocument: result
});
}
results[i + index] = result;
@ -114,9 +115,7 @@ export class WebScraperDataProvider {
}
private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
documents = await this.applyPathReplacements(documents);
documents = await this.applyImgAltText(documents);
let documents = await this.processLinks(this.urls, inProgress);
return documents;
}
@ -153,6 +152,13 @@ export class WebScraperDataProvider {
documents = await this.getSitemapData(this.urls[0], documents);
documents = this.applyPathReplacements(documents);
documents = await this.applyImgAltText(documents);
if(this.extractorOptions.mode === "llm-extraction" && this.mode === "single_urls") {
documents = await generateCompletions(
documents,
this.extractorOptions
)
}
return documents.concat(pdfDocuments);
}
@ -275,7 +281,7 @@ export class WebScraperDataProvider {
documents.push(cachedDocument);
// get children documents
for (const childUrl of cachedDocument.childrenLinks) {
for (const childUrl of (cachedDocument.childrenLinks || [])) {
const normalizedChildUrl = this.normalizeUrl(childUrl);
const childCachedDocumentString = await getValue(
"web-scraper-cache:" + normalizedChildUrl