2024-04-15 17:01:47 -04:00
|
|
|
import { CustomError } from "../lib/custom-error";
|
|
|
|
import { getWebScraperQueue } from "./queue-service";
|
|
|
|
import "dotenv/config";
|
|
|
|
import { logtail } from "./logtail";
|
|
|
|
import { startWebScraperPipeline } from "../main/runWebScraper";
|
|
|
|
import { callWebhook } from "./webhook";
|
2024-04-20 16:53:11 -04:00
|
|
|
import { logJob } from "./logging/log_job";
|
2024-04-15 17:01:47 -04:00
|
|
|
|
|
|
|
getWebScraperQueue().process(
|
|
|
|
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
|
|
|
async function (job, done) {
|
|
|
|
try {
|
|
|
|
job.progress({
|
|
|
|
current: 1,
|
|
|
|
total: 100,
|
|
|
|
current_step: "SCRAPING",
|
|
|
|
current_url: "",
|
|
|
|
});
|
2024-04-20 16:53:11 -04:00
|
|
|
const start = Date.now();
|
2024-04-15 17:01:47 -04:00
|
|
|
const { success, message, docs } = await startWebScraperPipeline({ job });
|
2024-04-20 16:53:11 -04:00
|
|
|
const end = Date.now();
|
|
|
|
const timeTakenInSeconds = (end - start) / 1000;
|
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
const data = {
|
|
|
|
success: success,
|
|
|
|
result: {
|
|
|
|
links: docs.map((doc) => {
|
|
|
|
return { content: doc, source: doc.metadata.sourceURL };
|
|
|
|
}),
|
|
|
|
},
|
|
|
|
project_id: job.data.project_id,
|
|
|
|
error: message /* etc... */,
|
|
|
|
};
|
|
|
|
|
|
|
|
await callWebhook(job.data.team_id, data);
|
2024-04-20 16:53:11 -04:00
|
|
|
|
|
|
|
await logJob({
|
|
|
|
success: success,
|
|
|
|
message: message,
|
|
|
|
num_docs: docs.length,
|
|
|
|
docs: docs,
|
|
|
|
time_taken: timeTakenInSeconds,
|
|
|
|
team_id: job.data.team_id,
|
|
|
|
mode: "crawl",
|
|
|
|
url: job.data.url,
|
|
|
|
crawlerOptions: job.data.crawlerOptions,
|
|
|
|
pageOptions: job.data.pageOptions,
|
|
|
|
});
|
2024-04-15 17:01:47 -04:00
|
|
|
done(null, data);
|
|
|
|
} catch (error) {
|
|
|
|
if (error instanceof CustomError) {
|
|
|
|
// Here we handle the error, then save the failed job
|
|
|
|
console.error(error.message); // or any other error handling
|
|
|
|
|
|
|
|
logtail.error("Custom error while ingesting", {
|
|
|
|
job_id: job.id,
|
|
|
|
error: error.message,
|
|
|
|
dataIngestionJob: error.dataIngestionJob,
|
|
|
|
});
|
|
|
|
}
|
|
|
|
console.log(error);
|
|
|
|
|
|
|
|
logtail.error("Overall error ingesting", {
|
|
|
|
job_id: job.id,
|
|
|
|
error: error.message,
|
|
|
|
});
|
|
|
|
|
|
|
|
const data = {
|
|
|
|
success: false,
|
|
|
|
project_id: job.data.project_id,
|
|
|
|
error:
|
|
|
|
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
|
|
|
};
|
|
|
|
await callWebhook(job.data.team_id, data);
|
|
|
|
done(null, data);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
);
|