0
This commit is contained in:
Nicolas 2024-04-20 19:37:45 -07:00
parent 19cba43ee4
commit 0db0874b00
8 changed files with 32 additions and 10 deletions

View File

@ -42,6 +42,7 @@ export async function crawlController(req: Request, res: Response) {
returnOnlyUrls: true, returnOnlyUrls: true,
}, },
pageOptions: pageOptions, pageOptions: pageOptions,
}); });
const docs = await a.getDocuments(false, (progress) => { const docs = await a.getDocuments(false, (progress) => {
@ -67,6 +68,7 @@ export async function crawlController(req: Request, res: Response) {
crawlerOptions: { ...crawlerOptions }, crawlerOptions: { ...crawlerOptions },
team_id: team_id, team_id: team_id,
pageOptions: pageOptions, pageOptions: pageOptions,
origin: req.body.origin ?? "api",
}); });
res.json({ jobId: job.id }); res.json({ jobId: job.id });

View File

@ -21,12 +21,14 @@ export async function crawlPreviewController(req: Request, res: Response) {
const mode = req.body.mode ?? "crawl"; const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
const job = await addWebScraperJob({ const job = await addWebScraperJob({
url: url, url: url,
mode: mode ?? "crawl", // fix for single urls not working mode: mode ?? "crawl", // fix for single urls not working
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
team_id: "preview", team_id: "preview",
pageOptions: pageOptions, pageOptions: pageOptions,
origin: "website-preview",
}); });
res.json({ jobId: job.id }); res.json({ jobId: job.id });

View File

@ -72,6 +72,7 @@ export async function scrapeController(req: Request, res: Response) {
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
const origin = req.body.origin ?? "api";
try { try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } = const { success: creditsCheckSuccess, message: creditsCheckMessage } =
@ -83,24 +84,27 @@ export async function scrapeController(req: Request, res: Response) {
console.error(error); console.error(error);
return res.status(500).json({ error: "Internal server error" }); return res.status(500).json({ error: "Internal server error" });
} }
const startTime = new Date().getTime();
const result = await scrapeHelper( const result = await scrapeHelper(
req, req,
team_id, team_id,
crawlerOptions, crawlerOptions,
pageOptions pageOptions
); );
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
logJob({ logJob({
success: result.success, success: result.success,
message: result.error, message: result.error,
num_docs: 1, num_docs: 1,
docs: [result.data], docs: [result.data],
time_taken: 0, time_taken: timeTakenInSeconds,
team_id: team_id, team_id: team_id,
mode: "scrape", mode: "scrape",
url: req.body.url, url: req.body.url,
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions, pageOptions: pageOptions,
origin: origin,
}); });
return res.status(result.returnCode).json(result); return res.status(result.returnCode).json(result);
} catch (error) { } catch (error) {

View File

@ -44,7 +44,11 @@ export async function runWebScraper({
onSuccess: (result: any) => void; onSuccess: (result: any) => void;
onError: (error: any) => void; onError: (error: any) => void;
team_id: string; team_id: string;
}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> { }): Promise<{
success: boolean;
message: string;
docs: CrawlResult[];
}> {
try { try {
const provider = new WebScraperDataProvider(); const provider = new WebScraperDataProvider();
if (mode === "crawl") { if (mode === "crawl") {
@ -70,7 +74,7 @@ export async function runWebScraper({
return { return {
success: true, success: true,
message: "No pages found", message: "No pages found",
docs: [], docs: []
}; };
} }
@ -87,7 +91,7 @@ export async function runWebScraper({
return { return {
success: false, success: false,
message: "Failed to bill team, no subscription was found", message: "Failed to bill team, no subscription was found",
docs: [], docs: []
}; };
} }

View File

@ -17,11 +17,12 @@ export async function logJob(job: FirecrawlJob) {
num_docs: job.num_docs, num_docs: job.num_docs,
docs: job.docs, docs: job.docs,
time_taken: job.time_taken, time_taken: job.time_taken,
team_id: job.team_id, team_id: job.team_id === "preview" ? null : job.team_id,
mode: job.mode, mode: job.mode,
url: job.url, url: job.url,
crawler_options: job.crawlerOptions, crawler_options: job.crawlerOptions,
page_options: job.pageOptions, page_options: job.pageOptions,
origin: job.origin,
}, },
]); ]);
if (error) { if (error) {

View File

@ -17,6 +17,7 @@ getWebScraperQueue().process(
current_url: "", current_url: "",
}); });
const start = Date.now(); const start = Date.now();
console.log("Processing job", job.data);
const { success, message, docs } = await startWebScraperPipeline({ job }); const { success, message, docs } = await startWebScraperPipeline({ job });
const end = Date.now(); const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000; const timeTakenInSeconds = (end - start) / 1000;
@ -45,6 +46,7 @@ getWebScraperQueue().process(
url: job.data.url, url: job.data.url,
crawlerOptions: job.data.crawlerOptions, crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions, pageOptions: job.data.pageOptions,
origin: job.data.origin,
}); });
done(null, data); done(null, data);
} catch (error) { } catch (error) {

View File

@ -1,6 +1,7 @@
import { supabase_service } from "./supabase"; import { supabase_service } from "./supabase";
export const callWebhook = async (teamId: string, data: any) => { export const callWebhook = async (teamId: string, data: any) => {
try {
const { data: webhooksData, error } = await supabase_service const { data: webhooksData, error } = await supabase_service
.from('webhooks') .from('webhooks')
.select('url') .select('url')
@ -37,5 +38,9 @@ export const callWebhook = async (teamId: string, data: any) => {
data: dataToSend, data: dataToSend,
error: data.error || undefined, error: data.error || undefined,
}), }),
}); });
} } catch (error) {
console.error(`Error sending webhook for team ID: ${teamId}`, error.message);
}
};

View File

@ -22,6 +22,7 @@ export interface WebScraperOptions {
crawlerOptions: any; crawlerOptions: any;
pageOptions: any; pageOptions: any;
team_id: string; team_id: string;
origin?: string;
} }
@ -36,6 +37,7 @@ export interface FirecrawlJob {
url: string; url: string;
crawlerOptions?: any; crawlerOptions?: any;
pageOptions?: any; pageOptions?: any;
origin: string;
} }