0
v-firecrawl/apps/api/src/controllers/crawl.ts

80 lines
2.4 KiB
TypeScript
Raw Normal View History

2024-04-20 19:38:05 -04:00
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../../src/scraper/WebScraper";
import { billTeam } from "../../src/services/billing/credit_billing";
import { checkTeamCredits } from "../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
export async function crawlController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
if (!success) {
return res.status(status).json({ error });
}
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits" });
}
// authenticate on supabase
const url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
if (mode === "single_urls" && !url.includes(",")) {
try {
const a = new WebScraperDataProvider();
await a.setOptions({
mode: "single_urls",
urls: [url],
crawlerOptions: {
returnOnlyUrls: true,
},
pageOptions: pageOptions,
2024-04-20 22:37:45 -04:00
2024-04-20 19:38:05 -04:00
});
const docs = await a.getDocuments(false, (progress) => {
job.progress({
current: progress.current,
total: progress.total,
current_step: "SCRAPING",
current_url: progress.currentDocumentUrl,
});
});
return res.json({
success: true,
documents: docs,
});
} catch (error) {
console.error(error);
return res.status(500).json({ error: error.message });
}
}
const job = await addWebScraperJob({
url: url,
mode: mode ?? "crawl", // fix for single urls not working
crawlerOptions: { ...crawlerOptions },
team_id: team_id,
pageOptions: pageOptions,
2024-04-20 22:37:45 -04:00
origin: req.body.origin ?? "api",
2024-04-20 19:38:05 -04:00
});
res.json({ jobId: job.id });
} catch (error) {
console.error(error);
return res.status(500).json({ error: error.message });
}
}