0
v-firecrawl/apps/api/src/controllers/crawlPreview.ts

46 lines
1.5 KiB
TypeScript
Raw Normal View History

2024-04-20 19:38:05 -04:00
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
2024-04-20 19:38:05 -04:00
export async function crawlPreviewController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Preview
);
if (!success) {
return res.status(status).json({ error });
}
// authenticate on supabase
const url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
if (isUrlBlocked(url)) {
2024-04-23 19:47:24 -04:00
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
}
2024-04-20 19:38:05 -04:00
const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
2024-04-20 22:37:45 -04:00
2024-04-20 19:38:05 -04:00
const job = await addWebScraperJob({
url: url,
mode: mode ?? "crawl", // fix for single urls not working
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
team_id: "preview",
pageOptions: pageOptions,
2024-04-20 22:37:45 -04:00
origin: "website-preview",
2024-04-20 19:38:05 -04:00
});
res.json({ jobId: job.id });
} catch (error) {
console.error(error);
return res.status(500).json({ error: error.message });
}
}