2024-04-20 19:38:05 -04:00
import { Request , Response } from "express" ;
import { authenticateUser } from "./auth" ;
import { RateLimiterMode } from "../../src/types" ;
import { addWebScraperJob } from "../../src/services/queue-jobs" ;
2024-04-23 17:50:35 -04:00
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist" ;
2024-04-20 19:38:05 -04:00
export async function crawlPreviewController ( req : Request , res : Response ) {
try {
const { success , team_id , error , status } = await authenticateUser (
req ,
res ,
RateLimiterMode . Preview
) ;
if ( ! success ) {
return res . status ( status ) . json ( { error } ) ;
}
// authenticate on supabase
const url = req . body . url ;
if ( ! url ) {
return res . status ( 400 ) . json ( { error : "Url is required" } ) ;
}
2024-04-23 17:50:35 -04:00
if ( isUrlBlocked ( url ) ) {
2024-04-23 19:47:24 -04:00
return res . status ( 403 ) . json ( { error : "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." } ) ;
2024-04-23 17:50:35 -04:00
}
2024-04-20 19:38:05 -04:00
const mode = req . body . mode ? ? "crawl" ;
const crawlerOptions = req . body . crawlerOptions ? ? { } ;
const pageOptions = req . body . pageOptions ? ? { onlyMainContent : false } ;
2024-04-20 22:37:45 -04:00
2024-04-20 19:38:05 -04:00
const job = await addWebScraperJob ( {
url : url ,
mode : mode ? ? "crawl" , // fix for single urls not working
crawlerOptions : { . . . crawlerOptions , limit : 5 , maxCrawledLinks : 5 } ,
team_id : "preview" ,
pageOptions : pageOptions ,
2024-04-20 22:37:45 -04:00
origin : "website-preview" ,
2024-04-20 19:38:05 -04:00
} ) ;
res . json ( { jobId : job.id } ) ;
} catch ( error ) {
console . error ( error ) ;
return res . status ( 500 ) . json ( { error : error.message } ) ;
}
}