2024-04-20 19:38:05 -04:00
import { Request , Response } from "express" ;
import { WebScraperDataProvider } from "../../src/scraper/WebScraper" ;
import { billTeam } from "../../src/services/billing/credit_billing" ;
import { checkTeamCredits } from "../../src/services/billing/credit_billing" ;
import { authenticateUser } from "./auth" ;
import { RateLimiterMode } from "../../src/types" ;
import { addWebScraperJob } from "../../src/services/queue-jobs" ;
2024-04-23 17:50:35 -04:00
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist" ;
2024-04-20 19:38:05 -04:00
export async function crawlController ( req : Request , res : Response ) {
try {
const { success , team_id , error , status } = await authenticateUser (
req ,
res ,
RateLimiterMode . Crawl
) ;
if ( ! success ) {
return res . status ( status ) . json ( { error } ) ;
}
2024-04-21 13:36:48 -04:00
const { success : creditsCheckSuccess , message : creditsCheckMessage } =
await checkTeamCredits ( team_id , 1 ) ;
if ( ! creditsCheckSuccess ) {
return res . status ( 402 ) . json ( { error : "Insufficient credits" } ) ;
2024-04-20 19:38:05 -04:00
}
const url = req . body . url ;
if ( ! url ) {
return res . status ( 400 ) . json ( { error : "Url is required" } ) ;
}
2024-04-23 17:50:35 -04:00
if ( isUrlBlocked ( url ) ) {
2024-04-23 19:47:24 -04:00
return res . status ( 403 ) . json ( { error : "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." } ) ;
2024-04-23 17:50:35 -04:00
}
2024-04-20 19:38:05 -04:00
const mode = req . body . mode ? ? "crawl" ;
const crawlerOptions = req . body . crawlerOptions ? ? { } ;
2024-05-06 18:45:56 -04:00
const pageOptions = req . body . pageOptions ? ? { onlyMainContent : false } ;
const includeHtml = req . body . includeHtml || false ;
2024-04-20 19:38:05 -04:00
if ( mode === "single_urls" && ! url . includes ( "," ) ) {
try {
const a = new WebScraperDataProvider ( ) ;
await a . setOptions ( {
mode : "single_urls" ,
urls : [ url ] ,
crawlerOptions : {
returnOnlyUrls : true ,
} ,
pageOptions : pageOptions ,
2024-05-06 18:45:56 -04:00
includeHtml : includeHtml ,
2024-04-20 19:38:05 -04:00
} ) ;
const docs = await a . getDocuments ( false , ( progress ) = > {
job . progress ( {
current : progress.current ,
total : progress.total ,
current_step : "SCRAPING" ,
current_url : progress.currentDocumentUrl ,
} ) ;
} ) ;
return res . json ( {
success : true ,
documents : docs ,
} ) ;
} catch ( error ) {
console . error ( error ) ;
return res . status ( 500 ) . json ( { error : error.message } ) ;
}
}
const job = await addWebScraperJob ( {
url : url ,
mode : mode ? ? "crawl" , // fix for single urls not working
crawlerOptions : { . . . crawlerOptions } ,
team_id : team_id ,
pageOptions : pageOptions ,
2024-04-20 22:37:45 -04:00
origin : req.body.origin ? ? "api" ,
2024-05-06 18:45:56 -04:00
includeHtml : includeHtml ,
2024-04-20 19:38:05 -04:00
} ) ;
res . json ( { jobId : job.id } ) ;
} catch ( error ) {
console . error ( error ) ;
return res . status ( 500 ) . json ( { error : error.message } ) ;
}
}