2024-05-06 10:36:44 -04:00
import { ExtractorOptions , PageOptions } from './../lib/entities' ;
2024-04-20 19:38:05 -04:00
import { Request , Response } from "express" ;
2024-04-20 22:04:27 -04:00
import { WebScraperDataProvider } from "../scraper/WebScraper" ;
import { billTeam , checkTeamCredits } from "../services/billing/credit_billing" ;
2024-04-20 19:38:05 -04:00
import { authenticateUser } from "./auth" ;
2024-04-20 22:04:27 -04:00
import { RateLimiterMode } from "../types" ;
import { logJob } from "../services/logging/log_job" ;
import { Document } from "../lib/entities" ;
2024-04-23 17:50:35 -04:00
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist" ; // Import the isUrlBlocked function
2024-04-30 12:20:15 -04:00
import { numTokensFromString } from '../lib/LLM-extraction/helpers' ;
2024-04-20 19:38:05 -04:00
export async function scrapeHelper (
req : Request ,
team_id : string ,
crawlerOptions : any ,
2024-05-06 10:36:44 -04:00
pageOptions : PageOptions ,
2024-05-06 18:45:56 -04:00
extractorOptions : ExtractorOptions ,
includeHtml : boolean = false
2024-04-20 21:55:39 -04:00
) : Promise < {
success : boolean ;
error? : string ;
data? : Document ;
2024-04-20 22:04:27 -04:00
returnCode : number ;
2024-04-20 21:55:39 -04:00
} > {
2024-04-20 19:38:05 -04:00
const url = req . body . url ;
if ( ! url ) {
2024-04-20 21:55:39 -04:00
return { success : false , error : "Url is required" , returnCode : 400 } ;
2024-04-20 19:38:05 -04:00
}
2024-04-23 17:50:35 -04:00
if ( isUrlBlocked ( url ) ) {
2024-04-23 19:47:24 -04:00
return { success : false , error : "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." , returnCode : 403 } ;
2024-04-23 17:50:35 -04:00
}
2024-04-29 15:12:55 -04:00
2024-04-20 19:38:05 -04:00
const a = new WebScraperDataProvider ( ) ;
await a . setOptions ( {
mode : "single_urls" ,
urls : [ url ] ,
crawlerOptions : {
. . . crawlerOptions ,
} ,
pageOptions : pageOptions ,
2024-05-06 18:45:56 -04:00
extractorOptions : extractorOptions ,
includeHtml : includeHtml
2024-04-20 19:38:05 -04:00
} ) ;
const docs = await a . getDocuments ( false ) ;
// make sure doc.content is not empty
const filteredDocs = docs . filter (
( doc : { content? : string } ) = > doc . content && doc . content . trim ( ) . length > 0
) ;
if ( filteredDocs . length === 0 ) {
2024-04-20 21:55:39 -04:00
return { success : true , error : "No page found" , returnCode : 200 } ;
2024-04-20 19:38:05 -04:00
}
2024-04-21 12:31:22 -04:00
2024-04-30 12:20:15 -04:00
let creditsToBeBilled = filteredDocs . length ;
2024-04-30 19:19:59 -04:00
const creditsPerLLMExtract = 5 ;
2024-04-30 12:20:15 -04:00
if ( extractorOptions . mode === "llm-extraction" ) {
creditsToBeBilled = creditsToBeBilled + ( creditsPerLLMExtract * filteredDocs . length )
}
2024-04-26 10:42:49 -04:00
const billingResult = await billTeam (
team_id ,
2024-04-30 12:20:15 -04:00
creditsToBeBilled
2024-04-26 10:42:49 -04:00
) ;
if ( ! billingResult . success ) {
return {
success : false ,
error :
"Failed to bill team. Insufficient credits or subscription not found." ,
returnCode : 402 ,
} ;
}
2024-04-21 12:31:22 -04:00
2024-04-20 19:38:05 -04:00
return {
success : true ,
data : filteredDocs [ 0 ] ,
2024-04-20 21:55:39 -04:00
returnCode : 200 ,
2024-04-20 19:38:05 -04:00
} ;
}
export async function scrapeController ( req : Request , res : Response ) {
try {
// make sure to authenticate user first, Bearer <token>
const { success , team_id , error , status } = await authenticateUser (
req ,
res ,
RateLimiterMode . Scrape
) ;
if ( ! success ) {
return res . status ( status ) . json ( { error } ) ;
}
const crawlerOptions = req . body . crawlerOptions ? ? { } ;
2024-05-06 18:45:56 -04:00
const pageOptions = req . body . pageOptions ? ? { onlyMainContent : false } ;
2024-04-28 18:52:09 -04:00
const extractorOptions = req . body . extractorOptions ? ? {
mode : "markdown"
}
2024-04-20 22:37:45 -04:00
const origin = req . body . origin ? ? "api" ;
2024-05-06 18:45:56 -04:00
const includeHtml = req . body . includeHtml ? ? false ;
2024-04-20 19:38:05 -04:00
try {
const { success : creditsCheckSuccess , message : creditsCheckMessage } =
await checkTeamCredits ( team_id , 1 ) ;
if ( ! creditsCheckSuccess ) {
return res . status ( 402 ) . json ( { error : "Insufficient credits" } ) ;
}
} catch ( error ) {
console . error ( error ) ;
return res . status ( 500 ) . json ( { error : "Internal server error" } ) ;
}
2024-04-20 22:37:45 -04:00
const startTime = new Date ( ) . getTime ( ) ;
2024-04-20 21:55:39 -04:00
const result = await scrapeHelper (
req ,
team_id ,
crawlerOptions ,
2024-04-28 18:52:09 -04:00
pageOptions ,
2024-05-06 18:45:56 -04:00
extractorOptions ,
includeHtml
2024-04-20 21:55:39 -04:00
) ;
2024-04-20 22:37:45 -04:00
const endTime = new Date ( ) . getTime ( ) ;
const timeTakenInSeconds = ( endTime - startTime ) / 1000 ;
2024-04-30 19:35:44 -04:00
const numTokens = ( result . data && result . data . markdown ) ? numTokensFromString ( result . data . markdown , "gpt-3.5-turbo" ) : 0 ;
2024-04-30 12:20:15 -04:00
2024-04-20 21:55:39 -04:00
logJob ( {
success : result.success ,
message : result.error ,
num_docs : 1 ,
docs : [ result . data ] ,
2024-04-20 22:37:45 -04:00
time_taken : timeTakenInSeconds ,
2024-04-20 21:55:39 -04:00
team_id : team_id ,
mode : "scrape" ,
url : req.body.url ,
crawlerOptions : crawlerOptions ,
pageOptions : pageOptions ,
2024-04-30 12:20:15 -04:00
origin : origin ,
extractor_options : extractorOptions ,
2024-05-06 18:45:56 -04:00
num_tokens : numTokens ,
includeHtml : includeHtml
2024-04-20 21:55:39 -04:00
} ) ;
2024-04-20 22:04:27 -04:00
return res . status ( result . returnCode ) . json ( result ) ;
2024-04-20 19:38:05 -04:00
} catch ( error ) {
console . error ( error ) ;
return res . status ( 500 ) . json ( { error : error.message } ) ;
}
}