2024-05-27 14:14:00 -03:00
import importlib . util
2024-05-24 17:56:27 -03:00
import pytest
2024-05-27 14:14:00 -03:00
import time
import os
from uuid import uuid4
from dotenv import load_dotenv
2024-05-24 17:56:27 -03:00
2024-05-27 14:14:00 -03:00
load_dotenv ( )
API_URL = " http://127.0.0.1:3002 " ;
2024-05-27 14:28:44 -03:00
ABSOLUTE_FIRECRAWL_PATH = " firecrawl/firecrawl.py "
2024-05-27 14:14:00 -03:00
TEST_API_KEY = os . getenv ( ' TEST_API_KEY ' )
print ( f " ABSOLUTE_FIRECRAWL_PATH: { ABSOLUTE_FIRECRAWL_PATH } " )
spec = importlib . util . spec_from_file_location ( " FirecrawlApp " , ABSOLUTE_FIRECRAWL_PATH )
firecrawl = importlib . util . module_from_spec ( spec )
spec . loader . exec_module ( firecrawl )
FirecrawlApp = firecrawl . FirecrawlApp
def test_no_api_key ( ) :
with pytest . raises ( Exception ) as excinfo :
invalid_app = FirecrawlApp ( api_url = API_URL )
assert " No API key provided " in str ( excinfo . value )
def test_scrape_url_invalid_api_key ( ) :
invalid_app = FirecrawlApp ( api_url = API_URL , api_key = " invalid_api_key " )
with pytest . raises ( Exception ) as excinfo :
invalid_app . scrape_url ( ' https://firecrawl.dev ' )
assert " Failed to scrape URL. Status code: 401 " in str ( excinfo . value )
def test_blocklisted_url ( ) :
blocklisted_url = " https://facebook.com/fake-test "
app = FirecrawlApp ( api_url = API_URL , api_key = TEST_API_KEY )
with pytest . raises ( Exception ) as excinfo :
app . scrape_url ( blocklisted_url )
assert " Failed to scrape URL. Status code: 403 " in str ( excinfo . value )
def test_successful_response_with_valid_preview_token ( ) :
app = FirecrawlApp ( api_url = API_URL , api_key = " this_is_just_a_preview_token " )
2024-06-06 15:36:20 -03:00
response = app . scrape_url ( ' https://roastmywebsite.ai ' )
2024-05-27 14:14:00 -03:00
assert response is not None
assert ' content ' in response
2024-06-06 15:36:20 -03:00
assert " _Roast_ " in response [ ' content ' ]
2024-05-24 17:56:27 -03:00
def test_scrape_url_e2e ( ) :
2024-05-27 14:14:00 -03:00
app = FirecrawlApp ( api_url = API_URL , api_key = TEST_API_KEY )
2024-06-06 15:36:20 -03:00
response = app . scrape_url ( ' https://roastmywebsite.ai ' )
2024-05-24 17:56:27 -03:00
assert response is not None
assert ' content ' in response
2024-05-27 14:14:00 -03:00
assert ' markdown ' in response
assert ' metadata ' in response
assert ' html ' not in response
2024-06-06 15:36:20 -03:00
assert " _Roast_ " in response [ ' content ' ]
2024-05-24 17:56:27 -03:00
2024-05-27 14:14:00 -03:00
def test_successful_response_with_valid_api_key_and_include_html ( ) :
app = FirecrawlApp ( api_url = API_URL , api_key = TEST_API_KEY )
2024-06-06 15:36:20 -03:00
response = app . scrape_url ( ' https://roastmywebsite.ai ' , { ' pageOptions ' : { ' includeHtml ' : True } } )
2024-05-27 14:14:00 -03:00
assert response is not None
assert ' content ' in response
assert ' markdown ' in response
assert ' html ' in response
assert ' metadata ' in response
2024-06-06 15:36:20 -03:00
assert " _Roast_ " in response [ ' content ' ]
assert " _Roast_ " in response [ ' markdown ' ]
2024-05-27 14:14:00 -03:00
assert " <h1 " in response [ ' html ' ]
2024-05-24 17:56:27 -03:00
2024-05-27 14:14:00 -03:00
def test_successful_response_for_valid_scrape_with_pdf_file ( ) :
app = FirecrawlApp ( api_url = API_URL , api_key = TEST_API_KEY )
response = app . scrape_url ( ' https://arxiv.org/pdf/astro-ph/9301001.pdf ' )
2024-05-24 17:56:27 -03:00
assert response is not None
2024-05-27 14:14:00 -03:00
assert ' content ' in response
assert ' metadata ' in response
assert ' We present spectrophotometric observations of the Broad Line Radio Galaxy ' in response [ ' content ' ]
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension ( ) :
app = FirecrawlApp ( api_url = API_URL , api_key = TEST_API_KEY )
response = app . scrape_url ( ' https://arxiv.org/pdf/astro-ph/9301001 ' )
time . sleep ( 6 ) # wait for 6 seconds
assert response is not None
assert ' content ' in response
assert ' metadata ' in response
assert ' We present spectrophotometric observations of the Broad Line Radio Galaxy ' in response [ ' content ' ]
2024-05-24 17:56:27 -03:00
def test_crawl_url_invalid_api_key ( ) :
2024-05-27 14:14:00 -03:00
invalid_app = FirecrawlApp ( api_url = API_URL , api_key = " invalid_api_key " )
2024-05-24 17:56:27 -03:00
with pytest . raises ( Exception ) as excinfo :
2024-05-27 14:14:00 -03:00
invalid_app . crawl_url ( ' https://firecrawl.dev ' )
2024-05-24 17:56:27 -03:00
assert " Unexpected error occurred while trying to start crawl job. Status code: 401 " in str ( excinfo . value )
2024-05-27 14:14:00 -03:00
def test_should_return_error_for_blocklisted_url ( ) :
app = FirecrawlApp ( api_url = API_URL , api_key = TEST_API_KEY )
blocklisted_url = " https://twitter.com/fake-test "
with pytest . raises ( Exception ) as excinfo :
app . crawl_url ( blocklisted_url )
assert " Unexpected error occurred while trying to start crawl job. Status code: 403 " in str ( excinfo . value )
def test_crawl_url_wait_for_completion_e2e ( ) :
app = FirecrawlApp ( api_url = API_URL , api_key = TEST_API_KEY )
2024-06-06 15:36:20 -03:00
response = app . crawl_url ( ' https://roastmywebsite.ai ' , { ' crawlerOptions ' : { ' excludes ' : [ ' blog/* ' ] } } , True )
2024-05-24 17:56:27 -03:00
assert response is not None
2024-05-27 14:14:00 -03:00
assert len ( response ) > 0
2024-05-24 17:56:27 -03:00
assert ' content ' in response [ 0 ]
2024-06-06 15:36:20 -03:00
assert " _Roast_ " in response [ 0 ] [ ' content ' ]
2024-05-24 17:56:27 -03:00
2024-05-27 14:14:00 -03:00
def test_crawl_url_with_idempotency_key_e2e ( ) :
app = FirecrawlApp ( api_url = API_URL , api_key = TEST_API_KEY )
uniqueIdempotencyKey = str ( uuid4 ( ) )
2024-06-06 15:36:20 -03:00
response = app . crawl_url ( ' https://roastmywebsite.ai ' , { ' crawlerOptions ' : { ' excludes ' : [ ' blog/* ' ] } } , True , 2 , uniqueIdempotencyKey )
2024-05-24 17:56:27 -03:00
assert response is not None
assert len ( response ) > 0
assert ' content ' in response [ 0 ]
2024-06-06 15:36:20 -03:00
assert " _Roast_ " in response [ 0 ] [ ' content ' ]
2024-05-27 14:14:00 -03:00
with pytest . raises ( Exception ) as excinfo :
app . crawl_url ( ' https://firecrawl.dev ' , { ' crawlerOptions ' : { ' excludes ' : [ ' blog/* ' ] } } , True , 2 , uniqueIdempotencyKey )
assert " Failed to start crawl job. Status code: 409. Error: Idempotency key already used " in str ( excinfo . value )
2024-05-24 17:56:27 -03:00
2024-05-27 14:14:00 -03:00
def test_check_crawl_status_e2e ( ) :
app = FirecrawlApp ( api_url = API_URL , api_key = TEST_API_KEY )
response = app . crawl_url ( ' https://firecrawl.dev ' , { ' crawlerOptions ' : { ' excludes ' : [ ' blog/* ' ] } } , False )
2024-05-24 17:56:27 -03:00
assert response is not None
assert ' jobId ' in response
2024-05-27 14:14:00 -03:00
time . sleep ( 30 ) # wait for 30 seconds
status_response = app . check_crawl_status ( response [ ' jobId ' ] )
assert status_response is not None
assert ' status ' in status_response
assert status_response [ ' status ' ] == ' completed '
assert ' data ' in status_response
assert len ( status_response [ ' data ' ] ) > 0
2024-05-24 17:56:27 -03:00
2024-05-27 14:14:00 -03:00
def test_search_e2e ( ) :
app = FirecrawlApp ( api_url = API_URL , api_key = TEST_API_KEY )
response = app . search ( " test query " )
2024-05-24 17:56:27 -03:00
assert response is not None
2024-05-27 14:14:00 -03:00
assert ' content ' in response [ 0 ]
assert len ( response ) > 2
2024-05-24 17:56:27 -03:00
2024-05-27 14:14:00 -03:00
def test_search_invalid_api_key ( ) :
invalid_app = FirecrawlApp ( api_url = API_URL , api_key = " invalid_api_key " )
2024-05-24 17:56:27 -03:00
with pytest . raises ( Exception ) as excinfo :
2024-05-27 14:14:00 -03:00
invalid_app . search ( " test query " )
assert " Failed to search. Status code: 401 " in str ( excinfo . value )
2024-05-24 17:56:27 -03:00
def test_llm_extraction ( ) :
2024-05-27 14:14:00 -03:00
app = FirecrawlApp ( api_url = API_URL , api_key = TEST_API_KEY )
2024-05-24 17:56:27 -03:00
response = app . scrape_url ( " https://mendable.ai " , {
' extractorOptions ' : {
' mode ' : ' llm-extraction ' ,
' extractionPrompt ' : " Based on the information on the page, find what the company ' s mission is and whether it supports SSO, and whether it is open source " ,
' extractionSchema ' : {
' type ' : ' object ' ,
' properties ' : {
' company_mission ' : { ' type ' : ' string ' } ,
' supports_sso ' : { ' type ' : ' boolean ' } ,
' is_open_source ' : { ' type ' : ' boolean ' }
} ,
' required ' : [ ' company_mission ' , ' supports_sso ' , ' is_open_source ' ]
}
}
} )
assert response is not None
assert ' llm_extraction ' in response
llm_extraction = response [ ' llm_extraction ' ]
assert ' company_mission ' in llm_extraction
assert isinstance ( llm_extraction [ ' supports_sso ' ] , bool )
assert isinstance ( llm_extraction [ ' is_open_source ' ] , bool )