0

168 lines
7.1 KiB
Python
Raw Normal View History

2024-05-27 14:14:00 -03:00
import importlib.util
import pytest
2024-05-27 14:14:00 -03:00
import time
import os
from uuid import uuid4
from dotenv import load_dotenv
2024-05-27 14:14:00 -03:00
load_dotenv()
API_URL = "http://127.0.0.1:3002";
2024-05-27 14:28:44 -03:00
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
2024-05-27 14:14:00 -03:00
TEST_API_KEY = os.getenv('TEST_API_KEY')
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
firecrawl = importlib.util.module_from_spec(spec)
spec.loader.exec_module(firecrawl)
FirecrawlApp = firecrawl.FirecrawlApp
def test_no_api_key():
with pytest.raises(Exception) as excinfo:
invalid_app = FirecrawlApp(api_url=API_URL)
assert "No API key provided" in str(excinfo.value)
def test_scrape_url_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
invalid_app.scrape_url('https://firecrawl.dev')
assert "Failed to scrape URL. Status code: 401" in str(excinfo.value)
def test_blocklisted_url():
blocklisted_url = "https://facebook.com/fake-test"
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
with pytest.raises(Exception) as excinfo:
app.scrape_url(blocklisted_url)
assert "Failed to scrape URL. Status code: 403" in str(excinfo.value)
def test_successful_response_with_valid_preview_token():
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
2024-06-06 15:36:20 -03:00
response = app.scrape_url('https://roastmywebsite.ai')
2024-05-27 14:14:00 -03:00
assert response is not None
assert 'content' in response
2024-06-06 15:36:20 -03:00
assert "_Roast_" in response['content']
def test_scrape_url_e2e():
2024-05-27 14:14:00 -03:00
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
2024-06-06 15:36:20 -03:00
response = app.scrape_url('https://roastmywebsite.ai')
assert response is not None
assert 'content' in response
2024-05-27 14:14:00 -03:00
assert 'markdown' in response
assert 'metadata' in response
assert 'html' not in response
2024-06-06 15:36:20 -03:00
assert "_Roast_" in response['content']
2024-05-27 14:14:00 -03:00
def test_successful_response_with_valid_api_key_and_include_html():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
2024-06-06 15:36:20 -03:00
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
2024-05-27 14:14:00 -03:00
assert response is not None
assert 'content' in response
assert 'markdown' in response
assert 'html' in response
assert 'metadata' in response
2024-06-06 15:36:20 -03:00
assert "_Roast_" in response['content']
assert "_Roast_" in response['markdown']
2024-05-27 14:14:00 -03:00
assert "<h1" in response['html']
2024-05-27 14:14:00 -03:00
def test_successful_response_for_valid_scrape_with_pdf_file():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
assert response is not None
2024-05-27 14:14:00 -03:00
assert 'content' in response
assert 'metadata' in response
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
time.sleep(6) # wait for 6 seconds
assert response is not None
assert 'content' in response
assert 'metadata' in response
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
def test_crawl_url_invalid_api_key():
2024-05-27 14:14:00 -03:00
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
2024-05-27 14:14:00 -03:00
invalid_app.crawl_url('https://firecrawl.dev')
assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value)
2024-05-27 14:14:00 -03:00
def test_should_return_error_for_blocklisted_url():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
blocklisted_url = "https://twitter.com/fake-test"
with pytest.raises(Exception) as excinfo:
app.crawl_url(blocklisted_url)
assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value)
def test_crawl_url_wait_for_completion_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
2024-06-06 15:36:20 -03:00
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
assert response is not None
2024-05-27 14:14:00 -03:00
assert len(response) > 0
assert 'content' in response[0]
2024-06-06 15:36:20 -03:00
assert "_Roast_" in response[0]['content']
2024-05-27 14:14:00 -03:00
def test_crawl_url_with_idempotency_key_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
uniqueIdempotencyKey = str(uuid4())
2024-06-06 15:36:20 -03:00
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
assert response is not None
assert len(response) > 0
assert 'content' in response[0]
2024-06-06 15:36:20 -03:00
assert "_Roast_" in response[0]['content']
2024-05-27 14:14:00 -03:00
with pytest.raises(Exception) as excinfo:
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value)
2024-05-27 14:14:00 -03:00
def test_check_crawl_status_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
assert response is not None
assert 'jobId' in response
2024-05-27 14:14:00 -03:00
time.sleep(30) # wait for 30 seconds
status_response = app.check_crawl_status(response['jobId'])
assert status_response is not None
assert 'status' in status_response
assert status_response['status'] == 'completed'
assert 'data' in status_response
assert len(status_response['data']) > 0
2024-05-27 14:14:00 -03:00
def test_search_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.search("test query")
assert response is not None
2024-05-27 14:14:00 -03:00
assert 'content' in response[0]
assert len(response) > 2
2024-05-27 14:14:00 -03:00
def test_search_invalid_api_key():
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as excinfo:
2024-05-27 14:14:00 -03:00
invalid_app.search("test query")
assert "Failed to search. Status code: 401" in str(excinfo.value)
def test_llm_extraction():
2024-05-27 14:14:00 -03:00
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.scrape_url("https://mendable.ai", {
'extractorOptions': {
'mode': 'llm-extraction',
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
'extractionSchema': {
'type': 'object',
'properties': {
'company_mission': {'type': 'string'},
'supports_sso': {'type': 'boolean'},
'is_open_source': {'type': 'boolean'}
},
'required': ['company_mission', 'supports_sso', 'is_open_source']
}
}
})
assert response is not None
assert 'llm_extraction' in response
llm_extraction = response['llm_extraction']
assert 'company_mission' in llm_extraction
assert isinstance(llm_extraction['supports_sso'], bool)
assert isinstance(llm_extraction['is_open_source'], bool)