"""Tests for NBA scraper.""" import json from datetime import datetime from unittest.mock import MagicMock, patch import pytest from sportstime_parser.scrapers.nba import NBAScraper, create_nba_scraper from sportstime_parser.scrapers.base import RawGameData from sportstime_parser.tests.fixtures import ( load_fixture, load_json_fixture, NBA_BR_OCTOBER_HTML, NBA_BR_EDGE_CASES_HTML, NBA_ESPN_SCOREBOARD_JSON, ) class TestNBAScraperInit: """Test NBAScraper initialization.""" def test_creates_scraper_with_season(self): """Test scraper initializes with correct season.""" scraper = NBAScraper(season=2025) assert scraper.sport == "nba" assert scraper.season == 2025 def test_factory_function_creates_scraper(self): """Test factory function creates correct scraper.""" scraper = create_nba_scraper(season=2025) assert isinstance(scraper, NBAScraper) assert scraper.season == 2025 def test_expected_game_count(self): """Test expected game count is correct for NBA.""" scraper = NBAScraper(season=2025) assert scraper.expected_game_count == 1230 def test_sources_in_priority_order(self): """Test sources are returned in correct priority order.""" scraper = NBAScraper(season=2025) sources = scraper._get_sources() assert sources == ["basketball_reference", "espn", "cbs"] class TestBasketballReferenceParsing: """Test Basketball-Reference HTML parsing.""" def test_parses_completed_games(self): """Test parsing completed games with scores.""" scraper = NBAScraper(season=2025) html = load_fixture(NBA_BR_OCTOBER_HTML) games = scraper._parse_basketball_reference(html, "http://example.com") # Should find all games in fixture assert len(games) == 7 # Check first completed game completed_games = [g for g in games if g.status == "final"] assert len(completed_games) == 2 # Boston @ Cleveland bos_cle = next(g for g in games if g.away_team_raw == "Boston Celtics") assert bos_cle.home_team_raw == "Cleveland Cavaliers" assert bos_cle.away_score == 112 assert bos_cle.home_score == 108 assert bos_cle.stadium_raw == "Rocket Mortgage FieldHouse" assert bos_cle.status == "final" def test_parses_scheduled_games(self): """Test parsing scheduled games without scores.""" scraper = NBAScraper(season=2025) html = load_fixture(NBA_BR_OCTOBER_HTML) games = scraper._parse_basketball_reference(html, "http://example.com") scheduled_games = [g for g in games if g.status == "scheduled"] assert len(scheduled_games) == 5 # Houston @ OKC hou_okc = next(g for g in scheduled_games if g.away_team_raw == "Houston Rockets") assert hou_okc.home_team_raw == "Oklahoma City Thunder" assert hou_okc.away_score is None assert hou_okc.home_score is None assert hou_okc.stadium_raw == "Paycom Center" def test_parses_game_dates_correctly(self): """Test game dates are parsed correctly.""" scraper = NBAScraper(season=2025) html = load_fixture(NBA_BR_OCTOBER_HTML) games = scraper._parse_basketball_reference(html, "http://example.com") # Check first game date first_game = games[0] assert first_game.game_date.year == 2025 assert first_game.game_date.month == 10 assert first_game.game_date.day == 22 def test_tracks_source_url(self): """Test source URL is tracked for all games.""" scraper = NBAScraper(season=2025) html = load_fixture(NBA_BR_OCTOBER_HTML) source_url = "http://basketball-reference.com/test" games = scraper._parse_basketball_reference(html, source_url) for game in games: assert game.source_url == source_url class TestBasketballReferenceEdgeCases: """Test edge case handling in Basketball-Reference parsing.""" def test_parses_postponed_games(self): """Test postponed games are identified correctly.""" scraper = NBAScraper(season=2025) html = load_fixture(NBA_BR_EDGE_CASES_HTML) games = scraper._parse_basketball_reference(html, "http://example.com") postponed = [g for g in games if g.status == "postponed"] assert len(postponed) == 1 assert postponed[0].away_team_raw == "Los Angeles Lakers" assert postponed[0].home_team_raw == "Phoenix Suns" def test_parses_cancelled_games(self): """Test cancelled games are identified correctly.""" scraper = NBAScraper(season=2025) html = load_fixture(NBA_BR_EDGE_CASES_HTML) games = scraper._parse_basketball_reference(html, "http://example.com") cancelled = [g for g in games if g.status == "cancelled"] assert len(cancelled) == 1 assert cancelled[0].away_team_raw == "Portland Trail Blazers" def test_parses_neutral_site_games(self): """Test neutral site games are parsed.""" scraper = NBAScraper(season=2025) html = load_fixture(NBA_BR_EDGE_CASES_HTML) games = scraper._parse_basketball_reference(html, "http://example.com") # Mexico City game mexico = next(g for g in games if g.stadium_raw == "Arena CDMX") assert mexico.away_team_raw == "Miami Heat" assert mexico.home_team_raw == "Washington Wizards" assert mexico.status == "final" def test_parses_overtime_games(self): """Test overtime games with high scores.""" scraper = NBAScraper(season=2025) html = load_fixture(NBA_BR_EDGE_CASES_HTML) games = scraper._parse_basketball_reference(html, "http://example.com") # High scoring OT game ot_game = next(g for g in games if g.away_score == 147) assert ot_game.home_score == 150 assert ot_game.status == "final" class TestESPNParsing: """Test ESPN API response parsing.""" def test_parses_completed_games(self): """Test parsing completed games from ESPN.""" scraper = NBAScraper(season=2025) data = load_json_fixture(NBA_ESPN_SCOREBOARD_JSON) games = scraper._parse_espn_response(data, "http://espn.com/api") completed = [g for g in games if g.status == "final"] assert len(completed) == 2 # Boston @ Cleveland bos_cle = next(g for g in completed if g.away_team_raw == "Boston Celtics") assert bos_cle.home_team_raw == "Cleveland Cavaliers" assert bos_cle.away_score == 112 assert bos_cle.home_score == 108 assert bos_cle.stadium_raw == "Rocket Mortgage FieldHouse" def test_parses_scheduled_games(self): """Test parsing scheduled games from ESPN.""" scraper = NBAScraper(season=2025) data = load_json_fixture(NBA_ESPN_SCOREBOARD_JSON) games = scraper._parse_espn_response(data, "http://espn.com/api") scheduled = [g for g in games if g.status == "scheduled"] assert len(scheduled) == 1 hou_okc = scheduled[0] assert hou_okc.away_team_raw == "Houston Rockets" assert hou_okc.home_team_raw == "Oklahoma City Thunder" assert hou_okc.stadium_raw == "Paycom Center" def test_parses_venue_info(self): """Test venue information is extracted.""" scraper = NBAScraper(season=2025) data = load_json_fixture(NBA_ESPN_SCOREBOARD_JSON) games = scraper._parse_espn_response(data, "http://espn.com/api") # Check all games have venue info for game in games: assert game.stadium_raw is not None class TestGameNormalization: """Test game normalization and canonical ID generation.""" def test_normalizes_games_with_canonical_ids(self): """Test games are normalized with correct canonical IDs.""" scraper = NBAScraper(season=2025) raw_games = [ RawGameData( game_date=datetime(2025, 10, 22), home_team_raw="Cleveland Cavaliers", away_team_raw="Boston Celtics", stadium_raw="Rocket Mortgage FieldHouse", home_score=108, away_score=112, status="final", source_url="http://example.com", ) ] games, review_items = scraper._normalize_games(raw_games) assert len(games) == 1 game = games[0] # Check canonical ID format assert game.id == "nba_2025_bos_cle_1022" assert game.sport == "nba" assert game.season == 2025 # Check team IDs assert game.home_team_id == "team_nba_cle" assert game.away_team_id == "team_nba_bos" # Check scores preserved assert game.home_score == 108 assert game.away_score == 112 def test_detects_doubleheaders(self): """Test doubleheaders get correct game numbers.""" scraper = NBAScraper(season=2025) raw_games = [ RawGameData( game_date=datetime(2025, 4, 1, 13, 0), home_team_raw="Boston Celtics", away_team_raw="New York Knicks", stadium_raw="TD Garden", status="final", home_score=105, away_score=98, ), RawGameData( game_date=datetime(2025, 4, 1, 19, 0), home_team_raw="Boston Celtics", away_team_raw="New York Knicks", stadium_raw="TD Garden", status="final", home_score=110, away_score=102, ), ] games, _ = scraper._normalize_games(raw_games) assert len(games) == 2 game_numbers = sorted([g.game_number for g in games]) assert game_numbers == [1, 2] # Check IDs include game number game_ids = sorted([g.id for g in games]) assert game_ids == ["nba_2025_nyk_bos_0401_1", "nba_2025_nyk_bos_0401_2"] def test_creates_review_items_for_unresolved_teams(self): """Test review items are created for unresolved teams.""" scraper = NBAScraper(season=2025) raw_games = [ RawGameData( game_date=datetime(2025, 10, 22), home_team_raw="Unknown Team XYZ", away_team_raw="Boston Celtics", stadium_raw="TD Garden", status="scheduled", ), ] games, review_items = scraper._normalize_games(raw_games) # Game should not be created due to unresolved team assert len(games) == 0 # But there should be a review item assert len(review_items) >= 1 class TestTeamAndStadiumScraping: """Test team and stadium data scraping.""" def test_scrapes_all_nba_teams(self): """Test all 30 NBA teams are returned.""" scraper = NBAScraper(season=2025) teams = scraper.scrape_teams() # 30 NBA teams assert len(teams) == 30 # Check team IDs are unique team_ids = [t.id for t in teams] assert len(set(team_ids)) == 30 # Check all teams have required fields for team in teams: assert team.id.startswith("team_nba_") assert team.sport == "nba" assert team.city assert team.name assert team.full_name assert team.abbreviation def test_teams_have_conferences_and_divisions(self): """Test teams have conference and division info.""" scraper = NBAScraper(season=2025) teams = scraper.scrape_teams() # Count teams by conference eastern = [t for t in teams if t.conference == "Eastern"] western = [t for t in teams if t.conference == "Western"] assert len(eastern) == 15 assert len(western) == 15 def test_scrapes_all_nba_stadiums(self): """Test all NBA stadiums are returned.""" scraper = NBAScraper(season=2025) stadiums = scraper.scrape_stadiums() # Should have stadiums for all teams assert len(stadiums) == 30 # Check stadium IDs are unique stadium_ids = [s.id for s in stadiums] assert len(set(stadium_ids)) == 30 # Check all stadiums have required fields for stadium in stadiums: assert stadium.id.startswith("stadium_nba_") assert stadium.sport == "nba" assert stadium.name assert stadium.city assert stadium.state assert stadium.country in ["USA", "Canada"] assert stadium.latitude != 0 assert stadium.longitude != 0 class TestScrapeFallback: """Test multi-source fallback behavior.""" def test_falls_back_to_next_source_on_failure(self): """Test scraper tries next source when first fails.""" scraper = NBAScraper(season=2025) with patch.object(scraper, '_scrape_basketball_reference') as mock_br, \ patch.object(scraper, '_scrape_espn') as mock_espn: # Make BR fail mock_br.side_effect = Exception("Connection failed") # Make ESPN return data mock_espn.return_value = [ RawGameData( game_date=datetime(2025, 10, 22), home_team_raw="Cleveland Cavaliers", away_team_raw="Boston Celtics", stadium_raw="Rocket Mortgage FieldHouse", status="scheduled", ) ] result = scraper.scrape_games() # Should have succeeded with ESPN assert result.success assert result.source == "espn" assert mock_br.called assert mock_espn.called def test_returns_failure_when_all_sources_fail(self): """Test scraper returns failure when all sources fail.""" scraper = NBAScraper(season=2025) with patch.object(scraper, '_scrape_basketball_reference') as mock_br, \ patch.object(scraper, '_scrape_espn') as mock_espn, \ patch.object(scraper, '_scrape_cbs') as mock_cbs: mock_br.side_effect = Exception("BR failed") mock_espn.side_effect = Exception("ESPN failed") mock_cbs.side_effect = Exception("CBS failed") result = scraper.scrape_games() assert not result.success assert "All sources failed" in result.error_message assert "CBS failed" in result.error_message class TestSeasonMonths: """Test season month calculation.""" def test_gets_correct_season_months(self): """Test correct months are returned for NBA season.""" scraper = NBAScraper(season=2025) months = scraper._get_season_months() # NBA season is Oct-Jun assert len(months) == 9 # Oct, Nov, Dec, Jan, Feb, Mar, Apr, May, Jun # Check first month is Oct of season year assert months[0] == (2025, 10) # Check last month is Jun of following year assert months[-1] == (2026, 6) # Check transition to new year assert months[2] == (2025, 12) # December assert months[3] == (2026, 1) # January