Files
Trey t 52d445bca4 feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00

429 lines
15 KiB
Python

"""Tests for NBA scraper."""
import json
from datetime import datetime
from unittest.mock import MagicMock, patch
import pytest
from sportstime_parser.scrapers.nba import NBAScraper, create_nba_scraper
from sportstime_parser.scrapers.base import RawGameData
from sportstime_parser.tests.fixtures import (
load_fixture,
load_json_fixture,
NBA_BR_OCTOBER_HTML,
NBA_BR_EDGE_CASES_HTML,
NBA_ESPN_SCOREBOARD_JSON,
)
class TestNBAScraperInit:
"""Test NBAScraper initialization."""
def test_creates_scraper_with_season(self):
"""Test scraper initializes with correct season."""
scraper = NBAScraper(season=2025)
assert scraper.sport == "nba"
assert scraper.season == 2025
def test_factory_function_creates_scraper(self):
"""Test factory function creates correct scraper."""
scraper = create_nba_scraper(season=2025)
assert isinstance(scraper, NBAScraper)
assert scraper.season == 2025
def test_expected_game_count(self):
"""Test expected game count is correct for NBA."""
scraper = NBAScraper(season=2025)
assert scraper.expected_game_count == 1230
def test_sources_in_priority_order(self):
"""Test sources are returned in correct priority order."""
scraper = NBAScraper(season=2025)
sources = scraper._get_sources()
assert sources == ["basketball_reference", "espn", "cbs"]
class TestBasketballReferenceParsing:
"""Test Basketball-Reference HTML parsing."""
def test_parses_completed_games(self):
"""Test parsing completed games with scores."""
scraper = NBAScraper(season=2025)
html = load_fixture(NBA_BR_OCTOBER_HTML)
games = scraper._parse_basketball_reference(html, "http://example.com")
# Should find all games in fixture
assert len(games) == 7
# Check first completed game
completed_games = [g for g in games if g.status == "final"]
assert len(completed_games) == 2
# Boston @ Cleveland
bos_cle = next(g for g in games if g.away_team_raw == "Boston Celtics")
assert bos_cle.home_team_raw == "Cleveland Cavaliers"
assert bos_cle.away_score == 112
assert bos_cle.home_score == 108
assert bos_cle.stadium_raw == "Rocket Mortgage FieldHouse"
assert bos_cle.status == "final"
def test_parses_scheduled_games(self):
"""Test parsing scheduled games without scores."""
scraper = NBAScraper(season=2025)
html = load_fixture(NBA_BR_OCTOBER_HTML)
games = scraper._parse_basketball_reference(html, "http://example.com")
scheduled_games = [g for g in games if g.status == "scheduled"]
assert len(scheduled_games) == 5
# Houston @ OKC
hou_okc = next(g for g in scheduled_games if g.away_team_raw == "Houston Rockets")
assert hou_okc.home_team_raw == "Oklahoma City Thunder"
assert hou_okc.away_score is None
assert hou_okc.home_score is None
assert hou_okc.stadium_raw == "Paycom Center"
def test_parses_game_dates_correctly(self):
"""Test game dates are parsed correctly."""
scraper = NBAScraper(season=2025)
html = load_fixture(NBA_BR_OCTOBER_HTML)
games = scraper._parse_basketball_reference(html, "http://example.com")
# Check first game date
first_game = games[0]
assert first_game.game_date.year == 2025
assert first_game.game_date.month == 10
assert first_game.game_date.day == 22
def test_tracks_source_url(self):
"""Test source URL is tracked for all games."""
scraper = NBAScraper(season=2025)
html = load_fixture(NBA_BR_OCTOBER_HTML)
source_url = "http://basketball-reference.com/test"
games = scraper._parse_basketball_reference(html, source_url)
for game in games:
assert game.source_url == source_url
class TestBasketballReferenceEdgeCases:
"""Test edge case handling in Basketball-Reference parsing."""
def test_parses_postponed_games(self):
"""Test postponed games are identified correctly."""
scraper = NBAScraper(season=2025)
html = load_fixture(NBA_BR_EDGE_CASES_HTML)
games = scraper._parse_basketball_reference(html, "http://example.com")
postponed = [g for g in games if g.status == "postponed"]
assert len(postponed) == 1
assert postponed[0].away_team_raw == "Los Angeles Lakers"
assert postponed[0].home_team_raw == "Phoenix Suns"
def test_parses_cancelled_games(self):
"""Test cancelled games are identified correctly."""
scraper = NBAScraper(season=2025)
html = load_fixture(NBA_BR_EDGE_CASES_HTML)
games = scraper._parse_basketball_reference(html, "http://example.com")
cancelled = [g for g in games if g.status == "cancelled"]
assert len(cancelled) == 1
assert cancelled[0].away_team_raw == "Portland Trail Blazers"
def test_parses_neutral_site_games(self):
"""Test neutral site games are parsed."""
scraper = NBAScraper(season=2025)
html = load_fixture(NBA_BR_EDGE_CASES_HTML)
games = scraper._parse_basketball_reference(html, "http://example.com")
# Mexico City game
mexico = next(g for g in games if g.stadium_raw == "Arena CDMX")
assert mexico.away_team_raw == "Miami Heat"
assert mexico.home_team_raw == "Washington Wizards"
assert mexico.status == "final"
def test_parses_overtime_games(self):
"""Test overtime games with high scores."""
scraper = NBAScraper(season=2025)
html = load_fixture(NBA_BR_EDGE_CASES_HTML)
games = scraper._parse_basketball_reference(html, "http://example.com")
# High scoring OT game
ot_game = next(g for g in games if g.away_score == 147)
assert ot_game.home_score == 150
assert ot_game.status == "final"
class TestESPNParsing:
"""Test ESPN API response parsing."""
def test_parses_completed_games(self):
"""Test parsing completed games from ESPN."""
scraper = NBAScraper(season=2025)
data = load_json_fixture(NBA_ESPN_SCOREBOARD_JSON)
games = scraper._parse_espn_response(data, "http://espn.com/api")
completed = [g for g in games if g.status == "final"]
assert len(completed) == 2
# Boston @ Cleveland
bos_cle = next(g for g in completed if g.away_team_raw == "Boston Celtics")
assert bos_cle.home_team_raw == "Cleveland Cavaliers"
assert bos_cle.away_score == 112
assert bos_cle.home_score == 108
assert bos_cle.stadium_raw == "Rocket Mortgage FieldHouse"
def test_parses_scheduled_games(self):
"""Test parsing scheduled games from ESPN."""
scraper = NBAScraper(season=2025)
data = load_json_fixture(NBA_ESPN_SCOREBOARD_JSON)
games = scraper._parse_espn_response(data, "http://espn.com/api")
scheduled = [g for g in games if g.status == "scheduled"]
assert len(scheduled) == 1
hou_okc = scheduled[0]
assert hou_okc.away_team_raw == "Houston Rockets"
assert hou_okc.home_team_raw == "Oklahoma City Thunder"
assert hou_okc.stadium_raw == "Paycom Center"
def test_parses_venue_info(self):
"""Test venue information is extracted."""
scraper = NBAScraper(season=2025)
data = load_json_fixture(NBA_ESPN_SCOREBOARD_JSON)
games = scraper._parse_espn_response(data, "http://espn.com/api")
# Check all games have venue info
for game in games:
assert game.stadium_raw is not None
class TestGameNormalization:
"""Test game normalization and canonical ID generation."""
def test_normalizes_games_with_canonical_ids(self):
"""Test games are normalized with correct canonical IDs."""
scraper = NBAScraper(season=2025)
raw_games = [
RawGameData(
game_date=datetime(2025, 10, 22),
home_team_raw="Cleveland Cavaliers",
away_team_raw="Boston Celtics",
stadium_raw="Rocket Mortgage FieldHouse",
home_score=108,
away_score=112,
status="final",
source_url="http://example.com",
)
]
games, review_items = scraper._normalize_games(raw_games)
assert len(games) == 1
game = games[0]
# Check canonical ID format
assert game.id == "nba_2025_bos_cle_1022"
assert game.sport == "nba"
assert game.season == 2025
# Check team IDs
assert game.home_team_id == "team_nba_cle"
assert game.away_team_id == "team_nba_bos"
# Check scores preserved
assert game.home_score == 108
assert game.away_score == 112
def test_detects_doubleheaders(self):
"""Test doubleheaders get correct game numbers."""
scraper = NBAScraper(season=2025)
raw_games = [
RawGameData(
game_date=datetime(2025, 4, 1, 13, 0),
home_team_raw="Boston Celtics",
away_team_raw="New York Knicks",
stadium_raw="TD Garden",
status="final",
home_score=105,
away_score=98,
),
RawGameData(
game_date=datetime(2025, 4, 1, 19, 0),
home_team_raw="Boston Celtics",
away_team_raw="New York Knicks",
stadium_raw="TD Garden",
status="final",
home_score=110,
away_score=102,
),
]
games, _ = scraper._normalize_games(raw_games)
assert len(games) == 2
game_numbers = sorted([g.game_number for g in games])
assert game_numbers == [1, 2]
# Check IDs include game number
game_ids = sorted([g.id for g in games])
assert game_ids == ["nba_2025_nyk_bos_0401_1", "nba_2025_nyk_bos_0401_2"]
def test_creates_review_items_for_unresolved_teams(self):
"""Test review items are created for unresolved teams."""
scraper = NBAScraper(season=2025)
raw_games = [
RawGameData(
game_date=datetime(2025, 10, 22),
home_team_raw="Unknown Team XYZ",
away_team_raw="Boston Celtics",
stadium_raw="TD Garden",
status="scheduled",
),
]
games, review_items = scraper._normalize_games(raw_games)
# Game should not be created due to unresolved team
assert len(games) == 0
# But there should be a review item
assert len(review_items) >= 1
class TestTeamAndStadiumScraping:
"""Test team and stadium data scraping."""
def test_scrapes_all_nba_teams(self):
"""Test all 30 NBA teams are returned."""
scraper = NBAScraper(season=2025)
teams = scraper.scrape_teams()
# 30 NBA teams
assert len(teams) == 30
# Check team IDs are unique
team_ids = [t.id for t in teams]
assert len(set(team_ids)) == 30
# Check all teams have required fields
for team in teams:
assert team.id.startswith("team_nba_")
assert team.sport == "nba"
assert team.city
assert team.name
assert team.full_name
assert team.abbreviation
def test_teams_have_conferences_and_divisions(self):
"""Test teams have conference and division info."""
scraper = NBAScraper(season=2025)
teams = scraper.scrape_teams()
# Count teams by conference
eastern = [t for t in teams if t.conference == "Eastern"]
western = [t for t in teams if t.conference == "Western"]
assert len(eastern) == 15
assert len(western) == 15
def test_scrapes_all_nba_stadiums(self):
"""Test all NBA stadiums are returned."""
scraper = NBAScraper(season=2025)
stadiums = scraper.scrape_stadiums()
# Should have stadiums for all teams
assert len(stadiums) == 30
# Check stadium IDs are unique
stadium_ids = [s.id for s in stadiums]
assert len(set(stadium_ids)) == 30
# Check all stadiums have required fields
for stadium in stadiums:
assert stadium.id.startswith("stadium_nba_")
assert stadium.sport == "nba"
assert stadium.name
assert stadium.city
assert stadium.state
assert stadium.country in ["USA", "Canada"]
assert stadium.latitude != 0
assert stadium.longitude != 0
class TestScrapeFallback:
"""Test multi-source fallback behavior."""
def test_falls_back_to_next_source_on_failure(self):
"""Test scraper tries next source when first fails."""
scraper = NBAScraper(season=2025)
with patch.object(scraper, '_scrape_basketball_reference') as mock_br, \
patch.object(scraper, '_scrape_espn') as mock_espn:
# Make BR fail
mock_br.side_effect = Exception("Connection failed")
# Make ESPN return data
mock_espn.return_value = [
RawGameData(
game_date=datetime(2025, 10, 22),
home_team_raw="Cleveland Cavaliers",
away_team_raw="Boston Celtics",
stadium_raw="Rocket Mortgage FieldHouse",
status="scheduled",
)
]
result = scraper.scrape_games()
# Should have succeeded with ESPN
assert result.success
assert result.source == "espn"
assert mock_br.called
assert mock_espn.called
def test_returns_failure_when_all_sources_fail(self):
"""Test scraper returns failure when all sources fail."""
scraper = NBAScraper(season=2025)
with patch.object(scraper, '_scrape_basketball_reference') as mock_br, \
patch.object(scraper, '_scrape_espn') as mock_espn, \
patch.object(scraper, '_scrape_cbs') as mock_cbs:
mock_br.side_effect = Exception("BR failed")
mock_espn.side_effect = Exception("ESPN failed")
mock_cbs.side_effect = Exception("CBS failed")
result = scraper.scrape_games()
assert not result.success
assert "All sources failed" in result.error_message
assert "CBS failed" in result.error_message
class TestSeasonMonths:
"""Test season month calculation."""
def test_gets_correct_season_months(self):
"""Test correct months are returned for NBA season."""
scraper = NBAScraper(season=2025)
months = scraper._get_season_months()
# NBA season is Oct-Jun
assert len(months) == 9 # Oct, Nov, Dec, Jan, Feb, Mar, Apr, May, Jun
# Check first month is Oct of season year
assert months[0] == (2025, 10)
# Check last month is Jun of following year
assert months[-1] == (2026, 6)
# Check transition to new year
assert months[2] == (2025, 12) # December
assert months[3] == (2026, 1) # January