Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
318 lines
10 KiB
Python
318 lines
10 KiB
Python
"""Tests for NHL scraper."""
|
|
|
|
from datetime import datetime
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from sportstime_parser.scrapers.nhl import NHLScraper, create_nhl_scraper
|
|
from sportstime_parser.scrapers.base import RawGameData
|
|
from sportstime_parser.tests.fixtures import (
|
|
load_json_fixture,
|
|
NHL_ESPN_SCOREBOARD_JSON,
|
|
)
|
|
|
|
|
|
class TestNHLScraperInit:
|
|
"""Test NHLScraper initialization."""
|
|
|
|
def test_creates_scraper_with_season(self):
|
|
"""Test scraper initializes with correct season."""
|
|
scraper = NHLScraper(season=2025)
|
|
assert scraper.sport == "nhl"
|
|
assert scraper.season == 2025
|
|
|
|
def test_factory_function_creates_scraper(self):
|
|
"""Test factory function creates correct scraper."""
|
|
scraper = create_nhl_scraper(season=2025)
|
|
assert isinstance(scraper, NHLScraper)
|
|
assert scraper.season == 2025
|
|
|
|
def test_expected_game_count(self):
|
|
"""Test expected game count is correct for NHL."""
|
|
scraper = NHLScraper(season=2025)
|
|
assert scraper.expected_game_count == 1312
|
|
|
|
def test_sources_in_priority_order(self):
|
|
"""Test sources are returned in correct priority order."""
|
|
scraper = NHLScraper(season=2025)
|
|
sources = scraper._get_sources()
|
|
assert sources == ["hockey_reference", "nhl_api", "espn"]
|
|
|
|
|
|
class TestESPNParsing:
|
|
"""Test ESPN API response parsing."""
|
|
|
|
def test_parses_completed_games(self):
|
|
"""Test parsing completed games from ESPN."""
|
|
scraper = NHLScraper(season=2025)
|
|
data = load_json_fixture(NHL_ESPN_SCOREBOARD_JSON)
|
|
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
|
|
|
completed = [g for g in games if g.status == "final"]
|
|
assert len(completed) == 2
|
|
|
|
# Penguins @ Bruins
|
|
pit_bos = next(g for g in completed if g.away_team_raw == "Pittsburgh Penguins")
|
|
assert pit_bos.home_team_raw == "Boston Bruins"
|
|
assert pit_bos.away_score == 2
|
|
assert pit_bos.home_score == 4
|
|
assert pit_bos.stadium_raw == "TD Garden"
|
|
|
|
def test_parses_scheduled_games(self):
|
|
"""Test parsing scheduled games from ESPN."""
|
|
scraper = NHLScraper(season=2025)
|
|
data = load_json_fixture(NHL_ESPN_SCOREBOARD_JSON)
|
|
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
|
|
|
scheduled = [g for g in games if g.status == "scheduled"]
|
|
assert len(scheduled) == 1
|
|
|
|
vgk_lak = scheduled[0]
|
|
assert vgk_lak.away_team_raw == "Vegas Golden Knights"
|
|
assert vgk_lak.home_team_raw == "Los Angeles Kings"
|
|
assert vgk_lak.stadium_raw == "Crypto.com Arena"
|
|
|
|
def test_parses_venue_info(self):
|
|
"""Test venue information is extracted."""
|
|
scraper = NHLScraper(season=2025)
|
|
data = load_json_fixture(NHL_ESPN_SCOREBOARD_JSON)
|
|
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
|
|
|
for game in games:
|
|
assert game.stadium_raw is not None
|
|
|
|
|
|
class TestGameNormalization:
|
|
"""Test game normalization and canonical ID generation."""
|
|
|
|
def test_normalizes_games_with_canonical_ids(self):
|
|
"""Test games are normalized with correct canonical IDs."""
|
|
scraper = NHLScraper(season=2025)
|
|
|
|
raw_games = [
|
|
RawGameData(
|
|
game_date=datetime(2025, 10, 8),
|
|
home_team_raw="Boston Bruins",
|
|
away_team_raw="Pittsburgh Penguins",
|
|
stadium_raw="TD Garden",
|
|
home_score=4,
|
|
away_score=2,
|
|
status="final",
|
|
source_url="http://example.com",
|
|
)
|
|
]
|
|
|
|
games, review_items = scraper._normalize_games(raw_games)
|
|
|
|
assert len(games) == 1
|
|
game = games[0]
|
|
|
|
# Check canonical ID format
|
|
assert game.id == "nhl_2025_pit_bos_1008"
|
|
assert game.sport == "nhl"
|
|
assert game.season == 2025
|
|
|
|
# Check team IDs
|
|
assert game.home_team_id == "team_nhl_bos"
|
|
assert game.away_team_id == "team_nhl_pit"
|
|
|
|
# Check scores preserved
|
|
assert game.home_score == 4
|
|
assert game.away_score == 2
|
|
|
|
def test_creates_review_items_for_unresolved_teams(self):
|
|
"""Test review items are created for unresolved teams."""
|
|
scraper = NHLScraper(season=2025)
|
|
|
|
raw_games = [
|
|
RawGameData(
|
|
game_date=datetime(2025, 10, 8),
|
|
home_team_raw="Unknown Team XYZ",
|
|
away_team_raw="Boston Bruins",
|
|
stadium_raw="TD Garden",
|
|
status="scheduled",
|
|
),
|
|
]
|
|
|
|
games, review_items = scraper._normalize_games(raw_games)
|
|
|
|
# Game should not be created due to unresolved team
|
|
assert len(games) == 0
|
|
|
|
# But there should be a review item
|
|
assert len(review_items) >= 1
|
|
|
|
|
|
class TestTeamAndStadiumScraping:
|
|
"""Test team and stadium data scraping."""
|
|
|
|
def test_scrapes_all_nhl_teams(self):
|
|
"""Test all 32 NHL teams are returned."""
|
|
scraper = NHLScraper(season=2025)
|
|
teams = scraper.scrape_teams()
|
|
|
|
# 32 NHL teams
|
|
assert len(teams) == 32
|
|
|
|
# Check team IDs are unique
|
|
team_ids = [t.id for t in teams]
|
|
assert len(set(team_ids)) == 32
|
|
|
|
# Check all teams have required fields
|
|
for team in teams:
|
|
assert team.id.startswith("team_nhl_")
|
|
assert team.sport == "nhl"
|
|
assert team.city
|
|
assert team.name
|
|
assert team.full_name
|
|
assert team.abbreviation
|
|
|
|
def test_teams_have_conferences_and_divisions(self):
|
|
"""Test teams have conference and division info."""
|
|
scraper = NHLScraper(season=2025)
|
|
teams = scraper.scrape_teams()
|
|
|
|
# Count teams by conference
|
|
eastern = [t for t in teams if t.conference == "Eastern"]
|
|
western = [t for t in teams if t.conference == "Western"]
|
|
|
|
assert len(eastern) == 16
|
|
assert len(western) == 16
|
|
|
|
def test_scrapes_all_nhl_stadiums(self):
|
|
"""Test all NHL stadiums are returned."""
|
|
scraper = NHLScraper(season=2025)
|
|
stadiums = scraper.scrape_stadiums()
|
|
|
|
# Should have stadiums for all teams
|
|
assert len(stadiums) == 32
|
|
|
|
# Check stadium IDs are unique
|
|
stadium_ids = [s.id for s in stadiums]
|
|
assert len(set(stadium_ids)) == 32
|
|
|
|
# Check all stadiums have required fields
|
|
for stadium in stadiums:
|
|
assert stadium.id.startswith("stadium_nhl_")
|
|
assert stadium.sport == "nhl"
|
|
assert stadium.name
|
|
assert stadium.city
|
|
assert stadium.state
|
|
assert stadium.country in ["USA", "Canada"]
|
|
assert stadium.latitude != 0
|
|
assert stadium.longitude != 0
|
|
|
|
|
|
class TestScrapeFallback:
|
|
"""Test multi-source fallback behavior."""
|
|
|
|
def test_falls_back_to_next_source_on_failure(self):
|
|
"""Test scraper tries next source when first fails."""
|
|
scraper = NHLScraper(season=2025)
|
|
|
|
with patch.object(scraper, '_scrape_hockey_reference') as mock_hr, \
|
|
patch.object(scraper, '_scrape_nhl_api') as mock_nhl, \
|
|
patch.object(scraper, '_scrape_espn') as mock_espn:
|
|
|
|
# Make HR and NHL API fail
|
|
mock_hr.side_effect = Exception("Connection failed")
|
|
mock_nhl.side_effect = Exception("API error")
|
|
|
|
# Make ESPN return data
|
|
mock_espn.return_value = [
|
|
RawGameData(
|
|
game_date=datetime(2025, 10, 8),
|
|
home_team_raw="Boston Bruins",
|
|
away_team_raw="Pittsburgh Penguins",
|
|
stadium_raw="TD Garden",
|
|
status="scheduled",
|
|
)
|
|
]
|
|
|
|
result = scraper.scrape_games()
|
|
|
|
assert result.success
|
|
assert result.source == "espn"
|
|
assert mock_hr.called
|
|
assert mock_nhl.called
|
|
assert mock_espn.called
|
|
|
|
|
|
class TestSeasonMonths:
|
|
"""Test season month calculation."""
|
|
|
|
def test_gets_correct_season_months(self):
|
|
"""Test correct months are returned for NHL season."""
|
|
scraper = NHLScraper(season=2025)
|
|
months = scraper._get_season_months()
|
|
|
|
# NHL season is October-June
|
|
assert len(months) == 9 # Oct, Nov, Dec, Jan, Feb, Mar, Apr, May, Jun
|
|
|
|
# Check first month is October of season year
|
|
assert months[0] == (2025, 10)
|
|
|
|
# Check last month is June of following year
|
|
assert months[-1] == (2026, 6)
|
|
|
|
# Check transition to new year
|
|
assert months[2] == (2025, 12) # December
|
|
assert months[3] == (2026, 1) # January
|
|
|
|
|
|
class TestInternationalFiltering:
|
|
"""Test international game filtering.
|
|
|
|
Note: Filtering happens in _parse_espn_response, not _normalize_games.
|
|
"""
|
|
|
|
def test_filters_european_games_during_parsing(self):
|
|
"""Test European games are filtered out during ESPN parsing."""
|
|
scraper = NHLScraper(season=2025)
|
|
|
|
# Create ESPN-like data with Prague game (Global Series)
|
|
espn_data = {
|
|
"events": [
|
|
{
|
|
"date": "2025-10-10T18:00:00Z",
|
|
"competitions": [
|
|
{
|
|
"neutralSite": True,
|
|
"venue": {
|
|
"fullName": "O2 Arena, Prague",
|
|
"address": {"city": "Prague", "country": "Czech Republic"},
|
|
},
|
|
"competitors": [
|
|
{"homeAway": "home", "team": {"displayName": "Florida Panthers"}},
|
|
{"homeAway": "away", "team": {"displayName": "Dallas Stars"}},
|
|
],
|
|
}
|
|
],
|
|
}
|
|
]
|
|
}
|
|
|
|
games = scraper._parse_espn_response(espn_data, "http://espn.com/api")
|
|
|
|
# Prague game should be filtered
|
|
assert len(games) == 0
|
|
|
|
def test_keeps_north_american_games(self):
|
|
"""Test North American games are kept."""
|
|
scraper = NHLScraper(season=2025)
|
|
|
|
raw_games = [
|
|
RawGameData(
|
|
game_date=datetime(2025, 10, 8),
|
|
home_team_raw="Boston Bruins",
|
|
away_team_raw="Pittsburgh Penguins",
|
|
stadium_raw="TD Garden",
|
|
status="scheduled",
|
|
),
|
|
]
|
|
|
|
games, _ = scraper._normalize_games(raw_games)
|
|
|
|
assert len(games) == 1
|