feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
269
sportstime_parser/tests/test_alias_loader.py
Normal file
269
sportstime_parser/tests/test_alias_loader.py
Normal file
@@ -0,0 +1,269 @@
|
||||
"""Tests for alias loaders."""
|
||||
|
||||
import pytest
|
||||
import json
|
||||
import tempfile
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
from sportstime_parser.normalizers.alias_loader import (
|
||||
TeamAliasLoader,
|
||||
StadiumAliasLoader,
|
||||
)
|
||||
from sportstime_parser.models.aliases import AliasType
|
||||
|
||||
|
||||
class TestTeamAliasLoader:
|
||||
"""Tests for TeamAliasLoader class."""
|
||||
|
||||
@pytest.fixture
|
||||
def sample_aliases_file(self):
|
||||
"""Create a temporary aliases file for testing."""
|
||||
data = [
|
||||
{
|
||||
"id": "1",
|
||||
"team_canonical_id": "nba_okc",
|
||||
"alias_type": "name",
|
||||
"alias_value": "Seattle SuperSonics",
|
||||
"valid_from": "1967-01-01",
|
||||
"valid_until": "2008-07-02",
|
||||
},
|
||||
{
|
||||
"id": "2",
|
||||
"team_canonical_id": "nba_okc",
|
||||
"alias_type": "name",
|
||||
"alias_value": "Oklahoma City Thunder",
|
||||
"valid_from": "2008-07-03",
|
||||
"valid_until": None,
|
||||
},
|
||||
{
|
||||
"id": "3",
|
||||
"team_canonical_id": "nba_okc",
|
||||
"alias_type": "abbreviation",
|
||||
"alias_value": "OKC",
|
||||
"valid_from": "2008-07-03",
|
||||
"valid_until": None,
|
||||
},
|
||||
]
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", suffix=".json", delete=False
|
||||
) as f:
|
||||
json.dump(data, f)
|
||||
return Path(f.name)
|
||||
|
||||
def test_load_aliases(self, sample_aliases_file):
|
||||
"""Test loading aliases from file."""
|
||||
loader = TeamAliasLoader(sample_aliases_file)
|
||||
loader.load()
|
||||
assert len(loader._aliases) == 3
|
||||
|
||||
def test_resolve_current_alias(self, sample_aliases_file):
|
||||
"""Test resolving a current alias."""
|
||||
loader = TeamAliasLoader(sample_aliases_file)
|
||||
|
||||
# Current date should resolve to Thunder
|
||||
result = loader.resolve("Oklahoma City Thunder")
|
||||
assert result == "nba_okc"
|
||||
|
||||
# Abbreviation should also work
|
||||
result = loader.resolve("OKC")
|
||||
assert result == "nba_okc"
|
||||
|
||||
def test_resolve_historical_alias(self, sample_aliases_file):
|
||||
"""Test resolving a historical alias with date."""
|
||||
loader = TeamAliasLoader(sample_aliases_file)
|
||||
|
||||
# Historical date should resolve SuperSonics
|
||||
result = loader.resolve("Seattle SuperSonics", check_date=date(2007, 1, 1))
|
||||
assert result == "nba_okc"
|
||||
|
||||
# After relocation, SuperSonics shouldn't resolve
|
||||
result = loader.resolve("Seattle SuperSonics", check_date=date(2010, 1, 1))
|
||||
assert result is None
|
||||
|
||||
def test_resolve_case_insensitive(self, sample_aliases_file):
|
||||
"""Test case insensitive resolution."""
|
||||
loader = TeamAliasLoader(sample_aliases_file)
|
||||
|
||||
result = loader.resolve("oklahoma city thunder")
|
||||
assert result == "nba_okc"
|
||||
|
||||
result = loader.resolve("okc")
|
||||
assert result == "nba_okc"
|
||||
|
||||
def test_resolve_with_type_filter(self, sample_aliases_file):
|
||||
"""Test filtering by alias type."""
|
||||
loader = TeamAliasLoader(sample_aliases_file)
|
||||
|
||||
# Should find when searching all types
|
||||
result = loader.resolve("OKC")
|
||||
assert result == "nba_okc"
|
||||
|
||||
# Should not find when filtering to name only
|
||||
result = loader.resolve("OKC", alias_types=[AliasType.NAME])
|
||||
assert result is None
|
||||
|
||||
def test_get_aliases_for_team(self, sample_aliases_file):
|
||||
"""Test getting all aliases for a team."""
|
||||
loader = TeamAliasLoader(sample_aliases_file)
|
||||
|
||||
aliases = loader.get_aliases_for_team("nba_okc")
|
||||
assert len(aliases) == 3
|
||||
|
||||
# Filter by current date
|
||||
aliases = loader.get_aliases_for_team(
|
||||
"nba_okc", check_date=date(2020, 1, 1)
|
||||
)
|
||||
assert len(aliases) == 2 # Thunder name + OKC abbreviation
|
||||
|
||||
def test_missing_file(self):
|
||||
"""Test handling of missing file."""
|
||||
loader = TeamAliasLoader(Path("/nonexistent/file.json"))
|
||||
loader.load() # Should not raise
|
||||
assert len(loader._aliases) == 0
|
||||
|
||||
|
||||
class TestStadiumAliasLoader:
|
||||
"""Tests for StadiumAliasLoader class."""
|
||||
|
||||
@pytest.fixture
|
||||
def sample_stadium_aliases(self):
|
||||
"""Create a temporary stadium aliases file."""
|
||||
data = [
|
||||
{
|
||||
"alias_name": "Crypto.com Arena",
|
||||
"stadium_canonical_id": "crypto_arena_los_angeles_ca",
|
||||
"valid_from": "2021-12-25",
|
||||
"valid_until": None,
|
||||
},
|
||||
{
|
||||
"alias_name": "Staples Center",
|
||||
"stadium_canonical_id": "crypto_arena_los_angeles_ca",
|
||||
"valid_from": "1999-10-17",
|
||||
"valid_until": "2021-12-24",
|
||||
},
|
||||
]
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", suffix=".json", delete=False
|
||||
) as f:
|
||||
json.dump(data, f)
|
||||
return Path(f.name)
|
||||
|
||||
def test_load_stadium_aliases(self, sample_stadium_aliases):
|
||||
"""Test loading stadium aliases."""
|
||||
loader = StadiumAliasLoader(sample_stadium_aliases)
|
||||
loader.load()
|
||||
assert len(loader._aliases) == 2
|
||||
|
||||
def test_resolve_current_name(self, sample_stadium_aliases):
|
||||
"""Test resolving current stadium name."""
|
||||
loader = StadiumAliasLoader(sample_stadium_aliases)
|
||||
|
||||
result = loader.resolve("Crypto.com Arena")
|
||||
assert result == "crypto_arena_los_angeles_ca"
|
||||
|
||||
def test_resolve_historical_name(self, sample_stadium_aliases):
|
||||
"""Test resolving historical stadium name."""
|
||||
loader = StadiumAliasLoader(sample_stadium_aliases)
|
||||
|
||||
# Staples Center in 2020
|
||||
result = loader.resolve("Staples Center", check_date=date(2020, 1, 1))
|
||||
assert result == "crypto_arena_los_angeles_ca"
|
||||
|
||||
# Staples Center after rename shouldn't resolve
|
||||
result = loader.resolve("Staples Center", check_date=date(2023, 1, 1))
|
||||
assert result is None
|
||||
|
||||
def test_date_boundary(self, sample_stadium_aliases):
|
||||
"""Test exact date boundaries."""
|
||||
loader = StadiumAliasLoader(sample_stadium_aliases)
|
||||
|
||||
# Last day of Staples Center
|
||||
result = loader.resolve("Staples Center", check_date=date(2021, 12, 24))
|
||||
assert result == "crypto_arena_los_angeles_ca"
|
||||
|
||||
# First day of Crypto.com Arena
|
||||
result = loader.resolve("Crypto.com Arena", check_date=date(2021, 12, 25))
|
||||
assert result == "crypto_arena_los_angeles_ca"
|
||||
|
||||
def test_get_all_names(self, sample_stadium_aliases):
|
||||
"""Test getting all stadium names."""
|
||||
loader = StadiumAliasLoader(sample_stadium_aliases)
|
||||
|
||||
names = loader.get_all_names()
|
||||
assert len(names) == 2
|
||||
assert "Crypto.com Arena" in names
|
||||
assert "Staples Center" in names
|
||||
|
||||
|
||||
class TestDateRangeHandling:
|
||||
"""Tests for date range edge cases in aliases."""
|
||||
|
||||
@pytest.fixture
|
||||
def date_range_aliases(self):
|
||||
"""Create aliases with various date range scenarios."""
|
||||
data = [
|
||||
{
|
||||
"id": "1",
|
||||
"team_canonical_id": "test_team",
|
||||
"alias_type": "name",
|
||||
"alias_value": "Always Valid",
|
||||
"valid_from": None,
|
||||
"valid_until": None,
|
||||
},
|
||||
{
|
||||
"id": "2",
|
||||
"team_canonical_id": "test_team",
|
||||
"alias_type": "name",
|
||||
"alias_value": "Future Only",
|
||||
"valid_from": "2030-01-01",
|
||||
"valid_until": None,
|
||||
},
|
||||
{
|
||||
"id": "3",
|
||||
"team_canonical_id": "test_team",
|
||||
"alias_type": "name",
|
||||
"alias_value": "Past Only",
|
||||
"valid_from": None,
|
||||
"valid_until": "2000-01-01",
|
||||
},
|
||||
]
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", suffix=".json", delete=False
|
||||
) as f:
|
||||
json.dump(data, f)
|
||||
return Path(f.name)
|
||||
|
||||
def test_always_valid_alias(self, date_range_aliases):
|
||||
"""Test alias with no date restrictions."""
|
||||
loader = TeamAliasLoader(date_range_aliases)
|
||||
|
||||
result = loader.resolve("Always Valid", check_date=date(2025, 1, 1))
|
||||
assert result == "test_team"
|
||||
|
||||
result = loader.resolve("Always Valid", check_date=date(1990, 1, 1))
|
||||
assert result == "test_team"
|
||||
|
||||
def test_future_only_alias(self, date_range_aliases):
|
||||
"""Test alias that starts in the future."""
|
||||
loader = TeamAliasLoader(date_range_aliases)
|
||||
|
||||
# Before valid_from
|
||||
result = loader.resolve("Future Only", check_date=date(2025, 1, 1))
|
||||
assert result is None
|
||||
|
||||
# After valid_from
|
||||
result = loader.resolve("Future Only", check_date=date(2035, 1, 1))
|
||||
assert result == "test_team"
|
||||
|
||||
def test_past_only_alias(self, date_range_aliases):
|
||||
"""Test alias that expired in the past."""
|
||||
loader = TeamAliasLoader(date_range_aliases)
|
||||
|
||||
# Before valid_until
|
||||
result = loader.resolve("Past Only", check_date=date(1990, 1, 1))
|
||||
assert result == "test_team"
|
||||
|
||||
# After valid_until
|
||||
result = loader.resolve("Past Only", check_date=date(2025, 1, 1))
|
||||
assert result is None
|
||||
Reference in New Issue
Block a user