feat(scripts): add sportstime-parser data pipeline

Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-20 18:56:25 -06:00
parent ac78042a7e
commit 52d445bca4
76 changed files with 25065 additions and 0 deletions

View File

@@ -0,0 +1,269 @@
"""Tests for alias loaders."""
import pytest
import json
import tempfile
from datetime import date
from pathlib import Path
from sportstime_parser.normalizers.alias_loader import (
TeamAliasLoader,
StadiumAliasLoader,
)
from sportstime_parser.models.aliases import AliasType
class TestTeamAliasLoader:
"""Tests for TeamAliasLoader class."""
@pytest.fixture
def sample_aliases_file(self):
"""Create a temporary aliases file for testing."""
data = [
{
"id": "1",
"team_canonical_id": "nba_okc",
"alias_type": "name",
"alias_value": "Seattle SuperSonics",
"valid_from": "1967-01-01",
"valid_until": "2008-07-02",
},
{
"id": "2",
"team_canonical_id": "nba_okc",
"alias_type": "name",
"alias_value": "Oklahoma City Thunder",
"valid_from": "2008-07-03",
"valid_until": None,
},
{
"id": "3",
"team_canonical_id": "nba_okc",
"alias_type": "abbreviation",
"alias_value": "OKC",
"valid_from": "2008-07-03",
"valid_until": None,
},
]
with tempfile.NamedTemporaryFile(
mode="w", suffix=".json", delete=False
) as f:
json.dump(data, f)
return Path(f.name)
def test_load_aliases(self, sample_aliases_file):
"""Test loading aliases from file."""
loader = TeamAliasLoader(sample_aliases_file)
loader.load()
assert len(loader._aliases) == 3
def test_resolve_current_alias(self, sample_aliases_file):
"""Test resolving a current alias."""
loader = TeamAliasLoader(sample_aliases_file)
# Current date should resolve to Thunder
result = loader.resolve("Oklahoma City Thunder")
assert result == "nba_okc"
# Abbreviation should also work
result = loader.resolve("OKC")
assert result == "nba_okc"
def test_resolve_historical_alias(self, sample_aliases_file):
"""Test resolving a historical alias with date."""
loader = TeamAliasLoader(sample_aliases_file)
# Historical date should resolve SuperSonics
result = loader.resolve("Seattle SuperSonics", check_date=date(2007, 1, 1))
assert result == "nba_okc"
# After relocation, SuperSonics shouldn't resolve
result = loader.resolve("Seattle SuperSonics", check_date=date(2010, 1, 1))
assert result is None
def test_resolve_case_insensitive(self, sample_aliases_file):
"""Test case insensitive resolution."""
loader = TeamAliasLoader(sample_aliases_file)
result = loader.resolve("oklahoma city thunder")
assert result == "nba_okc"
result = loader.resolve("okc")
assert result == "nba_okc"
def test_resolve_with_type_filter(self, sample_aliases_file):
"""Test filtering by alias type."""
loader = TeamAliasLoader(sample_aliases_file)
# Should find when searching all types
result = loader.resolve("OKC")
assert result == "nba_okc"
# Should not find when filtering to name only
result = loader.resolve("OKC", alias_types=[AliasType.NAME])
assert result is None
def test_get_aliases_for_team(self, sample_aliases_file):
"""Test getting all aliases for a team."""
loader = TeamAliasLoader(sample_aliases_file)
aliases = loader.get_aliases_for_team("nba_okc")
assert len(aliases) == 3
# Filter by current date
aliases = loader.get_aliases_for_team(
"nba_okc", check_date=date(2020, 1, 1)
)
assert len(aliases) == 2 # Thunder name + OKC abbreviation
def test_missing_file(self):
"""Test handling of missing file."""
loader = TeamAliasLoader(Path("/nonexistent/file.json"))
loader.load() # Should not raise
assert len(loader._aliases) == 0
class TestStadiumAliasLoader:
"""Tests for StadiumAliasLoader class."""
@pytest.fixture
def sample_stadium_aliases(self):
"""Create a temporary stadium aliases file."""
data = [
{
"alias_name": "Crypto.com Arena",
"stadium_canonical_id": "crypto_arena_los_angeles_ca",
"valid_from": "2021-12-25",
"valid_until": None,
},
{
"alias_name": "Staples Center",
"stadium_canonical_id": "crypto_arena_los_angeles_ca",
"valid_from": "1999-10-17",
"valid_until": "2021-12-24",
},
]
with tempfile.NamedTemporaryFile(
mode="w", suffix=".json", delete=False
) as f:
json.dump(data, f)
return Path(f.name)
def test_load_stadium_aliases(self, sample_stadium_aliases):
"""Test loading stadium aliases."""
loader = StadiumAliasLoader(sample_stadium_aliases)
loader.load()
assert len(loader._aliases) == 2
def test_resolve_current_name(self, sample_stadium_aliases):
"""Test resolving current stadium name."""
loader = StadiumAliasLoader(sample_stadium_aliases)
result = loader.resolve("Crypto.com Arena")
assert result == "crypto_arena_los_angeles_ca"
def test_resolve_historical_name(self, sample_stadium_aliases):
"""Test resolving historical stadium name."""
loader = StadiumAliasLoader(sample_stadium_aliases)
# Staples Center in 2020
result = loader.resolve("Staples Center", check_date=date(2020, 1, 1))
assert result == "crypto_arena_los_angeles_ca"
# Staples Center after rename shouldn't resolve
result = loader.resolve("Staples Center", check_date=date(2023, 1, 1))
assert result is None
def test_date_boundary(self, sample_stadium_aliases):
"""Test exact date boundaries."""
loader = StadiumAliasLoader(sample_stadium_aliases)
# Last day of Staples Center
result = loader.resolve("Staples Center", check_date=date(2021, 12, 24))
assert result == "crypto_arena_los_angeles_ca"
# First day of Crypto.com Arena
result = loader.resolve("Crypto.com Arena", check_date=date(2021, 12, 25))
assert result == "crypto_arena_los_angeles_ca"
def test_get_all_names(self, sample_stadium_aliases):
"""Test getting all stadium names."""
loader = StadiumAliasLoader(sample_stadium_aliases)
names = loader.get_all_names()
assert len(names) == 2
assert "Crypto.com Arena" in names
assert "Staples Center" in names
class TestDateRangeHandling:
"""Tests for date range edge cases in aliases."""
@pytest.fixture
def date_range_aliases(self):
"""Create aliases with various date range scenarios."""
data = [
{
"id": "1",
"team_canonical_id": "test_team",
"alias_type": "name",
"alias_value": "Always Valid",
"valid_from": None,
"valid_until": None,
},
{
"id": "2",
"team_canonical_id": "test_team",
"alias_type": "name",
"alias_value": "Future Only",
"valid_from": "2030-01-01",
"valid_until": None,
},
{
"id": "3",
"team_canonical_id": "test_team",
"alias_type": "name",
"alias_value": "Past Only",
"valid_from": None,
"valid_until": "2000-01-01",
},
]
with tempfile.NamedTemporaryFile(
mode="w", suffix=".json", delete=False
) as f:
json.dump(data, f)
return Path(f.name)
def test_always_valid_alias(self, date_range_aliases):
"""Test alias with no date restrictions."""
loader = TeamAliasLoader(date_range_aliases)
result = loader.resolve("Always Valid", check_date=date(2025, 1, 1))
assert result == "test_team"
result = loader.resolve("Always Valid", check_date=date(1990, 1, 1))
assert result == "test_team"
def test_future_only_alias(self, date_range_aliases):
"""Test alias that starts in the future."""
loader = TeamAliasLoader(date_range_aliases)
# Before valid_from
result = loader.resolve("Future Only", check_date=date(2025, 1, 1))
assert result is None
# After valid_from
result = loader.resolve("Future Only", check_date=date(2035, 1, 1))
assert result == "test_team"
def test_past_only_alias(self, date_range_aliases):
"""Test alias that expired in the past."""
loader = TeamAliasLoader(date_range_aliases)
# Before valid_until
result = loader.resolve("Past Only", check_date=date(1990, 1, 1))
assert result == "test_team"
# After valid_until
result = loader.resolve("Past Only", check_date=date(2025, 1, 1))
assert result is None