feat(scripts): add sportstime-parser data pipeline

Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-20 18:56:25 -06:00
parent ac78042a7e
commit 52d445bca4
76 changed files with 25065 additions and 0 deletions

View File

@@ -0,0 +1,312 @@
"""Alias file loaders for team and stadium name resolution."""
import json
from datetime import date
from pathlib import Path
from typing import Optional
from ..config import TEAM_ALIASES_FILE, STADIUM_ALIASES_FILE
from ..models.aliases import TeamAlias, StadiumAlias, AliasType
class TeamAliasLoader:
"""Loader for team aliases with date-aware resolution.
Loads team aliases from JSON and provides lookup methods
with support for historical name changes.
"""
def __init__(self, filepath: Optional[Path] = None):
"""Initialize the loader.
Args:
filepath: Path to team_aliases.json, defaults to config value
"""
self.filepath = filepath or TEAM_ALIASES_FILE
self._aliases: list[TeamAlias] = []
self._by_value: dict[str, list[TeamAlias]] = {}
self._by_team: dict[str, list[TeamAlias]] = {}
self._loaded = False
def load(self) -> None:
"""Load aliases from the JSON file."""
if not self.filepath.exists():
self._loaded = True
return
with open(self.filepath, "r", encoding="utf-8") as f:
data = json.load(f)
self._aliases = []
self._by_value = {}
self._by_team = {}
for item in data:
alias = TeamAlias.from_dict(item)
self._aliases.append(alias)
# Index by lowercase value
value_key = alias.alias_value.lower()
if value_key not in self._by_value:
self._by_value[value_key] = []
self._by_value[value_key].append(alias)
# Index by team ID
if alias.team_canonical_id not in self._by_team:
self._by_team[alias.team_canonical_id] = []
self._by_team[alias.team_canonical_id].append(alias)
self._loaded = True
def _ensure_loaded(self) -> None:
"""Ensure aliases are loaded."""
if not self._loaded:
self.load()
def resolve(
self,
value: str,
check_date: Optional[date] = None,
alias_types: Optional[list[AliasType]] = None,
) -> Optional[str]:
"""Resolve an alias value to a canonical team ID.
Args:
value: Alias value to look up (case-insensitive)
check_date: Date to check validity (None = current date)
alias_types: Types of aliases to check (None = all types)
Returns:
Canonical team ID if found, None otherwise
"""
self._ensure_loaded()
if check_date is None:
check_date = date.today()
value_key = value.lower().strip()
aliases = self._by_value.get(value_key, [])
for alias in aliases:
# Check type filter
if alias_types and alias.alias_type not in alias_types:
continue
# Check date validity
if alias.is_valid_on(check_date):
return alias.team_canonical_id
return None
def get_aliases_for_team(
self,
team_id: str,
check_date: Optional[date] = None,
) -> list[TeamAlias]:
"""Get all aliases for a team.
Args:
team_id: Canonical team ID
check_date: Date to filter by (None = all aliases)
Returns:
List of TeamAlias objects
"""
self._ensure_loaded()
aliases = self._by_team.get(team_id, [])
if check_date:
aliases = [a for a in aliases if a.is_valid_on(check_date)]
return aliases
def get_all_values(
self,
alias_type: Optional[AliasType] = None,
) -> list[str]:
"""Get all alias values.
Args:
alias_type: Filter by alias type (None = all types)
Returns:
List of alias values
"""
self._ensure_loaded()
values = []
for alias in self._aliases:
if alias_type is None or alias.alias_type == alias_type:
values.append(alias.alias_value)
return values
class StadiumAliasLoader:
"""Loader for stadium aliases with date-aware resolution.
Loads stadium aliases from JSON and provides lookup methods
with support for historical name changes (e.g., naming rights).
"""
def __init__(self, filepath: Optional[Path] = None):
"""Initialize the loader.
Args:
filepath: Path to stadium_aliases.json, defaults to config value
"""
self.filepath = filepath or STADIUM_ALIASES_FILE
self._aliases: list[StadiumAlias] = []
self._by_name: dict[str, list[StadiumAlias]] = {}
self._by_stadium: dict[str, list[StadiumAlias]] = {}
self._loaded = False
def load(self) -> None:
"""Load aliases from the JSON file."""
if not self.filepath.exists():
self._loaded = True
return
with open(self.filepath, "r", encoding="utf-8") as f:
data = json.load(f)
self._aliases = []
self._by_name = {}
self._by_stadium = {}
for item in data:
alias = StadiumAlias.from_dict(item)
self._aliases.append(alias)
# Index by lowercase name
name_key = alias.alias_name.lower()
if name_key not in self._by_name:
self._by_name[name_key] = []
self._by_name[name_key].append(alias)
# Index by stadium ID
if alias.stadium_canonical_id not in self._by_stadium:
self._by_stadium[alias.stadium_canonical_id] = []
self._by_stadium[alias.stadium_canonical_id].append(alias)
self._loaded = True
def _ensure_loaded(self) -> None:
"""Ensure aliases are loaded."""
if not self._loaded:
self.load()
def resolve(
self,
name: str,
check_date: Optional[date] = None,
) -> Optional[str]:
"""Resolve a stadium name to a canonical stadium ID.
Args:
name: Stadium name to look up (case-insensitive)
check_date: Date to check validity (None = current date)
Returns:
Canonical stadium ID if found, None otherwise
"""
self._ensure_loaded()
if check_date is None:
check_date = date.today()
name_key = name.lower().strip()
aliases = self._by_name.get(name_key, [])
for alias in aliases:
if alias.is_valid_on(check_date):
return alias.stadium_canonical_id
return None
def get_aliases_for_stadium(
self,
stadium_id: str,
check_date: Optional[date] = None,
) -> list[StadiumAlias]:
"""Get all aliases for a stadium.
Args:
stadium_id: Canonical stadium ID
check_date: Date to filter by (None = all aliases)
Returns:
List of StadiumAlias objects
"""
self._ensure_loaded()
aliases = self._by_stadium.get(stadium_id, [])
if check_date:
aliases = [a for a in aliases if a.is_valid_on(check_date)]
return aliases
def get_all_names(self) -> list[str]:
"""Get all stadium alias names.
Returns:
List of stadium names
"""
self._ensure_loaded()
return [alias.alias_name for alias in self._aliases]
# Global loader instances (lazy initialized)
_team_alias_loader: Optional[TeamAliasLoader] = None
_stadium_alias_loader: Optional[StadiumAliasLoader] = None
def get_team_alias_loader() -> TeamAliasLoader:
"""Get the global team alias loader instance."""
global _team_alias_loader
if _team_alias_loader is None:
_team_alias_loader = TeamAliasLoader()
return _team_alias_loader
def get_stadium_alias_loader() -> StadiumAliasLoader:
"""Get the global stadium alias loader instance."""
global _stadium_alias_loader
if _stadium_alias_loader is None:
_stadium_alias_loader = StadiumAliasLoader()
return _stadium_alias_loader
def resolve_team_alias(
value: str,
check_date: Optional[date] = None,
) -> Optional[str]:
"""Convenience function to resolve a team alias.
Args:
value: Alias value (name, abbreviation, or city)
check_date: Date to check validity
Returns:
Canonical team ID if found
"""
return get_team_alias_loader().resolve(value, check_date)
def resolve_stadium_alias(
name: str,
check_date: Optional[date] = None,
) -> Optional[str]:
"""Convenience function to resolve a stadium alias.
Args:
name: Stadium name
check_date: Date to check validity
Returns:
Canonical stadium ID if found
"""
return get_stadium_alias_loader().resolve(name, check_date)