Files
Sportstime/Scripts/sportstime_parser/normalizers/alias_loader.py
Trey t eeaf900e5a feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package:

- Multi-source scrapers with automatic fallback for 7 sports
- Canonical ID generation for games, teams, and stadiums
- Fuzzy matching with configurable thresholds for name resolution
- CloudKit Web Services uploader with JWT auth, diff-based updates
- Resumable uploads with checkpoint state persistence
- Validation reports with manual review items and suggested matches
- Comprehensive test suite (249 tests)

CLI: sportstime-parser scrape|validate|upload|status|retry|clear

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 21:06:12 -06:00

313 lines
8.6 KiB
Python

"""Alias file loaders for team and stadium name resolution."""
import json
from datetime import date
from pathlib import Path
from typing import Optional
from ..config import TEAM_ALIASES_FILE, STADIUM_ALIASES_FILE
from ..models.aliases import TeamAlias, StadiumAlias, AliasType
class TeamAliasLoader:
"""Loader for team aliases with date-aware resolution.
Loads team aliases from JSON and provides lookup methods
with support for historical name changes.
"""
def __init__(self, filepath: Optional[Path] = None):
"""Initialize the loader.
Args:
filepath: Path to team_aliases.json, defaults to config value
"""
self.filepath = filepath or TEAM_ALIASES_FILE
self._aliases: list[TeamAlias] = []
self._by_value: dict[str, list[TeamAlias]] = {}
self._by_team: dict[str, list[TeamAlias]] = {}
self._loaded = False
def load(self) -> None:
"""Load aliases from the JSON file."""
if not self.filepath.exists():
self._loaded = True
return
with open(self.filepath, "r", encoding="utf-8") as f:
data = json.load(f)
self._aliases = []
self._by_value = {}
self._by_team = {}
for item in data:
alias = TeamAlias.from_dict(item)
self._aliases.append(alias)
# Index by lowercase value
value_key = alias.alias_value.lower()
if value_key not in self._by_value:
self._by_value[value_key] = []
self._by_value[value_key].append(alias)
# Index by team ID
if alias.team_canonical_id not in self._by_team:
self._by_team[alias.team_canonical_id] = []
self._by_team[alias.team_canonical_id].append(alias)
self._loaded = True
def _ensure_loaded(self) -> None:
"""Ensure aliases are loaded."""
if not self._loaded:
self.load()
def resolve(
self,
value: str,
check_date: Optional[date] = None,
alias_types: Optional[list[AliasType]] = None,
) -> Optional[str]:
"""Resolve an alias value to a canonical team ID.
Args:
value: Alias value to look up (case-insensitive)
check_date: Date to check validity (None = current date)
alias_types: Types of aliases to check (None = all types)
Returns:
Canonical team ID if found, None otherwise
"""
self._ensure_loaded()
if check_date is None:
check_date = date.today()
value_key = value.lower().strip()
aliases = self._by_value.get(value_key, [])
for alias in aliases:
# Check type filter
if alias_types and alias.alias_type not in alias_types:
continue
# Check date validity
if alias.is_valid_on(check_date):
return alias.team_canonical_id
return None
def get_aliases_for_team(
self,
team_id: str,
check_date: Optional[date] = None,
) -> list[TeamAlias]:
"""Get all aliases for a team.
Args:
team_id: Canonical team ID
check_date: Date to filter by (None = all aliases)
Returns:
List of TeamAlias objects
"""
self._ensure_loaded()
aliases = self._by_team.get(team_id, [])
if check_date:
aliases = [a for a in aliases if a.is_valid_on(check_date)]
return aliases
def get_all_values(
self,
alias_type: Optional[AliasType] = None,
) -> list[str]:
"""Get all alias values.
Args:
alias_type: Filter by alias type (None = all types)
Returns:
List of alias values
"""
self._ensure_loaded()
values = []
for alias in self._aliases:
if alias_type is None or alias.alias_type == alias_type:
values.append(alias.alias_value)
return values
class StadiumAliasLoader:
"""Loader for stadium aliases with date-aware resolution.
Loads stadium aliases from JSON and provides lookup methods
with support for historical name changes (e.g., naming rights).
"""
def __init__(self, filepath: Optional[Path] = None):
"""Initialize the loader.
Args:
filepath: Path to stadium_aliases.json, defaults to config value
"""
self.filepath = filepath or STADIUM_ALIASES_FILE
self._aliases: list[StadiumAlias] = []
self._by_name: dict[str, list[StadiumAlias]] = {}
self._by_stadium: dict[str, list[StadiumAlias]] = {}
self._loaded = False
def load(self) -> None:
"""Load aliases from the JSON file."""
if not self.filepath.exists():
self._loaded = True
return
with open(self.filepath, "r", encoding="utf-8") as f:
data = json.load(f)
self._aliases = []
self._by_name = {}
self._by_stadium = {}
for item in data:
alias = StadiumAlias.from_dict(item)
self._aliases.append(alias)
# Index by lowercase name
name_key = alias.alias_name.lower()
if name_key not in self._by_name:
self._by_name[name_key] = []
self._by_name[name_key].append(alias)
# Index by stadium ID
if alias.stadium_canonical_id not in self._by_stadium:
self._by_stadium[alias.stadium_canonical_id] = []
self._by_stadium[alias.stadium_canonical_id].append(alias)
self._loaded = True
def _ensure_loaded(self) -> None:
"""Ensure aliases are loaded."""
if not self._loaded:
self.load()
def resolve(
self,
name: str,
check_date: Optional[date] = None,
) -> Optional[str]:
"""Resolve a stadium name to a canonical stadium ID.
Args:
name: Stadium name to look up (case-insensitive)
check_date: Date to check validity (None = current date)
Returns:
Canonical stadium ID if found, None otherwise
"""
self._ensure_loaded()
if check_date is None:
check_date = date.today()
name_key = name.lower().strip()
aliases = self._by_name.get(name_key, [])
for alias in aliases:
if alias.is_valid_on(check_date):
return alias.stadium_canonical_id
return None
def get_aliases_for_stadium(
self,
stadium_id: str,
check_date: Optional[date] = None,
) -> list[StadiumAlias]:
"""Get all aliases for a stadium.
Args:
stadium_id: Canonical stadium ID
check_date: Date to filter by (None = all aliases)
Returns:
List of StadiumAlias objects
"""
self._ensure_loaded()
aliases = self._by_stadium.get(stadium_id, [])
if check_date:
aliases = [a for a in aliases if a.is_valid_on(check_date)]
return aliases
def get_all_names(self) -> list[str]:
"""Get all stadium alias names.
Returns:
List of stadium names
"""
self._ensure_loaded()
return [alias.alias_name for alias in self._aliases]
# Global loader instances (lazy initialized)
_team_alias_loader: Optional[TeamAliasLoader] = None
_stadium_alias_loader: Optional[StadiumAliasLoader] = None
def get_team_alias_loader() -> TeamAliasLoader:
"""Get the global team alias loader instance."""
global _team_alias_loader
if _team_alias_loader is None:
_team_alias_loader = TeamAliasLoader()
return _team_alias_loader
def get_stadium_alias_loader() -> StadiumAliasLoader:
"""Get the global stadium alias loader instance."""
global _stadium_alias_loader
if _stadium_alias_loader is None:
_stadium_alias_loader = StadiumAliasLoader()
return _stadium_alias_loader
def resolve_team_alias(
value: str,
check_date: Optional[date] = None,
) -> Optional[str]:
"""Convenience function to resolve a team alias.
Args:
value: Alias value (name, abbreviation, or city)
check_date: Date to check validity
Returns:
Canonical team ID if found
"""
return get_team_alias_loader().resolve(value, check_date)
def resolve_stadium_alias(
name: str,
check_date: Optional[date] = None,
) -> Optional[str]:
"""Convenience function to resolve a stadium alias.
Args:
name: Stadium name
check_date: Date to check validity
Returns:
Canonical stadium ID if found
"""
return get_stadium_alias_loader().resolve(name, check_date)