Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
313 lines
8.6 KiB
Python
313 lines
8.6 KiB
Python
"""Alias file loaders for team and stadium name resolution."""
|
|
|
|
import json
|
|
from datetime import date
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
from ..config import TEAM_ALIASES_FILE, STADIUM_ALIASES_FILE
|
|
from ..models.aliases import TeamAlias, StadiumAlias, AliasType
|
|
|
|
|
|
class TeamAliasLoader:
|
|
"""Loader for team aliases with date-aware resolution.
|
|
|
|
Loads team aliases from JSON and provides lookup methods
|
|
with support for historical name changes.
|
|
"""
|
|
|
|
def __init__(self, filepath: Optional[Path] = None):
|
|
"""Initialize the loader.
|
|
|
|
Args:
|
|
filepath: Path to team_aliases.json, defaults to config value
|
|
"""
|
|
self.filepath = filepath or TEAM_ALIASES_FILE
|
|
self._aliases: list[TeamAlias] = []
|
|
self._by_value: dict[str, list[TeamAlias]] = {}
|
|
self._by_team: dict[str, list[TeamAlias]] = {}
|
|
self._loaded = False
|
|
|
|
def load(self) -> None:
|
|
"""Load aliases from the JSON file."""
|
|
if not self.filepath.exists():
|
|
self._loaded = True
|
|
return
|
|
|
|
with open(self.filepath, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
self._aliases = []
|
|
self._by_value = {}
|
|
self._by_team = {}
|
|
|
|
for item in data:
|
|
alias = TeamAlias.from_dict(item)
|
|
self._aliases.append(alias)
|
|
|
|
# Index by lowercase value
|
|
value_key = alias.alias_value.lower()
|
|
if value_key not in self._by_value:
|
|
self._by_value[value_key] = []
|
|
self._by_value[value_key].append(alias)
|
|
|
|
# Index by team ID
|
|
if alias.team_canonical_id not in self._by_team:
|
|
self._by_team[alias.team_canonical_id] = []
|
|
self._by_team[alias.team_canonical_id].append(alias)
|
|
|
|
self._loaded = True
|
|
|
|
def _ensure_loaded(self) -> None:
|
|
"""Ensure aliases are loaded."""
|
|
if not self._loaded:
|
|
self.load()
|
|
|
|
def resolve(
|
|
self,
|
|
value: str,
|
|
check_date: Optional[date] = None,
|
|
alias_types: Optional[list[AliasType]] = None,
|
|
) -> Optional[str]:
|
|
"""Resolve an alias value to a canonical team ID.
|
|
|
|
Args:
|
|
value: Alias value to look up (case-insensitive)
|
|
check_date: Date to check validity (None = current date)
|
|
alias_types: Types of aliases to check (None = all types)
|
|
|
|
Returns:
|
|
Canonical team ID if found, None otherwise
|
|
"""
|
|
self._ensure_loaded()
|
|
|
|
if check_date is None:
|
|
check_date = date.today()
|
|
|
|
value_key = value.lower().strip()
|
|
aliases = self._by_value.get(value_key, [])
|
|
|
|
for alias in aliases:
|
|
# Check type filter
|
|
if alias_types and alias.alias_type not in alias_types:
|
|
continue
|
|
|
|
# Check date validity
|
|
if alias.is_valid_on(check_date):
|
|
return alias.team_canonical_id
|
|
|
|
return None
|
|
|
|
def get_aliases_for_team(
|
|
self,
|
|
team_id: str,
|
|
check_date: Optional[date] = None,
|
|
) -> list[TeamAlias]:
|
|
"""Get all aliases for a team.
|
|
|
|
Args:
|
|
team_id: Canonical team ID
|
|
check_date: Date to filter by (None = all aliases)
|
|
|
|
Returns:
|
|
List of TeamAlias objects
|
|
"""
|
|
self._ensure_loaded()
|
|
|
|
aliases = self._by_team.get(team_id, [])
|
|
|
|
if check_date:
|
|
aliases = [a for a in aliases if a.is_valid_on(check_date)]
|
|
|
|
return aliases
|
|
|
|
def get_all_values(
|
|
self,
|
|
alias_type: Optional[AliasType] = None,
|
|
) -> list[str]:
|
|
"""Get all alias values.
|
|
|
|
Args:
|
|
alias_type: Filter by alias type (None = all types)
|
|
|
|
Returns:
|
|
List of alias values
|
|
"""
|
|
self._ensure_loaded()
|
|
|
|
values = []
|
|
for alias in self._aliases:
|
|
if alias_type is None or alias.alias_type == alias_type:
|
|
values.append(alias.alias_value)
|
|
|
|
return values
|
|
|
|
|
|
class StadiumAliasLoader:
|
|
"""Loader for stadium aliases with date-aware resolution.
|
|
|
|
Loads stadium aliases from JSON and provides lookup methods
|
|
with support for historical name changes (e.g., naming rights).
|
|
"""
|
|
|
|
def __init__(self, filepath: Optional[Path] = None):
|
|
"""Initialize the loader.
|
|
|
|
Args:
|
|
filepath: Path to stadium_aliases.json, defaults to config value
|
|
"""
|
|
self.filepath = filepath or STADIUM_ALIASES_FILE
|
|
self._aliases: list[StadiumAlias] = []
|
|
self._by_name: dict[str, list[StadiumAlias]] = {}
|
|
self._by_stadium: dict[str, list[StadiumAlias]] = {}
|
|
self._loaded = False
|
|
|
|
def load(self) -> None:
|
|
"""Load aliases from the JSON file."""
|
|
if not self.filepath.exists():
|
|
self._loaded = True
|
|
return
|
|
|
|
with open(self.filepath, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
self._aliases = []
|
|
self._by_name = {}
|
|
self._by_stadium = {}
|
|
|
|
for item in data:
|
|
alias = StadiumAlias.from_dict(item)
|
|
self._aliases.append(alias)
|
|
|
|
# Index by lowercase name
|
|
name_key = alias.alias_name.lower()
|
|
if name_key not in self._by_name:
|
|
self._by_name[name_key] = []
|
|
self._by_name[name_key].append(alias)
|
|
|
|
# Index by stadium ID
|
|
if alias.stadium_canonical_id not in self._by_stadium:
|
|
self._by_stadium[alias.stadium_canonical_id] = []
|
|
self._by_stadium[alias.stadium_canonical_id].append(alias)
|
|
|
|
self._loaded = True
|
|
|
|
def _ensure_loaded(self) -> None:
|
|
"""Ensure aliases are loaded."""
|
|
if not self._loaded:
|
|
self.load()
|
|
|
|
def resolve(
|
|
self,
|
|
name: str,
|
|
check_date: Optional[date] = None,
|
|
) -> Optional[str]:
|
|
"""Resolve a stadium name to a canonical stadium ID.
|
|
|
|
Args:
|
|
name: Stadium name to look up (case-insensitive)
|
|
check_date: Date to check validity (None = current date)
|
|
|
|
Returns:
|
|
Canonical stadium ID if found, None otherwise
|
|
"""
|
|
self._ensure_loaded()
|
|
|
|
if check_date is None:
|
|
check_date = date.today()
|
|
|
|
name_key = name.lower().strip()
|
|
aliases = self._by_name.get(name_key, [])
|
|
|
|
for alias in aliases:
|
|
if alias.is_valid_on(check_date):
|
|
return alias.stadium_canonical_id
|
|
|
|
return None
|
|
|
|
def get_aliases_for_stadium(
|
|
self,
|
|
stadium_id: str,
|
|
check_date: Optional[date] = None,
|
|
) -> list[StadiumAlias]:
|
|
"""Get all aliases for a stadium.
|
|
|
|
Args:
|
|
stadium_id: Canonical stadium ID
|
|
check_date: Date to filter by (None = all aliases)
|
|
|
|
Returns:
|
|
List of StadiumAlias objects
|
|
"""
|
|
self._ensure_loaded()
|
|
|
|
aliases = self._by_stadium.get(stadium_id, [])
|
|
|
|
if check_date:
|
|
aliases = [a for a in aliases if a.is_valid_on(check_date)]
|
|
|
|
return aliases
|
|
|
|
def get_all_names(self) -> list[str]:
|
|
"""Get all stadium alias names.
|
|
|
|
Returns:
|
|
List of stadium names
|
|
"""
|
|
self._ensure_loaded()
|
|
|
|
return [alias.alias_name for alias in self._aliases]
|
|
|
|
|
|
# Global loader instances (lazy initialized)
|
|
_team_alias_loader: Optional[TeamAliasLoader] = None
|
|
_stadium_alias_loader: Optional[StadiumAliasLoader] = None
|
|
|
|
|
|
def get_team_alias_loader() -> TeamAliasLoader:
|
|
"""Get the global team alias loader instance."""
|
|
global _team_alias_loader
|
|
if _team_alias_loader is None:
|
|
_team_alias_loader = TeamAliasLoader()
|
|
return _team_alias_loader
|
|
|
|
|
|
def get_stadium_alias_loader() -> StadiumAliasLoader:
|
|
"""Get the global stadium alias loader instance."""
|
|
global _stadium_alias_loader
|
|
if _stadium_alias_loader is None:
|
|
_stadium_alias_loader = StadiumAliasLoader()
|
|
return _stadium_alias_loader
|
|
|
|
|
|
def resolve_team_alias(
|
|
value: str,
|
|
check_date: Optional[date] = None,
|
|
) -> Optional[str]:
|
|
"""Convenience function to resolve a team alias.
|
|
|
|
Args:
|
|
value: Alias value (name, abbreviation, or city)
|
|
check_date: Date to check validity
|
|
|
|
Returns:
|
|
Canonical team ID if found
|
|
"""
|
|
return get_team_alias_loader().resolve(value, check_date)
|
|
|
|
|
|
def resolve_stadium_alias(
|
|
name: str,
|
|
check_date: Optional[date] = None,
|
|
) -> Optional[str]:
|
|
"""Convenience function to resolve a stadium alias.
|
|
|
|
Args:
|
|
name: Stadium name
|
|
check_date: Date to check validity
|
|
|
|
Returns:
|
|
Canonical stadium ID if found
|
|
"""
|
|
return get_stadium_alias_loader().resolve(name, check_date)
|