feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
312
Scripts/sportstime_parser/normalizers/alias_loader.py
Normal file
312
Scripts/sportstime_parser/normalizers/alias_loader.py
Normal file
@@ -0,0 +1,312 @@
|
||||
"""Alias file loaders for team and stadium name resolution."""
|
||||
|
||||
import json
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from ..config import TEAM_ALIASES_FILE, STADIUM_ALIASES_FILE
|
||||
from ..models.aliases import TeamAlias, StadiumAlias, AliasType
|
||||
|
||||
|
||||
class TeamAliasLoader:
|
||||
"""Loader for team aliases with date-aware resolution.
|
||||
|
||||
Loads team aliases from JSON and provides lookup methods
|
||||
with support for historical name changes.
|
||||
"""
|
||||
|
||||
def __init__(self, filepath: Optional[Path] = None):
|
||||
"""Initialize the loader.
|
||||
|
||||
Args:
|
||||
filepath: Path to team_aliases.json, defaults to config value
|
||||
"""
|
||||
self.filepath = filepath or TEAM_ALIASES_FILE
|
||||
self._aliases: list[TeamAlias] = []
|
||||
self._by_value: dict[str, list[TeamAlias]] = {}
|
||||
self._by_team: dict[str, list[TeamAlias]] = {}
|
||||
self._loaded = False
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load aliases from the JSON file."""
|
||||
if not self.filepath.exists():
|
||||
self._loaded = True
|
||||
return
|
||||
|
||||
with open(self.filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
self._aliases = []
|
||||
self._by_value = {}
|
||||
self._by_team = {}
|
||||
|
||||
for item in data:
|
||||
alias = TeamAlias.from_dict(item)
|
||||
self._aliases.append(alias)
|
||||
|
||||
# Index by lowercase value
|
||||
value_key = alias.alias_value.lower()
|
||||
if value_key not in self._by_value:
|
||||
self._by_value[value_key] = []
|
||||
self._by_value[value_key].append(alias)
|
||||
|
||||
# Index by team ID
|
||||
if alias.team_canonical_id not in self._by_team:
|
||||
self._by_team[alias.team_canonical_id] = []
|
||||
self._by_team[alias.team_canonical_id].append(alias)
|
||||
|
||||
self._loaded = True
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
"""Ensure aliases are loaded."""
|
||||
if not self._loaded:
|
||||
self.load()
|
||||
|
||||
def resolve(
|
||||
self,
|
||||
value: str,
|
||||
check_date: Optional[date] = None,
|
||||
alias_types: Optional[list[AliasType]] = None,
|
||||
) -> Optional[str]:
|
||||
"""Resolve an alias value to a canonical team ID.
|
||||
|
||||
Args:
|
||||
value: Alias value to look up (case-insensitive)
|
||||
check_date: Date to check validity (None = current date)
|
||||
alias_types: Types of aliases to check (None = all types)
|
||||
|
||||
Returns:
|
||||
Canonical team ID if found, None otherwise
|
||||
"""
|
||||
self._ensure_loaded()
|
||||
|
||||
if check_date is None:
|
||||
check_date = date.today()
|
||||
|
||||
value_key = value.lower().strip()
|
||||
aliases = self._by_value.get(value_key, [])
|
||||
|
||||
for alias in aliases:
|
||||
# Check type filter
|
||||
if alias_types and alias.alias_type not in alias_types:
|
||||
continue
|
||||
|
||||
# Check date validity
|
||||
if alias.is_valid_on(check_date):
|
||||
return alias.team_canonical_id
|
||||
|
||||
return None
|
||||
|
||||
def get_aliases_for_team(
|
||||
self,
|
||||
team_id: str,
|
||||
check_date: Optional[date] = None,
|
||||
) -> list[TeamAlias]:
|
||||
"""Get all aliases for a team.
|
||||
|
||||
Args:
|
||||
team_id: Canonical team ID
|
||||
check_date: Date to filter by (None = all aliases)
|
||||
|
||||
Returns:
|
||||
List of TeamAlias objects
|
||||
"""
|
||||
self._ensure_loaded()
|
||||
|
||||
aliases = self._by_team.get(team_id, [])
|
||||
|
||||
if check_date:
|
||||
aliases = [a for a in aliases if a.is_valid_on(check_date)]
|
||||
|
||||
return aliases
|
||||
|
||||
def get_all_values(
|
||||
self,
|
||||
alias_type: Optional[AliasType] = None,
|
||||
) -> list[str]:
|
||||
"""Get all alias values.
|
||||
|
||||
Args:
|
||||
alias_type: Filter by alias type (None = all types)
|
||||
|
||||
Returns:
|
||||
List of alias values
|
||||
"""
|
||||
self._ensure_loaded()
|
||||
|
||||
values = []
|
||||
for alias in self._aliases:
|
||||
if alias_type is None or alias.alias_type == alias_type:
|
||||
values.append(alias.alias_value)
|
||||
|
||||
return values
|
||||
|
||||
|
||||
class StadiumAliasLoader:
|
||||
"""Loader for stadium aliases with date-aware resolution.
|
||||
|
||||
Loads stadium aliases from JSON and provides lookup methods
|
||||
with support for historical name changes (e.g., naming rights).
|
||||
"""
|
||||
|
||||
def __init__(self, filepath: Optional[Path] = None):
|
||||
"""Initialize the loader.
|
||||
|
||||
Args:
|
||||
filepath: Path to stadium_aliases.json, defaults to config value
|
||||
"""
|
||||
self.filepath = filepath or STADIUM_ALIASES_FILE
|
||||
self._aliases: list[StadiumAlias] = []
|
||||
self._by_name: dict[str, list[StadiumAlias]] = {}
|
||||
self._by_stadium: dict[str, list[StadiumAlias]] = {}
|
||||
self._loaded = False
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load aliases from the JSON file."""
|
||||
if not self.filepath.exists():
|
||||
self._loaded = True
|
||||
return
|
||||
|
||||
with open(self.filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
self._aliases = []
|
||||
self._by_name = {}
|
||||
self._by_stadium = {}
|
||||
|
||||
for item in data:
|
||||
alias = StadiumAlias.from_dict(item)
|
||||
self._aliases.append(alias)
|
||||
|
||||
# Index by lowercase name
|
||||
name_key = alias.alias_name.lower()
|
||||
if name_key not in self._by_name:
|
||||
self._by_name[name_key] = []
|
||||
self._by_name[name_key].append(alias)
|
||||
|
||||
# Index by stadium ID
|
||||
if alias.stadium_canonical_id not in self._by_stadium:
|
||||
self._by_stadium[alias.stadium_canonical_id] = []
|
||||
self._by_stadium[alias.stadium_canonical_id].append(alias)
|
||||
|
||||
self._loaded = True
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
"""Ensure aliases are loaded."""
|
||||
if not self._loaded:
|
||||
self.load()
|
||||
|
||||
def resolve(
|
||||
self,
|
||||
name: str,
|
||||
check_date: Optional[date] = None,
|
||||
) -> Optional[str]:
|
||||
"""Resolve a stadium name to a canonical stadium ID.
|
||||
|
||||
Args:
|
||||
name: Stadium name to look up (case-insensitive)
|
||||
check_date: Date to check validity (None = current date)
|
||||
|
||||
Returns:
|
||||
Canonical stadium ID if found, None otherwise
|
||||
"""
|
||||
self._ensure_loaded()
|
||||
|
||||
if check_date is None:
|
||||
check_date = date.today()
|
||||
|
||||
name_key = name.lower().strip()
|
||||
aliases = self._by_name.get(name_key, [])
|
||||
|
||||
for alias in aliases:
|
||||
if alias.is_valid_on(check_date):
|
||||
return alias.stadium_canonical_id
|
||||
|
||||
return None
|
||||
|
||||
def get_aliases_for_stadium(
|
||||
self,
|
||||
stadium_id: str,
|
||||
check_date: Optional[date] = None,
|
||||
) -> list[StadiumAlias]:
|
||||
"""Get all aliases for a stadium.
|
||||
|
||||
Args:
|
||||
stadium_id: Canonical stadium ID
|
||||
check_date: Date to filter by (None = all aliases)
|
||||
|
||||
Returns:
|
||||
List of StadiumAlias objects
|
||||
"""
|
||||
self._ensure_loaded()
|
||||
|
||||
aliases = self._by_stadium.get(stadium_id, [])
|
||||
|
||||
if check_date:
|
||||
aliases = [a for a in aliases if a.is_valid_on(check_date)]
|
||||
|
||||
return aliases
|
||||
|
||||
def get_all_names(self) -> list[str]:
|
||||
"""Get all stadium alias names.
|
||||
|
||||
Returns:
|
||||
List of stadium names
|
||||
"""
|
||||
self._ensure_loaded()
|
||||
|
||||
return [alias.alias_name for alias in self._aliases]
|
||||
|
||||
|
||||
# Global loader instances (lazy initialized)
|
||||
_team_alias_loader: Optional[TeamAliasLoader] = None
|
||||
_stadium_alias_loader: Optional[StadiumAliasLoader] = None
|
||||
|
||||
|
||||
def get_team_alias_loader() -> TeamAliasLoader:
|
||||
"""Get the global team alias loader instance."""
|
||||
global _team_alias_loader
|
||||
if _team_alias_loader is None:
|
||||
_team_alias_loader = TeamAliasLoader()
|
||||
return _team_alias_loader
|
||||
|
||||
|
||||
def get_stadium_alias_loader() -> StadiumAliasLoader:
|
||||
"""Get the global stadium alias loader instance."""
|
||||
global _stadium_alias_loader
|
||||
if _stadium_alias_loader is None:
|
||||
_stadium_alias_loader = StadiumAliasLoader()
|
||||
return _stadium_alias_loader
|
||||
|
||||
|
||||
def resolve_team_alias(
|
||||
value: str,
|
||||
check_date: Optional[date] = None,
|
||||
) -> Optional[str]:
|
||||
"""Convenience function to resolve a team alias.
|
||||
|
||||
Args:
|
||||
value: Alias value (name, abbreviation, or city)
|
||||
check_date: Date to check validity
|
||||
|
||||
Returns:
|
||||
Canonical team ID if found
|
||||
"""
|
||||
return get_team_alias_loader().resolve(value, check_date)
|
||||
|
||||
|
||||
def resolve_stadium_alias(
|
||||
name: str,
|
||||
check_date: Optional[date] = None,
|
||||
) -> Optional[str]:
|
||||
"""Convenience function to resolve a stadium alias.
|
||||
|
||||
Args:
|
||||
name: Stadium name
|
||||
check_date: Date to check validity
|
||||
|
||||
Returns:
|
||||
Canonical stadium ID if found
|
||||
"""
|
||||
return get_stadium_alias_loader().resolve(name, check_date)
|
||||
Reference in New Issue
Block a user