"""Canonical ID generation for games, teams, and stadiums.""" import re import unicodedata from datetime import date, datetime from typing import Optional def normalize_string(s: str) -> str: """Normalize a string for use in canonical IDs. - Convert to lowercase - Replace spaces and hyphens with underscores - Remove special characters (except underscores) - Collapse multiple underscores - Strip leading/trailing underscores Args: s: String to normalize Returns: Normalized string suitable for IDs """ # Convert to lowercase result = s.lower() # Normalize unicode (e.g., é -> e) result = unicodedata.normalize("NFKD", result) result = result.encode("ascii", "ignore").decode("ascii") # Replace spaces and hyphens with underscores result = re.sub(r"[\s\-]+", "_", result) # Remove special characters except underscores result = re.sub(r"[^a-z0-9_]", "", result) # Collapse multiple underscores result = re.sub(r"_+", "_", result) # Strip leading/trailing underscores result = result.strip("_") return result def generate_game_id( sport: str, season: int, away_abbrev: str, home_abbrev: str, game_date: date | datetime, game_number: Optional[int] = None, ) -> str: """Generate a canonical game ID. Format: game_{sport}_{season}_{YYYYMMDD}_{away}_{home}[_{game_number}] Args: sport: Sport code (e.g., 'nba', 'mlb') season: Season start year (e.g., 2025 for 2025-26) away_abbrev: Away team abbreviation (e.g., 'HOU') home_abbrev: Home team abbreviation (e.g., 'OKC') game_date: Date of the game game_number: Game number for doubleheaders (1 or 2), None for single games Returns: Canonical game ID (e.g., 'game_nba_2025_20251021_hou_okc') Examples: >>> generate_game_id('nba', 2025, 'HOU', 'OKC', date(2025, 10, 21)) 'game_nba_2025_20251021_hou_okc' >>> generate_game_id('mlb', 2026, 'NYY', 'BOS', date(2026, 4, 1), game_number=1) 'game_mlb_2026_20260401_nyy_bos_1' """ # Normalize sport and abbreviations sport_norm = sport.lower() away_norm = away_abbrev.lower() home_norm = home_abbrev.lower() # Format date as YYYYMMDD if isinstance(game_date, datetime): game_date = game_date.date() date_str = game_date.strftime("%Y%m%d") # Build ID with game_ prefix parts = ["game", sport_norm, str(season), date_str, away_norm, home_norm] # Add game number for doubleheaders if game_number is not None: parts.append(str(game_number)) return "_".join(parts) def generate_team_id(sport: str, city: str, name: str) -> str: """Generate a canonical team ID. Format: team_{sport}_{abbreviation} For most teams, we use the standard abbreviation. This function generates a fallback ID based on city and name for teams without a known abbreviation. Args: sport: Sport code (e.g., 'nba', 'mlb') city: Team city (e.g., 'Los Angeles') name: Team name (e.g., 'Lakers') Returns: Canonical team ID (e.g., 'team_nba_la_lakers') Examples: >>> generate_team_id('nba', 'Los Angeles', 'Lakers') 'team_nba_la_lakers' >>> generate_team_id('mlb', 'New York', 'Yankees') 'team_mlb_new_york_yankees' """ sport_norm = sport.lower() city_norm = normalize_string(city) name_norm = normalize_string(name) return f"team_{sport_norm}_{city_norm}_{name_norm}" def generate_team_id_from_abbrev(sport: str, abbreviation: str) -> str: """Generate a canonical team ID from abbreviation. Format: team_{sport}_{abbreviation} Args: sport: Sport code (e.g., 'nba', 'mlb') abbreviation: Team abbreviation (e.g., 'LAL', 'NYY') Returns: Canonical team ID (e.g., 'team_nba_lal') Examples: >>> generate_team_id_from_abbrev('nba', 'LAL') 'team_nba_lal' >>> generate_team_id_from_abbrev('mlb', 'NYY') 'team_mlb_nyy' """ sport_norm = sport.lower() abbrev_norm = abbreviation.lower() return f"team_{sport_norm}_{abbrev_norm}" def generate_stadium_id(sport: str, name: str) -> str: """Generate a canonical stadium ID. Format: stadium_{sport}_{normalized_name} Args: sport: Sport code (e.g., 'nba', 'mlb') name: Stadium name (e.g., 'Yankee Stadium') Returns: Canonical stadium ID (e.g., 'stadium_mlb_yankee_stadium') Examples: >>> generate_stadium_id('nba', 'Crypto.com Arena') 'stadium_nba_cryptocom_arena' >>> generate_stadium_id('mlb', 'Yankee Stadium') 'stadium_mlb_yankee_stadium' """ sport_norm = sport.lower() name_norm = normalize_string(name) return f"stadium_{sport_norm}_{name_norm}" def parse_game_id(game_id: str) -> dict: """Parse a canonical game ID into its components. Args: game_id: Canonical game ID (e.g., 'game_nba_2025_20251021_hou_okc') Returns: Dictionary with keys: sport, season, away_abbrev, home_abbrev, year, month, day, game_number (optional) Raises: ValueError: If game_id format is invalid Examples: >>> parse_game_id('game_nba_2025_20251021_hou_okc') {'sport': 'nba', 'season': 2025, 'away_abbrev': 'hou', 'home_abbrev': 'okc', 'year': 2025, 'month': 10, 'day': 21, 'game_number': None} >>> parse_game_id('game_mlb_2026_20260401_nyy_bos_1') {'sport': 'mlb', 'season': 2026, 'away_abbrev': 'nyy', 'home_abbrev': 'bos', 'year': 2026, 'month': 4, 'day': 1, 'game_number': 1} """ parts = game_id.split("_") if len(parts) < 6 or len(parts) > 7: raise ValueError(f"Invalid game ID format: {game_id}") if parts[0] != "game": raise ValueError(f"Game ID must start with 'game_': {game_id}") sport = parts[1] season = int(parts[2]) date_str = parts[3] away_abbrev = parts[4] home_abbrev = parts[5] if len(date_str) != 8: raise ValueError(f"Invalid date format in game ID: {game_id}") year = int(date_str[:4]) month = int(date_str[4:6]) day = int(date_str[6:]) game_number = None if len(parts) == 7: game_number = int(parts[6]) return { "sport": sport, "season": season, "away_abbrev": away_abbrev, "home_abbrev": home_abbrev, "year": year, "month": month, "day": day, "game_number": game_number, } def parse_team_id(team_id: str) -> dict: """Parse a canonical team ID into its components. Args: team_id: Canonical team ID (e.g., 'team_nba_lal') Returns: Dictionary with keys: sport, identifier (abbreviation or city_name) Raises: ValueError: If team_id format is invalid """ if not team_id.startswith("team_"): raise ValueError(f"Invalid team ID format: {team_id}") parts = team_id.split("_", 2) if len(parts) < 3: raise ValueError(f"Invalid team ID format: {team_id}") return { "sport": parts[1], "identifier": parts[2], } def parse_stadium_id(stadium_id: str) -> dict: """Parse a canonical stadium ID into its components. Args: stadium_id: Canonical stadium ID (e.g., 'stadium_nba_paycom_center') Returns: Dictionary with keys: sport, name Raises: ValueError: If stadium_id format is invalid """ if not stadium_id.startswith("stadium_"): raise ValueError(f"Invalid stadium ID format: {stadium_id}") parts = stadium_id.split("_", 2) if len(parts) < 3: raise ValueError(f"Invalid stadium ID format: {stadium_id}") return { "sport": parts[1], "name": parts[2], }