Files
Sportstime/Scripts/sportstime_parser/normalizers/canonical_id.py
Trey t 11adfc10dd wip
2026-01-19 23:53:37 -06:00

285 lines
7.7 KiB
Python

"""Canonical ID generation for games, teams, and stadiums."""
import re
import unicodedata
from datetime import date, datetime
from typing import Optional
def normalize_string(s: str) -> str:
"""Normalize a string for use in canonical IDs.
- Convert to lowercase
- Replace spaces and hyphens with underscores
- Remove special characters (except underscores)
- Collapse multiple underscores
- Strip leading/trailing underscores
Args:
s: String to normalize
Returns:
Normalized string suitable for IDs
"""
# Convert to lowercase
result = s.lower()
# Normalize unicode (e.g., é -> e)
result = unicodedata.normalize("NFKD", result)
result = result.encode("ascii", "ignore").decode("ascii")
# Replace spaces and hyphens with underscores
result = re.sub(r"[\s\-]+", "_", result)
# Remove special characters except underscores
result = re.sub(r"[^a-z0-9_]", "", result)
# Collapse multiple underscores
result = re.sub(r"_+", "_", result)
# Strip leading/trailing underscores
result = result.strip("_")
return result
def generate_game_id(
sport: str,
season: int,
away_abbrev: str,
home_abbrev: str,
game_date: date | datetime,
game_number: Optional[int] = None,
) -> str:
"""Generate a canonical game ID.
Format: game_{sport}_{season}_{YYYYMMDD}_{away}_{home}[_{game_number}]
Args:
sport: Sport code (e.g., 'nba', 'mlb')
season: Season start year (e.g., 2025 for 2025-26)
away_abbrev: Away team abbreviation (e.g., 'HOU')
home_abbrev: Home team abbreviation (e.g., 'OKC')
game_date: Date of the game
game_number: Game number for doubleheaders (1 or 2), None for single games
Returns:
Canonical game ID (e.g., 'game_nba_2025_20251021_hou_okc')
Examples:
>>> generate_game_id('nba', 2025, 'HOU', 'OKC', date(2025, 10, 21))
'game_nba_2025_20251021_hou_okc'
>>> generate_game_id('mlb', 2026, 'NYY', 'BOS', date(2026, 4, 1), game_number=1)
'game_mlb_2026_20260401_nyy_bos_1'
"""
# Normalize sport and abbreviations
sport_norm = sport.lower()
away_norm = away_abbrev.lower()
home_norm = home_abbrev.lower()
# Format date as YYYYMMDD
if isinstance(game_date, datetime):
game_date = game_date.date()
date_str = game_date.strftime("%Y%m%d")
# Build ID with game_ prefix
parts = ["game", sport_norm, str(season), date_str, away_norm, home_norm]
# Add game number for doubleheaders
if game_number is not None:
parts.append(str(game_number))
return "_".join(parts)
def generate_team_id(sport: str, city: str, name: str) -> str:
"""Generate a canonical team ID.
Format: team_{sport}_{abbreviation}
For most teams, we use the standard abbreviation. This function generates
a fallback ID based on city and name for teams without a known abbreviation.
Args:
sport: Sport code (e.g., 'nba', 'mlb')
city: Team city (e.g., 'Los Angeles')
name: Team name (e.g., 'Lakers')
Returns:
Canonical team ID (e.g., 'team_nba_la_lakers')
Examples:
>>> generate_team_id('nba', 'Los Angeles', 'Lakers')
'team_nba_la_lakers'
>>> generate_team_id('mlb', 'New York', 'Yankees')
'team_mlb_new_york_yankees'
"""
sport_norm = sport.lower()
city_norm = normalize_string(city)
name_norm = normalize_string(name)
return f"team_{sport_norm}_{city_norm}_{name_norm}"
def generate_team_id_from_abbrev(sport: str, abbreviation: str) -> str:
"""Generate a canonical team ID from abbreviation.
Format: team_{sport}_{abbreviation}
Args:
sport: Sport code (e.g., 'nba', 'mlb')
abbreviation: Team abbreviation (e.g., 'LAL', 'NYY')
Returns:
Canonical team ID (e.g., 'team_nba_lal')
Examples:
>>> generate_team_id_from_abbrev('nba', 'LAL')
'team_nba_lal'
>>> generate_team_id_from_abbrev('mlb', 'NYY')
'team_mlb_nyy'
"""
sport_norm = sport.lower()
abbrev_norm = abbreviation.lower()
return f"team_{sport_norm}_{abbrev_norm}"
def generate_stadium_id(sport: str, name: str) -> str:
"""Generate a canonical stadium ID.
Format: stadium_{sport}_{normalized_name}
Args:
sport: Sport code (e.g., 'nba', 'mlb')
name: Stadium name (e.g., 'Yankee Stadium')
Returns:
Canonical stadium ID (e.g., 'stadium_mlb_yankee_stadium')
Examples:
>>> generate_stadium_id('nba', 'Crypto.com Arena')
'stadium_nba_cryptocom_arena'
>>> generate_stadium_id('mlb', 'Yankee Stadium')
'stadium_mlb_yankee_stadium'
"""
sport_norm = sport.lower()
name_norm = normalize_string(name)
return f"stadium_{sport_norm}_{name_norm}"
def parse_game_id(game_id: str) -> dict:
"""Parse a canonical game ID into its components.
Args:
game_id: Canonical game ID (e.g., 'game_nba_2025_20251021_hou_okc')
Returns:
Dictionary with keys: sport, season, away_abbrev, home_abbrev,
year, month, day, game_number (optional)
Raises:
ValueError: If game_id format is invalid
Examples:
>>> parse_game_id('game_nba_2025_20251021_hou_okc')
{'sport': 'nba', 'season': 2025, 'away_abbrev': 'hou',
'home_abbrev': 'okc', 'year': 2025, 'month': 10, 'day': 21, 'game_number': None}
>>> parse_game_id('game_mlb_2026_20260401_nyy_bos_1')
{'sport': 'mlb', 'season': 2026, 'away_abbrev': 'nyy',
'home_abbrev': 'bos', 'year': 2026, 'month': 4, 'day': 1, 'game_number': 1}
"""
parts = game_id.split("_")
if len(parts) < 6 or len(parts) > 7:
raise ValueError(f"Invalid game ID format: {game_id}")
if parts[0] != "game":
raise ValueError(f"Game ID must start with 'game_': {game_id}")
sport = parts[1]
season = int(parts[2])
date_str = parts[3]
away_abbrev = parts[4]
home_abbrev = parts[5]
if len(date_str) != 8:
raise ValueError(f"Invalid date format in game ID: {game_id}")
year = int(date_str[:4])
month = int(date_str[4:6])
day = int(date_str[6:])
game_number = None
if len(parts) == 7:
game_number = int(parts[6])
return {
"sport": sport,
"season": season,
"away_abbrev": away_abbrev,
"home_abbrev": home_abbrev,
"year": year,
"month": month,
"day": day,
"game_number": game_number,
}
def parse_team_id(team_id: str) -> dict:
"""Parse a canonical team ID into its components.
Args:
team_id: Canonical team ID (e.g., 'team_nba_lal')
Returns:
Dictionary with keys: sport, identifier (abbreviation or city_name)
Raises:
ValueError: If team_id format is invalid
"""
if not team_id.startswith("team_"):
raise ValueError(f"Invalid team ID format: {team_id}")
parts = team_id.split("_", 2)
if len(parts) < 3:
raise ValueError(f"Invalid team ID format: {team_id}")
return {
"sport": parts[1],
"identifier": parts[2],
}
def parse_stadium_id(stadium_id: str) -> dict:
"""Parse a canonical stadium ID into its components.
Args:
stadium_id: Canonical stadium ID (e.g., 'stadium_nba_paycom_center')
Returns:
Dictionary with keys: sport, name
Raises:
ValueError: If stadium_id format is invalid
"""
if not stadium_id.startswith("stadium_"):
raise ValueError(f"Invalid stadium ID format: {stadium_id}")
parts = stadium_id.split("_", 2)
if len(parts) < 3:
raise ValueError(f"Invalid stadium ID format: {stadium_id}")
return {
"sport": parts[1],
"name": parts[2],
}