#!/usr/bin/env python3 """ Game Canonicalization for SportsTime ==================================== Stage 3 of the canonicalization pipeline. Resolves team and stadium references in games, generates canonical game IDs. Usage: python canonicalize_games.py --games data/games.json --teams data/teams_canonical.json \ --aliases data/stadium_aliases.json --output data/ """ import argparse import json from collections import defaultdict from dataclasses import dataclass, asdict from pathlib import Path from typing import Optional # ============================================================================= # DATA CLASSES # ============================================================================= @dataclass class CanonicalGame: """A canonicalized game with stable ID and resolved references.""" canonical_id: str sport: str season: str date: str # YYYY-MM-DD time: Optional[str] home_team_canonical_id: str away_team_canonical_id: str stadium_canonical_id: str is_playoff: bool = False broadcast: Optional[str] = None @dataclass class ResolutionWarning: """Warning about a resolution issue.""" game_key: str issue: str details: str # ============================================================================= # TEAM ABBREVIATION ALIASES # Maps alternative abbreviations to canonical team IDs # ============================================================================= TEAM_ABBREV_ALIASES = { # NBA ('NBA', 'PHX'): 'team_nba_pho', # Phoenix ('NBA', 'BKN'): 'team_nba_brk', # Brooklyn ('NBA', 'CHA'): 'team_nba_cho', # Charlotte (older abbrev) ('NBA', 'NOP'): 'team_nba_nop', # New Orleans ('NBA', 'NO'): 'team_nba_nop', # New Orleans alt ('NBA', 'NY'): 'team_nba_nyk', # New York ('NBA', 'SA'): 'team_nba_sas', # San Antonio ('NBA', 'GS'): 'team_nba_gsw', # Golden State ('NBA', 'UTAH'): 'team_nba_uta', # Utah # MLB ('MLB', 'AZ'): 'team_mlb_ari', # Arizona ('MLB', 'CWS'): 'team_mlb_chw', # Chicago White Sox ('MLB', 'KC'): 'team_mlb_kcr', # Kansas City ('MLB', 'SD'): 'team_mlb_sdp', # San Diego ('MLB', 'SF'): 'team_mlb_sfg', # San Francisco ('MLB', 'TB'): 'team_mlb_tbr', # Tampa Bay ('MLB', 'WSH'): 'team_mlb_wsn', # Washington ('MLB', 'WAS'): 'team_mlb_wsn', # Washington alt ('MLB', 'LA'): 'team_mlb_lad', # Los Angeles Dodgers ('MLB', 'ATH'): 'team_mlb_oak', # Oakland Athletics # NHL ('NHL', 'ARI'): 'team_nhl_ari', # Arizona/Utah ('NHL', 'UTA'): 'team_nhl_ari', # Utah Hockey Club (uses ARI code) ('NHL', 'VGS'): 'team_nhl_vgk', # Vegas ('NHL', 'TB'): 'team_nhl_tbl', # Tampa Bay Lightning ('NHL', 'NJ'): 'team_nhl_njd', # New Jersey ('NHL', 'SJ'): 'team_nhl_sjs', # San Jose ('NHL', 'LA'): 'team_nhl_lak', # Los Angeles Kings ('NHL', 'MON'): 'team_nhl_mtl', # Montreal # NFL ('NFL', 'JAC'): 'team_nfl_jax', # Jacksonville (JAC vs JAX) ('NFL', 'OAK'): 'team_nfl_lv', # Oakland → Las Vegas Raiders (moved 2020) ('NFL', 'SD'): 'team_nfl_lac', # San Diego → Los Angeles Chargers (moved 2017) ('NFL', 'STL'): 'team_nfl_lar', # St. Louis → Los Angeles Rams (moved 2016) ('NFL', 'GNB'): 'team_nfl_gb', # Green Bay alternate ('NFL', 'KAN'): 'team_nfl_kc', # Kansas City alternate ('NFL', 'NWE'): 'team_nfl_ne', # New England alternate ('NFL', 'NOR'): 'team_nfl_no', # New Orleans alternate ('NFL', 'TAM'): 'team_nfl_tb', # Tampa Bay alternate ('NFL', 'SFO'): 'team_nfl_sf', # San Francisco alternate ('NFL', 'WAS'): 'team_nfl_was', # Washington (direct match but include for completeness) # MLS ('MLS', 'LA'): 'team_mls_lag', # LA Galaxy ('MLS', 'NYC'): 'team_mls_nycfc', # NYC FC ('MLS', 'RBNY'): 'team_mls_nyrb', # NY Red Bulls ('MLS', 'NYR'): 'team_mls_nyrb', # NY Red Bulls alt ('MLS', 'SJE'): 'team_mls_sj', # San Jose Earthquakes ('MLS', 'KC'): 'team_mls_skc', # Sporting KC ('MLS', 'DCU'): 'team_mls_dc', # DC United ('MLS', 'FCD'): 'team_mls_dal', # FC Dallas ('MLS', 'MON'): 'team_mls_mtl', # Montreal ('MLS', 'LAF'): 'team_mls_lafc', # LAFC alt # WNBA ('WNBA', 'LV'): 'team_wnba_lva', # Las Vegas Aces ('WNBA', 'LAS'): 'team_wnba_la', # LA Sparks ('WNBA', 'NYL'): 'team_wnba_ny', # New York Liberty ('WNBA', 'PHX'): 'team_wnba_pho', # Phoenix Mercury ('WNBA', 'CONN'): 'team_wnba_con', # Connecticut Sun ('WNBA', 'WSH'): 'team_wnba_was', # Washington Mystics # NWSL ('NWSL', 'ANG'): 'team_nwsl_la', # Angel City FC (uses LA abbrev) ('NWSL', 'ACFC'): 'team_nwsl_la', # Angel City FC alt ('NWSL', 'NCC'): 'team_nwsl_nc', # North Carolina Courage ('NWSL', 'GOTHAM'): 'team_nwsl_nj', # NJ/NY Gotham FC ('NWSL', 'NY'): 'team_nwsl_nj', # NJ/NY Gotham FC alt ('NWSL', 'BAY'): 'team_nwsl_sj', # Bay FC (San Jose) ('NWSL', 'RLC'): 'team_nwsl_uta', # Racing Louisville -> Utah Royals (rebrand) ('NWSL', 'LOU'): 'team_nwsl_uta', # Louisville -> Utah alt } # ============================================================================= # ID GENERATION # ============================================================================= def normalize_season(sport: str, season: str) -> str: """ Normalize season format for ID generation. NBA/NHL: "2025-26" -> "202526" MLB: "2026" -> "2026" """ return season.replace('-', '') def generate_canonical_game_id( sport: str, season: str, date: str, # YYYY-MM-DD away_abbrev: str, home_abbrev: str, sequence: int = 1 ) -> str: """ Generate deterministic canonical ID for game. Format: game_{sport}_{season}_{date}_{away}_{home}[_{sequence}] Example: game_nba_202526_20251021_hou_okc game_mlb_2026_20260615_bos_nyy_2 (doubleheader game 2) """ normalized_season = normalize_season(sport, season) date_compact = date.replace('-', '') # YYYYMMDD base_id = f"game_{sport.lower()}_{normalized_season}_{date_compact}_{away_abbrev.lower()}_{home_abbrev.lower()}" if sequence > 1: return f"{base_id}_{sequence}" return base_id # ============================================================================= # RESOLUTION # ============================================================================= def build_alias_lookup(stadium_aliases: list[dict]) -> dict[str, str]: """ Build lookup from alias name to canonical stadium ID. Returns: {alias_name_lower: canonical_stadium_id} """ lookup = {} for alias in stadium_aliases: alias_name = alias.get('alias_name', '').lower().strip() canonical_id = alias.get('stadium_canonical_id', '') if alias_name and canonical_id: lookup[alias_name] = canonical_id return lookup def resolve_team( abbrev: str, sport: str, teams_by_abbrev: dict[tuple[str, str], dict], teams_by_id: dict[str, dict] ) -> Optional[dict]: """ Resolve team abbreviation to canonical team. 1. Try direct match by (sport, abbrev) 2. Try alias lookup 3. Return None if not found """ key = (sport, abbrev.upper()) # Direct match if key in teams_by_abbrev: return teams_by_abbrev[key] # Alias match if key in TEAM_ABBREV_ALIASES: canonical_id = TEAM_ABBREV_ALIASES[key] if canonical_id in teams_by_id: return teams_by_id[canonical_id] return None def resolve_stadium_from_venue( venue: str, home_team: dict, sport: str, alias_lookup: dict[str, str], stadiums_by_id: dict[str, dict] ) -> str: """ Resolve stadium canonical ID from venue name. Strategy: 1. ALWAYS prefer home team's stadium (most reliable, sport-correct) 2. Try sport-scoped alias match (only if home team has no stadium) 3. Fall back to unknown stadium slug For multi-sport venues (MSG, Crypto.com Arena, etc.), home team's stadium_canonical_id is authoritative because it's already sport-scoped. Args: venue: Venue name from game data home_team: Resolved home team dict sport: Sport code (NBA, MLB, NHL) alias_lookup: {alias_name_lower: canonical_stadium_id} stadiums_by_id: {canonical_id: stadium_dict} Returns: canonical_stadium_id """ # Strategy 1: Home team's stadium is most reliable (sport-scoped) if home_team: team_stadium = home_team.get('stadium_canonical_id', '') if team_stadium: return team_stadium # Strategy 2: Sport-scoped alias match (fallback for neutral sites) venue_lower = venue.lower().strip() sport_prefix = f"stadium_{sport.lower()}_" if venue_lower in alias_lookup: matched_id = alias_lookup[venue_lower] # Only use alias if it's for the correct sport if matched_id.startswith(sport_prefix): return matched_id # Strategy 3: Partial match with sport check for alias, canonical_id in alias_lookup.items(): if len(alias) > 3 and (alias in venue_lower or venue_lower in alias): if canonical_id.startswith(sport_prefix): return canonical_id # Unknown stadium slug = venue_lower[:30].replace(' ', '_').replace('.', '') return f"stadium_unknown_{slug}" # ============================================================================= # CANONICALIZATION # ============================================================================= def canonicalize_games( raw_games: list[dict], canonical_teams: list[dict], stadium_aliases: list[dict], verbose: bool = False ) -> tuple[list[CanonicalGame], list[ResolutionWarning]]: """ Stage 3: Canonicalize games. 1. Resolve team abbreviations to canonical IDs 2. Resolve venues to stadium canonical IDs 3. Generate canonical game IDs (handling doubleheaders) Args: raw_games: List of raw game dicts canonical_teams: List of canonical team dicts stadium_aliases: List of stadium alias dicts verbose: Print detailed progress Returns: (canonical_games, warnings) """ games = [] warnings = [] # Build lookups teams_by_abbrev = {} # (sport, abbrev) -> team dict teams_by_id = {} # canonical_id -> team dict for team in canonical_teams: abbrev = team['abbreviation'].upper() sport = team['sport'] teams_by_abbrev[(sport, abbrev)] = team teams_by_id[team['canonical_id']] = team alias_lookup = build_alias_lookup(stadium_aliases) stadiums_by_id = {} # Would be populated from stadiums_canonical.json if needed # Track games for doubleheader detection game_counts = defaultdict(int) # (date, away_id, home_id) -> count resolved_count = 0 unresolved_teams = 0 unresolved_stadiums = 0 for raw in raw_games: sport = raw.get('sport', '').upper() season = raw.get('season', '') date = raw.get('date', '') home_abbrev = raw.get('home_team_abbrev', '').upper() away_abbrev = raw.get('away_team_abbrev', '').upper() venue = raw.get('venue', '') game_key = f"{date}_{away_abbrev}_{home_abbrev}" # Resolve teams home_team = resolve_team(home_abbrev, sport, teams_by_abbrev, teams_by_id) away_team = resolve_team(away_abbrev, sport, teams_by_abbrev, teams_by_id) if not home_team: warnings.append(ResolutionWarning( game_key=game_key, issue='Unknown home team', details=f"Could not resolve home team '{home_abbrev}' for sport {sport}" )) unresolved_teams += 1 if verbose: print(f" WARNING: {game_key} - unknown home team {home_abbrev}") continue if not away_team: warnings.append(ResolutionWarning( game_key=game_key, issue='Unknown away team', details=f"Could not resolve away team '{away_abbrev}' for sport {sport}" )) unresolved_teams += 1 if verbose: print(f" WARNING: {game_key} - unknown away team {away_abbrev}") continue # Resolve stadium stadium_canonical_id = resolve_stadium_from_venue( venue, home_team, sport, alias_lookup, stadiums_by_id ) if stadium_canonical_id.startswith('stadium_unknown'): warnings.append(ResolutionWarning( game_key=game_key, issue='Unknown stadium', details=f"Could not resolve venue '{venue}', using home team stadium" )) unresolved_stadiums += 1 # Fall back to home team stadium stadium_canonical_id = home_team.get('stadium_canonical_id', stadium_canonical_id) # Handle doubleheaders matchup_key = (date, away_team['canonical_id'], home_team['canonical_id']) game_counts[matchup_key] += 1 sequence = game_counts[matchup_key] # Generate canonical ID canonical_id = generate_canonical_game_id( sport, season, date, away_team['abbreviation'], home_team['abbreviation'], sequence ) game = CanonicalGame( canonical_id=canonical_id, sport=sport, season=season, date=date, time=raw.get('time'), home_team_canonical_id=home_team['canonical_id'], away_team_canonical_id=away_team['canonical_id'], stadium_canonical_id=stadium_canonical_id, is_playoff=raw.get('is_playoff', False), broadcast=raw.get('broadcast') ) games.append(game) resolved_count += 1 if verbose: print(f"\n Resolved: {resolved_count} games") print(f" Unresolved teams: {unresolved_teams}") print(f" Unknown stadiums (used home team): {unresolved_stadiums}") return games, warnings # ============================================================================= # MAIN # ============================================================================= def main(): parser = argparse.ArgumentParser( description='Canonicalize game data' ) parser.add_argument( '--games', type=str, default='./data/games.json', help='Input raw games JSON file' ) parser.add_argument( '--teams', type=str, default='./data/teams_canonical.json', help='Input canonical teams JSON file' ) parser.add_argument( '--aliases', type=str, default='./data/stadium_aliases.json', help='Input stadium aliases JSON file' ) parser.add_argument( '--output', type=str, default='./data', help='Output directory for canonical files' ) parser.add_argument( '--verbose', '-v', action='store_true', help='Verbose output' ) args = parser.parse_args() games_path = Path(args.games) teams_path = Path(args.teams) aliases_path = Path(args.aliases) output_dir = Path(args.output) output_dir.mkdir(parents=True, exist_ok=True) # Load input files print(f"Loading raw games from {games_path}...") with open(games_path) as f: raw_games = json.load(f) print(f" Loaded {len(raw_games)} raw games") print(f"Loading canonical teams from {teams_path}...") with open(teams_path) as f: canonical_teams = json.load(f) print(f" Loaded {len(canonical_teams)} canonical teams") print(f"Loading stadium aliases from {aliases_path}...") with open(aliases_path) as f: stadium_aliases = json.load(f) print(f" Loaded {len(stadium_aliases)} stadium aliases") # Canonicalize games print("\nCanonicalizing games...") canonical_games, warnings = canonicalize_games( raw_games, canonical_teams, stadium_aliases, verbose=args.verbose ) print(f" Created {len(canonical_games)} canonical games") if warnings: print(f"\n Warnings: {len(warnings)}") # Group by issue type by_issue = defaultdict(list) for w in warnings: by_issue[w.issue].append(w) for issue, issue_warnings in by_issue.items(): print(f" - {issue}: {len(issue_warnings)}") # Export games_path = output_dir / 'games_canonical.json' warnings_path = output_dir / 'game_resolution_warnings.json' with open(games_path, 'w') as f: json.dump([asdict(g) for g in canonical_games], f, indent=2) print(f"\nExported games to {games_path}") if warnings: with open(warnings_path, 'w') as f: json.dump([asdict(w) for w in warnings], f, indent=2) print(f"Exported warnings to {warnings_path}") # Summary by sport print("\nSummary by sport:") by_sport = {} for g in canonical_games: by_sport[g.sport] = by_sport.get(g.sport, 0) + 1 for sport, count in sorted(by_sport.items()): print(f" {sport}: {count} games") # Check for doubleheaders doubleheaders = sum(1 for g in canonical_games if '_2' in g.canonical_id or '_3' in g.canonical_id) if doubleheaders: print(f"\n Doubleheader games detected: {doubleheaders}") if __name__ == '__main__': main()