Files
Sportstime/Scripts/canonicalize_games.py
Trey t 41496b6bea feat(03-01): add NFL team abbreviation aliases
Add NFL entries to TEAM_ABBREV_ALIASES dict:
- Historical relocations: OAK→LV, SD→LAC, STL→LAR
- Common 3-letter variations: JAC, GNB, KAN, NWE, NOR, TAM, SFO
- Direct match for WAS included for completeness

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 09:35:21 -06:00

476 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Game Canonicalization for SportsTime
====================================
Stage 3 of the canonicalization pipeline.
Resolves team and stadium references in games, generates canonical game IDs.
Usage:
python canonicalize_games.py --games data/games.json --teams data/teams_canonical.json \
--aliases data/stadium_aliases.json --output data/
"""
import argparse
import json
from collections import defaultdict
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class CanonicalGame:
"""A canonicalized game with stable ID and resolved references."""
canonical_id: str
sport: str
season: str
date: str # YYYY-MM-DD
time: Optional[str]
home_team_canonical_id: str
away_team_canonical_id: str
stadium_canonical_id: str
is_playoff: bool = False
broadcast: Optional[str] = None
@dataclass
class ResolutionWarning:
"""Warning about a resolution issue."""
game_key: str
issue: str
details: str
# =============================================================================
# TEAM ABBREVIATION ALIASES
# Maps alternative abbreviations to canonical team IDs
# =============================================================================
TEAM_ABBREV_ALIASES = {
# NBA
('NBA', 'PHX'): 'team_nba_pho', # Phoenix
('NBA', 'BKN'): 'team_nba_brk', # Brooklyn
('NBA', 'CHA'): 'team_nba_cho', # Charlotte (older abbrev)
('NBA', 'NOP'): 'team_nba_nop', # New Orleans
('NBA', 'NO'): 'team_nba_nop', # New Orleans alt
('NBA', 'NY'): 'team_nba_nyk', # New York
('NBA', 'SA'): 'team_nba_sas', # San Antonio
('NBA', 'GS'): 'team_nba_gsw', # Golden State
('NBA', 'UTAH'): 'team_nba_uta', # Utah
# MLB
('MLB', 'AZ'): 'team_mlb_ari', # Arizona
('MLB', 'CWS'): 'team_mlb_chw', # Chicago White Sox
('MLB', 'KC'): 'team_mlb_kcr', # Kansas City
('MLB', 'SD'): 'team_mlb_sdp', # San Diego
('MLB', 'SF'): 'team_mlb_sfg', # San Francisco
('MLB', 'TB'): 'team_mlb_tbr', # Tampa Bay
('MLB', 'WSH'): 'team_mlb_wsn', # Washington
('MLB', 'WAS'): 'team_mlb_wsn', # Washington alt
('MLB', 'LA'): 'team_mlb_lad', # Los Angeles Dodgers
('MLB', 'ATH'): 'team_mlb_oak', # Oakland Athletics
# NHL
('NHL', 'ARI'): 'team_nhl_ari', # Arizona/Utah
('NHL', 'UTA'): 'team_nhl_ari', # Utah Hockey Club (uses ARI code)
('NHL', 'VGS'): 'team_nhl_vgk', # Vegas
('NHL', 'TB'): 'team_nhl_tbl', # Tampa Bay Lightning
('NHL', 'NJ'): 'team_nhl_njd', # New Jersey
('NHL', 'SJ'): 'team_nhl_sjs', # San Jose
('NHL', 'LA'): 'team_nhl_lak', # Los Angeles Kings
('NHL', 'MON'): 'team_nhl_mtl', # Montreal
# NFL
('NFL', 'JAC'): 'team_nfl_jax', # Jacksonville (JAC vs JAX)
('NFL', 'OAK'): 'team_nfl_lv', # Oakland → Las Vegas Raiders (moved 2020)
('NFL', 'SD'): 'team_nfl_lac', # San Diego → Los Angeles Chargers (moved 2017)
('NFL', 'STL'): 'team_nfl_lar', # St. Louis → Los Angeles Rams (moved 2016)
('NFL', 'GNB'): 'team_nfl_gb', # Green Bay alternate
('NFL', 'KAN'): 'team_nfl_kc', # Kansas City alternate
('NFL', 'NWE'): 'team_nfl_ne', # New England alternate
('NFL', 'NOR'): 'team_nfl_no', # New Orleans alternate
('NFL', 'TAM'): 'team_nfl_tb', # Tampa Bay alternate
('NFL', 'SFO'): 'team_nfl_sf', # San Francisco alternate
('NFL', 'WAS'): 'team_nfl_was', # Washington (direct match but include for completeness)
}
# =============================================================================
# ID GENERATION
# =============================================================================
def normalize_season(sport: str, season: str) -> str:
"""
Normalize season format for ID generation.
NBA/NHL: "2025-26" -> "202526"
MLB: "2026" -> "2026"
"""
return season.replace('-', '')
def generate_canonical_game_id(
sport: str,
season: str,
date: str, # YYYY-MM-DD
away_abbrev: str,
home_abbrev: str,
sequence: int = 1
) -> str:
"""
Generate deterministic canonical ID for game.
Format: game_{sport}_{season}_{date}_{away}_{home}[_{sequence}]
Example: game_nba_202526_20251021_hou_okc
game_mlb_2026_20260615_bos_nyy_2 (doubleheader game 2)
"""
normalized_season = normalize_season(sport, season)
date_compact = date.replace('-', '') # YYYYMMDD
base_id = f"game_{sport.lower()}_{normalized_season}_{date_compact}_{away_abbrev.lower()}_{home_abbrev.lower()}"
if sequence > 1:
return f"{base_id}_{sequence}"
return base_id
# =============================================================================
# RESOLUTION
# =============================================================================
def build_alias_lookup(stadium_aliases: list[dict]) -> dict[str, str]:
"""
Build lookup from alias name to canonical stadium ID.
Returns: {alias_name_lower: canonical_stadium_id}
"""
lookup = {}
for alias in stadium_aliases:
alias_name = alias.get('alias_name', '').lower().strip()
canonical_id = alias.get('stadium_canonical_id', '')
if alias_name and canonical_id:
lookup[alias_name] = canonical_id
return lookup
def resolve_team(
abbrev: str,
sport: str,
teams_by_abbrev: dict[tuple[str, str], dict],
teams_by_id: dict[str, dict]
) -> Optional[dict]:
"""
Resolve team abbreviation to canonical team.
1. Try direct match by (sport, abbrev)
2. Try alias lookup
3. Return None if not found
"""
key = (sport, abbrev.upper())
# Direct match
if key in teams_by_abbrev:
return teams_by_abbrev[key]
# Alias match
if key in TEAM_ABBREV_ALIASES:
canonical_id = TEAM_ABBREV_ALIASES[key]
if canonical_id in teams_by_id:
return teams_by_id[canonical_id]
return None
def resolve_stadium_from_venue(
venue: str,
home_team: dict,
sport: str,
alias_lookup: dict[str, str],
stadiums_by_id: dict[str, dict]
) -> str:
"""
Resolve stadium canonical ID from venue name.
Strategy:
1. ALWAYS prefer home team's stadium (most reliable, sport-correct)
2. Try sport-scoped alias match (only if home team has no stadium)
3. Fall back to unknown stadium slug
For multi-sport venues (MSG, Crypto.com Arena, etc.), home team's
stadium_canonical_id is authoritative because it's already sport-scoped.
Args:
venue: Venue name from game data
home_team: Resolved home team dict
sport: Sport code (NBA, MLB, NHL)
alias_lookup: {alias_name_lower: canonical_stadium_id}
stadiums_by_id: {canonical_id: stadium_dict}
Returns:
canonical_stadium_id
"""
# Strategy 1: Home team's stadium is most reliable (sport-scoped)
if home_team:
team_stadium = home_team.get('stadium_canonical_id', '')
if team_stadium:
return team_stadium
# Strategy 2: Sport-scoped alias match (fallback for neutral sites)
venue_lower = venue.lower().strip()
sport_prefix = f"stadium_{sport.lower()}_"
if venue_lower in alias_lookup:
matched_id = alias_lookup[venue_lower]
# Only use alias if it's for the correct sport
if matched_id.startswith(sport_prefix):
return matched_id
# Strategy 3: Partial match with sport check
for alias, canonical_id in alias_lookup.items():
if len(alias) > 3 and (alias in venue_lower or venue_lower in alias):
if canonical_id.startswith(sport_prefix):
return canonical_id
# Unknown stadium
slug = venue_lower[:30].replace(' ', '_').replace('.', '')
return f"stadium_unknown_{slug}"
# =============================================================================
# CANONICALIZATION
# =============================================================================
def canonicalize_games(
raw_games: list[dict],
canonical_teams: list[dict],
stadium_aliases: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalGame], list[ResolutionWarning]]:
"""
Stage 3: Canonicalize games.
1. Resolve team abbreviations to canonical IDs
2. Resolve venues to stadium canonical IDs
3. Generate canonical game IDs (handling doubleheaders)
Args:
raw_games: List of raw game dicts
canonical_teams: List of canonical team dicts
stadium_aliases: List of stadium alias dicts
verbose: Print detailed progress
Returns:
(canonical_games, warnings)
"""
games = []
warnings = []
# Build lookups
teams_by_abbrev = {} # (sport, abbrev) -> team dict
teams_by_id = {} # canonical_id -> team dict
for team in canonical_teams:
abbrev = team['abbreviation'].upper()
sport = team['sport']
teams_by_abbrev[(sport, abbrev)] = team
teams_by_id[team['canonical_id']] = team
alias_lookup = build_alias_lookup(stadium_aliases)
stadiums_by_id = {} # Would be populated from stadiums_canonical.json if needed
# Track games for doubleheader detection
game_counts = defaultdict(int) # (date, away_id, home_id) -> count
resolved_count = 0
unresolved_teams = 0
unresolved_stadiums = 0
for raw in raw_games:
sport = raw.get('sport', '').upper()
season = raw.get('season', '')
date = raw.get('date', '')
home_abbrev = raw.get('home_team_abbrev', '').upper()
away_abbrev = raw.get('away_team_abbrev', '').upper()
venue = raw.get('venue', '')
game_key = f"{date}_{away_abbrev}_{home_abbrev}"
# Resolve teams
home_team = resolve_team(home_abbrev, sport, teams_by_abbrev, teams_by_id)
away_team = resolve_team(away_abbrev, sport, teams_by_abbrev, teams_by_id)
if not home_team:
warnings.append(ResolutionWarning(
game_key=game_key,
issue='Unknown home team',
details=f"Could not resolve home team '{home_abbrev}' for sport {sport}"
))
unresolved_teams += 1
if verbose:
print(f" WARNING: {game_key} - unknown home team {home_abbrev}")
continue
if not away_team:
warnings.append(ResolutionWarning(
game_key=game_key,
issue='Unknown away team',
details=f"Could not resolve away team '{away_abbrev}' for sport {sport}"
))
unresolved_teams += 1
if verbose:
print(f" WARNING: {game_key} - unknown away team {away_abbrev}")
continue
# Resolve stadium
stadium_canonical_id = resolve_stadium_from_venue(
venue, home_team, sport, alias_lookup, stadiums_by_id
)
if stadium_canonical_id.startswith('stadium_unknown'):
warnings.append(ResolutionWarning(
game_key=game_key,
issue='Unknown stadium',
details=f"Could not resolve venue '{venue}', using home team stadium"
))
unresolved_stadiums += 1
# Fall back to home team stadium
stadium_canonical_id = home_team.get('stadium_canonical_id', stadium_canonical_id)
# Handle doubleheaders
matchup_key = (date, away_team['canonical_id'], home_team['canonical_id'])
game_counts[matchup_key] += 1
sequence = game_counts[matchup_key]
# Generate canonical ID
canonical_id = generate_canonical_game_id(
sport, season, date,
away_team['abbreviation'], home_team['abbreviation'],
sequence
)
game = CanonicalGame(
canonical_id=canonical_id,
sport=sport,
season=season,
date=date,
time=raw.get('time'),
home_team_canonical_id=home_team['canonical_id'],
away_team_canonical_id=away_team['canonical_id'],
stadium_canonical_id=stadium_canonical_id,
is_playoff=raw.get('is_playoff', False),
broadcast=raw.get('broadcast')
)
games.append(game)
resolved_count += 1
if verbose:
print(f"\n Resolved: {resolved_count} games")
print(f" Unresolved teams: {unresolved_teams}")
print(f" Unknown stadiums (used home team): {unresolved_stadiums}")
return games, warnings
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Canonicalize game data'
)
parser.add_argument(
'--games', type=str, default='./data/games.json',
help='Input raw games JSON file'
)
parser.add_argument(
'--teams', type=str, default='./data/teams_canonical.json',
help='Input canonical teams JSON file'
)
parser.add_argument(
'--aliases', type=str, default='./data/stadium_aliases.json',
help='Input stadium aliases JSON file'
)
parser.add_argument(
'--output', type=str, default='./data',
help='Output directory for canonical files'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
args = parser.parse_args()
games_path = Path(args.games)
teams_path = Path(args.teams)
aliases_path = Path(args.aliases)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
# Load input files
print(f"Loading raw games from {games_path}...")
with open(games_path) as f:
raw_games = json.load(f)
print(f" Loaded {len(raw_games)} raw games")
print(f"Loading canonical teams from {teams_path}...")
with open(teams_path) as f:
canonical_teams = json.load(f)
print(f" Loaded {len(canonical_teams)} canonical teams")
print(f"Loading stadium aliases from {aliases_path}...")
with open(aliases_path) as f:
stadium_aliases = json.load(f)
print(f" Loaded {len(stadium_aliases)} stadium aliases")
# Canonicalize games
print("\nCanonicalizing games...")
canonical_games, warnings = canonicalize_games(
raw_games, canonical_teams, stadium_aliases, verbose=args.verbose
)
print(f" Created {len(canonical_games)} canonical games")
if warnings:
print(f"\n Warnings: {len(warnings)}")
# Group by issue type
by_issue = defaultdict(list)
for w in warnings:
by_issue[w.issue].append(w)
for issue, issue_warnings in by_issue.items():
print(f" - {issue}: {len(issue_warnings)}")
# Export
games_path = output_dir / 'games_canonical.json'
warnings_path = output_dir / 'game_resolution_warnings.json'
with open(games_path, 'w') as f:
json.dump([asdict(g) for g in canonical_games], f, indent=2)
print(f"\nExported games to {games_path}")
if warnings:
with open(warnings_path, 'w') as f:
json.dump([asdict(w) for w in warnings], f, indent=2)
print(f"Exported warnings to {warnings_path}")
# Summary by sport
print("\nSummary by sport:")
by_sport = {}
for g in canonical_games:
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
for sport, count in sorted(by_sport.items()):
print(f" {sport}: {count} games")
# Check for doubleheaders
doubleheaders = sum(1 for g in canonical_games if '_2' in g.canonical_id or '_3' in g.canonical_id)
if doubleheaders:
print(f"\n Doubleheader games detected: {doubleheaders}")
if __name__ == '__main__':
main()