Add NFL entries to TEAM_ABBREV_ALIASES dict: - Historical relocations: OAK→LV, SD→LAC, STL→LAR - Common 3-letter variations: JAC, GNB, KAN, NWE, NOR, TAM, SFO - Direct match for WAS included for completeness Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
476 lines
16 KiB
Python
476 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Game Canonicalization for SportsTime
|
|
====================================
|
|
Stage 3 of the canonicalization pipeline.
|
|
|
|
Resolves team and stadium references in games, generates canonical game IDs.
|
|
|
|
Usage:
|
|
python canonicalize_games.py --games data/games.json --teams data/teams_canonical.json \
|
|
--aliases data/stadium_aliases.json --output data/
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass, asdict
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
|
|
# =============================================================================
|
|
# DATA CLASSES
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class CanonicalGame:
|
|
"""A canonicalized game with stable ID and resolved references."""
|
|
canonical_id: str
|
|
sport: str
|
|
season: str
|
|
date: str # YYYY-MM-DD
|
|
time: Optional[str]
|
|
home_team_canonical_id: str
|
|
away_team_canonical_id: str
|
|
stadium_canonical_id: str
|
|
is_playoff: bool = False
|
|
broadcast: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class ResolutionWarning:
|
|
"""Warning about a resolution issue."""
|
|
game_key: str
|
|
issue: str
|
|
details: str
|
|
|
|
|
|
# =============================================================================
|
|
# TEAM ABBREVIATION ALIASES
|
|
# Maps alternative abbreviations to canonical team IDs
|
|
# =============================================================================
|
|
|
|
TEAM_ABBREV_ALIASES = {
|
|
# NBA
|
|
('NBA', 'PHX'): 'team_nba_pho', # Phoenix
|
|
('NBA', 'BKN'): 'team_nba_brk', # Brooklyn
|
|
('NBA', 'CHA'): 'team_nba_cho', # Charlotte (older abbrev)
|
|
('NBA', 'NOP'): 'team_nba_nop', # New Orleans
|
|
('NBA', 'NO'): 'team_nba_nop', # New Orleans alt
|
|
('NBA', 'NY'): 'team_nba_nyk', # New York
|
|
('NBA', 'SA'): 'team_nba_sas', # San Antonio
|
|
('NBA', 'GS'): 'team_nba_gsw', # Golden State
|
|
('NBA', 'UTAH'): 'team_nba_uta', # Utah
|
|
|
|
# MLB
|
|
('MLB', 'AZ'): 'team_mlb_ari', # Arizona
|
|
('MLB', 'CWS'): 'team_mlb_chw', # Chicago White Sox
|
|
('MLB', 'KC'): 'team_mlb_kcr', # Kansas City
|
|
('MLB', 'SD'): 'team_mlb_sdp', # San Diego
|
|
('MLB', 'SF'): 'team_mlb_sfg', # San Francisco
|
|
('MLB', 'TB'): 'team_mlb_tbr', # Tampa Bay
|
|
('MLB', 'WSH'): 'team_mlb_wsn', # Washington
|
|
('MLB', 'WAS'): 'team_mlb_wsn', # Washington alt
|
|
('MLB', 'LA'): 'team_mlb_lad', # Los Angeles Dodgers
|
|
('MLB', 'ATH'): 'team_mlb_oak', # Oakland Athletics
|
|
|
|
# NHL
|
|
('NHL', 'ARI'): 'team_nhl_ari', # Arizona/Utah
|
|
('NHL', 'UTA'): 'team_nhl_ari', # Utah Hockey Club (uses ARI code)
|
|
('NHL', 'VGS'): 'team_nhl_vgk', # Vegas
|
|
('NHL', 'TB'): 'team_nhl_tbl', # Tampa Bay Lightning
|
|
('NHL', 'NJ'): 'team_nhl_njd', # New Jersey
|
|
('NHL', 'SJ'): 'team_nhl_sjs', # San Jose
|
|
('NHL', 'LA'): 'team_nhl_lak', # Los Angeles Kings
|
|
('NHL', 'MON'): 'team_nhl_mtl', # Montreal
|
|
|
|
# NFL
|
|
('NFL', 'JAC'): 'team_nfl_jax', # Jacksonville (JAC vs JAX)
|
|
('NFL', 'OAK'): 'team_nfl_lv', # Oakland → Las Vegas Raiders (moved 2020)
|
|
('NFL', 'SD'): 'team_nfl_lac', # San Diego → Los Angeles Chargers (moved 2017)
|
|
('NFL', 'STL'): 'team_nfl_lar', # St. Louis → Los Angeles Rams (moved 2016)
|
|
('NFL', 'GNB'): 'team_nfl_gb', # Green Bay alternate
|
|
('NFL', 'KAN'): 'team_nfl_kc', # Kansas City alternate
|
|
('NFL', 'NWE'): 'team_nfl_ne', # New England alternate
|
|
('NFL', 'NOR'): 'team_nfl_no', # New Orleans alternate
|
|
('NFL', 'TAM'): 'team_nfl_tb', # Tampa Bay alternate
|
|
('NFL', 'SFO'): 'team_nfl_sf', # San Francisco alternate
|
|
('NFL', 'WAS'): 'team_nfl_was', # Washington (direct match but include for completeness)
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# ID GENERATION
|
|
# =============================================================================
|
|
|
|
def normalize_season(sport: str, season: str) -> str:
|
|
"""
|
|
Normalize season format for ID generation.
|
|
|
|
NBA/NHL: "2025-26" -> "202526"
|
|
MLB: "2026" -> "2026"
|
|
"""
|
|
return season.replace('-', '')
|
|
|
|
|
|
def generate_canonical_game_id(
|
|
sport: str,
|
|
season: str,
|
|
date: str, # YYYY-MM-DD
|
|
away_abbrev: str,
|
|
home_abbrev: str,
|
|
sequence: int = 1
|
|
) -> str:
|
|
"""
|
|
Generate deterministic canonical ID for game.
|
|
|
|
Format: game_{sport}_{season}_{date}_{away}_{home}[_{sequence}]
|
|
Example: game_nba_202526_20251021_hou_okc
|
|
game_mlb_2026_20260615_bos_nyy_2 (doubleheader game 2)
|
|
"""
|
|
normalized_season = normalize_season(sport, season)
|
|
date_compact = date.replace('-', '') # YYYYMMDD
|
|
|
|
base_id = f"game_{sport.lower()}_{normalized_season}_{date_compact}_{away_abbrev.lower()}_{home_abbrev.lower()}"
|
|
|
|
if sequence > 1:
|
|
return f"{base_id}_{sequence}"
|
|
return base_id
|
|
|
|
|
|
# =============================================================================
|
|
# RESOLUTION
|
|
# =============================================================================
|
|
|
|
def build_alias_lookup(stadium_aliases: list[dict]) -> dict[str, str]:
|
|
"""
|
|
Build lookup from alias name to canonical stadium ID.
|
|
|
|
Returns: {alias_name_lower: canonical_stadium_id}
|
|
"""
|
|
lookup = {}
|
|
for alias in stadium_aliases:
|
|
alias_name = alias.get('alias_name', '').lower().strip()
|
|
canonical_id = alias.get('stadium_canonical_id', '')
|
|
if alias_name and canonical_id:
|
|
lookup[alias_name] = canonical_id
|
|
return lookup
|
|
|
|
|
|
def resolve_team(
|
|
abbrev: str,
|
|
sport: str,
|
|
teams_by_abbrev: dict[tuple[str, str], dict],
|
|
teams_by_id: dict[str, dict]
|
|
) -> Optional[dict]:
|
|
"""
|
|
Resolve team abbreviation to canonical team.
|
|
|
|
1. Try direct match by (sport, abbrev)
|
|
2. Try alias lookup
|
|
3. Return None if not found
|
|
"""
|
|
key = (sport, abbrev.upper())
|
|
|
|
# Direct match
|
|
if key in teams_by_abbrev:
|
|
return teams_by_abbrev[key]
|
|
|
|
# Alias match
|
|
if key in TEAM_ABBREV_ALIASES:
|
|
canonical_id = TEAM_ABBREV_ALIASES[key]
|
|
if canonical_id in teams_by_id:
|
|
return teams_by_id[canonical_id]
|
|
|
|
return None
|
|
|
|
|
|
def resolve_stadium_from_venue(
|
|
venue: str,
|
|
home_team: dict,
|
|
sport: str,
|
|
alias_lookup: dict[str, str],
|
|
stadiums_by_id: dict[str, dict]
|
|
) -> str:
|
|
"""
|
|
Resolve stadium canonical ID from venue name.
|
|
|
|
Strategy:
|
|
1. ALWAYS prefer home team's stadium (most reliable, sport-correct)
|
|
2. Try sport-scoped alias match (only if home team has no stadium)
|
|
3. Fall back to unknown stadium slug
|
|
|
|
For multi-sport venues (MSG, Crypto.com Arena, etc.), home team's
|
|
stadium_canonical_id is authoritative because it's already sport-scoped.
|
|
|
|
Args:
|
|
venue: Venue name from game data
|
|
home_team: Resolved home team dict
|
|
sport: Sport code (NBA, MLB, NHL)
|
|
alias_lookup: {alias_name_lower: canonical_stadium_id}
|
|
stadiums_by_id: {canonical_id: stadium_dict}
|
|
|
|
Returns:
|
|
canonical_stadium_id
|
|
"""
|
|
# Strategy 1: Home team's stadium is most reliable (sport-scoped)
|
|
if home_team:
|
|
team_stadium = home_team.get('stadium_canonical_id', '')
|
|
if team_stadium:
|
|
return team_stadium
|
|
|
|
# Strategy 2: Sport-scoped alias match (fallback for neutral sites)
|
|
venue_lower = venue.lower().strip()
|
|
sport_prefix = f"stadium_{sport.lower()}_"
|
|
|
|
if venue_lower in alias_lookup:
|
|
matched_id = alias_lookup[venue_lower]
|
|
# Only use alias if it's for the correct sport
|
|
if matched_id.startswith(sport_prefix):
|
|
return matched_id
|
|
|
|
# Strategy 3: Partial match with sport check
|
|
for alias, canonical_id in alias_lookup.items():
|
|
if len(alias) > 3 and (alias in venue_lower or venue_lower in alias):
|
|
if canonical_id.startswith(sport_prefix):
|
|
return canonical_id
|
|
|
|
# Unknown stadium
|
|
slug = venue_lower[:30].replace(' ', '_').replace('.', '')
|
|
return f"stadium_unknown_{slug}"
|
|
|
|
|
|
# =============================================================================
|
|
# CANONICALIZATION
|
|
# =============================================================================
|
|
|
|
def canonicalize_games(
|
|
raw_games: list[dict],
|
|
canonical_teams: list[dict],
|
|
stadium_aliases: list[dict],
|
|
verbose: bool = False
|
|
) -> tuple[list[CanonicalGame], list[ResolutionWarning]]:
|
|
"""
|
|
Stage 3: Canonicalize games.
|
|
|
|
1. Resolve team abbreviations to canonical IDs
|
|
2. Resolve venues to stadium canonical IDs
|
|
3. Generate canonical game IDs (handling doubleheaders)
|
|
|
|
Args:
|
|
raw_games: List of raw game dicts
|
|
canonical_teams: List of canonical team dicts
|
|
stadium_aliases: List of stadium alias dicts
|
|
verbose: Print detailed progress
|
|
|
|
Returns:
|
|
(canonical_games, warnings)
|
|
"""
|
|
games = []
|
|
warnings = []
|
|
|
|
# Build lookups
|
|
teams_by_abbrev = {} # (sport, abbrev) -> team dict
|
|
teams_by_id = {} # canonical_id -> team dict
|
|
|
|
for team in canonical_teams:
|
|
abbrev = team['abbreviation'].upper()
|
|
sport = team['sport']
|
|
teams_by_abbrev[(sport, abbrev)] = team
|
|
teams_by_id[team['canonical_id']] = team
|
|
|
|
alias_lookup = build_alias_lookup(stadium_aliases)
|
|
stadiums_by_id = {} # Would be populated from stadiums_canonical.json if needed
|
|
|
|
# Track games for doubleheader detection
|
|
game_counts = defaultdict(int) # (date, away_id, home_id) -> count
|
|
|
|
resolved_count = 0
|
|
unresolved_teams = 0
|
|
unresolved_stadiums = 0
|
|
|
|
for raw in raw_games:
|
|
sport = raw.get('sport', '').upper()
|
|
season = raw.get('season', '')
|
|
date = raw.get('date', '')
|
|
home_abbrev = raw.get('home_team_abbrev', '').upper()
|
|
away_abbrev = raw.get('away_team_abbrev', '').upper()
|
|
venue = raw.get('venue', '')
|
|
|
|
game_key = f"{date}_{away_abbrev}_{home_abbrev}"
|
|
|
|
# Resolve teams
|
|
home_team = resolve_team(home_abbrev, sport, teams_by_abbrev, teams_by_id)
|
|
away_team = resolve_team(away_abbrev, sport, teams_by_abbrev, teams_by_id)
|
|
|
|
if not home_team:
|
|
warnings.append(ResolutionWarning(
|
|
game_key=game_key,
|
|
issue='Unknown home team',
|
|
details=f"Could not resolve home team '{home_abbrev}' for sport {sport}"
|
|
))
|
|
unresolved_teams += 1
|
|
if verbose:
|
|
print(f" WARNING: {game_key} - unknown home team {home_abbrev}")
|
|
continue
|
|
|
|
if not away_team:
|
|
warnings.append(ResolutionWarning(
|
|
game_key=game_key,
|
|
issue='Unknown away team',
|
|
details=f"Could not resolve away team '{away_abbrev}' for sport {sport}"
|
|
))
|
|
unresolved_teams += 1
|
|
if verbose:
|
|
print(f" WARNING: {game_key} - unknown away team {away_abbrev}")
|
|
continue
|
|
|
|
# Resolve stadium
|
|
stadium_canonical_id = resolve_stadium_from_venue(
|
|
venue, home_team, sport, alias_lookup, stadiums_by_id
|
|
)
|
|
|
|
if stadium_canonical_id.startswith('stadium_unknown'):
|
|
warnings.append(ResolutionWarning(
|
|
game_key=game_key,
|
|
issue='Unknown stadium',
|
|
details=f"Could not resolve venue '{venue}', using home team stadium"
|
|
))
|
|
unresolved_stadiums += 1
|
|
# Fall back to home team stadium
|
|
stadium_canonical_id = home_team.get('stadium_canonical_id', stadium_canonical_id)
|
|
|
|
# Handle doubleheaders
|
|
matchup_key = (date, away_team['canonical_id'], home_team['canonical_id'])
|
|
game_counts[matchup_key] += 1
|
|
sequence = game_counts[matchup_key]
|
|
|
|
# Generate canonical ID
|
|
canonical_id = generate_canonical_game_id(
|
|
sport, season, date,
|
|
away_team['abbreviation'], home_team['abbreviation'],
|
|
sequence
|
|
)
|
|
|
|
game = CanonicalGame(
|
|
canonical_id=canonical_id,
|
|
sport=sport,
|
|
season=season,
|
|
date=date,
|
|
time=raw.get('time'),
|
|
home_team_canonical_id=home_team['canonical_id'],
|
|
away_team_canonical_id=away_team['canonical_id'],
|
|
stadium_canonical_id=stadium_canonical_id,
|
|
is_playoff=raw.get('is_playoff', False),
|
|
broadcast=raw.get('broadcast')
|
|
)
|
|
games.append(game)
|
|
resolved_count += 1
|
|
|
|
if verbose:
|
|
print(f"\n Resolved: {resolved_count} games")
|
|
print(f" Unresolved teams: {unresolved_teams}")
|
|
print(f" Unknown stadiums (used home team): {unresolved_stadiums}")
|
|
|
|
return games, warnings
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Canonicalize game data'
|
|
)
|
|
parser.add_argument(
|
|
'--games', type=str, default='./data/games.json',
|
|
help='Input raw games JSON file'
|
|
)
|
|
parser.add_argument(
|
|
'--teams', type=str, default='./data/teams_canonical.json',
|
|
help='Input canonical teams JSON file'
|
|
)
|
|
parser.add_argument(
|
|
'--aliases', type=str, default='./data/stadium_aliases.json',
|
|
help='Input stadium aliases JSON file'
|
|
)
|
|
parser.add_argument(
|
|
'--output', type=str, default='./data',
|
|
help='Output directory for canonical files'
|
|
)
|
|
parser.add_argument(
|
|
'--verbose', '-v', action='store_true',
|
|
help='Verbose output'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
games_path = Path(args.games)
|
|
teams_path = Path(args.teams)
|
|
aliases_path = Path(args.aliases)
|
|
output_dir = Path(args.output)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load input files
|
|
print(f"Loading raw games from {games_path}...")
|
|
with open(games_path) as f:
|
|
raw_games = json.load(f)
|
|
print(f" Loaded {len(raw_games)} raw games")
|
|
|
|
print(f"Loading canonical teams from {teams_path}...")
|
|
with open(teams_path) as f:
|
|
canonical_teams = json.load(f)
|
|
print(f" Loaded {len(canonical_teams)} canonical teams")
|
|
|
|
print(f"Loading stadium aliases from {aliases_path}...")
|
|
with open(aliases_path) as f:
|
|
stadium_aliases = json.load(f)
|
|
print(f" Loaded {len(stadium_aliases)} stadium aliases")
|
|
|
|
# Canonicalize games
|
|
print("\nCanonicalizing games...")
|
|
canonical_games, warnings = canonicalize_games(
|
|
raw_games, canonical_teams, stadium_aliases, verbose=args.verbose
|
|
)
|
|
print(f" Created {len(canonical_games)} canonical games")
|
|
|
|
if warnings:
|
|
print(f"\n Warnings: {len(warnings)}")
|
|
# Group by issue type
|
|
by_issue = defaultdict(list)
|
|
for w in warnings:
|
|
by_issue[w.issue].append(w)
|
|
for issue, issue_warnings in by_issue.items():
|
|
print(f" - {issue}: {len(issue_warnings)}")
|
|
|
|
# Export
|
|
games_path = output_dir / 'games_canonical.json'
|
|
warnings_path = output_dir / 'game_resolution_warnings.json'
|
|
|
|
with open(games_path, 'w') as f:
|
|
json.dump([asdict(g) for g in canonical_games], f, indent=2)
|
|
print(f"\nExported games to {games_path}")
|
|
|
|
if warnings:
|
|
with open(warnings_path, 'w') as f:
|
|
json.dump([asdict(w) for w in warnings], f, indent=2)
|
|
print(f"Exported warnings to {warnings_path}")
|
|
|
|
# Summary by sport
|
|
print("\nSummary by sport:")
|
|
by_sport = {}
|
|
for g in canonical_games:
|
|
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
|
|
for sport, count in sorted(by_sport.items()):
|
|
print(f" {sport}: {count} games")
|
|
|
|
# Check for doubleheaders
|
|
doubleheaders = sum(1 for g in canonical_games if '_2' in g.canonical_id or '_3' in g.canonical_id)
|
|
if doubleheaders:
|
|
print(f"\n Doubleheader games detected: {doubleheaders}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|