Add canonical ID pipeline and fix UUID consistency for CloudKit sync

- Add local canonicalization pipeline (stadiums, teams, games) that generates
  deterministic canonical IDs before CloudKit upload
- Fix CanonicalSyncService to use deterministic UUIDs from canonical IDs
  instead of random UUIDs from CloudKit records
- Add SyncStadium/SyncTeam/SyncGame types to CloudKitService that preserve
  canonical ID relationships during sync
- Add canonical ID field keys to CKModels for reading from CloudKit records
- Bundle canonical JSON files (stadiums_canonical, teams_canonical,
  games_canonical, stadium_aliases) for consistent bootstrap data
- Update BootstrapService to prefer canonical format files over legacy format

This ensures all entities use consistent deterministic UUIDs derived from
their canonical IDs, preventing duplicate records when syncing CloudKit
data with bootstrapped local data.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-09 10:30:09 -06:00
parent 1ee47df53e
commit 7efcea7bd4
31 changed files with 128868 additions and 282 deletions

View File

@@ -0,0 +1,462 @@
#!/usr/bin/env python3
"""
Game Canonicalization for SportsTime
====================================
Stage 3 of the canonicalization pipeline.
Resolves team and stadium references in games, generates canonical game IDs.
Usage:
python canonicalize_games.py --games data/games.json --teams data/teams_canonical.json \
--aliases data/stadium_aliases.json --output data/
"""
import argparse
import json
from collections import defaultdict
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class CanonicalGame:
"""A canonicalized game with stable ID and resolved references."""
canonical_id: str
sport: str
season: str
date: str # YYYY-MM-DD
time: Optional[str]
home_team_canonical_id: str
away_team_canonical_id: str
stadium_canonical_id: str
is_playoff: bool = False
broadcast: Optional[str] = None
@dataclass
class ResolutionWarning:
"""Warning about a resolution issue."""
game_key: str
issue: str
details: str
# =============================================================================
# TEAM ABBREVIATION ALIASES
# Maps alternative abbreviations to canonical team IDs
# =============================================================================
TEAM_ABBREV_ALIASES = {
# NBA
('NBA', 'PHX'): 'team_nba_pho', # Phoenix
('NBA', 'BKN'): 'team_nba_brk', # Brooklyn
('NBA', 'CHA'): 'team_nba_cho', # Charlotte (older abbrev)
('NBA', 'NOP'): 'team_nba_nop', # New Orleans
('NBA', 'NO'): 'team_nba_nop', # New Orleans alt
('NBA', 'NY'): 'team_nba_nyk', # New York
('NBA', 'SA'): 'team_nba_sas', # San Antonio
('NBA', 'GS'): 'team_nba_gsw', # Golden State
('NBA', 'UTAH'): 'team_nba_uta', # Utah
# MLB
('MLB', 'AZ'): 'team_mlb_ari', # Arizona
('MLB', 'CWS'): 'team_mlb_chw', # Chicago White Sox
('MLB', 'KC'): 'team_mlb_kcr', # Kansas City
('MLB', 'SD'): 'team_mlb_sdp', # San Diego
('MLB', 'SF'): 'team_mlb_sfg', # San Francisco
('MLB', 'TB'): 'team_mlb_tbr', # Tampa Bay
('MLB', 'WSH'): 'team_mlb_wsn', # Washington
('MLB', 'WAS'): 'team_mlb_wsn', # Washington alt
('MLB', 'LA'): 'team_mlb_lad', # Los Angeles Dodgers
('MLB', 'ATH'): 'team_mlb_oak', # Oakland Athletics
# NHL
('NHL', 'ARI'): 'team_nhl_ari', # Arizona/Utah
('NHL', 'UTA'): 'team_nhl_ari', # Utah Hockey Club (uses ARI code)
('NHL', 'VGS'): 'team_nhl_vgk', # Vegas
('NHL', 'TB'): 'team_nhl_tbl', # Tampa Bay Lightning
('NHL', 'NJ'): 'team_nhl_njd', # New Jersey
('NHL', 'SJ'): 'team_nhl_sjs', # San Jose
('NHL', 'LA'): 'team_nhl_lak', # Los Angeles Kings
('NHL', 'MON'): 'team_nhl_mtl', # Montreal
}
# =============================================================================
# ID GENERATION
# =============================================================================
def normalize_season(sport: str, season: str) -> str:
"""
Normalize season format for ID generation.
NBA/NHL: "2025-26" -> "202526"
MLB: "2026" -> "2026"
"""
return season.replace('-', '')
def generate_canonical_game_id(
sport: str,
season: str,
date: str, # YYYY-MM-DD
away_abbrev: str,
home_abbrev: str,
sequence: int = 1
) -> str:
"""
Generate deterministic canonical ID for game.
Format: game_{sport}_{season}_{date}_{away}_{home}[_{sequence}]
Example: game_nba_202526_20251021_hou_okc
game_mlb_2026_20260615_bos_nyy_2 (doubleheader game 2)
"""
normalized_season = normalize_season(sport, season)
date_compact = date.replace('-', '') # YYYYMMDD
base_id = f"game_{sport.lower()}_{normalized_season}_{date_compact}_{away_abbrev.lower()}_{home_abbrev.lower()}"
if sequence > 1:
return f"{base_id}_{sequence}"
return base_id
# =============================================================================
# RESOLUTION
# =============================================================================
def build_alias_lookup(stadium_aliases: list[dict]) -> dict[str, str]:
"""
Build lookup from alias name to canonical stadium ID.
Returns: {alias_name_lower: canonical_stadium_id}
"""
lookup = {}
for alias in stadium_aliases:
alias_name = alias.get('alias_name', '').lower().strip()
canonical_id = alias.get('stadium_canonical_id', '')
if alias_name and canonical_id:
lookup[alias_name] = canonical_id
return lookup
def resolve_team(
abbrev: str,
sport: str,
teams_by_abbrev: dict[tuple[str, str], dict],
teams_by_id: dict[str, dict]
) -> Optional[dict]:
"""
Resolve team abbreviation to canonical team.
1. Try direct match by (sport, abbrev)
2. Try alias lookup
3. Return None if not found
"""
key = (sport, abbrev.upper())
# Direct match
if key in teams_by_abbrev:
return teams_by_abbrev[key]
# Alias match
if key in TEAM_ABBREV_ALIASES:
canonical_id = TEAM_ABBREV_ALIASES[key]
if canonical_id in teams_by_id:
return teams_by_id[canonical_id]
return None
def resolve_stadium_from_venue(
venue: str,
home_team: dict,
sport: str,
alias_lookup: dict[str, str],
stadiums_by_id: dict[str, dict]
) -> str:
"""
Resolve stadium canonical ID from venue name.
Strategy:
1. ALWAYS prefer home team's stadium (most reliable, sport-correct)
2. Try sport-scoped alias match (only if home team has no stadium)
3. Fall back to unknown stadium slug
For multi-sport venues (MSG, Crypto.com Arena, etc.), home team's
stadium_canonical_id is authoritative because it's already sport-scoped.
Args:
venue: Venue name from game data
home_team: Resolved home team dict
sport: Sport code (NBA, MLB, NHL)
alias_lookup: {alias_name_lower: canonical_stadium_id}
stadiums_by_id: {canonical_id: stadium_dict}
Returns:
canonical_stadium_id
"""
# Strategy 1: Home team's stadium is most reliable (sport-scoped)
if home_team:
team_stadium = home_team.get('stadium_canonical_id', '')
if team_stadium:
return team_stadium
# Strategy 2: Sport-scoped alias match (fallback for neutral sites)
venue_lower = venue.lower().strip()
sport_prefix = f"stadium_{sport.lower()}_"
if venue_lower in alias_lookup:
matched_id = alias_lookup[venue_lower]
# Only use alias if it's for the correct sport
if matched_id.startswith(sport_prefix):
return matched_id
# Strategy 3: Partial match with sport check
for alias, canonical_id in alias_lookup.items():
if len(alias) > 3 and (alias in venue_lower or venue_lower in alias):
if canonical_id.startswith(sport_prefix):
return canonical_id
# Unknown stadium
slug = venue_lower[:30].replace(' ', '_').replace('.', '')
return f"stadium_unknown_{slug}"
# =============================================================================
# CANONICALIZATION
# =============================================================================
def canonicalize_games(
raw_games: list[dict],
canonical_teams: list[dict],
stadium_aliases: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalGame], list[ResolutionWarning]]:
"""
Stage 3: Canonicalize games.
1. Resolve team abbreviations to canonical IDs
2. Resolve venues to stadium canonical IDs
3. Generate canonical game IDs (handling doubleheaders)
Args:
raw_games: List of raw game dicts
canonical_teams: List of canonical team dicts
stadium_aliases: List of stadium alias dicts
verbose: Print detailed progress
Returns:
(canonical_games, warnings)
"""
games = []
warnings = []
# Build lookups
teams_by_abbrev = {} # (sport, abbrev) -> team dict
teams_by_id = {} # canonical_id -> team dict
for team in canonical_teams:
abbrev = team['abbreviation'].upper()
sport = team['sport']
teams_by_abbrev[(sport, abbrev)] = team
teams_by_id[team['canonical_id']] = team
alias_lookup = build_alias_lookup(stadium_aliases)
stadiums_by_id = {} # Would be populated from stadiums_canonical.json if needed
# Track games for doubleheader detection
game_counts = defaultdict(int) # (date, away_id, home_id) -> count
resolved_count = 0
unresolved_teams = 0
unresolved_stadiums = 0
for raw in raw_games:
sport = raw.get('sport', '').upper()
season = raw.get('season', '')
date = raw.get('date', '')
home_abbrev = raw.get('home_team_abbrev', '').upper()
away_abbrev = raw.get('away_team_abbrev', '').upper()
venue = raw.get('venue', '')
game_key = f"{date}_{away_abbrev}_{home_abbrev}"
# Resolve teams
home_team = resolve_team(home_abbrev, sport, teams_by_abbrev, teams_by_id)
away_team = resolve_team(away_abbrev, sport, teams_by_abbrev, teams_by_id)
if not home_team:
warnings.append(ResolutionWarning(
game_key=game_key,
issue='Unknown home team',
details=f"Could not resolve home team '{home_abbrev}' for sport {sport}"
))
unresolved_teams += 1
if verbose:
print(f" WARNING: {game_key} - unknown home team {home_abbrev}")
continue
if not away_team:
warnings.append(ResolutionWarning(
game_key=game_key,
issue='Unknown away team',
details=f"Could not resolve away team '{away_abbrev}' for sport {sport}"
))
unresolved_teams += 1
if verbose:
print(f" WARNING: {game_key} - unknown away team {away_abbrev}")
continue
# Resolve stadium
stadium_canonical_id = resolve_stadium_from_venue(
venue, home_team, sport, alias_lookup, stadiums_by_id
)
if stadium_canonical_id.startswith('stadium_unknown'):
warnings.append(ResolutionWarning(
game_key=game_key,
issue='Unknown stadium',
details=f"Could not resolve venue '{venue}', using home team stadium"
))
unresolved_stadiums += 1
# Fall back to home team stadium
stadium_canonical_id = home_team.get('stadium_canonical_id', stadium_canonical_id)
# Handle doubleheaders
matchup_key = (date, away_team['canonical_id'], home_team['canonical_id'])
game_counts[matchup_key] += 1
sequence = game_counts[matchup_key]
# Generate canonical ID
canonical_id = generate_canonical_game_id(
sport, season, date,
away_team['abbreviation'], home_team['abbreviation'],
sequence
)
game = CanonicalGame(
canonical_id=canonical_id,
sport=sport,
season=season,
date=date,
time=raw.get('time'),
home_team_canonical_id=home_team['canonical_id'],
away_team_canonical_id=away_team['canonical_id'],
stadium_canonical_id=stadium_canonical_id,
is_playoff=raw.get('is_playoff', False),
broadcast=raw.get('broadcast')
)
games.append(game)
resolved_count += 1
if verbose:
print(f"\n Resolved: {resolved_count} games")
print(f" Unresolved teams: {unresolved_teams}")
print(f" Unknown stadiums (used home team): {unresolved_stadiums}")
return games, warnings
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Canonicalize game data'
)
parser.add_argument(
'--games', type=str, default='./data/games.json',
help='Input raw games JSON file'
)
parser.add_argument(
'--teams', type=str, default='./data/teams_canonical.json',
help='Input canonical teams JSON file'
)
parser.add_argument(
'--aliases', type=str, default='./data/stadium_aliases.json',
help='Input stadium aliases JSON file'
)
parser.add_argument(
'--output', type=str, default='./data',
help='Output directory for canonical files'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
args = parser.parse_args()
games_path = Path(args.games)
teams_path = Path(args.teams)
aliases_path = Path(args.aliases)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
# Load input files
print(f"Loading raw games from {games_path}...")
with open(games_path) as f:
raw_games = json.load(f)
print(f" Loaded {len(raw_games)} raw games")
print(f"Loading canonical teams from {teams_path}...")
with open(teams_path) as f:
canonical_teams = json.load(f)
print(f" Loaded {len(canonical_teams)} canonical teams")
print(f"Loading stadium aliases from {aliases_path}...")
with open(aliases_path) as f:
stadium_aliases = json.load(f)
print(f" Loaded {len(stadium_aliases)} stadium aliases")
# Canonicalize games
print("\nCanonicalizing games...")
canonical_games, warnings = canonicalize_games(
raw_games, canonical_teams, stadium_aliases, verbose=args.verbose
)
print(f" Created {len(canonical_games)} canonical games")
if warnings:
print(f"\n Warnings: {len(warnings)}")
# Group by issue type
by_issue = defaultdict(list)
for w in warnings:
by_issue[w.issue].append(w)
for issue, issue_warnings in by_issue.items():
print(f" - {issue}: {len(issue_warnings)}")
# Export
games_path = output_dir / 'games_canonical.json'
warnings_path = output_dir / 'game_resolution_warnings.json'
with open(games_path, 'w') as f:
json.dump([asdict(g) for g in canonical_games], f, indent=2)
print(f"\nExported games to {games_path}")
if warnings:
with open(warnings_path, 'w') as f:
json.dump([asdict(w) for w in warnings], f, indent=2)
print(f"Exported warnings to {warnings_path}")
# Summary by sport
print("\nSummary by sport:")
by_sport = {}
for g in canonical_games:
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
for sport, count in sorted(by_sport.items()):
print(f" {sport}: {count} games")
# Check for doubleheaders
doubleheaders = sum(1 for g in canonical_games if '_2' in g.canonical_id or '_3' in g.canonical_id)
if doubleheaders:
print(f"\n Doubleheader games detected: {doubleheaders}")
if __name__ == '__main__':
main()