Add canonical ID pipeline and fix UUID consistency for CloudKit sync
- Add local canonicalization pipeline (stadiums, teams, games) that generates deterministic canonical IDs before CloudKit upload - Fix CanonicalSyncService to use deterministic UUIDs from canonical IDs instead of random UUIDs from CloudKit records - Add SyncStadium/SyncTeam/SyncGame types to CloudKitService that preserve canonical ID relationships during sync - Add canonical ID field keys to CKModels for reading from CloudKit records - Bundle canonical JSON files (stadiums_canonical, teams_canonical, games_canonical, stadium_aliases) for consistent bootstrap data - Update BootstrapService to prefer canonical format files over legacy format This ensures all entities use consistent deterministic UUIDs derived from their canonical IDs, preventing duplicate records when syncing CloudKit data with bootstrapped local data. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
462
Scripts/canonicalize_games.py
Normal file
462
Scripts/canonicalize_games.py
Normal file
@@ -0,0 +1,462 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Game Canonicalization for SportsTime
|
||||
====================================
|
||||
Stage 3 of the canonicalization pipeline.
|
||||
|
||||
Resolves team and stadium references in games, generates canonical game IDs.
|
||||
|
||||
Usage:
|
||||
python canonicalize_games.py --games data/games.json --teams data/teams_canonical.json \
|
||||
--aliases data/stadium_aliases.json --output data/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DATA CLASSES
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class CanonicalGame:
|
||||
"""A canonicalized game with stable ID and resolved references."""
|
||||
canonical_id: str
|
||||
sport: str
|
||||
season: str
|
||||
date: str # YYYY-MM-DD
|
||||
time: Optional[str]
|
||||
home_team_canonical_id: str
|
||||
away_team_canonical_id: str
|
||||
stadium_canonical_id: str
|
||||
is_playoff: bool = False
|
||||
broadcast: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResolutionWarning:
|
||||
"""Warning about a resolution issue."""
|
||||
game_key: str
|
||||
issue: str
|
||||
details: str
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TEAM ABBREVIATION ALIASES
|
||||
# Maps alternative abbreviations to canonical team IDs
|
||||
# =============================================================================
|
||||
|
||||
TEAM_ABBREV_ALIASES = {
|
||||
# NBA
|
||||
('NBA', 'PHX'): 'team_nba_pho', # Phoenix
|
||||
('NBA', 'BKN'): 'team_nba_brk', # Brooklyn
|
||||
('NBA', 'CHA'): 'team_nba_cho', # Charlotte (older abbrev)
|
||||
('NBA', 'NOP'): 'team_nba_nop', # New Orleans
|
||||
('NBA', 'NO'): 'team_nba_nop', # New Orleans alt
|
||||
('NBA', 'NY'): 'team_nba_nyk', # New York
|
||||
('NBA', 'SA'): 'team_nba_sas', # San Antonio
|
||||
('NBA', 'GS'): 'team_nba_gsw', # Golden State
|
||||
('NBA', 'UTAH'): 'team_nba_uta', # Utah
|
||||
|
||||
# MLB
|
||||
('MLB', 'AZ'): 'team_mlb_ari', # Arizona
|
||||
('MLB', 'CWS'): 'team_mlb_chw', # Chicago White Sox
|
||||
('MLB', 'KC'): 'team_mlb_kcr', # Kansas City
|
||||
('MLB', 'SD'): 'team_mlb_sdp', # San Diego
|
||||
('MLB', 'SF'): 'team_mlb_sfg', # San Francisco
|
||||
('MLB', 'TB'): 'team_mlb_tbr', # Tampa Bay
|
||||
('MLB', 'WSH'): 'team_mlb_wsn', # Washington
|
||||
('MLB', 'WAS'): 'team_mlb_wsn', # Washington alt
|
||||
('MLB', 'LA'): 'team_mlb_lad', # Los Angeles Dodgers
|
||||
('MLB', 'ATH'): 'team_mlb_oak', # Oakland Athletics
|
||||
|
||||
# NHL
|
||||
('NHL', 'ARI'): 'team_nhl_ari', # Arizona/Utah
|
||||
('NHL', 'UTA'): 'team_nhl_ari', # Utah Hockey Club (uses ARI code)
|
||||
('NHL', 'VGS'): 'team_nhl_vgk', # Vegas
|
||||
('NHL', 'TB'): 'team_nhl_tbl', # Tampa Bay Lightning
|
||||
('NHL', 'NJ'): 'team_nhl_njd', # New Jersey
|
||||
('NHL', 'SJ'): 'team_nhl_sjs', # San Jose
|
||||
('NHL', 'LA'): 'team_nhl_lak', # Los Angeles Kings
|
||||
('NHL', 'MON'): 'team_nhl_mtl', # Montreal
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# ID GENERATION
|
||||
# =============================================================================
|
||||
|
||||
def normalize_season(sport: str, season: str) -> str:
|
||||
"""
|
||||
Normalize season format for ID generation.
|
||||
|
||||
NBA/NHL: "2025-26" -> "202526"
|
||||
MLB: "2026" -> "2026"
|
||||
"""
|
||||
return season.replace('-', '')
|
||||
|
||||
|
||||
def generate_canonical_game_id(
|
||||
sport: str,
|
||||
season: str,
|
||||
date: str, # YYYY-MM-DD
|
||||
away_abbrev: str,
|
||||
home_abbrev: str,
|
||||
sequence: int = 1
|
||||
) -> str:
|
||||
"""
|
||||
Generate deterministic canonical ID for game.
|
||||
|
||||
Format: game_{sport}_{season}_{date}_{away}_{home}[_{sequence}]
|
||||
Example: game_nba_202526_20251021_hou_okc
|
||||
game_mlb_2026_20260615_bos_nyy_2 (doubleheader game 2)
|
||||
"""
|
||||
normalized_season = normalize_season(sport, season)
|
||||
date_compact = date.replace('-', '') # YYYYMMDD
|
||||
|
||||
base_id = f"game_{sport.lower()}_{normalized_season}_{date_compact}_{away_abbrev.lower()}_{home_abbrev.lower()}"
|
||||
|
||||
if sequence > 1:
|
||||
return f"{base_id}_{sequence}"
|
||||
return base_id
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# RESOLUTION
|
||||
# =============================================================================
|
||||
|
||||
def build_alias_lookup(stadium_aliases: list[dict]) -> dict[str, str]:
|
||||
"""
|
||||
Build lookup from alias name to canonical stadium ID.
|
||||
|
||||
Returns: {alias_name_lower: canonical_stadium_id}
|
||||
"""
|
||||
lookup = {}
|
||||
for alias in stadium_aliases:
|
||||
alias_name = alias.get('alias_name', '').lower().strip()
|
||||
canonical_id = alias.get('stadium_canonical_id', '')
|
||||
if alias_name and canonical_id:
|
||||
lookup[alias_name] = canonical_id
|
||||
return lookup
|
||||
|
||||
|
||||
def resolve_team(
|
||||
abbrev: str,
|
||||
sport: str,
|
||||
teams_by_abbrev: dict[tuple[str, str], dict],
|
||||
teams_by_id: dict[str, dict]
|
||||
) -> Optional[dict]:
|
||||
"""
|
||||
Resolve team abbreviation to canonical team.
|
||||
|
||||
1. Try direct match by (sport, abbrev)
|
||||
2. Try alias lookup
|
||||
3. Return None if not found
|
||||
"""
|
||||
key = (sport, abbrev.upper())
|
||||
|
||||
# Direct match
|
||||
if key in teams_by_abbrev:
|
||||
return teams_by_abbrev[key]
|
||||
|
||||
# Alias match
|
||||
if key in TEAM_ABBREV_ALIASES:
|
||||
canonical_id = TEAM_ABBREV_ALIASES[key]
|
||||
if canonical_id in teams_by_id:
|
||||
return teams_by_id[canonical_id]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def resolve_stadium_from_venue(
|
||||
venue: str,
|
||||
home_team: dict,
|
||||
sport: str,
|
||||
alias_lookup: dict[str, str],
|
||||
stadiums_by_id: dict[str, dict]
|
||||
) -> str:
|
||||
"""
|
||||
Resolve stadium canonical ID from venue name.
|
||||
|
||||
Strategy:
|
||||
1. ALWAYS prefer home team's stadium (most reliable, sport-correct)
|
||||
2. Try sport-scoped alias match (only if home team has no stadium)
|
||||
3. Fall back to unknown stadium slug
|
||||
|
||||
For multi-sport venues (MSG, Crypto.com Arena, etc.), home team's
|
||||
stadium_canonical_id is authoritative because it's already sport-scoped.
|
||||
|
||||
Args:
|
||||
venue: Venue name from game data
|
||||
home_team: Resolved home team dict
|
||||
sport: Sport code (NBA, MLB, NHL)
|
||||
alias_lookup: {alias_name_lower: canonical_stadium_id}
|
||||
stadiums_by_id: {canonical_id: stadium_dict}
|
||||
|
||||
Returns:
|
||||
canonical_stadium_id
|
||||
"""
|
||||
# Strategy 1: Home team's stadium is most reliable (sport-scoped)
|
||||
if home_team:
|
||||
team_stadium = home_team.get('stadium_canonical_id', '')
|
||||
if team_stadium:
|
||||
return team_stadium
|
||||
|
||||
# Strategy 2: Sport-scoped alias match (fallback for neutral sites)
|
||||
venue_lower = venue.lower().strip()
|
||||
sport_prefix = f"stadium_{sport.lower()}_"
|
||||
|
||||
if venue_lower in alias_lookup:
|
||||
matched_id = alias_lookup[venue_lower]
|
||||
# Only use alias if it's for the correct sport
|
||||
if matched_id.startswith(sport_prefix):
|
||||
return matched_id
|
||||
|
||||
# Strategy 3: Partial match with sport check
|
||||
for alias, canonical_id in alias_lookup.items():
|
||||
if len(alias) > 3 and (alias in venue_lower or venue_lower in alias):
|
||||
if canonical_id.startswith(sport_prefix):
|
||||
return canonical_id
|
||||
|
||||
# Unknown stadium
|
||||
slug = venue_lower[:30].replace(' ', '_').replace('.', '')
|
||||
return f"stadium_unknown_{slug}"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CANONICALIZATION
|
||||
# =============================================================================
|
||||
|
||||
def canonicalize_games(
|
||||
raw_games: list[dict],
|
||||
canonical_teams: list[dict],
|
||||
stadium_aliases: list[dict],
|
||||
verbose: bool = False
|
||||
) -> tuple[list[CanonicalGame], list[ResolutionWarning]]:
|
||||
"""
|
||||
Stage 3: Canonicalize games.
|
||||
|
||||
1. Resolve team abbreviations to canonical IDs
|
||||
2. Resolve venues to stadium canonical IDs
|
||||
3. Generate canonical game IDs (handling doubleheaders)
|
||||
|
||||
Args:
|
||||
raw_games: List of raw game dicts
|
||||
canonical_teams: List of canonical team dicts
|
||||
stadium_aliases: List of stadium alias dicts
|
||||
verbose: Print detailed progress
|
||||
|
||||
Returns:
|
||||
(canonical_games, warnings)
|
||||
"""
|
||||
games = []
|
||||
warnings = []
|
||||
|
||||
# Build lookups
|
||||
teams_by_abbrev = {} # (sport, abbrev) -> team dict
|
||||
teams_by_id = {} # canonical_id -> team dict
|
||||
|
||||
for team in canonical_teams:
|
||||
abbrev = team['abbreviation'].upper()
|
||||
sport = team['sport']
|
||||
teams_by_abbrev[(sport, abbrev)] = team
|
||||
teams_by_id[team['canonical_id']] = team
|
||||
|
||||
alias_lookup = build_alias_lookup(stadium_aliases)
|
||||
stadiums_by_id = {} # Would be populated from stadiums_canonical.json if needed
|
||||
|
||||
# Track games for doubleheader detection
|
||||
game_counts = defaultdict(int) # (date, away_id, home_id) -> count
|
||||
|
||||
resolved_count = 0
|
||||
unresolved_teams = 0
|
||||
unresolved_stadiums = 0
|
||||
|
||||
for raw in raw_games:
|
||||
sport = raw.get('sport', '').upper()
|
||||
season = raw.get('season', '')
|
||||
date = raw.get('date', '')
|
||||
home_abbrev = raw.get('home_team_abbrev', '').upper()
|
||||
away_abbrev = raw.get('away_team_abbrev', '').upper()
|
||||
venue = raw.get('venue', '')
|
||||
|
||||
game_key = f"{date}_{away_abbrev}_{home_abbrev}"
|
||||
|
||||
# Resolve teams
|
||||
home_team = resolve_team(home_abbrev, sport, teams_by_abbrev, teams_by_id)
|
||||
away_team = resolve_team(away_abbrev, sport, teams_by_abbrev, teams_by_id)
|
||||
|
||||
if not home_team:
|
||||
warnings.append(ResolutionWarning(
|
||||
game_key=game_key,
|
||||
issue='Unknown home team',
|
||||
details=f"Could not resolve home team '{home_abbrev}' for sport {sport}"
|
||||
))
|
||||
unresolved_teams += 1
|
||||
if verbose:
|
||||
print(f" WARNING: {game_key} - unknown home team {home_abbrev}")
|
||||
continue
|
||||
|
||||
if not away_team:
|
||||
warnings.append(ResolutionWarning(
|
||||
game_key=game_key,
|
||||
issue='Unknown away team',
|
||||
details=f"Could not resolve away team '{away_abbrev}' for sport {sport}"
|
||||
))
|
||||
unresolved_teams += 1
|
||||
if verbose:
|
||||
print(f" WARNING: {game_key} - unknown away team {away_abbrev}")
|
||||
continue
|
||||
|
||||
# Resolve stadium
|
||||
stadium_canonical_id = resolve_stadium_from_venue(
|
||||
venue, home_team, sport, alias_lookup, stadiums_by_id
|
||||
)
|
||||
|
||||
if stadium_canonical_id.startswith('stadium_unknown'):
|
||||
warnings.append(ResolutionWarning(
|
||||
game_key=game_key,
|
||||
issue='Unknown stadium',
|
||||
details=f"Could not resolve venue '{venue}', using home team stadium"
|
||||
))
|
||||
unresolved_stadiums += 1
|
||||
# Fall back to home team stadium
|
||||
stadium_canonical_id = home_team.get('stadium_canonical_id', stadium_canonical_id)
|
||||
|
||||
# Handle doubleheaders
|
||||
matchup_key = (date, away_team['canonical_id'], home_team['canonical_id'])
|
||||
game_counts[matchup_key] += 1
|
||||
sequence = game_counts[matchup_key]
|
||||
|
||||
# Generate canonical ID
|
||||
canonical_id = generate_canonical_game_id(
|
||||
sport, season, date,
|
||||
away_team['abbreviation'], home_team['abbreviation'],
|
||||
sequence
|
||||
)
|
||||
|
||||
game = CanonicalGame(
|
||||
canonical_id=canonical_id,
|
||||
sport=sport,
|
||||
season=season,
|
||||
date=date,
|
||||
time=raw.get('time'),
|
||||
home_team_canonical_id=home_team['canonical_id'],
|
||||
away_team_canonical_id=away_team['canonical_id'],
|
||||
stadium_canonical_id=stadium_canonical_id,
|
||||
is_playoff=raw.get('is_playoff', False),
|
||||
broadcast=raw.get('broadcast')
|
||||
)
|
||||
games.append(game)
|
||||
resolved_count += 1
|
||||
|
||||
if verbose:
|
||||
print(f"\n Resolved: {resolved_count} games")
|
||||
print(f" Unresolved teams: {unresolved_teams}")
|
||||
print(f" Unknown stadiums (used home team): {unresolved_stadiums}")
|
||||
|
||||
return games, warnings
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MAIN
|
||||
# =============================================================================
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Canonicalize game data'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--games', type=str, default='./data/games.json',
|
||||
help='Input raw games JSON file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--teams', type=str, default='./data/teams_canonical.json',
|
||||
help='Input canonical teams JSON file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--aliases', type=str, default='./data/stadium_aliases.json',
|
||||
help='Input stadium aliases JSON file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output', type=str, default='./data',
|
||||
help='Output directory for canonical files'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v', action='store_true',
|
||||
help='Verbose output'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
games_path = Path(args.games)
|
||||
teams_path = Path(args.teams)
|
||||
aliases_path = Path(args.aliases)
|
||||
output_dir = Path(args.output)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load input files
|
||||
print(f"Loading raw games from {games_path}...")
|
||||
with open(games_path) as f:
|
||||
raw_games = json.load(f)
|
||||
print(f" Loaded {len(raw_games)} raw games")
|
||||
|
||||
print(f"Loading canonical teams from {teams_path}...")
|
||||
with open(teams_path) as f:
|
||||
canonical_teams = json.load(f)
|
||||
print(f" Loaded {len(canonical_teams)} canonical teams")
|
||||
|
||||
print(f"Loading stadium aliases from {aliases_path}...")
|
||||
with open(aliases_path) as f:
|
||||
stadium_aliases = json.load(f)
|
||||
print(f" Loaded {len(stadium_aliases)} stadium aliases")
|
||||
|
||||
# Canonicalize games
|
||||
print("\nCanonicalizing games...")
|
||||
canonical_games, warnings = canonicalize_games(
|
||||
raw_games, canonical_teams, stadium_aliases, verbose=args.verbose
|
||||
)
|
||||
print(f" Created {len(canonical_games)} canonical games")
|
||||
|
||||
if warnings:
|
||||
print(f"\n Warnings: {len(warnings)}")
|
||||
# Group by issue type
|
||||
by_issue = defaultdict(list)
|
||||
for w in warnings:
|
||||
by_issue[w.issue].append(w)
|
||||
for issue, issue_warnings in by_issue.items():
|
||||
print(f" - {issue}: {len(issue_warnings)}")
|
||||
|
||||
# Export
|
||||
games_path = output_dir / 'games_canonical.json'
|
||||
warnings_path = output_dir / 'game_resolution_warnings.json'
|
||||
|
||||
with open(games_path, 'w') as f:
|
||||
json.dump([asdict(g) for g in canonical_games], f, indent=2)
|
||||
print(f"\nExported games to {games_path}")
|
||||
|
||||
if warnings:
|
||||
with open(warnings_path, 'w') as f:
|
||||
json.dump([asdict(w) for w in warnings], f, indent=2)
|
||||
print(f"Exported warnings to {warnings_path}")
|
||||
|
||||
# Summary by sport
|
||||
print("\nSummary by sport:")
|
||||
by_sport = {}
|
||||
for g in canonical_games:
|
||||
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
|
||||
for sport, count in sorted(by_sport.items()):
|
||||
print(f" {sport}: {count} games")
|
||||
|
||||
# Check for doubleheaders
|
||||
doubleheaders = sum(1 for g in canonical_games if '_2' in g.canonical_id or '_3' in g.canonical_id)
|
||||
if doubleheaders:
|
||||
print(f"\n Doubleheader games detected: {doubleheaders}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user