Add canonical ID pipeline and fix UUID consistency for CloudKit sync

- Add local canonicalization pipeline (stadiums, teams, games) that generates
  deterministic canonical IDs before CloudKit upload
- Fix CanonicalSyncService to use deterministic UUIDs from canonical IDs
  instead of random UUIDs from CloudKit records
- Add SyncStadium/SyncTeam/SyncGame types to CloudKitService that preserve
  canonical ID relationships during sync
- Add canonical ID field keys to CKModels for reading from CloudKit records
- Bundle canonical JSON files (stadiums_canonical, teams_canonical,
  games_canonical, stadium_aliases) for consistent bootstrap data
- Update BootstrapService to prefer canonical format files over legacy format

This ensures all entities use consistent deterministic UUIDs derived from
their canonical IDs, preventing duplicate records when syncing CloudKit
data with bootstrapped local data.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-09 10:30:09 -06:00
parent 1ee47df53e
commit 7efcea7bd4
31 changed files with 128868 additions and 282 deletions

View File

@@ -2,7 +2,15 @@
"""
CloudKit Import Script
======================
Imports JSON data into CloudKit. Run separately from pipeline.
Imports canonical JSON data into CloudKit. Run after canonicalization pipeline.
Expected input files (from canonicalization pipeline):
- stadiums_canonical.json
- teams_canonical.json
- games_canonical.json
- stadium_aliases.json
- league_structure.json
- team_aliases.json
Setup:
1. CloudKit Dashboard > Tokens & Keys > Server-to-Server Keys
@@ -309,12 +317,35 @@ def main():
print(f"Environment: {args.env}\n")
data_dir = Path(args.data_dir)
stadiums = json.load(open(data_dir / 'stadiums.json'))
games = json.load(open(data_dir / 'games.json')) if (data_dir / 'games.json').exists() else []
# Load canonical format files (from canonicalization pipeline)
# Fall back to legacy format for backward compatibility
if (data_dir / 'stadiums_canonical.json').exists():
stadiums = json.load(open(data_dir / 'stadiums_canonical.json'))
use_canonical = True
else:
stadiums = json.load(open(data_dir / 'stadiums.json'))
use_canonical = False
if (data_dir / 'teams_canonical.json').exists():
teams = json.load(open(data_dir / 'teams_canonical.json'))
else:
teams = [] # Legacy: extracted from stadiums
if (data_dir / 'games_canonical.json').exists():
games = json.load(open(data_dir / 'games_canonical.json'))
elif (data_dir / 'games.json').exists():
games = json.load(open(data_dir / 'games.json'))
else:
games = []
league_structure = json.load(open(data_dir / 'league_structure.json')) if (data_dir / 'league_structure.json').exists() else []
team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else []
stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else []
print(f"Loaded {len(stadiums)} stadiums, {len(games)} games, {len(league_structure)} league structures, {len(team_aliases)} team aliases, {len(stadium_aliases)} stadium aliases\n")
print(f"Using {'canonical' if use_canonical else 'legacy'} format")
print(f"Loaded {len(stadiums)} stadiums, {len(teams)} teams, {len(games)} games")
print(f"Loaded {len(league_structure)} league structures, {len(team_aliases)} team aliases, {len(stadium_aliases)} stadium aliases\n")
ck = None
if not args.dry_run:
@@ -353,72 +384,135 @@ def main():
import_team_aliases = args.team_aliases_only or args.canonical_only or (not args.stadiums_only and not args.games_only and not args.league_structure_only and not args.stadium_aliases_only)
import_stadium_aliases = args.stadium_aliases_only or args.canonical_only or (not args.stadiums_only and not args.games_only and not args.league_structure_only and not args.team_aliases_only)
# Build stadium UUID lookup (stadium string ID -> UUID)
stadium_uuid_map = {s['id']: deterministic_uuid(s['id']) for s in stadiums}
# Build stadium ID lookup
# Canonical format uses canonical_id, legacy uses id
def get_stadium_id(s):
return s.get('canonical_id', s.get('id', ''))
# Import stadiums & teams
def get_team_id(t):
return t.get('canonical_id', '')
stadium_id_map = {get_stadium_id(s): deterministic_uuid(get_stadium_id(s)) for s in stadiums}
# Import stadiums
if import_stadiums:
print("--- Stadiums ---")
recs = [{
'recordType': 'Stadium', 'recordName': stadium_uuid_map[s['id']],
'fields': {
'stadiumId': {'value': stadium_uuid_map[s['id']]}, 'name': {'value': s['name']},
'city': {'value': s['city']}, 'state': {'value': s.get('state', '')},
'sport': {'value': s['sport']}, 'source': {'value': s.get('source', '')},
'teamAbbrevs': {'value': s.get('team_abbrevs', [])},
**({'location': {'value': {'latitude': s['latitude'], 'longitude': s['longitude']}}}
if s.get('latitude') else {}),
**({'capacity': {'value': s['capacity']}} if s.get('capacity') else {}),
recs = []
for s in stadiums:
stadium_id = get_stadium_id(s)
record_name = deterministic_uuid(stadium_id)
# Canonical format uses primary_team_abbrevs, legacy uses team_abbrevs
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
fields = {
'stadiumId': {'value': record_name},
'canonicalId': {'value': stadium_id}, # Store canonical_id as string
'name': {'value': s['name']},
'city': {'value': s['city']},
'state': {'value': s.get('state', '')},
'sport': {'value': s['sport']},
'source': {'value': s.get('source', 'canonical')},
'teamAbbrevs': {'value': team_abbrevs},
}
} for s in stadiums]
if s.get('latitude'):
fields['location'] = {'value': {'latitude': s['latitude'], 'longitude': s['longitude']}}
if s.get('capacity'):
fields['capacity'] = {'value': s['capacity']}
recs.append({'recordType': 'Stadium', 'recordName': record_name, 'fields': fields})
stats['stadiums'] = import_data(ck, recs, 'stadiums', args.dry_run, args.verbose)
# Import teams (canonical format has dedicated teams file)
if import_teams:
print("--- Teams ---")
teams = {}
for s in stadiums:
for abbr in s.get('team_abbrevs', []):
team_key = f"{s['sport']}_{abbr}" # Match Swift: "{sport.rawValue}_{abbrev}"
if team_key not in teams:
teams[team_key] = {'abbr': abbr, 'city': s['city'], 'sport': s['sport']}
team_uuid = deterministic_uuid(team_key)
team_map[(s['sport'], abbr)] = team_uuid
if teams:
# Canonical format: use teams_canonical.json
recs = []
for t in teams:
team_id = get_team_id(t)
record_name = deterministic_uuid(team_id)
team_map[(t['sport'], t['abbreviation'])] = record_name
recs = [{
'recordType': 'Team', 'recordName': deterministic_uuid(team_key),
'fields': {
'teamId': {'value': deterministic_uuid(team_key)},
'abbreviation': {'value': info['abbr']},
'name': {'value': info['abbr']},
'city': {'value': info['city']},
'sport': {'value': info['sport']},
}
} for team_key, info in teams.items()]
stats['teams'] = import_data(ck, recs, 'teams', args.dry_run, args.verbose)
fields = {
'teamId': {'value': record_name},
'canonicalId': {'value': team_id}, # Store canonical_id as string
'abbreviation': {'value': t['abbreviation']},
'name': {'value': t['name']},
'city': {'value': t['city']},
'sport': {'value': t['sport']},
'stadiumCanonicalId': {'value': t.get('stadium_canonical_id', '')},
}
if t.get('conference_id'):
fields['conferenceId'] = {'value': t['conference_id']}
if t.get('division_id'):
fields['divisionId'] = {'value': t['division_id']}
recs.append({'recordType': 'Team', 'recordName': record_name, 'fields': fields})
stats['teams'] = import_data(ck, recs, 'teams', args.dry_run, args.verbose)
else:
# Legacy format: extract teams from stadiums
teams_dict = {}
for s in stadiums:
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
for abbr in team_abbrevs:
team_key = f"{s['sport']}_{abbr}"
if team_key not in teams_dict:
teams_dict[team_key] = {'abbr': abbr, 'city': s['city'], 'sport': s['sport']}
team_uuid = deterministic_uuid(team_key)
team_map[(s['sport'], abbr)] = team_uuid
recs = [{
'recordType': 'Team', 'recordName': deterministic_uuid(team_key),
'fields': {
'teamId': {'value': deterministic_uuid(team_key)},
'canonicalId': {'value': team_key},
'abbreviation': {'value': info['abbr']},
'name': {'value': info['abbr']},
'city': {'value': info['city']},
'sport': {'value': info['sport']},
}
} for team_key, info in teams_dict.items()]
stats['teams'] = import_data(ck, recs, 'teams', args.dry_run, args.verbose)
# Import games
if import_games and games:
# Detect canonical game format (has canonical_id field)
use_canonical_games = games and 'canonical_id' in games[0]
# Rebuild team_map if only importing games (--games-only flag)
if not team_map:
for s in stadiums:
for abbr in s.get('team_abbrevs', []):
team_key = f"{s['sport']}_{abbr}"
team_map[(s['sport'], abbr)] = deterministic_uuid(team_key)
if teams:
# Canonical format: use teams_canonical.json
for t in teams:
team_id = get_team_id(t)
team_map[(t['sport'], t['abbreviation'])] = deterministic_uuid(team_id)
else:
# Legacy format: extract from stadiums
for s in stadiums:
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
for abbr in team_abbrevs:
team_key = f"{s['sport']}_{abbr}"
team_map[(s['sport'], abbr)] = deterministic_uuid(team_key)
# Build team -> stadium map for stadiumRef
# Build team -> stadium map for stadiumRef (legacy format needs this)
team_stadium_map = {}
for s in stadiums:
stadium_uuid = stadium_uuid_map[s['id']]
for abbr in s.get('team_abbrevs', []):
stadium_id = get_stadium_id(s)
stadium_uuid = stadium_id_map[stadium_id]
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
for abbr in team_abbrevs:
team_stadium_map[(s['sport'], abbr)] = stadium_uuid
print("--- Games ---")
print(f" Using {'canonical' if use_canonical_games else 'legacy'} game format")
# Deduplicate games by ID
# Deduplicate games by ID (canonical_id or id)
seen_ids = set()
unique_games = []
for g in games:
if g['id'] not in seen_ids:
seen_ids.add(g['id'])
game_id = g.get('canonical_id', g.get('id', ''))
if game_id not in seen_ids:
seen_ids.add(game_id)
unique_games.append(g)
if len(unique_games) < len(games):
@@ -426,13 +520,20 @@ def main():
recs = []
for g in unique_games:
game_uuid = deterministic_uuid(g['id'])
# Get game ID (canonical or legacy)
game_id = g.get('canonical_id', g.get('id', ''))
game_uuid = deterministic_uuid(game_id)
sport = g['sport']
fields = {
'gameId': {'value': game_uuid}, 'sport': {'value': sport},
'season': {'value': g.get('season', '')}, 'source': {'value': g.get('source', '')},
'gameId': {'value': game_uuid},
'canonicalId': {'value': game_id}, # Store canonical_id as string
'sport': {'value': sport},
'season': {'value': g.get('season', '')},
'source': {'value': g.get('source', 'canonical' if use_canonical_games else '')},
}
# Parse date/time
if g.get('date'):
try:
# Parse time like "7:30p" or "10:00a"
@@ -455,20 +556,38 @@ def main():
fields['dateTime'] = {'value': int(dt.timestamp() * 1000), 'type': 'TIMESTAMP'}
except Exception as e:
if args.verbose:
print(f" Warning: Failed to parse date/time for {g['id']}: {e}")
print(f" Warning: Failed to parse date/time for {game_id}: {e}")
# Team references
if use_canonical_games:
# Canonical format: extract team abbrev from canonical ID (team_nba_atl -> atl)
home_team_canonical_id = g.get('home_team_canonical_id', '')
away_team_canonical_id = g.get('away_team_canonical_id', '')
home_team_uuid = deterministic_uuid(home_team_canonical_id)
away_team_uuid = deterministic_uuid(away_team_canonical_id)
else:
# Legacy format: use abbreviations
home_team_key = f"{sport}_{g.get('home_team_abbrev', '')}"
away_team_key = f"{sport}_{g.get('away_team_abbrev', '')}"
home_team_uuid = deterministic_uuid(home_team_key)
away_team_uuid = deterministic_uuid(away_team_key)
# Team references - use (sport, abbrev) tuple for lookup
home_team_key = f"{sport}_{g.get('home_team_abbrev', '')}"
away_team_key = f"{sport}_{g.get('away_team_abbrev', '')}"
home_team_uuid = deterministic_uuid(home_team_key)
away_team_uuid = deterministic_uuid(away_team_key)
fields['homeTeamRef'] = {'value': {'recordName': home_team_uuid, 'action': 'NONE'}}
fields['awayTeamRef'] = {'value': {'recordName': away_team_uuid, 'action': 'NONE'}}
# Stadium reference - look up by home team abbrev
stadium_uuid = team_stadium_map.get((sport, g.get('home_team_abbrev', '')))
if stadium_uuid:
fields['stadiumRef'] = {'value': {'recordName': stadium_uuid, 'action': 'NONE'}}
# Stadium reference
if use_canonical_games and g.get('stadium_canonical_id'):
# Canonical format: use stadium_canonical_id directly
stadium_canonical_id = g['stadium_canonical_id']
stadium_uuid = stadium_id_map.get(stadium_canonical_id)
if stadium_uuid:
fields['stadiumRef'] = {'value': {'recordName': stadium_uuid, 'action': 'NONE'}}
fields['stadiumCanonicalId'] = {'value': stadium_canonical_id}
else:
# Legacy format: look up by home team abbrev
stadium_uuid = team_stadium_map.get((sport, g.get('home_team_abbrev', '')))
if stadium_uuid:
fields['stadiumRef'] = {'value': {'recordName': stadium_uuid, 'action': 'NONE'}}
recs.append({'recordType': 'Game', 'recordName': game_uuid, 'fields': fields})
@@ -554,9 +673,14 @@ def main():
fields['validUntil'] = {'value': int(dt.timestamp() * 1000), 'type': 'TIMESTAMP'}
except:
pass
# Extract sport from stadium_canonical_id (e.g., "stadium_nba_td_garden" -> "nba")
# This makes record names unique for shared venues (TD Garden has NBA and NHL entries)
stadium_id = sa['stadium_canonical_id']
sport = stadium_id.split('_')[1] if '_' in stadium_id else 'unknown'
record_name = f"{sport}_{sa['alias_name'].lower()}"
recs.append({
'recordType': 'StadiumAlias',
'recordName': sa['alias_name'].lower(), # Use alias_name as recordName (unique key)
'recordName': record_name,
'fields': fields
})
stats['stadium_aliases'] = import_data(ck, recs, 'stadium aliases', args.dry_run, args.verbose)