Add canonical ID pipeline and fix UUID consistency for CloudKit sync

- Add local canonicalization pipeline (stadiums, teams, games) that generates
  deterministic canonical IDs before CloudKit upload
- Fix CanonicalSyncService to use deterministic UUIDs from canonical IDs
  instead of random UUIDs from CloudKit records
- Add SyncStadium/SyncTeam/SyncGame types to CloudKitService that preserve
  canonical ID relationships during sync
- Add canonical ID field keys to CKModels for reading from CloudKit records
- Bundle canonical JSON files (stadiums_canonical, teams_canonical,
  games_canonical, stadium_aliases) for consistent bootstrap data
- Update BootstrapService to prefer canonical format files over legacy format

This ensures all entities use consistent deterministic UUIDs derived from
their canonical IDs, preventing duplicate records when syncing CloudKit
data with bootstrapped local data.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-09 10:30:09 -06:00
parent 1ee47df53e
commit 7efcea7bd4
31 changed files with 128868 additions and 282 deletions

View File

@@ -0,0 +1,487 @@
#!/usr/bin/env python3
"""
Team Canonicalization for SportsTime
====================================
Stage 2 of the canonicalization pipeline.
Generates canonical team IDs and fuzzy matches teams to stadiums.
Usage:
python canonicalize_teams.py --stadiums data/stadiums_canonical.json --output data/
"""
import argparse
import json
from dataclasses import dataclass, asdict, field
from difflib import SequenceMatcher
from pathlib import Path
from typing import Optional
# Import team mappings from scraper
from scrape_schedules import NBA_TEAMS, MLB_TEAMS, NHL_TEAMS
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class CanonicalTeam:
"""A canonicalized team with stable ID."""
canonical_id: str
name: str
abbreviation: str
sport: str
city: str
stadium_canonical_id: str
conference_id: Optional[str] = None
division_id: Optional[str] = None
primary_color: Optional[str] = None
secondary_color: Optional[str] = None
@dataclass
class MatchWarning:
"""Warning about a low-confidence match."""
team_canonical_id: str
team_name: str
arena_name: str
matched_stadium: Optional[str]
issue: str
confidence: float
# =============================================================================
# LEAGUE STRUCTURE
# Maps team abbreviation -> (conference_id, division_id)
# =============================================================================
NBA_DIVISIONS = {
# Eastern Conference - Atlantic
'BOS': ('nba_eastern', 'nba_atlantic'),
'BRK': ('nba_eastern', 'nba_atlantic'),
'NYK': ('nba_eastern', 'nba_atlantic'),
'PHI': ('nba_eastern', 'nba_atlantic'),
'TOR': ('nba_eastern', 'nba_atlantic'),
# Eastern Conference - Central
'CHI': ('nba_eastern', 'nba_central'),
'CLE': ('nba_eastern', 'nba_central'),
'DET': ('nba_eastern', 'nba_central'),
'IND': ('nba_eastern', 'nba_central'),
'MIL': ('nba_eastern', 'nba_central'),
# Eastern Conference - Southeast
'ATL': ('nba_eastern', 'nba_southeast'),
'CHO': ('nba_eastern', 'nba_southeast'),
'MIA': ('nba_eastern', 'nba_southeast'),
'ORL': ('nba_eastern', 'nba_southeast'),
'WAS': ('nba_eastern', 'nba_southeast'),
# Western Conference - Northwest
'DEN': ('nba_western', 'nba_northwest'),
'MIN': ('nba_western', 'nba_northwest'),
'OKC': ('nba_western', 'nba_northwest'),
'POR': ('nba_western', 'nba_northwest'),
'UTA': ('nba_western', 'nba_northwest'),
# Western Conference - Pacific
'GSW': ('nba_western', 'nba_pacific'),
'LAC': ('nba_western', 'nba_pacific'),
'LAL': ('nba_western', 'nba_pacific'),
'PHO': ('nba_western', 'nba_pacific'),
'SAC': ('nba_western', 'nba_pacific'),
# Western Conference - Southwest
'DAL': ('nba_western', 'nba_southwest'),
'HOU': ('nba_western', 'nba_southwest'),
'MEM': ('nba_western', 'nba_southwest'),
'NOP': ('nba_western', 'nba_southwest'),
'SAS': ('nba_western', 'nba_southwest'),
}
MLB_DIVISIONS = {
# American League - East
'NYY': ('mlb_al', 'mlb_al_east'),
'BOS': ('mlb_al', 'mlb_al_east'),
'TOR': ('mlb_al', 'mlb_al_east'),
'BAL': ('mlb_al', 'mlb_al_east'),
'TBR': ('mlb_al', 'mlb_al_east'),
# American League - Central
'CLE': ('mlb_al', 'mlb_al_central'),
'DET': ('mlb_al', 'mlb_al_central'),
'MIN': ('mlb_al', 'mlb_al_central'),
'CHW': ('mlb_al', 'mlb_al_central'),
'KCR': ('mlb_al', 'mlb_al_central'),
# American League - West
'HOU': ('mlb_al', 'mlb_al_west'),
'SEA': ('mlb_al', 'mlb_al_west'),
'TEX': ('mlb_al', 'mlb_al_west'),
'LAA': ('mlb_al', 'mlb_al_west'),
'OAK': ('mlb_al', 'mlb_al_west'),
# National League - East
'ATL': ('mlb_nl', 'mlb_nl_east'),
'PHI': ('mlb_nl', 'mlb_nl_east'),
'NYM': ('mlb_nl', 'mlb_nl_east'),
'MIA': ('mlb_nl', 'mlb_nl_east'),
'WSN': ('mlb_nl', 'mlb_nl_east'),
# National League - Central
'MIL': ('mlb_nl', 'mlb_nl_central'),
'CHC': ('mlb_nl', 'mlb_nl_central'),
'STL': ('mlb_nl', 'mlb_nl_central'),
'PIT': ('mlb_nl', 'mlb_nl_central'),
'CIN': ('mlb_nl', 'mlb_nl_central'),
# National League - West
'LAD': ('mlb_nl', 'mlb_nl_west'),
'ARI': ('mlb_nl', 'mlb_nl_west'),
'SDP': ('mlb_nl', 'mlb_nl_west'),
'SFG': ('mlb_nl', 'mlb_nl_west'),
'COL': ('mlb_nl', 'mlb_nl_west'),
}
NHL_DIVISIONS = {
# Eastern Conference - Atlantic
'BOS': ('nhl_eastern', 'nhl_atlantic'),
'BUF': ('nhl_eastern', 'nhl_atlantic'),
'DET': ('nhl_eastern', 'nhl_atlantic'),
'FLA': ('nhl_eastern', 'nhl_atlantic'),
'MTL': ('nhl_eastern', 'nhl_atlantic'),
'OTT': ('nhl_eastern', 'nhl_atlantic'),
'TBL': ('nhl_eastern', 'nhl_atlantic'),
'TOR': ('nhl_eastern', 'nhl_atlantic'),
# Eastern Conference - Metropolitan
'CAR': ('nhl_eastern', 'nhl_metropolitan'),
'CBJ': ('nhl_eastern', 'nhl_metropolitan'),
'NJD': ('nhl_eastern', 'nhl_metropolitan'),
'NYI': ('nhl_eastern', 'nhl_metropolitan'),
'NYR': ('nhl_eastern', 'nhl_metropolitan'),
'PHI': ('nhl_eastern', 'nhl_metropolitan'),
'PIT': ('nhl_eastern', 'nhl_metropolitan'),
'WSH': ('nhl_eastern', 'nhl_metropolitan'),
# Western Conference - Central
'ARI': ('nhl_western', 'nhl_central'), # Utah Hockey Club
'CHI': ('nhl_western', 'nhl_central'),
'COL': ('nhl_western', 'nhl_central'),
'DAL': ('nhl_western', 'nhl_central'),
'MIN': ('nhl_western', 'nhl_central'),
'NSH': ('nhl_western', 'nhl_central'),
'STL': ('nhl_western', 'nhl_central'),
'WPG': ('nhl_western', 'nhl_central'),
# Western Conference - Pacific
'ANA': ('nhl_western', 'nhl_pacific'),
'CGY': ('nhl_western', 'nhl_pacific'),
'EDM': ('nhl_western', 'nhl_pacific'),
'LAK': ('nhl_western', 'nhl_pacific'),
'SEA': ('nhl_western', 'nhl_pacific'),
'SJS': ('nhl_western', 'nhl_pacific'),
'VAN': ('nhl_western', 'nhl_pacific'),
'VGK': ('nhl_western', 'nhl_pacific'),
}
# =============================================================================
# FUZZY MATCHING
# =============================================================================
def normalize_for_matching(text: str) -> str:
"""Normalize text for fuzzy matching."""
import re
text = text.lower().strip()
# Remove common suffixes/prefixes
text = re.sub(r'\s*(arena|center|stadium|field|park|centre)\s*', ' ', text)
# Remove special characters
text = re.sub(r'[^a-z0-9\s]', '', text)
# Collapse spaces
text = re.sub(r'\s+', ' ', text).strip()
return text
def fuzzy_match_stadium(
team_arena_name: str,
team_city: str,
sport: str,
stadiums: list[dict],
confidence_threshold: float = 0.6
) -> tuple[Optional[str], float]:
"""
Fuzzy match team's arena to a canonical stadium.
Matching strategy:
- 70% weight: Name similarity (SequenceMatcher)
- 30% weight: City match (exact=1.0, partial=0.5)
Args:
team_arena_name: The arena name from team mapping
team_city: The team's city
sport: Sport code (NBA, MLB, NHL)
stadiums: List of canonical stadium dicts
confidence_threshold: Minimum confidence for a match
Returns:
(canonical_stadium_id, confidence_score)
"""
best_match = None
best_score = 0.0
# Normalize arena name
arena_normalized = normalize_for_matching(team_arena_name)
city_lower = team_city.lower()
# Filter to same sport
sport_stadiums = [s for s in stadiums if s['sport'] == sport]
for stadium in sport_stadiums:
stadium_name_normalized = normalize_for_matching(stadium['name'])
# Score 1: Name similarity
name_score = SequenceMatcher(
None,
arena_normalized,
stadium_name_normalized
).ratio()
# Also check full names (unnormalized)
full_name_score = SequenceMatcher(
None,
team_arena_name.lower(),
stadium['name'].lower()
).ratio()
# Take the better score
name_score = max(name_score, full_name_score)
# Score 2: City match
city_score = 0.0
stadium_city_lower = stadium['city'].lower()
if city_lower == stadium_city_lower:
city_score = 1.0
elif city_lower in stadium_city_lower or stadium_city_lower in city_lower:
city_score = 0.5
# Check for nearby cities (e.g., "San Francisco" team but "Oakland" arena)
nearby_cities = {
'san francisco': ['oakland', 'san jose'],
'new york': ['brooklyn', 'queens', 'elmont', 'newark'],
'los angeles': ['inglewood', 'anaheim'],
'miami': ['sunrise', 'fort lauderdale'],
'dallas': ['arlington', 'fort worth'],
'washington': ['landover', 'capital heights'],
'minneapolis': ['st paul', 'st. paul'],
'detroit': ['auburn hills', 'pontiac'],
}
for main_city, nearby in nearby_cities.items():
if city_lower == main_city and stadium_city_lower in nearby:
city_score = 0.7
elif stadium_city_lower == main_city and city_lower in nearby:
city_score = 0.7
# Combined score (weighted)
combined = (name_score * 0.7) + (city_score * 0.3)
if combined > best_score:
best_score = combined
best_match = stadium['canonical_id']
if best_score >= confidence_threshold:
return best_match, best_score
return None, best_score
# =============================================================================
# CANONICALIZATION
# =============================================================================
def generate_canonical_team_id(sport: str, abbrev: str) -> str:
"""
Generate deterministic canonical ID for team.
Format: team_{sport}_{abbrev}
Example: team_nba_atl
"""
return f"team_{sport.lower()}_{abbrev.lower()}"
def canonicalize_teams(
team_mappings: dict[str, dict],
sport: str,
canonical_stadiums: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalTeam], list[MatchWarning]]:
"""
Stage 2: Canonicalize teams.
1. Generate canonical IDs from abbreviations
2. Fuzzy match to stadiums
3. Log low-confidence matches for review
Args:
team_mappings: Team data dict (e.g., NBA_TEAMS)
sport: Sport code
canonical_stadiums: List of canonical stadium dicts
verbose: Print detailed progress
Returns:
(canonical_teams, warnings)
"""
teams = []
warnings = []
# Determine arena key based on sport
arena_key = 'arena' if sport in ['NBA', 'NHL'] else 'stadium'
# Get division structure
division_map = {
'NBA': NBA_DIVISIONS,
'MLB': MLB_DIVISIONS,
'NHL': NHL_DIVISIONS,
}.get(sport, {})
for abbrev, info in team_mappings.items():
canonical_id = generate_canonical_team_id(sport, abbrev)
arena_name = info.get(arena_key, '')
city = info.get('city', '')
team_name = info.get('name', '')
# Fuzzy match stadium
stadium_canonical_id, confidence = fuzzy_match_stadium(
arena_name, city, sport, canonical_stadiums
)
if stadium_canonical_id is None:
warnings.append(MatchWarning(
team_canonical_id=canonical_id,
team_name=team_name,
arena_name=arena_name,
matched_stadium=None,
issue='No stadium match found',
confidence=confidence
))
# Create placeholder ID
stadium_canonical_id = f"stadium_unknown_{sport.lower()}_{abbrev.lower()}"
if verbose:
print(f" WARNING: {canonical_id} - no stadium match for '{arena_name}'")
elif confidence < 0.8:
warnings.append(MatchWarning(
team_canonical_id=canonical_id,
team_name=team_name,
arena_name=arena_name,
matched_stadium=stadium_canonical_id,
issue='Low confidence stadium match',
confidence=confidence
))
if verbose:
print(f" WARNING: {canonical_id} - low confidence ({confidence:.2f}) match to {stadium_canonical_id}")
# Get conference/division
conf_id, div_id = division_map.get(abbrev, (None, None))
team = CanonicalTeam(
canonical_id=canonical_id,
name=team_name,
abbreviation=abbrev,
sport=sport,
city=city,
stadium_canonical_id=stadium_canonical_id,
conference_id=conf_id,
division_id=div_id
)
teams.append(team)
if verbose and confidence >= 0.8:
print(f" {canonical_id}: {team_name} -> {stadium_canonical_id} ({confidence:.2f})")
return teams, warnings
def canonicalize_all_teams(
canonical_stadiums: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalTeam], list[MatchWarning]]:
"""Canonicalize teams for all sports."""
all_teams = []
all_warnings = []
sport_mappings = [
('NBA', NBA_TEAMS),
('MLB', MLB_TEAMS),
('NHL', NHL_TEAMS),
]
for sport, team_map in sport_mappings:
if verbose:
print(f"\n{sport}:")
teams, warnings = canonicalize_teams(
team_map, sport, canonical_stadiums, verbose
)
all_teams.extend(teams)
all_warnings.extend(warnings)
return all_teams, all_warnings
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Canonicalize team data'
)
parser.add_argument(
'--stadiums', type=str, default='./data/stadiums_canonical.json',
help='Input canonical stadiums JSON file'
)
parser.add_argument(
'--output', type=str, default='./data',
help='Output directory for canonical files'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
args = parser.parse_args()
stadiums_path = Path(args.stadiums)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
# Load canonical stadiums
print(f"Loading canonical stadiums from {stadiums_path}...")
with open(stadiums_path) as f:
canonical_stadiums = json.load(f)
print(f" Loaded {len(canonical_stadiums)} canonical stadiums")
# Canonicalize teams
print("\nCanonicalizing teams...")
canonical_teams, warnings = canonicalize_all_teams(
canonical_stadiums, verbose=args.verbose
)
print(f" Created {len(canonical_teams)} canonical teams")
if warnings:
print(f"\n Warnings: {len(warnings)}")
for w in warnings:
print(f" - {w.team_canonical_id}: {w.issue} (confidence: {w.confidence:.2f})")
# Export
teams_path = output_dir / 'teams_canonical.json'
warnings_path = output_dir / 'team_matching_warnings.json'
with open(teams_path, 'w') as f:
json.dump([asdict(t) for t in canonical_teams], f, indent=2)
print(f"\nExported teams to {teams_path}")
if warnings:
with open(warnings_path, 'w') as f:
json.dump([asdict(w) for w in warnings], f, indent=2)
print(f"Exported warnings to {warnings_path}")
# Summary by sport
print("\nSummary by sport:")
by_sport = {}
for t in canonical_teams:
by_sport[t.sport] = by_sport.get(t.sport, 0) + 1
for sport, count in sorted(by_sport.items()):
print(f" {sport}: {count} teams")
if __name__ == '__main__':
main()