Add canonical ID pipeline and fix UUID consistency for CloudKit sync
- Add local canonicalization pipeline (stadiums, teams, games) that generates deterministic canonical IDs before CloudKit upload - Fix CanonicalSyncService to use deterministic UUIDs from canonical IDs instead of random UUIDs from CloudKit records - Add SyncStadium/SyncTeam/SyncGame types to CloudKitService that preserve canonical ID relationships during sync - Add canonical ID field keys to CKModels for reading from CloudKit records - Bundle canonical JSON files (stadiums_canonical, teams_canonical, games_canonical, stadium_aliases) for consistent bootstrap data - Update BootstrapService to prefer canonical format files over legacy format This ensures all entities use consistent deterministic UUIDs derived from their canonical IDs, preventing duplicate records when syncing CloudKit data with bootstrapped local data. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
487
Scripts/canonicalize_teams.py
Normal file
487
Scripts/canonicalize_teams.py
Normal file
@@ -0,0 +1,487 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Team Canonicalization for SportsTime
|
||||
====================================
|
||||
Stage 2 of the canonicalization pipeline.
|
||||
|
||||
Generates canonical team IDs and fuzzy matches teams to stadiums.
|
||||
|
||||
Usage:
|
||||
python canonicalize_teams.py --stadiums data/stadiums_canonical.json --output data/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from dataclasses import dataclass, asdict, field
|
||||
from difflib import SequenceMatcher
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Import team mappings from scraper
|
||||
from scrape_schedules import NBA_TEAMS, MLB_TEAMS, NHL_TEAMS
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DATA CLASSES
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class CanonicalTeam:
|
||||
"""A canonicalized team with stable ID."""
|
||||
canonical_id: str
|
||||
name: str
|
||||
abbreviation: str
|
||||
sport: str
|
||||
city: str
|
||||
stadium_canonical_id: str
|
||||
conference_id: Optional[str] = None
|
||||
division_id: Optional[str] = None
|
||||
primary_color: Optional[str] = None
|
||||
secondary_color: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchWarning:
|
||||
"""Warning about a low-confidence match."""
|
||||
team_canonical_id: str
|
||||
team_name: str
|
||||
arena_name: str
|
||||
matched_stadium: Optional[str]
|
||||
issue: str
|
||||
confidence: float
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LEAGUE STRUCTURE
|
||||
# Maps team abbreviation -> (conference_id, division_id)
|
||||
# =============================================================================
|
||||
|
||||
NBA_DIVISIONS = {
|
||||
# Eastern Conference - Atlantic
|
||||
'BOS': ('nba_eastern', 'nba_atlantic'),
|
||||
'BRK': ('nba_eastern', 'nba_atlantic'),
|
||||
'NYK': ('nba_eastern', 'nba_atlantic'),
|
||||
'PHI': ('nba_eastern', 'nba_atlantic'),
|
||||
'TOR': ('nba_eastern', 'nba_atlantic'),
|
||||
# Eastern Conference - Central
|
||||
'CHI': ('nba_eastern', 'nba_central'),
|
||||
'CLE': ('nba_eastern', 'nba_central'),
|
||||
'DET': ('nba_eastern', 'nba_central'),
|
||||
'IND': ('nba_eastern', 'nba_central'),
|
||||
'MIL': ('nba_eastern', 'nba_central'),
|
||||
# Eastern Conference - Southeast
|
||||
'ATL': ('nba_eastern', 'nba_southeast'),
|
||||
'CHO': ('nba_eastern', 'nba_southeast'),
|
||||
'MIA': ('nba_eastern', 'nba_southeast'),
|
||||
'ORL': ('nba_eastern', 'nba_southeast'),
|
||||
'WAS': ('nba_eastern', 'nba_southeast'),
|
||||
# Western Conference - Northwest
|
||||
'DEN': ('nba_western', 'nba_northwest'),
|
||||
'MIN': ('nba_western', 'nba_northwest'),
|
||||
'OKC': ('nba_western', 'nba_northwest'),
|
||||
'POR': ('nba_western', 'nba_northwest'),
|
||||
'UTA': ('nba_western', 'nba_northwest'),
|
||||
# Western Conference - Pacific
|
||||
'GSW': ('nba_western', 'nba_pacific'),
|
||||
'LAC': ('nba_western', 'nba_pacific'),
|
||||
'LAL': ('nba_western', 'nba_pacific'),
|
||||
'PHO': ('nba_western', 'nba_pacific'),
|
||||
'SAC': ('nba_western', 'nba_pacific'),
|
||||
# Western Conference - Southwest
|
||||
'DAL': ('nba_western', 'nba_southwest'),
|
||||
'HOU': ('nba_western', 'nba_southwest'),
|
||||
'MEM': ('nba_western', 'nba_southwest'),
|
||||
'NOP': ('nba_western', 'nba_southwest'),
|
||||
'SAS': ('nba_western', 'nba_southwest'),
|
||||
}
|
||||
|
||||
MLB_DIVISIONS = {
|
||||
# American League - East
|
||||
'NYY': ('mlb_al', 'mlb_al_east'),
|
||||
'BOS': ('mlb_al', 'mlb_al_east'),
|
||||
'TOR': ('mlb_al', 'mlb_al_east'),
|
||||
'BAL': ('mlb_al', 'mlb_al_east'),
|
||||
'TBR': ('mlb_al', 'mlb_al_east'),
|
||||
# American League - Central
|
||||
'CLE': ('mlb_al', 'mlb_al_central'),
|
||||
'DET': ('mlb_al', 'mlb_al_central'),
|
||||
'MIN': ('mlb_al', 'mlb_al_central'),
|
||||
'CHW': ('mlb_al', 'mlb_al_central'),
|
||||
'KCR': ('mlb_al', 'mlb_al_central'),
|
||||
# American League - West
|
||||
'HOU': ('mlb_al', 'mlb_al_west'),
|
||||
'SEA': ('mlb_al', 'mlb_al_west'),
|
||||
'TEX': ('mlb_al', 'mlb_al_west'),
|
||||
'LAA': ('mlb_al', 'mlb_al_west'),
|
||||
'OAK': ('mlb_al', 'mlb_al_west'),
|
||||
# National League - East
|
||||
'ATL': ('mlb_nl', 'mlb_nl_east'),
|
||||
'PHI': ('mlb_nl', 'mlb_nl_east'),
|
||||
'NYM': ('mlb_nl', 'mlb_nl_east'),
|
||||
'MIA': ('mlb_nl', 'mlb_nl_east'),
|
||||
'WSN': ('mlb_nl', 'mlb_nl_east'),
|
||||
# National League - Central
|
||||
'MIL': ('mlb_nl', 'mlb_nl_central'),
|
||||
'CHC': ('mlb_nl', 'mlb_nl_central'),
|
||||
'STL': ('mlb_nl', 'mlb_nl_central'),
|
||||
'PIT': ('mlb_nl', 'mlb_nl_central'),
|
||||
'CIN': ('mlb_nl', 'mlb_nl_central'),
|
||||
# National League - West
|
||||
'LAD': ('mlb_nl', 'mlb_nl_west'),
|
||||
'ARI': ('mlb_nl', 'mlb_nl_west'),
|
||||
'SDP': ('mlb_nl', 'mlb_nl_west'),
|
||||
'SFG': ('mlb_nl', 'mlb_nl_west'),
|
||||
'COL': ('mlb_nl', 'mlb_nl_west'),
|
||||
}
|
||||
|
||||
NHL_DIVISIONS = {
|
||||
# Eastern Conference - Atlantic
|
||||
'BOS': ('nhl_eastern', 'nhl_atlantic'),
|
||||
'BUF': ('nhl_eastern', 'nhl_atlantic'),
|
||||
'DET': ('nhl_eastern', 'nhl_atlantic'),
|
||||
'FLA': ('nhl_eastern', 'nhl_atlantic'),
|
||||
'MTL': ('nhl_eastern', 'nhl_atlantic'),
|
||||
'OTT': ('nhl_eastern', 'nhl_atlantic'),
|
||||
'TBL': ('nhl_eastern', 'nhl_atlantic'),
|
||||
'TOR': ('nhl_eastern', 'nhl_atlantic'),
|
||||
# Eastern Conference - Metropolitan
|
||||
'CAR': ('nhl_eastern', 'nhl_metropolitan'),
|
||||
'CBJ': ('nhl_eastern', 'nhl_metropolitan'),
|
||||
'NJD': ('nhl_eastern', 'nhl_metropolitan'),
|
||||
'NYI': ('nhl_eastern', 'nhl_metropolitan'),
|
||||
'NYR': ('nhl_eastern', 'nhl_metropolitan'),
|
||||
'PHI': ('nhl_eastern', 'nhl_metropolitan'),
|
||||
'PIT': ('nhl_eastern', 'nhl_metropolitan'),
|
||||
'WSH': ('nhl_eastern', 'nhl_metropolitan'),
|
||||
# Western Conference - Central
|
||||
'ARI': ('nhl_western', 'nhl_central'), # Utah Hockey Club
|
||||
'CHI': ('nhl_western', 'nhl_central'),
|
||||
'COL': ('nhl_western', 'nhl_central'),
|
||||
'DAL': ('nhl_western', 'nhl_central'),
|
||||
'MIN': ('nhl_western', 'nhl_central'),
|
||||
'NSH': ('nhl_western', 'nhl_central'),
|
||||
'STL': ('nhl_western', 'nhl_central'),
|
||||
'WPG': ('nhl_western', 'nhl_central'),
|
||||
# Western Conference - Pacific
|
||||
'ANA': ('nhl_western', 'nhl_pacific'),
|
||||
'CGY': ('nhl_western', 'nhl_pacific'),
|
||||
'EDM': ('nhl_western', 'nhl_pacific'),
|
||||
'LAK': ('nhl_western', 'nhl_pacific'),
|
||||
'SEA': ('nhl_western', 'nhl_pacific'),
|
||||
'SJS': ('nhl_western', 'nhl_pacific'),
|
||||
'VAN': ('nhl_western', 'nhl_pacific'),
|
||||
'VGK': ('nhl_western', 'nhl_pacific'),
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# FUZZY MATCHING
|
||||
# =============================================================================
|
||||
|
||||
def normalize_for_matching(text: str) -> str:
|
||||
"""Normalize text for fuzzy matching."""
|
||||
import re
|
||||
text = text.lower().strip()
|
||||
# Remove common suffixes/prefixes
|
||||
text = re.sub(r'\s*(arena|center|stadium|field|park|centre)\s*', ' ', text)
|
||||
# Remove special characters
|
||||
text = re.sub(r'[^a-z0-9\s]', '', text)
|
||||
# Collapse spaces
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def fuzzy_match_stadium(
|
||||
team_arena_name: str,
|
||||
team_city: str,
|
||||
sport: str,
|
||||
stadiums: list[dict],
|
||||
confidence_threshold: float = 0.6
|
||||
) -> tuple[Optional[str], float]:
|
||||
"""
|
||||
Fuzzy match team's arena to a canonical stadium.
|
||||
|
||||
Matching strategy:
|
||||
- 70% weight: Name similarity (SequenceMatcher)
|
||||
- 30% weight: City match (exact=1.0, partial=0.5)
|
||||
|
||||
Args:
|
||||
team_arena_name: The arena name from team mapping
|
||||
team_city: The team's city
|
||||
sport: Sport code (NBA, MLB, NHL)
|
||||
stadiums: List of canonical stadium dicts
|
||||
confidence_threshold: Minimum confidence for a match
|
||||
|
||||
Returns:
|
||||
(canonical_stadium_id, confidence_score)
|
||||
"""
|
||||
best_match = None
|
||||
best_score = 0.0
|
||||
|
||||
# Normalize arena name
|
||||
arena_normalized = normalize_for_matching(team_arena_name)
|
||||
city_lower = team_city.lower()
|
||||
|
||||
# Filter to same sport
|
||||
sport_stadiums = [s for s in stadiums if s['sport'] == sport]
|
||||
|
||||
for stadium in sport_stadiums:
|
||||
stadium_name_normalized = normalize_for_matching(stadium['name'])
|
||||
|
||||
# Score 1: Name similarity
|
||||
name_score = SequenceMatcher(
|
||||
None,
|
||||
arena_normalized,
|
||||
stadium_name_normalized
|
||||
).ratio()
|
||||
|
||||
# Also check full names (unnormalized)
|
||||
full_name_score = SequenceMatcher(
|
||||
None,
|
||||
team_arena_name.lower(),
|
||||
stadium['name'].lower()
|
||||
).ratio()
|
||||
|
||||
# Take the better score
|
||||
name_score = max(name_score, full_name_score)
|
||||
|
||||
# Score 2: City match
|
||||
city_score = 0.0
|
||||
stadium_city_lower = stadium['city'].lower()
|
||||
|
||||
if city_lower == stadium_city_lower:
|
||||
city_score = 1.0
|
||||
elif city_lower in stadium_city_lower or stadium_city_lower in city_lower:
|
||||
city_score = 0.5
|
||||
# Check for nearby cities (e.g., "San Francisco" team but "Oakland" arena)
|
||||
nearby_cities = {
|
||||
'san francisco': ['oakland', 'san jose'],
|
||||
'new york': ['brooklyn', 'queens', 'elmont', 'newark'],
|
||||
'los angeles': ['inglewood', 'anaheim'],
|
||||
'miami': ['sunrise', 'fort lauderdale'],
|
||||
'dallas': ['arlington', 'fort worth'],
|
||||
'washington': ['landover', 'capital heights'],
|
||||
'minneapolis': ['st paul', 'st. paul'],
|
||||
'detroit': ['auburn hills', 'pontiac'],
|
||||
}
|
||||
for main_city, nearby in nearby_cities.items():
|
||||
if city_lower == main_city and stadium_city_lower in nearby:
|
||||
city_score = 0.7
|
||||
elif stadium_city_lower == main_city and city_lower in nearby:
|
||||
city_score = 0.7
|
||||
|
||||
# Combined score (weighted)
|
||||
combined = (name_score * 0.7) + (city_score * 0.3)
|
||||
|
||||
if combined > best_score:
|
||||
best_score = combined
|
||||
best_match = stadium['canonical_id']
|
||||
|
||||
if best_score >= confidence_threshold:
|
||||
return best_match, best_score
|
||||
|
||||
return None, best_score
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CANONICALIZATION
|
||||
# =============================================================================
|
||||
|
||||
def generate_canonical_team_id(sport: str, abbrev: str) -> str:
|
||||
"""
|
||||
Generate deterministic canonical ID for team.
|
||||
|
||||
Format: team_{sport}_{abbrev}
|
||||
Example: team_nba_atl
|
||||
"""
|
||||
return f"team_{sport.lower()}_{abbrev.lower()}"
|
||||
|
||||
|
||||
def canonicalize_teams(
|
||||
team_mappings: dict[str, dict],
|
||||
sport: str,
|
||||
canonical_stadiums: list[dict],
|
||||
verbose: bool = False
|
||||
) -> tuple[list[CanonicalTeam], list[MatchWarning]]:
|
||||
"""
|
||||
Stage 2: Canonicalize teams.
|
||||
|
||||
1. Generate canonical IDs from abbreviations
|
||||
2. Fuzzy match to stadiums
|
||||
3. Log low-confidence matches for review
|
||||
|
||||
Args:
|
||||
team_mappings: Team data dict (e.g., NBA_TEAMS)
|
||||
sport: Sport code
|
||||
canonical_stadiums: List of canonical stadium dicts
|
||||
verbose: Print detailed progress
|
||||
|
||||
Returns:
|
||||
(canonical_teams, warnings)
|
||||
"""
|
||||
teams = []
|
||||
warnings = []
|
||||
|
||||
# Determine arena key based on sport
|
||||
arena_key = 'arena' if sport in ['NBA', 'NHL'] else 'stadium'
|
||||
|
||||
# Get division structure
|
||||
division_map = {
|
||||
'NBA': NBA_DIVISIONS,
|
||||
'MLB': MLB_DIVISIONS,
|
||||
'NHL': NHL_DIVISIONS,
|
||||
}.get(sport, {})
|
||||
|
||||
for abbrev, info in team_mappings.items():
|
||||
canonical_id = generate_canonical_team_id(sport, abbrev)
|
||||
arena_name = info.get(arena_key, '')
|
||||
city = info.get('city', '')
|
||||
team_name = info.get('name', '')
|
||||
|
||||
# Fuzzy match stadium
|
||||
stadium_canonical_id, confidence = fuzzy_match_stadium(
|
||||
arena_name, city, sport, canonical_stadiums
|
||||
)
|
||||
|
||||
if stadium_canonical_id is None:
|
||||
warnings.append(MatchWarning(
|
||||
team_canonical_id=canonical_id,
|
||||
team_name=team_name,
|
||||
arena_name=arena_name,
|
||||
matched_stadium=None,
|
||||
issue='No stadium match found',
|
||||
confidence=confidence
|
||||
))
|
||||
# Create placeholder ID
|
||||
stadium_canonical_id = f"stadium_unknown_{sport.lower()}_{abbrev.lower()}"
|
||||
if verbose:
|
||||
print(f" WARNING: {canonical_id} - no stadium match for '{arena_name}'")
|
||||
|
||||
elif confidence < 0.8:
|
||||
warnings.append(MatchWarning(
|
||||
team_canonical_id=canonical_id,
|
||||
team_name=team_name,
|
||||
arena_name=arena_name,
|
||||
matched_stadium=stadium_canonical_id,
|
||||
issue='Low confidence stadium match',
|
||||
confidence=confidence
|
||||
))
|
||||
if verbose:
|
||||
print(f" WARNING: {canonical_id} - low confidence ({confidence:.2f}) match to {stadium_canonical_id}")
|
||||
|
||||
# Get conference/division
|
||||
conf_id, div_id = division_map.get(abbrev, (None, None))
|
||||
|
||||
team = CanonicalTeam(
|
||||
canonical_id=canonical_id,
|
||||
name=team_name,
|
||||
abbreviation=abbrev,
|
||||
sport=sport,
|
||||
city=city,
|
||||
stadium_canonical_id=stadium_canonical_id,
|
||||
conference_id=conf_id,
|
||||
division_id=div_id
|
||||
)
|
||||
teams.append(team)
|
||||
|
||||
if verbose and confidence >= 0.8:
|
||||
print(f" {canonical_id}: {team_name} -> {stadium_canonical_id} ({confidence:.2f})")
|
||||
|
||||
return teams, warnings
|
||||
|
||||
|
||||
def canonicalize_all_teams(
|
||||
canonical_stadiums: list[dict],
|
||||
verbose: bool = False
|
||||
) -> tuple[list[CanonicalTeam], list[MatchWarning]]:
|
||||
"""Canonicalize teams for all sports."""
|
||||
all_teams = []
|
||||
all_warnings = []
|
||||
|
||||
sport_mappings = [
|
||||
('NBA', NBA_TEAMS),
|
||||
('MLB', MLB_TEAMS),
|
||||
('NHL', NHL_TEAMS),
|
||||
]
|
||||
|
||||
for sport, team_map in sport_mappings:
|
||||
if verbose:
|
||||
print(f"\n{sport}:")
|
||||
|
||||
teams, warnings = canonicalize_teams(
|
||||
team_map, sport, canonical_stadiums, verbose
|
||||
)
|
||||
all_teams.extend(teams)
|
||||
all_warnings.extend(warnings)
|
||||
|
||||
return all_teams, all_warnings
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MAIN
|
||||
# =============================================================================
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Canonicalize team data'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--stadiums', type=str, default='./data/stadiums_canonical.json',
|
||||
help='Input canonical stadiums JSON file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output', type=str, default='./data',
|
||||
help='Output directory for canonical files'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v', action='store_true',
|
||||
help='Verbose output'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
stadiums_path = Path(args.stadiums)
|
||||
output_dir = Path(args.output)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load canonical stadiums
|
||||
print(f"Loading canonical stadiums from {stadiums_path}...")
|
||||
with open(stadiums_path) as f:
|
||||
canonical_stadiums = json.load(f)
|
||||
print(f" Loaded {len(canonical_stadiums)} canonical stadiums")
|
||||
|
||||
# Canonicalize teams
|
||||
print("\nCanonicalizing teams...")
|
||||
canonical_teams, warnings = canonicalize_all_teams(
|
||||
canonical_stadiums, verbose=args.verbose
|
||||
)
|
||||
print(f" Created {len(canonical_teams)} canonical teams")
|
||||
|
||||
if warnings:
|
||||
print(f"\n Warnings: {len(warnings)}")
|
||||
for w in warnings:
|
||||
print(f" - {w.team_canonical_id}: {w.issue} (confidence: {w.confidence:.2f})")
|
||||
|
||||
# Export
|
||||
teams_path = output_dir / 'teams_canonical.json'
|
||||
warnings_path = output_dir / 'team_matching_warnings.json'
|
||||
|
||||
with open(teams_path, 'w') as f:
|
||||
json.dump([asdict(t) for t in canonical_teams], f, indent=2)
|
||||
print(f"\nExported teams to {teams_path}")
|
||||
|
||||
if warnings:
|
||||
with open(warnings_path, 'w') as f:
|
||||
json.dump([asdict(w) for w in warnings], f, indent=2)
|
||||
print(f"Exported warnings to {warnings_path}")
|
||||
|
||||
# Summary by sport
|
||||
print("\nSummary by sport:")
|
||||
by_sport = {}
|
||||
for t in canonical_teams:
|
||||
by_sport[t.sport] = by_sport.get(t.sport, 0) + 1
|
||||
for sport, count in sorted(by_sport.items()):
|
||||
print(f" {sport}: {count} teams")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user