Add canonical ID pipeline and fix UUID consistency for CloudKit sync
- Add local canonicalization pipeline (stadiums, teams, games) that generates deterministic canonical IDs before CloudKit upload - Fix CanonicalSyncService to use deterministic UUIDs from canonical IDs instead of random UUIDs from CloudKit records - Add SyncStadium/SyncTeam/SyncGame types to CloudKitService that preserve canonical ID relationships during sync - Add canonical ID field keys to CKModels for reading from CloudKit records - Bundle canonical JSON files (stadiums_canonical, teams_canonical, games_canonical, stadium_aliases) for consistent bootstrap data - Update BootstrapService to prefer canonical format files over legacy format This ensures all entities use consistent deterministic UUIDs derived from their canonical IDs, preventing duplicate records when syncing CloudKit data with bootstrapped local data. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
393
Scripts/canonicalize_stadiums.py
Normal file
393
Scripts/canonicalize_stadiums.py
Normal file
@@ -0,0 +1,393 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Stadium Canonicalization for SportsTime
|
||||
========================================
|
||||
Stage 1 of the canonicalization pipeline.
|
||||
|
||||
Normalizes stadium data and generates deterministic canonical IDs.
|
||||
Creates stadium name aliases for fuzzy matching during game resolution.
|
||||
|
||||
Usage:
|
||||
python canonicalize_stadiums.py --input data/stadiums.json --output data/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass, asdict, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DATA CLASSES
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class CanonicalStadium:
|
||||
"""A canonicalized stadium with stable ID."""
|
||||
canonical_id: str
|
||||
name: str
|
||||
city: str
|
||||
state: str
|
||||
latitude: float
|
||||
longitude: float
|
||||
capacity: int
|
||||
sport: str
|
||||
primary_team_abbrevs: list = field(default_factory=list)
|
||||
year_opened: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class StadiumAlias:
|
||||
"""Maps an alias name to a canonical stadium ID."""
|
||||
alias_name: str # Normalized (lowercase)
|
||||
stadium_canonical_id: str
|
||||
valid_from: Optional[str] = None
|
||||
valid_until: Optional[str] = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# HISTORICAL STADIUM ALIASES
|
||||
# Known name changes for stadiums (sponsorship changes, renames)
|
||||
# =============================================================================
|
||||
|
||||
HISTORICAL_STADIUM_ALIASES = {
|
||||
# MLB
|
||||
'stadium_mlb_minute_maid_park': [
|
||||
{'alias_name': 'daikin park', 'valid_from': '2025-01-01'},
|
||||
{'alias_name': 'enron field', 'valid_from': '2000-04-01', 'valid_until': '2002-02-28'},
|
||||
{'alias_name': 'astros field', 'valid_from': '2002-03-01', 'valid_until': '2002-06-04'},
|
||||
],
|
||||
'stadium_mlb_guaranteed_rate_field': [
|
||||
{'alias_name': 'rate field', 'valid_from': '2024-01-01'},
|
||||
{'alias_name': 'us cellular field', 'valid_from': '2003-01-01', 'valid_until': '2016-08-24'},
|
||||
{'alias_name': 'comiskey park ii', 'valid_from': '1991-04-01', 'valid_until': '2002-12-31'},
|
||||
{'alias_name': 'new comiskey park', 'valid_from': '1991-04-01', 'valid_until': '2002-12-31'},
|
||||
],
|
||||
'stadium_mlb_truist_park': [
|
||||
{'alias_name': 'suntrust park', 'valid_from': '2017-04-01', 'valid_until': '2020-01-13'},
|
||||
],
|
||||
'stadium_mlb_progressive_field': [
|
||||
{'alias_name': 'jacobs field', 'valid_from': '1994-04-01', 'valid_until': '2008-01-10'},
|
||||
{'alias_name': 'the jake', 'valid_from': '1994-04-01', 'valid_until': '2008-01-10'},
|
||||
],
|
||||
'stadium_mlb_american_family_field': [
|
||||
{'alias_name': 'miller park', 'valid_from': '2001-04-01', 'valid_until': '2020-12-31'},
|
||||
],
|
||||
'stadium_mlb_rogers_centre': [
|
||||
{'alias_name': 'skydome', 'valid_from': '1989-06-01', 'valid_until': '2005-02-01'},
|
||||
],
|
||||
'stadium_mlb_loandepot_park': [
|
||||
{'alias_name': 'marlins park', 'valid_from': '2012-04-01', 'valid_until': '2021-03-31'},
|
||||
],
|
||||
'stadium_mlb_t_mobile_park': [
|
||||
{'alias_name': 'safeco field', 'valid_from': '1999-07-01', 'valid_until': '2018-12-31'},
|
||||
],
|
||||
'stadium_mlb_oracle_park': [
|
||||
{'alias_name': 'att park', 'valid_from': '2006-01-01', 'valid_until': '2019-01-08'},
|
||||
{'alias_name': 'sbc park', 'valid_from': '2004-01-01', 'valid_until': '2005-12-31'},
|
||||
{'alias_name': 'pac bell park', 'valid_from': '2000-04-01', 'valid_until': '2003-12-31'},
|
||||
],
|
||||
'stadium_mlb_globe_life_field': [
|
||||
{'alias_name': 'choctaw stadium', 'valid_from': '2020-01-01'}, # Globe Life Field opened 2020
|
||||
],
|
||||
|
||||
# NBA
|
||||
'stadium_nba_state_farm_arena': [
|
||||
{'alias_name': 'philips arena', 'valid_from': '1999-09-01', 'valid_until': '2018-06-25'},
|
||||
],
|
||||
'stadium_nba_crypto_com_arena': [
|
||||
{'alias_name': 'staples center', 'valid_from': '1999-10-01', 'valid_until': '2021-12-24'},
|
||||
],
|
||||
'stadium_nba_kaseya_center': [
|
||||
{'alias_name': 'ftx arena', 'valid_from': '2021-06-01', 'valid_until': '2023-03-31'},
|
||||
{'alias_name': 'american airlines arena', 'valid_from': '1999-12-01', 'valid_until': '2021-05-31'},
|
||||
],
|
||||
'stadium_nba_gainbridge_fieldhouse': [
|
||||
{'alias_name': 'bankers life fieldhouse', 'valid_from': '2011-01-01', 'valid_until': '2021-12-31'},
|
||||
{'alias_name': 'conseco fieldhouse', 'valid_from': '1999-11-01', 'valid_until': '2010-12-31'},
|
||||
],
|
||||
'stadium_nba_rocket_mortgage_fieldhouse': [
|
||||
{'alias_name': 'quicken loans arena', 'valid_from': '2005-08-01', 'valid_until': '2019-08-08'},
|
||||
{'alias_name': 'gund arena', 'valid_from': '1994-10-01', 'valid_until': '2005-07-31'},
|
||||
],
|
||||
'stadium_nba_kia_center': [
|
||||
{'alias_name': 'amway center', 'valid_from': '2010-10-01', 'valid_until': '2023-07-12'},
|
||||
],
|
||||
'stadium_nba_frost_bank_center': [
|
||||
{'alias_name': 'att center', 'valid_from': '2002-10-01', 'valid_until': '2023-10-01'},
|
||||
],
|
||||
'stadium_nba_intuit_dome': [
|
||||
# New arena opened 2024, Clippers moved from Crypto.com Arena
|
||||
],
|
||||
'stadium_nba_delta_center': [
|
||||
{'alias_name': 'vivint arena', 'valid_from': '2020-12-01', 'valid_until': '2023-07-01'},
|
||||
{'alias_name': 'vivint smart home arena', 'valid_from': '2015-11-01', 'valid_until': '2020-11-30'},
|
||||
{'alias_name': 'energysolutions arena', 'valid_from': '2006-11-01', 'valid_until': '2015-10-31'},
|
||||
],
|
||||
|
||||
# NHL
|
||||
'stadium_nhl_amerant_bank_arena': [
|
||||
{'alias_name': 'fla live arena', 'valid_from': '2021-10-01', 'valid_until': '2024-05-31'},
|
||||
{'alias_name': 'bb&t center', 'valid_from': '2012-06-01', 'valid_until': '2021-09-30'},
|
||||
{'alias_name': 'bankatlantic center', 'valid_from': '2005-10-01', 'valid_until': '2012-05-31'},
|
||||
],
|
||||
'stadium_nhl_climate_pledge_arena': [
|
||||
{'alias_name': 'keyarena', 'valid_from': '1995-01-01', 'valid_until': '2018-10-01'},
|
||||
{'alias_name': 'seattle center coliseum', 'valid_from': '1962-01-01', 'valid_until': '1994-12-31'},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# SLUG GENERATION
|
||||
# =============================================================================
|
||||
|
||||
def normalize_stadium_name(name: str) -> str:
|
||||
"""
|
||||
Normalize stadium name for slug generation.
|
||||
|
||||
- Lowercase
|
||||
- Remove parentheticals like "(IV)"
|
||||
- Remove special characters except spaces
|
||||
- Collapse multiple spaces
|
||||
"""
|
||||
normalized = name.lower()
|
||||
# Remove parentheticals
|
||||
normalized = re.sub(r'\s*\([^)]*\)', '', normalized)
|
||||
# Remove special characters except spaces and alphanumeric
|
||||
normalized = re.sub(r'[^a-z0-9\s]', '', normalized)
|
||||
# Replace multiple spaces with single space
|
||||
normalized = re.sub(r'\s+', ' ', normalized).strip()
|
||||
return normalized
|
||||
|
||||
|
||||
def generate_stadium_slug(name: str) -> str:
|
||||
"""
|
||||
Generate URL-safe slug from stadium name.
|
||||
|
||||
Examples:
|
||||
"State Farm Arena" -> "state_farm_arena"
|
||||
"TD Garden" -> "td_garden"
|
||||
"Crypto.com Arena" -> "crypto_com_arena"
|
||||
"""
|
||||
normalized = normalize_stadium_name(name)
|
||||
# Replace spaces with underscores
|
||||
slug = normalized.replace(' ', '_')
|
||||
# Truncate to 50 chars
|
||||
return slug[:50]
|
||||
|
||||
|
||||
def generate_canonical_stadium_id(sport: str, name: str) -> str:
|
||||
"""
|
||||
Generate deterministic canonical ID for stadium.
|
||||
|
||||
Format: stadium_{sport}_{slug}
|
||||
Example: stadium_nba_state_farm_arena
|
||||
"""
|
||||
slug = generate_stadium_slug(name)
|
||||
return f"stadium_{sport.lower()}_{slug}"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CANONICALIZATION
|
||||
# =============================================================================
|
||||
|
||||
def canonicalize_stadiums(
|
||||
raw_stadiums: list[dict],
|
||||
verbose: bool = False
|
||||
) -> tuple[list[CanonicalStadium], list[StadiumAlias]]:
|
||||
"""
|
||||
Stage 1: Canonicalize stadiums.
|
||||
|
||||
1. Normalize names and cities
|
||||
2. Deduplicate by (sport, normalized_name, city)
|
||||
3. Generate canonical IDs
|
||||
4. Create name aliases
|
||||
|
||||
Args:
|
||||
raw_stadiums: List of raw stadium dicts from scraper
|
||||
verbose: Print detailed progress
|
||||
|
||||
Returns:
|
||||
(canonical_stadiums, aliases)
|
||||
"""
|
||||
canonical_stadiums = []
|
||||
aliases = []
|
||||
seen_keys = {} # (sport, normalized_name, city) -> canonical_id
|
||||
|
||||
for raw in raw_stadiums:
|
||||
sport = raw.get('sport', '').upper()
|
||||
name = raw.get('name', '')
|
||||
city = raw.get('city', '')
|
||||
|
||||
if not sport or not name:
|
||||
if verbose:
|
||||
print(f" Skipping invalid stadium: {raw}")
|
||||
continue
|
||||
|
||||
# Generate canonical ID
|
||||
canonical_id = generate_canonical_stadium_id(sport, name)
|
||||
|
||||
# Deduplication key (same stadium in same city for same sport)
|
||||
normalized_name = normalize_stadium_name(name)
|
||||
dedup_key = (sport, normalized_name, city.lower())
|
||||
|
||||
if dedup_key in seen_keys:
|
||||
existing_canonical_id = seen_keys[dedup_key]
|
||||
# Add as alias if the display name differs
|
||||
alias_name = name.lower().strip()
|
||||
if alias_name != normalized_name:
|
||||
aliases.append(StadiumAlias(
|
||||
alias_name=alias_name,
|
||||
stadium_canonical_id=existing_canonical_id
|
||||
))
|
||||
if verbose:
|
||||
print(f" Duplicate: {name} -> {existing_canonical_id}")
|
||||
continue
|
||||
|
||||
seen_keys[dedup_key] = canonical_id
|
||||
|
||||
# Create canonical stadium
|
||||
canonical = CanonicalStadium(
|
||||
canonical_id=canonical_id,
|
||||
name=name,
|
||||
city=city,
|
||||
state=raw.get('state', ''),
|
||||
latitude=raw.get('latitude', 0.0),
|
||||
longitude=raw.get('longitude', 0.0),
|
||||
capacity=raw.get('capacity', 0),
|
||||
sport=sport,
|
||||
primary_team_abbrevs=raw.get('team_abbrevs', []),
|
||||
year_opened=raw.get('year_opened')
|
||||
)
|
||||
canonical_stadiums.append(canonical)
|
||||
|
||||
# Add primary name as alias (normalized)
|
||||
aliases.append(StadiumAlias(
|
||||
alias_name=name.lower().strip(),
|
||||
stadium_canonical_id=canonical_id
|
||||
))
|
||||
|
||||
# Also add normalized version if different
|
||||
if normalized_name != name.lower().strip():
|
||||
aliases.append(StadiumAlias(
|
||||
alias_name=normalized_name,
|
||||
stadium_canonical_id=canonical_id
|
||||
))
|
||||
|
||||
if verbose:
|
||||
print(f" {canonical_id}: {name} ({city})")
|
||||
|
||||
return canonical_stadiums, aliases
|
||||
|
||||
|
||||
def add_historical_aliases(
|
||||
aliases: list[StadiumAlias],
|
||||
canonical_ids: set[str]
|
||||
) -> list[StadiumAlias]:
|
||||
"""
|
||||
Add historical stadium name aliases.
|
||||
|
||||
Only adds aliases for stadiums that exist in canonical_ids.
|
||||
"""
|
||||
for canonical_id, historical in HISTORICAL_STADIUM_ALIASES.items():
|
||||
if canonical_id not in canonical_ids:
|
||||
continue
|
||||
|
||||
for hist in historical:
|
||||
aliases.append(StadiumAlias(
|
||||
alias_name=hist['alias_name'],
|
||||
stadium_canonical_id=canonical_id,
|
||||
valid_from=hist.get('valid_from'),
|
||||
valid_until=hist.get('valid_until')
|
||||
))
|
||||
|
||||
return aliases
|
||||
|
||||
|
||||
def deduplicate_aliases(aliases: list[StadiumAlias]) -> list[StadiumAlias]:
|
||||
"""Remove duplicate aliases (same alias_name -> same canonical_id)."""
|
||||
seen = set()
|
||||
deduped = []
|
||||
|
||||
for alias in aliases:
|
||||
key = (alias.alias_name, alias.stadium_canonical_id)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
deduped.append(alias)
|
||||
|
||||
return deduped
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MAIN
|
||||
# =============================================================================
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Canonicalize stadium data'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--input', type=str, default='./data/stadiums.json',
|
||||
help='Input raw stadiums JSON file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output', type=str, default='./data',
|
||||
help='Output directory for canonical files'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v', action='store_true',
|
||||
help='Verbose output'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
input_path = Path(args.input)
|
||||
output_dir = Path(args.output)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load raw stadiums
|
||||
print(f"Loading raw stadiums from {input_path}...")
|
||||
with open(input_path) as f:
|
||||
raw_stadiums = json.load(f)
|
||||
print(f" Loaded {len(raw_stadiums)} raw stadiums")
|
||||
|
||||
# Canonicalize
|
||||
print("\nCanonicalizing stadiums...")
|
||||
canonical_stadiums, aliases = canonicalize_stadiums(
|
||||
raw_stadiums, verbose=args.verbose
|
||||
)
|
||||
print(f" Created {len(canonical_stadiums)} canonical stadiums")
|
||||
|
||||
# Add historical aliases
|
||||
canonical_ids = {s.canonical_id for s in canonical_stadiums}
|
||||
aliases = add_historical_aliases(aliases, canonical_ids)
|
||||
|
||||
# Deduplicate aliases
|
||||
aliases = deduplicate_aliases(aliases)
|
||||
print(f" Created {len(aliases)} stadium aliases")
|
||||
|
||||
# Export
|
||||
stadiums_path = output_dir / 'stadiums_canonical.json'
|
||||
aliases_path = output_dir / 'stadium_aliases.json'
|
||||
|
||||
with open(stadiums_path, 'w') as f:
|
||||
json.dump([asdict(s) for s in canonical_stadiums], f, indent=2)
|
||||
print(f"\nExported stadiums to {stadiums_path}")
|
||||
|
||||
with open(aliases_path, 'w') as f:
|
||||
json.dump([asdict(a) for a in aliases], f, indent=2)
|
||||
print(f"Exported aliases to {aliases_path}")
|
||||
|
||||
# Summary by sport
|
||||
print("\nSummary by sport:")
|
||||
by_sport = {}
|
||||
for s in canonical_stadiums:
|
||||
by_sport[s.sport] = by_sport.get(s.sport, 0) + 1
|
||||
for sport, count in sorted(by_sport.items()):
|
||||
print(f" {sport}: {count} stadiums")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user