Files
Sportstime/Scripts/canonicalize_stadiums.py
Trey t 7efcea7bd4 Add canonical ID pipeline and fix UUID consistency for CloudKit sync
- Add local canonicalization pipeline (stadiums, teams, games) that generates
  deterministic canonical IDs before CloudKit upload
- Fix CanonicalSyncService to use deterministic UUIDs from canonical IDs
  instead of random UUIDs from CloudKit records
- Add SyncStadium/SyncTeam/SyncGame types to CloudKitService that preserve
  canonical ID relationships during sync
- Add canonical ID field keys to CKModels for reading from CloudKit records
- Bundle canonical JSON files (stadiums_canonical, teams_canonical,
  games_canonical, stadium_aliases) for consistent bootstrap data
- Update BootstrapService to prefer canonical format files over legacy format

This ensures all entities use consistent deterministic UUIDs derived from
their canonical IDs, preventing duplicate records when syncing CloudKit
data with bootstrapped local data.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-09 10:30:09 -06:00

394 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Stadium Canonicalization for SportsTime
========================================
Stage 1 of the canonicalization pipeline.
Normalizes stadium data and generates deterministic canonical IDs.
Creates stadium name aliases for fuzzy matching during game resolution.
Usage:
python canonicalize_stadiums.py --input data/stadiums.json --output data/
"""
import argparse
import json
import re
from dataclasses import dataclass, asdict, field
from pathlib import Path
from typing import Optional
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class CanonicalStadium:
"""A canonicalized stadium with stable ID."""
canonical_id: str
name: str
city: str
state: str
latitude: float
longitude: float
capacity: int
sport: str
primary_team_abbrevs: list = field(default_factory=list)
year_opened: Optional[int] = None
@dataclass
class StadiumAlias:
"""Maps an alias name to a canonical stadium ID."""
alias_name: str # Normalized (lowercase)
stadium_canonical_id: str
valid_from: Optional[str] = None
valid_until: Optional[str] = None
# =============================================================================
# HISTORICAL STADIUM ALIASES
# Known name changes for stadiums (sponsorship changes, renames)
# =============================================================================
HISTORICAL_STADIUM_ALIASES = {
# MLB
'stadium_mlb_minute_maid_park': [
{'alias_name': 'daikin park', 'valid_from': '2025-01-01'},
{'alias_name': 'enron field', 'valid_from': '2000-04-01', 'valid_until': '2002-02-28'},
{'alias_name': 'astros field', 'valid_from': '2002-03-01', 'valid_until': '2002-06-04'},
],
'stadium_mlb_guaranteed_rate_field': [
{'alias_name': 'rate field', 'valid_from': '2024-01-01'},
{'alias_name': 'us cellular field', 'valid_from': '2003-01-01', 'valid_until': '2016-08-24'},
{'alias_name': 'comiskey park ii', 'valid_from': '1991-04-01', 'valid_until': '2002-12-31'},
{'alias_name': 'new comiskey park', 'valid_from': '1991-04-01', 'valid_until': '2002-12-31'},
],
'stadium_mlb_truist_park': [
{'alias_name': 'suntrust park', 'valid_from': '2017-04-01', 'valid_until': '2020-01-13'},
],
'stadium_mlb_progressive_field': [
{'alias_name': 'jacobs field', 'valid_from': '1994-04-01', 'valid_until': '2008-01-10'},
{'alias_name': 'the jake', 'valid_from': '1994-04-01', 'valid_until': '2008-01-10'},
],
'stadium_mlb_american_family_field': [
{'alias_name': 'miller park', 'valid_from': '2001-04-01', 'valid_until': '2020-12-31'},
],
'stadium_mlb_rogers_centre': [
{'alias_name': 'skydome', 'valid_from': '1989-06-01', 'valid_until': '2005-02-01'},
],
'stadium_mlb_loandepot_park': [
{'alias_name': 'marlins park', 'valid_from': '2012-04-01', 'valid_until': '2021-03-31'},
],
'stadium_mlb_t_mobile_park': [
{'alias_name': 'safeco field', 'valid_from': '1999-07-01', 'valid_until': '2018-12-31'},
],
'stadium_mlb_oracle_park': [
{'alias_name': 'att park', 'valid_from': '2006-01-01', 'valid_until': '2019-01-08'},
{'alias_name': 'sbc park', 'valid_from': '2004-01-01', 'valid_until': '2005-12-31'},
{'alias_name': 'pac bell park', 'valid_from': '2000-04-01', 'valid_until': '2003-12-31'},
],
'stadium_mlb_globe_life_field': [
{'alias_name': 'choctaw stadium', 'valid_from': '2020-01-01'}, # Globe Life Field opened 2020
],
# NBA
'stadium_nba_state_farm_arena': [
{'alias_name': 'philips arena', 'valid_from': '1999-09-01', 'valid_until': '2018-06-25'},
],
'stadium_nba_crypto_com_arena': [
{'alias_name': 'staples center', 'valid_from': '1999-10-01', 'valid_until': '2021-12-24'},
],
'stadium_nba_kaseya_center': [
{'alias_name': 'ftx arena', 'valid_from': '2021-06-01', 'valid_until': '2023-03-31'},
{'alias_name': 'american airlines arena', 'valid_from': '1999-12-01', 'valid_until': '2021-05-31'},
],
'stadium_nba_gainbridge_fieldhouse': [
{'alias_name': 'bankers life fieldhouse', 'valid_from': '2011-01-01', 'valid_until': '2021-12-31'},
{'alias_name': 'conseco fieldhouse', 'valid_from': '1999-11-01', 'valid_until': '2010-12-31'},
],
'stadium_nba_rocket_mortgage_fieldhouse': [
{'alias_name': 'quicken loans arena', 'valid_from': '2005-08-01', 'valid_until': '2019-08-08'},
{'alias_name': 'gund arena', 'valid_from': '1994-10-01', 'valid_until': '2005-07-31'},
],
'stadium_nba_kia_center': [
{'alias_name': 'amway center', 'valid_from': '2010-10-01', 'valid_until': '2023-07-12'},
],
'stadium_nba_frost_bank_center': [
{'alias_name': 'att center', 'valid_from': '2002-10-01', 'valid_until': '2023-10-01'},
],
'stadium_nba_intuit_dome': [
# New arena opened 2024, Clippers moved from Crypto.com Arena
],
'stadium_nba_delta_center': [
{'alias_name': 'vivint arena', 'valid_from': '2020-12-01', 'valid_until': '2023-07-01'},
{'alias_name': 'vivint smart home arena', 'valid_from': '2015-11-01', 'valid_until': '2020-11-30'},
{'alias_name': 'energysolutions arena', 'valid_from': '2006-11-01', 'valid_until': '2015-10-31'},
],
# NHL
'stadium_nhl_amerant_bank_arena': [
{'alias_name': 'fla live arena', 'valid_from': '2021-10-01', 'valid_until': '2024-05-31'},
{'alias_name': 'bb&t center', 'valid_from': '2012-06-01', 'valid_until': '2021-09-30'},
{'alias_name': 'bankatlantic center', 'valid_from': '2005-10-01', 'valid_until': '2012-05-31'},
],
'stadium_nhl_climate_pledge_arena': [
{'alias_name': 'keyarena', 'valid_from': '1995-01-01', 'valid_until': '2018-10-01'},
{'alias_name': 'seattle center coliseum', 'valid_from': '1962-01-01', 'valid_until': '1994-12-31'},
],
}
# =============================================================================
# SLUG GENERATION
# =============================================================================
def normalize_stadium_name(name: str) -> str:
"""
Normalize stadium name for slug generation.
- Lowercase
- Remove parentheticals like "(IV)"
- Remove special characters except spaces
- Collapse multiple spaces
"""
normalized = name.lower()
# Remove parentheticals
normalized = re.sub(r'\s*\([^)]*\)', '', normalized)
# Remove special characters except spaces and alphanumeric
normalized = re.sub(r'[^a-z0-9\s]', '', normalized)
# Replace multiple spaces with single space
normalized = re.sub(r'\s+', ' ', normalized).strip()
return normalized
def generate_stadium_slug(name: str) -> str:
"""
Generate URL-safe slug from stadium name.
Examples:
"State Farm Arena" -> "state_farm_arena"
"TD Garden" -> "td_garden"
"Crypto.com Arena" -> "crypto_com_arena"
"""
normalized = normalize_stadium_name(name)
# Replace spaces with underscores
slug = normalized.replace(' ', '_')
# Truncate to 50 chars
return slug[:50]
def generate_canonical_stadium_id(sport: str, name: str) -> str:
"""
Generate deterministic canonical ID for stadium.
Format: stadium_{sport}_{slug}
Example: stadium_nba_state_farm_arena
"""
slug = generate_stadium_slug(name)
return f"stadium_{sport.lower()}_{slug}"
# =============================================================================
# CANONICALIZATION
# =============================================================================
def canonicalize_stadiums(
raw_stadiums: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalStadium], list[StadiumAlias]]:
"""
Stage 1: Canonicalize stadiums.
1. Normalize names and cities
2. Deduplicate by (sport, normalized_name, city)
3. Generate canonical IDs
4. Create name aliases
Args:
raw_stadiums: List of raw stadium dicts from scraper
verbose: Print detailed progress
Returns:
(canonical_stadiums, aliases)
"""
canonical_stadiums = []
aliases = []
seen_keys = {} # (sport, normalized_name, city) -> canonical_id
for raw in raw_stadiums:
sport = raw.get('sport', '').upper()
name = raw.get('name', '')
city = raw.get('city', '')
if not sport or not name:
if verbose:
print(f" Skipping invalid stadium: {raw}")
continue
# Generate canonical ID
canonical_id = generate_canonical_stadium_id(sport, name)
# Deduplication key (same stadium in same city for same sport)
normalized_name = normalize_stadium_name(name)
dedup_key = (sport, normalized_name, city.lower())
if dedup_key in seen_keys:
existing_canonical_id = seen_keys[dedup_key]
# Add as alias if the display name differs
alias_name = name.lower().strip()
if alias_name != normalized_name:
aliases.append(StadiumAlias(
alias_name=alias_name,
stadium_canonical_id=existing_canonical_id
))
if verbose:
print(f" Duplicate: {name} -> {existing_canonical_id}")
continue
seen_keys[dedup_key] = canonical_id
# Create canonical stadium
canonical = CanonicalStadium(
canonical_id=canonical_id,
name=name,
city=city,
state=raw.get('state', ''),
latitude=raw.get('latitude', 0.0),
longitude=raw.get('longitude', 0.0),
capacity=raw.get('capacity', 0),
sport=sport,
primary_team_abbrevs=raw.get('team_abbrevs', []),
year_opened=raw.get('year_opened')
)
canonical_stadiums.append(canonical)
# Add primary name as alias (normalized)
aliases.append(StadiumAlias(
alias_name=name.lower().strip(),
stadium_canonical_id=canonical_id
))
# Also add normalized version if different
if normalized_name != name.lower().strip():
aliases.append(StadiumAlias(
alias_name=normalized_name,
stadium_canonical_id=canonical_id
))
if verbose:
print(f" {canonical_id}: {name} ({city})")
return canonical_stadiums, aliases
def add_historical_aliases(
aliases: list[StadiumAlias],
canonical_ids: set[str]
) -> list[StadiumAlias]:
"""
Add historical stadium name aliases.
Only adds aliases for stadiums that exist in canonical_ids.
"""
for canonical_id, historical in HISTORICAL_STADIUM_ALIASES.items():
if canonical_id not in canonical_ids:
continue
for hist in historical:
aliases.append(StadiumAlias(
alias_name=hist['alias_name'],
stadium_canonical_id=canonical_id,
valid_from=hist.get('valid_from'),
valid_until=hist.get('valid_until')
))
return aliases
def deduplicate_aliases(aliases: list[StadiumAlias]) -> list[StadiumAlias]:
"""Remove duplicate aliases (same alias_name -> same canonical_id)."""
seen = set()
deduped = []
for alias in aliases:
key = (alias.alias_name, alias.stadium_canonical_id)
if key not in seen:
seen.add(key)
deduped.append(alias)
return deduped
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Canonicalize stadium data'
)
parser.add_argument(
'--input', type=str, default='./data/stadiums.json',
help='Input raw stadiums JSON file'
)
parser.add_argument(
'--output', type=str, default='./data',
help='Output directory for canonical files'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
args = parser.parse_args()
input_path = Path(args.input)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
# Load raw stadiums
print(f"Loading raw stadiums from {input_path}...")
with open(input_path) as f:
raw_stadiums = json.load(f)
print(f" Loaded {len(raw_stadiums)} raw stadiums")
# Canonicalize
print("\nCanonicalizing stadiums...")
canonical_stadiums, aliases = canonicalize_stadiums(
raw_stadiums, verbose=args.verbose
)
print(f" Created {len(canonical_stadiums)} canonical stadiums")
# Add historical aliases
canonical_ids = {s.canonical_id for s in canonical_stadiums}
aliases = add_historical_aliases(aliases, canonical_ids)
# Deduplicate aliases
aliases = deduplicate_aliases(aliases)
print(f" Created {len(aliases)} stadium aliases")
# Export
stadiums_path = output_dir / 'stadiums_canonical.json'
aliases_path = output_dir / 'stadium_aliases.json'
with open(stadiums_path, 'w') as f:
json.dump([asdict(s) for s in canonical_stadiums], f, indent=2)
print(f"\nExported stadiums to {stadiums_path}")
with open(aliases_path, 'w') as f:
json.dump([asdict(a) for a in aliases], f, indent=2)
print(f"Exported aliases to {aliases_path}")
# Summary by sport
print("\nSummary by sport:")
by_sport = {}
for s in canonical_stadiums:
by_sport[s.sport] = by_sport.get(s.sport, 0) + 1
for sport, count in sorted(by_sport.items()):
print(f" {sport}: {count} stadiums")
if __name__ == '__main__':
main()