#!/usr/bin/env python3 """ Stadium Canonicalization for SportsTime ======================================== Stage 1 of the canonicalization pipeline. Normalizes stadium data and generates deterministic canonical IDs. Creates stadium name aliases for fuzzy matching during game resolution. Usage: python canonicalize_stadiums.py --input data/stadiums.json --output data/ """ import argparse import json import re from dataclasses import dataclass, asdict, field from pathlib import Path from typing import Optional # ============================================================================= # DATA CLASSES # ============================================================================= @dataclass class CanonicalStadium: """A canonicalized stadium with stable ID.""" canonical_id: str name: str city: str state: str latitude: float longitude: float capacity: int sport: str primary_team_abbrevs: list = field(default_factory=list) year_opened: Optional[int] = None @dataclass class StadiumAlias: """Maps an alias name to a canonical stadium ID.""" alias_name: str # Normalized (lowercase) stadium_canonical_id: str valid_from: Optional[str] = None valid_until: Optional[str] = None # ============================================================================= # HISTORICAL STADIUM ALIASES # Known name changes for stadiums (sponsorship changes, renames) # ============================================================================= HISTORICAL_STADIUM_ALIASES = { # MLB 'stadium_mlb_minute_maid_park': [ {'alias_name': 'daikin park', 'valid_from': '2025-01-01'}, {'alias_name': 'enron field', 'valid_from': '2000-04-01', 'valid_until': '2002-02-28'}, {'alias_name': 'astros field', 'valid_from': '2002-03-01', 'valid_until': '2002-06-04'}, ], 'stadium_mlb_guaranteed_rate_field': [ {'alias_name': 'rate field', 'valid_from': '2024-01-01'}, {'alias_name': 'us cellular field', 'valid_from': '2003-01-01', 'valid_until': '2016-08-24'}, {'alias_name': 'comiskey park ii', 'valid_from': '1991-04-01', 'valid_until': '2002-12-31'}, {'alias_name': 'new comiskey park', 'valid_from': '1991-04-01', 'valid_until': '2002-12-31'}, ], 'stadium_mlb_truist_park': [ {'alias_name': 'suntrust park', 'valid_from': '2017-04-01', 'valid_until': '2020-01-13'}, ], 'stadium_mlb_progressive_field': [ {'alias_name': 'jacobs field', 'valid_from': '1994-04-01', 'valid_until': '2008-01-10'}, {'alias_name': 'the jake', 'valid_from': '1994-04-01', 'valid_until': '2008-01-10'}, ], 'stadium_mlb_american_family_field': [ {'alias_name': 'miller park', 'valid_from': '2001-04-01', 'valid_until': '2020-12-31'}, ], 'stadium_mlb_rogers_centre': [ {'alias_name': 'skydome', 'valid_from': '1989-06-01', 'valid_until': '2005-02-01'}, ], 'stadium_mlb_loandepot_park': [ {'alias_name': 'marlins park', 'valid_from': '2012-04-01', 'valid_until': '2021-03-31'}, ], 'stadium_mlb_t_mobile_park': [ {'alias_name': 'safeco field', 'valid_from': '1999-07-01', 'valid_until': '2018-12-31'}, ], 'stadium_mlb_oracle_park': [ {'alias_name': 'att park', 'valid_from': '2006-01-01', 'valid_until': '2019-01-08'}, {'alias_name': 'sbc park', 'valid_from': '2004-01-01', 'valid_until': '2005-12-31'}, {'alias_name': 'pac bell park', 'valid_from': '2000-04-01', 'valid_until': '2003-12-31'}, ], 'stadium_mlb_globe_life_field': [ {'alias_name': 'choctaw stadium', 'valid_from': '2020-01-01'}, # Globe Life Field opened 2020 ], # NBA 'stadium_nba_state_farm_arena': [ {'alias_name': 'philips arena', 'valid_from': '1999-09-01', 'valid_until': '2018-06-25'}, ], 'stadium_nba_crypto_com_arena': [ {'alias_name': 'staples center', 'valid_from': '1999-10-01', 'valid_until': '2021-12-24'}, ], 'stadium_nba_kaseya_center': [ {'alias_name': 'ftx arena', 'valid_from': '2021-06-01', 'valid_until': '2023-03-31'}, {'alias_name': 'american airlines arena', 'valid_from': '1999-12-01', 'valid_until': '2021-05-31'}, ], 'stadium_nba_gainbridge_fieldhouse': [ {'alias_name': 'bankers life fieldhouse', 'valid_from': '2011-01-01', 'valid_until': '2021-12-31'}, {'alias_name': 'conseco fieldhouse', 'valid_from': '1999-11-01', 'valid_until': '2010-12-31'}, ], 'stadium_nba_rocket_mortgage_fieldhouse': [ {'alias_name': 'quicken loans arena', 'valid_from': '2005-08-01', 'valid_until': '2019-08-08'}, {'alias_name': 'gund arena', 'valid_from': '1994-10-01', 'valid_until': '2005-07-31'}, ], 'stadium_nba_kia_center': [ {'alias_name': 'amway center', 'valid_from': '2010-10-01', 'valid_until': '2023-07-12'}, ], 'stadium_nba_frost_bank_center': [ {'alias_name': 'att center', 'valid_from': '2002-10-01', 'valid_until': '2023-10-01'}, ], 'stadium_nba_intuit_dome': [ # New arena opened 2024, Clippers moved from Crypto.com Arena ], 'stadium_nba_delta_center': [ {'alias_name': 'vivint arena', 'valid_from': '2020-12-01', 'valid_until': '2023-07-01'}, {'alias_name': 'vivint smart home arena', 'valid_from': '2015-11-01', 'valid_until': '2020-11-30'}, {'alias_name': 'energysolutions arena', 'valid_from': '2006-11-01', 'valid_until': '2015-10-31'}, ], # NHL 'stadium_nhl_amerant_bank_arena': [ {'alias_name': 'fla live arena', 'valid_from': '2021-10-01', 'valid_until': '2024-05-31'}, {'alias_name': 'bb&t center', 'valid_from': '2012-06-01', 'valid_until': '2021-09-30'}, {'alias_name': 'bankatlantic center', 'valid_from': '2005-10-01', 'valid_until': '2012-05-31'}, ], 'stadium_nhl_climate_pledge_arena': [ {'alias_name': 'keyarena', 'valid_from': '1995-01-01', 'valid_until': '2018-10-01'}, {'alias_name': 'seattle center coliseum', 'valid_from': '1962-01-01', 'valid_until': '1994-12-31'}, ], } # ============================================================================= # SLUG GENERATION # ============================================================================= def normalize_stadium_name(name: str) -> str: """ Normalize stadium name for slug generation. - Lowercase - Remove parentheticals like "(IV)" - Remove special characters except spaces - Collapse multiple spaces """ normalized = name.lower() # Remove parentheticals normalized = re.sub(r'\s*\([^)]*\)', '', normalized) # Remove special characters except spaces and alphanumeric normalized = re.sub(r'[^a-z0-9\s]', '', normalized) # Replace multiple spaces with single space normalized = re.sub(r'\s+', ' ', normalized).strip() return normalized def generate_stadium_slug(name: str) -> str: """ Generate URL-safe slug from stadium name. Examples: "State Farm Arena" -> "state_farm_arena" "TD Garden" -> "td_garden" "Crypto.com Arena" -> "crypto_com_arena" """ normalized = normalize_stadium_name(name) # Replace spaces with underscores slug = normalized.replace(' ', '_') # Truncate to 50 chars return slug[:50] def generate_canonical_stadium_id(sport: str, name: str) -> str: """ Generate deterministic canonical ID for stadium. Format: stadium_{sport}_{slug} Example: stadium_nba_state_farm_arena """ slug = generate_stadium_slug(name) return f"stadium_{sport.lower()}_{slug}" # ============================================================================= # CANONICALIZATION # ============================================================================= def canonicalize_stadiums( raw_stadiums: list[dict], verbose: bool = False ) -> tuple[list[CanonicalStadium], list[StadiumAlias]]: """ Stage 1: Canonicalize stadiums. 1. Normalize names and cities 2. Deduplicate by (sport, normalized_name, city) 3. Generate canonical IDs 4. Create name aliases Args: raw_stadiums: List of raw stadium dicts from scraper verbose: Print detailed progress Returns: (canonical_stadiums, aliases) """ canonical_stadiums = [] aliases = [] seen_keys = {} # (sport, normalized_name, city) -> canonical_id for raw in raw_stadiums: sport = raw.get('sport', '').upper() name = raw.get('name', '') city = raw.get('city', '') if not sport or not name: if verbose: print(f" Skipping invalid stadium: {raw}") continue # Generate canonical ID canonical_id = generate_canonical_stadium_id(sport, name) # Deduplication key (same stadium in same city for same sport) normalized_name = normalize_stadium_name(name) dedup_key = (sport, normalized_name, city.lower()) if dedup_key in seen_keys: existing_canonical_id = seen_keys[dedup_key] # Add as alias if the display name differs alias_name = name.lower().strip() if alias_name != normalized_name: aliases.append(StadiumAlias( alias_name=alias_name, stadium_canonical_id=existing_canonical_id )) if verbose: print(f" Duplicate: {name} -> {existing_canonical_id}") continue seen_keys[dedup_key] = canonical_id # Create canonical stadium canonical = CanonicalStadium( canonical_id=canonical_id, name=name, city=city, state=raw.get('state', ''), latitude=raw.get('latitude', 0.0), longitude=raw.get('longitude', 0.0), capacity=raw.get('capacity', 0), sport=sport, primary_team_abbrevs=raw.get('team_abbrevs', []), year_opened=raw.get('year_opened') ) canonical_stadiums.append(canonical) # Add primary name as alias (normalized) aliases.append(StadiumAlias( alias_name=name.lower().strip(), stadium_canonical_id=canonical_id )) # Also add normalized version if different if normalized_name != name.lower().strip(): aliases.append(StadiumAlias( alias_name=normalized_name, stadium_canonical_id=canonical_id )) if verbose: print(f" {canonical_id}: {name} ({city})") return canonical_stadiums, aliases def add_historical_aliases( aliases: list[StadiumAlias], canonical_ids: set[str] ) -> list[StadiumAlias]: """ Add historical stadium name aliases. Only adds aliases for stadiums that exist in canonical_ids. """ for canonical_id, historical in HISTORICAL_STADIUM_ALIASES.items(): if canonical_id not in canonical_ids: continue for hist in historical: aliases.append(StadiumAlias( alias_name=hist['alias_name'], stadium_canonical_id=canonical_id, valid_from=hist.get('valid_from'), valid_until=hist.get('valid_until') )) return aliases def deduplicate_aliases(aliases: list[StadiumAlias]) -> list[StadiumAlias]: """Remove duplicate aliases (same alias_name -> same canonical_id).""" seen = set() deduped = [] for alias in aliases: key = (alias.alias_name, alias.stadium_canonical_id) if key not in seen: seen.add(key) deduped.append(alias) return deduped # ============================================================================= # MAIN # ============================================================================= def main(): parser = argparse.ArgumentParser( description='Canonicalize stadium data' ) parser.add_argument( '--input', type=str, default='./data/stadiums.json', help='Input raw stadiums JSON file' ) parser.add_argument( '--output', type=str, default='./data', help='Output directory for canonical files' ) parser.add_argument( '--verbose', '-v', action='store_true', help='Verbose output' ) args = parser.parse_args() input_path = Path(args.input) output_dir = Path(args.output) output_dir.mkdir(parents=True, exist_ok=True) # Load raw stadiums print(f"Loading raw stadiums from {input_path}...") with open(input_path) as f: raw_stadiums = json.load(f) print(f" Loaded {len(raw_stadiums)} raw stadiums") # Canonicalize print("\nCanonicalizing stadiums...") canonical_stadiums, aliases = canonicalize_stadiums( raw_stadiums, verbose=args.verbose ) print(f" Created {len(canonical_stadiums)} canonical stadiums") # Add historical aliases canonical_ids = {s.canonical_id for s in canonical_stadiums} aliases = add_historical_aliases(aliases, canonical_ids) # Deduplicate aliases aliases = deduplicate_aliases(aliases) print(f" Created {len(aliases)} stadium aliases") # Export stadiums_path = output_dir / 'stadiums_canonical.json' aliases_path = output_dir / 'stadium_aliases.json' with open(stadiums_path, 'w') as f: json.dump([asdict(s) for s in canonical_stadiums], f, indent=2) print(f"\nExported stadiums to {stadiums_path}") with open(aliases_path, 'w') as f: json.dump([asdict(a) for a in aliases], f, indent=2) print(f"Exported aliases to {aliases_path}") # Summary by sport print("\nSummary by sport:") by_sport = {} for s in canonical_stadiums: by_sport[s.sport] = by_sport.get(s.sport, 0) + 1 for sport, count in sorted(by_sport.items()): print(f" {sport}: {count} stadiums") if __name__ == '__main__': main()