Files
Sportstime/Scripts/canonicalize_stadiums.py
Trey t b6a913df1d feat(03-02): add MLS to canonicalization pipeline
- Import MLS_TEAMS from mls module
- Add MLS_DIVISIONS dict (Eastern/Western conferences)
- Add MLS to sport_mappings for team canonicalization
- Add MLS team abbreviation aliases (LA, NYC, RBNY, etc.)
- Add MLS stadium historical aliases (BMO, PayPal Park, Shell Energy, etc.)

Total teams: 154 (30 MLS teams added)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 09:40:39 -06:00

490 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Stadium Canonicalization for SportsTime
========================================
Stage 1 of the canonicalization pipeline.
Normalizes stadium data and generates deterministic canonical IDs.
Creates stadium name aliases for fuzzy matching during game resolution.
Usage:
python canonicalize_stadiums.py --input data/stadiums.json --output data/
"""
import argparse
import json
import re
from dataclasses import dataclass, asdict, field
from pathlib import Path
from typing import Optional
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class CanonicalStadium:
"""A canonicalized stadium with stable ID."""
canonical_id: str
name: str
city: str
state: str
latitude: float
longitude: float
capacity: int
sport: str
primary_team_abbrevs: list = field(default_factory=list)
year_opened: Optional[int] = None
@dataclass
class StadiumAlias:
"""Maps an alias name to a canonical stadium ID."""
alias_name: str # Normalized (lowercase)
stadium_canonical_id: str
valid_from: Optional[str] = None
valid_until: Optional[str] = None
# =============================================================================
# HISTORICAL STADIUM ALIASES
# Known name changes for stadiums (sponsorship changes, renames)
# =============================================================================
HISTORICAL_STADIUM_ALIASES = {
# MLB
'stadium_mlb_minute_maid_park': [
{'alias_name': 'daikin park', 'valid_from': '2025-01-01'},
{'alias_name': 'enron field', 'valid_from': '2000-04-01', 'valid_until': '2002-02-28'},
{'alias_name': 'astros field', 'valid_from': '2002-03-01', 'valid_until': '2002-06-04'},
],
'stadium_mlb_guaranteed_rate_field': [
{'alias_name': 'rate field', 'valid_from': '2024-01-01'},
{'alias_name': 'us cellular field', 'valid_from': '2003-01-01', 'valid_until': '2016-08-24'},
{'alias_name': 'comiskey park ii', 'valid_from': '1991-04-01', 'valid_until': '2002-12-31'},
{'alias_name': 'new comiskey park', 'valid_from': '1991-04-01', 'valid_until': '2002-12-31'},
],
'stadium_mlb_truist_park': [
{'alias_name': 'suntrust park', 'valid_from': '2017-04-01', 'valid_until': '2020-01-13'},
],
'stadium_mlb_progressive_field': [
{'alias_name': 'jacobs field', 'valid_from': '1994-04-01', 'valid_until': '2008-01-10'},
{'alias_name': 'the jake', 'valid_from': '1994-04-01', 'valid_until': '2008-01-10'},
],
'stadium_mlb_american_family_field': [
{'alias_name': 'miller park', 'valid_from': '2001-04-01', 'valid_until': '2020-12-31'},
],
'stadium_mlb_rogers_centre': [
{'alias_name': 'skydome', 'valid_from': '1989-06-01', 'valid_until': '2005-02-01'},
],
'stadium_mlb_loandepot_park': [
{'alias_name': 'marlins park', 'valid_from': '2012-04-01', 'valid_until': '2021-03-31'},
],
'stadium_mlb_t_mobile_park': [
{'alias_name': 'safeco field', 'valid_from': '1999-07-01', 'valid_until': '2018-12-31'},
],
'stadium_mlb_oracle_park': [
{'alias_name': 'att park', 'valid_from': '2006-01-01', 'valid_until': '2019-01-08'},
{'alias_name': 'sbc park', 'valid_from': '2004-01-01', 'valid_until': '2005-12-31'},
{'alias_name': 'pac bell park', 'valid_from': '2000-04-01', 'valid_until': '2003-12-31'},
],
'stadium_mlb_globe_life_field': [
{'alias_name': 'choctaw stadium', 'valid_from': '2020-01-01'}, # Globe Life Field opened 2020
],
# NBA
'stadium_nba_state_farm_arena': [
{'alias_name': 'philips arena', 'valid_from': '1999-09-01', 'valid_until': '2018-06-25'},
],
'stadium_nba_crypto_com_arena': [
{'alias_name': 'staples center', 'valid_from': '1999-10-01', 'valid_until': '2021-12-24'},
],
'stadium_nba_kaseya_center': [
{'alias_name': 'ftx arena', 'valid_from': '2021-06-01', 'valid_until': '2023-03-31'},
{'alias_name': 'american airlines arena', 'valid_from': '1999-12-01', 'valid_until': '2021-05-31'},
],
'stadium_nba_gainbridge_fieldhouse': [
{'alias_name': 'bankers life fieldhouse', 'valid_from': '2011-01-01', 'valid_until': '2021-12-31'},
{'alias_name': 'conseco fieldhouse', 'valid_from': '1999-11-01', 'valid_until': '2010-12-31'},
],
'stadium_nba_rocket_mortgage_fieldhouse': [
{'alias_name': 'quicken loans arena', 'valid_from': '2005-08-01', 'valid_until': '2019-08-08'},
{'alias_name': 'gund arena', 'valid_from': '1994-10-01', 'valid_until': '2005-07-31'},
],
'stadium_nba_kia_center': [
{'alias_name': 'amway center', 'valid_from': '2010-10-01', 'valid_until': '2023-07-12'},
],
'stadium_nba_frost_bank_center': [
{'alias_name': 'att center', 'valid_from': '2002-10-01', 'valid_until': '2023-10-01'},
],
'stadium_nba_intuit_dome': [
# New arena opened 2024, Clippers moved from Crypto.com Arena
],
'stadium_nba_delta_center': [
{'alias_name': 'vivint arena', 'valid_from': '2020-12-01', 'valid_until': '2023-07-01'},
{'alias_name': 'vivint smart home arena', 'valid_from': '2015-11-01', 'valid_until': '2020-11-30'},
{'alias_name': 'energysolutions arena', 'valid_from': '2006-11-01', 'valid_until': '2015-10-31'},
],
# NHL
'stadium_nhl_amerant_bank_arena': [
{'alias_name': 'fla live arena', 'valid_from': '2021-10-01', 'valid_until': '2024-05-31'},
{'alias_name': 'bb&t center', 'valid_from': '2012-06-01', 'valid_until': '2021-09-30'},
{'alias_name': 'bankatlantic center', 'valid_from': '2005-10-01', 'valid_until': '2012-05-31'},
],
'stadium_nhl_climate_pledge_arena': [
{'alias_name': 'keyarena', 'valid_from': '1995-01-01', 'valid_until': '2018-10-01'},
{'alias_name': 'seattle center coliseum', 'valid_from': '1962-01-01', 'valid_until': '1994-12-31'},
],
# NFL
'stadium_nfl_sofi_stadium': [
# SoFi Stadium opened 2020, no prior name
],
'stadium_nfl_allegiant_stadium': [
# Allegiant Stadium opened 2020, no prior name (Raiders moved from Oakland Coliseum)
],
'stadium_nfl_caesars_superdome': [
{'alias_name': 'mercedes-benz superdome', 'valid_from': '2011-10-01', 'valid_until': '2021-07-01'},
{'alias_name': 'louisiana superdome', 'valid_from': '1975-08-01', 'valid_until': '2011-09-30'},
{'alias_name': 'superdome', 'valid_from': '1975-08-01'},
],
'stadium_nfl_paycor_stadium': [
{'alias_name': 'paul brown stadium', 'valid_from': '2000-08-01', 'valid_until': '2022-09-05'},
],
'stadium_nfl_empower_field_at_mile_high': [
{'alias_name': 'broncos stadium at mile high', 'valid_from': '2018-09-01', 'valid_until': '2019-08-31'},
{'alias_name': 'sports authority field at mile high', 'valid_from': '2011-08-01', 'valid_until': '2018-08-31'},
{'alias_name': 'invesco field at mile high', 'valid_from': '2001-09-01', 'valid_until': '2011-07-31'},
{'alias_name': 'mile high stadium', 'valid_from': '1960-01-01', 'valid_until': '2001-08-31'},
],
'stadium_nfl_acrisure_stadium': [
{'alias_name': 'heinz field', 'valid_from': '2001-08-01', 'valid_until': '2022-07-10'},
],
'stadium_nfl_everbank_stadium': [
{'alias_name': 'tiaa bank field', 'valid_from': '2018-01-01', 'valid_until': '2023-03-31'},
{'alias_name': 'everbank field', 'valid_from': '2014-01-01', 'valid_until': '2017-12-31'},
{'alias_name': 'alltel stadium', 'valid_from': '1997-06-01', 'valid_until': '2006-12-31'},
{'alias_name': 'jacksonville municipal stadium', 'valid_from': '1995-08-01', 'valid_until': '1997-05-31'},
],
'stadium_nfl_northwest_stadium': [
{'alias_name': 'fedexfield', 'valid_from': '1999-11-01', 'valid_until': '2025-01-01'},
{'alias_name': 'fedex field', 'valid_from': '1999-11-01', 'valid_until': '2025-01-01'},
{'alias_name': 'jack kent cooke stadium', 'valid_from': '1997-09-01', 'valid_until': '1999-10-31'},
],
'stadium_nfl_hard_rock_stadium': [
{'alias_name': 'sun life stadium', 'valid_from': '2010-01-01', 'valid_until': '2016-07-31'},
{'alias_name': 'land shark stadium', 'valid_from': '2009-01-01', 'valid_until': '2009-12-31'},
{'alias_name': 'dolphin stadium', 'valid_from': '2005-01-01', 'valid_until': '2008-12-31'},
{'alias_name': 'pro player stadium', 'valid_from': '1996-04-01', 'valid_until': '2004-12-31'},
{'alias_name': 'joe robbie stadium', 'valid_from': '1987-08-01', 'valid_until': '1996-03-31'},
],
'stadium_nfl_highmark_stadium': [
{'alias_name': 'bills stadium', 'valid_from': '2020-03-01', 'valid_until': '2021-03-31'},
{'alias_name': 'new era field', 'valid_from': '2016-08-01', 'valid_until': '2020-02-29'},
{'alias_name': 'ralph wilson stadium', 'valid_from': '1998-08-01', 'valid_until': '2016-07-31'},
{'alias_name': 'rich stadium', 'valid_from': '1973-08-01', 'valid_until': '1998-07-31'},
],
'stadium_nfl_geha_field_at_arrowhead_stadium': [
{'alias_name': 'arrowhead stadium', 'valid_from': '1972-08-01'},
],
'stadium_nfl_att_stadium': [
{'alias_name': 'cowboys stadium', 'valid_from': '2009-05-01', 'valid_until': '2013-07-24'},
],
'stadium_nfl_us_bank_stadium': [
# Opened 2016, no prior name (Vikings moved from Metrodome)
],
'stadium_nfl_lumen_field': [
{'alias_name': 'centurylink field', 'valid_from': '2011-06-01', 'valid_until': '2020-11-18'},
{'alias_name': 'qwest field', 'valid_from': '2004-06-01', 'valid_until': '2011-05-31'},
{'alias_name': 'seahawks stadium', 'valid_from': '2002-07-01', 'valid_until': '2004-05-31'},
],
# MLS
'stadium_mls_bmo_stadium': [
{'alias_name': 'banc of california stadium', 'valid_from': '2018-04-01', 'valid_until': '2023-06-01'},
],
'stadium_mls_paypal_park': [
{'alias_name': 'earthquakes stadium', 'valid_from': '2015-03-01', 'valid_until': '2020-12-31'},
{'alias_name': 'avaya stadium', 'valid_from': '2015-03-01', 'valid_until': '2020-12-31'},
],
'stadium_mls_shell_energy_stadium': [
{'alias_name': 'pnc stadium', 'valid_from': '2021-03-01', 'valid_until': '2023-03-01'},
{'alias_name': 'bbva stadium', 'valid_from': '2019-01-01', 'valid_until': '2021-02-28'},
{'alias_name': 'bbva compass stadium', 'valid_from': '2012-05-01', 'valid_until': '2018-12-31'},
],
'stadium_mls_dignity_health_sports_park': [
{'alias_name': 'stubhub center', 'valid_from': '2013-06-01', 'valid_until': '2019-01-31'},
{'alias_name': 'home depot center', 'valid_from': '2003-06-01', 'valid_until': '2013-05-31'},
],
'stadium_mls_interandco_stadium': [
{'alias_name': 'exploria stadium', 'valid_from': '2017-03-01', 'valid_until': '2023-07-01'},
{'alias_name': 'orlando city stadium', 'valid_from': '2017-03-01', 'valid_until': '2019-01-01'},
],
'stadium_mls_chase_stadium': [
{'alias_name': 'drv pnk stadium', 'valid_from': '2020-07-01', 'valid_until': '2024-01-01'},
{'alias_name': 'inter miami cf stadium', 'valid_from': '2020-07-01', 'valid_until': '2020-09-01'},
],
'stadium_mls_america_first_field': [
{'alias_name': 'rio tinto stadium', 'valid_from': '2008-10-01', 'valid_until': '2021-08-01'},
],
'stadium_mls_lowercom_field': [
{'alias_name': 'lower.com field', 'valid_from': '2021-07-01'}, # Current name with period
{'alias_name': 'new crew stadium', 'valid_from': '2021-07-01', 'valid_until': '2021-07-01'},
],
}
# =============================================================================
# SLUG GENERATION
# =============================================================================
def normalize_stadium_name(name: str) -> str:
"""
Normalize stadium name for slug generation.
- Lowercase
- Remove parentheticals like "(IV)"
- Remove special characters except spaces
- Collapse multiple spaces
"""
normalized = name.lower()
# Remove parentheticals
normalized = re.sub(r'\s*\([^)]*\)', '', normalized)
# Remove special characters except spaces and alphanumeric
normalized = re.sub(r'[^a-z0-9\s]', '', normalized)
# Replace multiple spaces with single space
normalized = re.sub(r'\s+', ' ', normalized).strip()
return normalized
def generate_stadium_slug(name: str) -> str:
"""
Generate URL-safe slug from stadium name.
Examples:
"State Farm Arena" -> "state_farm_arena"
"TD Garden" -> "td_garden"
"Crypto.com Arena" -> "crypto_com_arena"
"""
normalized = normalize_stadium_name(name)
# Replace spaces with underscores
slug = normalized.replace(' ', '_')
# Truncate to 50 chars
return slug[:50]
def generate_canonical_stadium_id(sport: str, name: str) -> str:
"""
Generate deterministic canonical ID for stadium.
Format: stadium_{sport}_{slug}
Example: stadium_nba_state_farm_arena
"""
slug = generate_stadium_slug(name)
return f"stadium_{sport.lower()}_{slug}"
# =============================================================================
# CANONICALIZATION
# =============================================================================
def canonicalize_stadiums(
raw_stadiums: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalStadium], list[StadiumAlias]]:
"""
Stage 1: Canonicalize stadiums.
1. Normalize names and cities
2. Deduplicate by (sport, normalized_name, city)
3. Generate canonical IDs
4. Create name aliases
Args:
raw_stadiums: List of raw stadium dicts from scraper
verbose: Print detailed progress
Returns:
(canonical_stadiums, aliases)
"""
canonical_stadiums = []
aliases = []
seen_keys = {} # (sport, normalized_name, city) -> canonical_id
for raw in raw_stadiums:
sport = raw.get('sport', '').upper()
name = raw.get('name', '')
city = raw.get('city', '')
if not sport or not name:
if verbose:
print(f" Skipping invalid stadium: {raw}")
continue
# Generate canonical ID
canonical_id = generate_canonical_stadium_id(sport, name)
# Deduplication key (same stadium in same city for same sport)
normalized_name = normalize_stadium_name(name)
dedup_key = (sport, normalized_name, city.lower())
if dedup_key in seen_keys:
existing_canonical_id = seen_keys[dedup_key]
# Add as alias if the display name differs
alias_name = name.lower().strip()
if alias_name != normalized_name:
aliases.append(StadiumAlias(
alias_name=alias_name,
stadium_canonical_id=existing_canonical_id
))
if verbose:
print(f" Duplicate: {name} -> {existing_canonical_id}")
continue
seen_keys[dedup_key] = canonical_id
# Create canonical stadium
canonical = CanonicalStadium(
canonical_id=canonical_id,
name=name,
city=city,
state=raw.get('state', ''),
latitude=raw.get('latitude', 0.0),
longitude=raw.get('longitude', 0.0),
capacity=raw.get('capacity', 0),
sport=sport,
primary_team_abbrevs=raw.get('team_abbrevs', []),
year_opened=raw.get('year_opened')
)
canonical_stadiums.append(canonical)
# Add primary name as alias (normalized)
aliases.append(StadiumAlias(
alias_name=name.lower().strip(),
stadium_canonical_id=canonical_id
))
# Also add normalized version if different
if normalized_name != name.lower().strip():
aliases.append(StadiumAlias(
alias_name=normalized_name,
stadium_canonical_id=canonical_id
))
if verbose:
print(f" {canonical_id}: {name} ({city})")
return canonical_stadiums, aliases
def add_historical_aliases(
aliases: list[StadiumAlias],
canonical_ids: set[str]
) -> list[StadiumAlias]:
"""
Add historical stadium name aliases.
Only adds aliases for stadiums that exist in canonical_ids.
"""
for canonical_id, historical in HISTORICAL_STADIUM_ALIASES.items():
if canonical_id not in canonical_ids:
continue
for hist in historical:
aliases.append(StadiumAlias(
alias_name=hist['alias_name'],
stadium_canonical_id=canonical_id,
valid_from=hist.get('valid_from'),
valid_until=hist.get('valid_until')
))
return aliases
def deduplicate_aliases(aliases: list[StadiumAlias]) -> list[StadiumAlias]:
"""Remove duplicate aliases (same alias_name -> same canonical_id)."""
seen = set()
deduped = []
for alias in aliases:
key = (alias.alias_name, alias.stadium_canonical_id)
if key not in seen:
seen.add(key)
deduped.append(alias)
return deduped
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Canonicalize stadium data'
)
parser.add_argument(
'--input', type=str, default='./data/stadiums.json',
help='Input raw stadiums JSON file'
)
parser.add_argument(
'--output', type=str, default='./data',
help='Output directory for canonical files'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
args = parser.parse_args()
input_path = Path(args.input)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
# Load raw stadiums
print(f"Loading raw stadiums from {input_path}...")
with open(input_path) as f:
raw_stadiums = json.load(f)
print(f" Loaded {len(raw_stadiums)} raw stadiums")
# Canonicalize
print("\nCanonicalizing stadiums...")
canonical_stadiums, aliases = canonicalize_stadiums(
raw_stadiums, verbose=args.verbose
)
print(f" Created {len(canonical_stadiums)} canonical stadiums")
# Add historical aliases
canonical_ids = {s.canonical_id for s in canonical_stadiums}
aliases = add_historical_aliases(aliases, canonical_ids)
# Deduplicate aliases
aliases = deduplicate_aliases(aliases)
print(f" Created {len(aliases)} stadium aliases")
# Export
stadiums_path = output_dir / 'stadiums_canonical.json'
aliases_path = output_dir / 'stadium_aliases.json'
with open(stadiums_path, 'w') as f:
json.dump([asdict(s) for s in canonical_stadiums], f, indent=2)
print(f"\nExported stadiums to {stadiums_path}")
with open(aliases_path, 'w') as f:
json.dump([asdict(a) for a in aliases], f, indent=2)
print(f"Exported aliases to {aliases_path}")
# Summary by sport
print("\nSummary by sport:")
by_sport = {}
for s in canonical_stadiums:
by_sport[s.sport] = by_sport.get(s.sport, 0) + 1
for sport, count in sorted(by_sport.items()):
print(f" {sport}: {count} stadiums")
if __name__ == '__main__':
main()