Add canonical ID pipeline and fix UUID consistency for CloudKit sync

- Add local canonicalization pipeline (stadiums, teams, games) that generates
  deterministic canonical IDs before CloudKit upload
- Fix CanonicalSyncService to use deterministic UUIDs from canonical IDs
  instead of random UUIDs from CloudKit records
- Add SyncStadium/SyncTeam/SyncGame types to CloudKitService that preserve
  canonical ID relationships during sync
- Add canonical ID field keys to CKModels for reading from CloudKit records
- Bundle canonical JSON files (stadiums_canonical, teams_canonical,
  games_canonical, stadium_aliases) for consistent bootstrap data
- Update BootstrapService to prefer canonical format files over legacy format

This ensures all entities use consistent deterministic UUIDs derived from
their canonical IDs, preventing duplicate records when syncing CloudKit
data with bootstrapped local data.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-09 10:30:09 -06:00
parent 1ee47df53e
commit 7efcea7bd4
31 changed files with 128868 additions and 282 deletions

View File

@@ -0,0 +1,462 @@
#!/usr/bin/env python3
"""
Game Canonicalization for SportsTime
====================================
Stage 3 of the canonicalization pipeline.
Resolves team and stadium references in games, generates canonical game IDs.
Usage:
python canonicalize_games.py --games data/games.json --teams data/teams_canonical.json \
--aliases data/stadium_aliases.json --output data/
"""
import argparse
import json
from collections import defaultdict
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class CanonicalGame:
"""A canonicalized game with stable ID and resolved references."""
canonical_id: str
sport: str
season: str
date: str # YYYY-MM-DD
time: Optional[str]
home_team_canonical_id: str
away_team_canonical_id: str
stadium_canonical_id: str
is_playoff: bool = False
broadcast: Optional[str] = None
@dataclass
class ResolutionWarning:
"""Warning about a resolution issue."""
game_key: str
issue: str
details: str
# =============================================================================
# TEAM ABBREVIATION ALIASES
# Maps alternative abbreviations to canonical team IDs
# =============================================================================
TEAM_ABBREV_ALIASES = {
# NBA
('NBA', 'PHX'): 'team_nba_pho', # Phoenix
('NBA', 'BKN'): 'team_nba_brk', # Brooklyn
('NBA', 'CHA'): 'team_nba_cho', # Charlotte (older abbrev)
('NBA', 'NOP'): 'team_nba_nop', # New Orleans
('NBA', 'NO'): 'team_nba_nop', # New Orleans alt
('NBA', 'NY'): 'team_nba_nyk', # New York
('NBA', 'SA'): 'team_nba_sas', # San Antonio
('NBA', 'GS'): 'team_nba_gsw', # Golden State
('NBA', 'UTAH'): 'team_nba_uta', # Utah
# MLB
('MLB', 'AZ'): 'team_mlb_ari', # Arizona
('MLB', 'CWS'): 'team_mlb_chw', # Chicago White Sox
('MLB', 'KC'): 'team_mlb_kcr', # Kansas City
('MLB', 'SD'): 'team_mlb_sdp', # San Diego
('MLB', 'SF'): 'team_mlb_sfg', # San Francisco
('MLB', 'TB'): 'team_mlb_tbr', # Tampa Bay
('MLB', 'WSH'): 'team_mlb_wsn', # Washington
('MLB', 'WAS'): 'team_mlb_wsn', # Washington alt
('MLB', 'LA'): 'team_mlb_lad', # Los Angeles Dodgers
('MLB', 'ATH'): 'team_mlb_oak', # Oakland Athletics
# NHL
('NHL', 'ARI'): 'team_nhl_ari', # Arizona/Utah
('NHL', 'UTA'): 'team_nhl_ari', # Utah Hockey Club (uses ARI code)
('NHL', 'VGS'): 'team_nhl_vgk', # Vegas
('NHL', 'TB'): 'team_nhl_tbl', # Tampa Bay Lightning
('NHL', 'NJ'): 'team_nhl_njd', # New Jersey
('NHL', 'SJ'): 'team_nhl_sjs', # San Jose
('NHL', 'LA'): 'team_nhl_lak', # Los Angeles Kings
('NHL', 'MON'): 'team_nhl_mtl', # Montreal
}
# =============================================================================
# ID GENERATION
# =============================================================================
def normalize_season(sport: str, season: str) -> str:
"""
Normalize season format for ID generation.
NBA/NHL: "2025-26" -> "202526"
MLB: "2026" -> "2026"
"""
return season.replace('-', '')
def generate_canonical_game_id(
sport: str,
season: str,
date: str, # YYYY-MM-DD
away_abbrev: str,
home_abbrev: str,
sequence: int = 1
) -> str:
"""
Generate deterministic canonical ID for game.
Format: game_{sport}_{season}_{date}_{away}_{home}[_{sequence}]
Example: game_nba_202526_20251021_hou_okc
game_mlb_2026_20260615_bos_nyy_2 (doubleheader game 2)
"""
normalized_season = normalize_season(sport, season)
date_compact = date.replace('-', '') # YYYYMMDD
base_id = f"game_{sport.lower()}_{normalized_season}_{date_compact}_{away_abbrev.lower()}_{home_abbrev.lower()}"
if sequence > 1:
return f"{base_id}_{sequence}"
return base_id
# =============================================================================
# RESOLUTION
# =============================================================================
def build_alias_lookup(stadium_aliases: list[dict]) -> dict[str, str]:
"""
Build lookup from alias name to canonical stadium ID.
Returns: {alias_name_lower: canonical_stadium_id}
"""
lookup = {}
for alias in stadium_aliases:
alias_name = alias.get('alias_name', '').lower().strip()
canonical_id = alias.get('stadium_canonical_id', '')
if alias_name and canonical_id:
lookup[alias_name] = canonical_id
return lookup
def resolve_team(
abbrev: str,
sport: str,
teams_by_abbrev: dict[tuple[str, str], dict],
teams_by_id: dict[str, dict]
) -> Optional[dict]:
"""
Resolve team abbreviation to canonical team.
1. Try direct match by (sport, abbrev)
2. Try alias lookup
3. Return None if not found
"""
key = (sport, abbrev.upper())
# Direct match
if key in teams_by_abbrev:
return teams_by_abbrev[key]
# Alias match
if key in TEAM_ABBREV_ALIASES:
canonical_id = TEAM_ABBREV_ALIASES[key]
if canonical_id in teams_by_id:
return teams_by_id[canonical_id]
return None
def resolve_stadium_from_venue(
venue: str,
home_team: dict,
sport: str,
alias_lookup: dict[str, str],
stadiums_by_id: dict[str, dict]
) -> str:
"""
Resolve stadium canonical ID from venue name.
Strategy:
1. ALWAYS prefer home team's stadium (most reliable, sport-correct)
2. Try sport-scoped alias match (only if home team has no stadium)
3. Fall back to unknown stadium slug
For multi-sport venues (MSG, Crypto.com Arena, etc.), home team's
stadium_canonical_id is authoritative because it's already sport-scoped.
Args:
venue: Venue name from game data
home_team: Resolved home team dict
sport: Sport code (NBA, MLB, NHL)
alias_lookup: {alias_name_lower: canonical_stadium_id}
stadiums_by_id: {canonical_id: stadium_dict}
Returns:
canonical_stadium_id
"""
# Strategy 1: Home team's stadium is most reliable (sport-scoped)
if home_team:
team_stadium = home_team.get('stadium_canonical_id', '')
if team_stadium:
return team_stadium
# Strategy 2: Sport-scoped alias match (fallback for neutral sites)
venue_lower = venue.lower().strip()
sport_prefix = f"stadium_{sport.lower()}_"
if venue_lower in alias_lookup:
matched_id = alias_lookup[venue_lower]
# Only use alias if it's for the correct sport
if matched_id.startswith(sport_prefix):
return matched_id
# Strategy 3: Partial match with sport check
for alias, canonical_id in alias_lookup.items():
if len(alias) > 3 and (alias in venue_lower or venue_lower in alias):
if canonical_id.startswith(sport_prefix):
return canonical_id
# Unknown stadium
slug = venue_lower[:30].replace(' ', '_').replace('.', '')
return f"stadium_unknown_{slug}"
# =============================================================================
# CANONICALIZATION
# =============================================================================
def canonicalize_games(
raw_games: list[dict],
canonical_teams: list[dict],
stadium_aliases: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalGame], list[ResolutionWarning]]:
"""
Stage 3: Canonicalize games.
1. Resolve team abbreviations to canonical IDs
2. Resolve venues to stadium canonical IDs
3. Generate canonical game IDs (handling doubleheaders)
Args:
raw_games: List of raw game dicts
canonical_teams: List of canonical team dicts
stadium_aliases: List of stadium alias dicts
verbose: Print detailed progress
Returns:
(canonical_games, warnings)
"""
games = []
warnings = []
# Build lookups
teams_by_abbrev = {} # (sport, abbrev) -> team dict
teams_by_id = {} # canonical_id -> team dict
for team in canonical_teams:
abbrev = team['abbreviation'].upper()
sport = team['sport']
teams_by_abbrev[(sport, abbrev)] = team
teams_by_id[team['canonical_id']] = team
alias_lookup = build_alias_lookup(stadium_aliases)
stadiums_by_id = {} # Would be populated from stadiums_canonical.json if needed
# Track games for doubleheader detection
game_counts = defaultdict(int) # (date, away_id, home_id) -> count
resolved_count = 0
unresolved_teams = 0
unresolved_stadiums = 0
for raw in raw_games:
sport = raw.get('sport', '').upper()
season = raw.get('season', '')
date = raw.get('date', '')
home_abbrev = raw.get('home_team_abbrev', '').upper()
away_abbrev = raw.get('away_team_abbrev', '').upper()
venue = raw.get('venue', '')
game_key = f"{date}_{away_abbrev}_{home_abbrev}"
# Resolve teams
home_team = resolve_team(home_abbrev, sport, teams_by_abbrev, teams_by_id)
away_team = resolve_team(away_abbrev, sport, teams_by_abbrev, teams_by_id)
if not home_team:
warnings.append(ResolutionWarning(
game_key=game_key,
issue='Unknown home team',
details=f"Could not resolve home team '{home_abbrev}' for sport {sport}"
))
unresolved_teams += 1
if verbose:
print(f" WARNING: {game_key} - unknown home team {home_abbrev}")
continue
if not away_team:
warnings.append(ResolutionWarning(
game_key=game_key,
issue='Unknown away team',
details=f"Could not resolve away team '{away_abbrev}' for sport {sport}"
))
unresolved_teams += 1
if verbose:
print(f" WARNING: {game_key} - unknown away team {away_abbrev}")
continue
# Resolve stadium
stadium_canonical_id = resolve_stadium_from_venue(
venue, home_team, sport, alias_lookup, stadiums_by_id
)
if stadium_canonical_id.startswith('stadium_unknown'):
warnings.append(ResolutionWarning(
game_key=game_key,
issue='Unknown stadium',
details=f"Could not resolve venue '{venue}', using home team stadium"
))
unresolved_stadiums += 1
# Fall back to home team stadium
stadium_canonical_id = home_team.get('stadium_canonical_id', stadium_canonical_id)
# Handle doubleheaders
matchup_key = (date, away_team['canonical_id'], home_team['canonical_id'])
game_counts[matchup_key] += 1
sequence = game_counts[matchup_key]
# Generate canonical ID
canonical_id = generate_canonical_game_id(
sport, season, date,
away_team['abbreviation'], home_team['abbreviation'],
sequence
)
game = CanonicalGame(
canonical_id=canonical_id,
sport=sport,
season=season,
date=date,
time=raw.get('time'),
home_team_canonical_id=home_team['canonical_id'],
away_team_canonical_id=away_team['canonical_id'],
stadium_canonical_id=stadium_canonical_id,
is_playoff=raw.get('is_playoff', False),
broadcast=raw.get('broadcast')
)
games.append(game)
resolved_count += 1
if verbose:
print(f"\n Resolved: {resolved_count} games")
print(f" Unresolved teams: {unresolved_teams}")
print(f" Unknown stadiums (used home team): {unresolved_stadiums}")
return games, warnings
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Canonicalize game data'
)
parser.add_argument(
'--games', type=str, default='./data/games.json',
help='Input raw games JSON file'
)
parser.add_argument(
'--teams', type=str, default='./data/teams_canonical.json',
help='Input canonical teams JSON file'
)
parser.add_argument(
'--aliases', type=str, default='./data/stadium_aliases.json',
help='Input stadium aliases JSON file'
)
parser.add_argument(
'--output', type=str, default='./data',
help='Output directory for canonical files'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
args = parser.parse_args()
games_path = Path(args.games)
teams_path = Path(args.teams)
aliases_path = Path(args.aliases)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
# Load input files
print(f"Loading raw games from {games_path}...")
with open(games_path) as f:
raw_games = json.load(f)
print(f" Loaded {len(raw_games)} raw games")
print(f"Loading canonical teams from {teams_path}...")
with open(teams_path) as f:
canonical_teams = json.load(f)
print(f" Loaded {len(canonical_teams)} canonical teams")
print(f"Loading stadium aliases from {aliases_path}...")
with open(aliases_path) as f:
stadium_aliases = json.load(f)
print(f" Loaded {len(stadium_aliases)} stadium aliases")
# Canonicalize games
print("\nCanonicalizing games...")
canonical_games, warnings = canonicalize_games(
raw_games, canonical_teams, stadium_aliases, verbose=args.verbose
)
print(f" Created {len(canonical_games)} canonical games")
if warnings:
print(f"\n Warnings: {len(warnings)}")
# Group by issue type
by_issue = defaultdict(list)
for w in warnings:
by_issue[w.issue].append(w)
for issue, issue_warnings in by_issue.items():
print(f" - {issue}: {len(issue_warnings)}")
# Export
games_path = output_dir / 'games_canonical.json'
warnings_path = output_dir / 'game_resolution_warnings.json'
with open(games_path, 'w') as f:
json.dump([asdict(g) for g in canonical_games], f, indent=2)
print(f"\nExported games to {games_path}")
if warnings:
with open(warnings_path, 'w') as f:
json.dump([asdict(w) for w in warnings], f, indent=2)
print(f"Exported warnings to {warnings_path}")
# Summary by sport
print("\nSummary by sport:")
by_sport = {}
for g in canonical_games:
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
for sport, count in sorted(by_sport.items()):
print(f" {sport}: {count} games")
# Check for doubleheaders
doubleheaders = sum(1 for g in canonical_games if '_2' in g.canonical_id or '_3' in g.canonical_id)
if doubleheaders:
print(f"\n Doubleheader games detected: {doubleheaders}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,393 @@
#!/usr/bin/env python3
"""
Stadium Canonicalization for SportsTime
========================================
Stage 1 of the canonicalization pipeline.
Normalizes stadium data and generates deterministic canonical IDs.
Creates stadium name aliases for fuzzy matching during game resolution.
Usage:
python canonicalize_stadiums.py --input data/stadiums.json --output data/
"""
import argparse
import json
import re
from dataclasses import dataclass, asdict, field
from pathlib import Path
from typing import Optional
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class CanonicalStadium:
"""A canonicalized stadium with stable ID."""
canonical_id: str
name: str
city: str
state: str
latitude: float
longitude: float
capacity: int
sport: str
primary_team_abbrevs: list = field(default_factory=list)
year_opened: Optional[int] = None
@dataclass
class StadiumAlias:
"""Maps an alias name to a canonical stadium ID."""
alias_name: str # Normalized (lowercase)
stadium_canonical_id: str
valid_from: Optional[str] = None
valid_until: Optional[str] = None
# =============================================================================
# HISTORICAL STADIUM ALIASES
# Known name changes for stadiums (sponsorship changes, renames)
# =============================================================================
HISTORICAL_STADIUM_ALIASES = {
# MLB
'stadium_mlb_minute_maid_park': [
{'alias_name': 'daikin park', 'valid_from': '2025-01-01'},
{'alias_name': 'enron field', 'valid_from': '2000-04-01', 'valid_until': '2002-02-28'},
{'alias_name': 'astros field', 'valid_from': '2002-03-01', 'valid_until': '2002-06-04'},
],
'stadium_mlb_guaranteed_rate_field': [
{'alias_name': 'rate field', 'valid_from': '2024-01-01'},
{'alias_name': 'us cellular field', 'valid_from': '2003-01-01', 'valid_until': '2016-08-24'},
{'alias_name': 'comiskey park ii', 'valid_from': '1991-04-01', 'valid_until': '2002-12-31'},
{'alias_name': 'new comiskey park', 'valid_from': '1991-04-01', 'valid_until': '2002-12-31'},
],
'stadium_mlb_truist_park': [
{'alias_name': 'suntrust park', 'valid_from': '2017-04-01', 'valid_until': '2020-01-13'},
],
'stadium_mlb_progressive_field': [
{'alias_name': 'jacobs field', 'valid_from': '1994-04-01', 'valid_until': '2008-01-10'},
{'alias_name': 'the jake', 'valid_from': '1994-04-01', 'valid_until': '2008-01-10'},
],
'stadium_mlb_american_family_field': [
{'alias_name': 'miller park', 'valid_from': '2001-04-01', 'valid_until': '2020-12-31'},
],
'stadium_mlb_rogers_centre': [
{'alias_name': 'skydome', 'valid_from': '1989-06-01', 'valid_until': '2005-02-01'},
],
'stadium_mlb_loandepot_park': [
{'alias_name': 'marlins park', 'valid_from': '2012-04-01', 'valid_until': '2021-03-31'},
],
'stadium_mlb_t_mobile_park': [
{'alias_name': 'safeco field', 'valid_from': '1999-07-01', 'valid_until': '2018-12-31'},
],
'stadium_mlb_oracle_park': [
{'alias_name': 'att park', 'valid_from': '2006-01-01', 'valid_until': '2019-01-08'},
{'alias_name': 'sbc park', 'valid_from': '2004-01-01', 'valid_until': '2005-12-31'},
{'alias_name': 'pac bell park', 'valid_from': '2000-04-01', 'valid_until': '2003-12-31'},
],
'stadium_mlb_globe_life_field': [
{'alias_name': 'choctaw stadium', 'valid_from': '2020-01-01'}, # Globe Life Field opened 2020
],
# NBA
'stadium_nba_state_farm_arena': [
{'alias_name': 'philips arena', 'valid_from': '1999-09-01', 'valid_until': '2018-06-25'},
],
'stadium_nba_crypto_com_arena': [
{'alias_name': 'staples center', 'valid_from': '1999-10-01', 'valid_until': '2021-12-24'},
],
'stadium_nba_kaseya_center': [
{'alias_name': 'ftx arena', 'valid_from': '2021-06-01', 'valid_until': '2023-03-31'},
{'alias_name': 'american airlines arena', 'valid_from': '1999-12-01', 'valid_until': '2021-05-31'},
],
'stadium_nba_gainbridge_fieldhouse': [
{'alias_name': 'bankers life fieldhouse', 'valid_from': '2011-01-01', 'valid_until': '2021-12-31'},
{'alias_name': 'conseco fieldhouse', 'valid_from': '1999-11-01', 'valid_until': '2010-12-31'},
],
'stadium_nba_rocket_mortgage_fieldhouse': [
{'alias_name': 'quicken loans arena', 'valid_from': '2005-08-01', 'valid_until': '2019-08-08'},
{'alias_name': 'gund arena', 'valid_from': '1994-10-01', 'valid_until': '2005-07-31'},
],
'stadium_nba_kia_center': [
{'alias_name': 'amway center', 'valid_from': '2010-10-01', 'valid_until': '2023-07-12'},
],
'stadium_nba_frost_bank_center': [
{'alias_name': 'att center', 'valid_from': '2002-10-01', 'valid_until': '2023-10-01'},
],
'stadium_nba_intuit_dome': [
# New arena opened 2024, Clippers moved from Crypto.com Arena
],
'stadium_nba_delta_center': [
{'alias_name': 'vivint arena', 'valid_from': '2020-12-01', 'valid_until': '2023-07-01'},
{'alias_name': 'vivint smart home arena', 'valid_from': '2015-11-01', 'valid_until': '2020-11-30'},
{'alias_name': 'energysolutions arena', 'valid_from': '2006-11-01', 'valid_until': '2015-10-31'},
],
# NHL
'stadium_nhl_amerant_bank_arena': [
{'alias_name': 'fla live arena', 'valid_from': '2021-10-01', 'valid_until': '2024-05-31'},
{'alias_name': 'bb&t center', 'valid_from': '2012-06-01', 'valid_until': '2021-09-30'},
{'alias_name': 'bankatlantic center', 'valid_from': '2005-10-01', 'valid_until': '2012-05-31'},
],
'stadium_nhl_climate_pledge_arena': [
{'alias_name': 'keyarena', 'valid_from': '1995-01-01', 'valid_until': '2018-10-01'},
{'alias_name': 'seattle center coliseum', 'valid_from': '1962-01-01', 'valid_until': '1994-12-31'},
],
}
# =============================================================================
# SLUG GENERATION
# =============================================================================
def normalize_stadium_name(name: str) -> str:
"""
Normalize stadium name for slug generation.
- Lowercase
- Remove parentheticals like "(IV)"
- Remove special characters except spaces
- Collapse multiple spaces
"""
normalized = name.lower()
# Remove parentheticals
normalized = re.sub(r'\s*\([^)]*\)', '', normalized)
# Remove special characters except spaces and alphanumeric
normalized = re.sub(r'[^a-z0-9\s]', '', normalized)
# Replace multiple spaces with single space
normalized = re.sub(r'\s+', ' ', normalized).strip()
return normalized
def generate_stadium_slug(name: str) -> str:
"""
Generate URL-safe slug from stadium name.
Examples:
"State Farm Arena" -> "state_farm_arena"
"TD Garden" -> "td_garden"
"Crypto.com Arena" -> "crypto_com_arena"
"""
normalized = normalize_stadium_name(name)
# Replace spaces with underscores
slug = normalized.replace(' ', '_')
# Truncate to 50 chars
return slug[:50]
def generate_canonical_stadium_id(sport: str, name: str) -> str:
"""
Generate deterministic canonical ID for stadium.
Format: stadium_{sport}_{slug}
Example: stadium_nba_state_farm_arena
"""
slug = generate_stadium_slug(name)
return f"stadium_{sport.lower()}_{slug}"
# =============================================================================
# CANONICALIZATION
# =============================================================================
def canonicalize_stadiums(
raw_stadiums: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalStadium], list[StadiumAlias]]:
"""
Stage 1: Canonicalize stadiums.
1. Normalize names and cities
2. Deduplicate by (sport, normalized_name, city)
3. Generate canonical IDs
4. Create name aliases
Args:
raw_stadiums: List of raw stadium dicts from scraper
verbose: Print detailed progress
Returns:
(canonical_stadiums, aliases)
"""
canonical_stadiums = []
aliases = []
seen_keys = {} # (sport, normalized_name, city) -> canonical_id
for raw in raw_stadiums:
sport = raw.get('sport', '').upper()
name = raw.get('name', '')
city = raw.get('city', '')
if not sport or not name:
if verbose:
print(f" Skipping invalid stadium: {raw}")
continue
# Generate canonical ID
canonical_id = generate_canonical_stadium_id(sport, name)
# Deduplication key (same stadium in same city for same sport)
normalized_name = normalize_stadium_name(name)
dedup_key = (sport, normalized_name, city.lower())
if dedup_key in seen_keys:
existing_canonical_id = seen_keys[dedup_key]
# Add as alias if the display name differs
alias_name = name.lower().strip()
if alias_name != normalized_name:
aliases.append(StadiumAlias(
alias_name=alias_name,
stadium_canonical_id=existing_canonical_id
))
if verbose:
print(f" Duplicate: {name} -> {existing_canonical_id}")
continue
seen_keys[dedup_key] = canonical_id
# Create canonical stadium
canonical = CanonicalStadium(
canonical_id=canonical_id,
name=name,
city=city,
state=raw.get('state', ''),
latitude=raw.get('latitude', 0.0),
longitude=raw.get('longitude', 0.0),
capacity=raw.get('capacity', 0),
sport=sport,
primary_team_abbrevs=raw.get('team_abbrevs', []),
year_opened=raw.get('year_opened')
)
canonical_stadiums.append(canonical)
# Add primary name as alias (normalized)
aliases.append(StadiumAlias(
alias_name=name.lower().strip(),
stadium_canonical_id=canonical_id
))
# Also add normalized version if different
if normalized_name != name.lower().strip():
aliases.append(StadiumAlias(
alias_name=normalized_name,
stadium_canonical_id=canonical_id
))
if verbose:
print(f" {canonical_id}: {name} ({city})")
return canonical_stadiums, aliases
def add_historical_aliases(
aliases: list[StadiumAlias],
canonical_ids: set[str]
) -> list[StadiumAlias]:
"""
Add historical stadium name aliases.
Only adds aliases for stadiums that exist in canonical_ids.
"""
for canonical_id, historical in HISTORICAL_STADIUM_ALIASES.items():
if canonical_id not in canonical_ids:
continue
for hist in historical:
aliases.append(StadiumAlias(
alias_name=hist['alias_name'],
stadium_canonical_id=canonical_id,
valid_from=hist.get('valid_from'),
valid_until=hist.get('valid_until')
))
return aliases
def deduplicate_aliases(aliases: list[StadiumAlias]) -> list[StadiumAlias]:
"""Remove duplicate aliases (same alias_name -> same canonical_id)."""
seen = set()
deduped = []
for alias in aliases:
key = (alias.alias_name, alias.stadium_canonical_id)
if key not in seen:
seen.add(key)
deduped.append(alias)
return deduped
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Canonicalize stadium data'
)
parser.add_argument(
'--input', type=str, default='./data/stadiums.json',
help='Input raw stadiums JSON file'
)
parser.add_argument(
'--output', type=str, default='./data',
help='Output directory for canonical files'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
args = parser.parse_args()
input_path = Path(args.input)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
# Load raw stadiums
print(f"Loading raw stadiums from {input_path}...")
with open(input_path) as f:
raw_stadiums = json.load(f)
print(f" Loaded {len(raw_stadiums)} raw stadiums")
# Canonicalize
print("\nCanonicalizing stadiums...")
canonical_stadiums, aliases = canonicalize_stadiums(
raw_stadiums, verbose=args.verbose
)
print(f" Created {len(canonical_stadiums)} canonical stadiums")
# Add historical aliases
canonical_ids = {s.canonical_id for s in canonical_stadiums}
aliases = add_historical_aliases(aliases, canonical_ids)
# Deduplicate aliases
aliases = deduplicate_aliases(aliases)
print(f" Created {len(aliases)} stadium aliases")
# Export
stadiums_path = output_dir / 'stadiums_canonical.json'
aliases_path = output_dir / 'stadium_aliases.json'
with open(stadiums_path, 'w') as f:
json.dump([asdict(s) for s in canonical_stadiums], f, indent=2)
print(f"\nExported stadiums to {stadiums_path}")
with open(aliases_path, 'w') as f:
json.dump([asdict(a) for a in aliases], f, indent=2)
print(f"Exported aliases to {aliases_path}")
# Summary by sport
print("\nSummary by sport:")
by_sport = {}
for s in canonical_stadiums:
by_sport[s.sport] = by_sport.get(s.sport, 0) + 1
for sport, count in sorted(by_sport.items()):
print(f" {sport}: {count} stadiums")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,487 @@
#!/usr/bin/env python3
"""
Team Canonicalization for SportsTime
====================================
Stage 2 of the canonicalization pipeline.
Generates canonical team IDs and fuzzy matches teams to stadiums.
Usage:
python canonicalize_teams.py --stadiums data/stadiums_canonical.json --output data/
"""
import argparse
import json
from dataclasses import dataclass, asdict, field
from difflib import SequenceMatcher
from pathlib import Path
from typing import Optional
# Import team mappings from scraper
from scrape_schedules import NBA_TEAMS, MLB_TEAMS, NHL_TEAMS
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class CanonicalTeam:
"""A canonicalized team with stable ID."""
canonical_id: str
name: str
abbreviation: str
sport: str
city: str
stadium_canonical_id: str
conference_id: Optional[str] = None
division_id: Optional[str] = None
primary_color: Optional[str] = None
secondary_color: Optional[str] = None
@dataclass
class MatchWarning:
"""Warning about a low-confidence match."""
team_canonical_id: str
team_name: str
arena_name: str
matched_stadium: Optional[str]
issue: str
confidence: float
# =============================================================================
# LEAGUE STRUCTURE
# Maps team abbreviation -> (conference_id, division_id)
# =============================================================================
NBA_DIVISIONS = {
# Eastern Conference - Atlantic
'BOS': ('nba_eastern', 'nba_atlantic'),
'BRK': ('nba_eastern', 'nba_atlantic'),
'NYK': ('nba_eastern', 'nba_atlantic'),
'PHI': ('nba_eastern', 'nba_atlantic'),
'TOR': ('nba_eastern', 'nba_atlantic'),
# Eastern Conference - Central
'CHI': ('nba_eastern', 'nba_central'),
'CLE': ('nba_eastern', 'nba_central'),
'DET': ('nba_eastern', 'nba_central'),
'IND': ('nba_eastern', 'nba_central'),
'MIL': ('nba_eastern', 'nba_central'),
# Eastern Conference - Southeast
'ATL': ('nba_eastern', 'nba_southeast'),
'CHO': ('nba_eastern', 'nba_southeast'),
'MIA': ('nba_eastern', 'nba_southeast'),
'ORL': ('nba_eastern', 'nba_southeast'),
'WAS': ('nba_eastern', 'nba_southeast'),
# Western Conference - Northwest
'DEN': ('nba_western', 'nba_northwest'),
'MIN': ('nba_western', 'nba_northwest'),
'OKC': ('nba_western', 'nba_northwest'),
'POR': ('nba_western', 'nba_northwest'),
'UTA': ('nba_western', 'nba_northwest'),
# Western Conference - Pacific
'GSW': ('nba_western', 'nba_pacific'),
'LAC': ('nba_western', 'nba_pacific'),
'LAL': ('nba_western', 'nba_pacific'),
'PHO': ('nba_western', 'nba_pacific'),
'SAC': ('nba_western', 'nba_pacific'),
# Western Conference - Southwest
'DAL': ('nba_western', 'nba_southwest'),
'HOU': ('nba_western', 'nba_southwest'),
'MEM': ('nba_western', 'nba_southwest'),
'NOP': ('nba_western', 'nba_southwest'),
'SAS': ('nba_western', 'nba_southwest'),
}
MLB_DIVISIONS = {
# American League - East
'NYY': ('mlb_al', 'mlb_al_east'),
'BOS': ('mlb_al', 'mlb_al_east'),
'TOR': ('mlb_al', 'mlb_al_east'),
'BAL': ('mlb_al', 'mlb_al_east'),
'TBR': ('mlb_al', 'mlb_al_east'),
# American League - Central
'CLE': ('mlb_al', 'mlb_al_central'),
'DET': ('mlb_al', 'mlb_al_central'),
'MIN': ('mlb_al', 'mlb_al_central'),
'CHW': ('mlb_al', 'mlb_al_central'),
'KCR': ('mlb_al', 'mlb_al_central'),
# American League - West
'HOU': ('mlb_al', 'mlb_al_west'),
'SEA': ('mlb_al', 'mlb_al_west'),
'TEX': ('mlb_al', 'mlb_al_west'),
'LAA': ('mlb_al', 'mlb_al_west'),
'OAK': ('mlb_al', 'mlb_al_west'),
# National League - East
'ATL': ('mlb_nl', 'mlb_nl_east'),
'PHI': ('mlb_nl', 'mlb_nl_east'),
'NYM': ('mlb_nl', 'mlb_nl_east'),
'MIA': ('mlb_nl', 'mlb_nl_east'),
'WSN': ('mlb_nl', 'mlb_nl_east'),
# National League - Central
'MIL': ('mlb_nl', 'mlb_nl_central'),
'CHC': ('mlb_nl', 'mlb_nl_central'),
'STL': ('mlb_nl', 'mlb_nl_central'),
'PIT': ('mlb_nl', 'mlb_nl_central'),
'CIN': ('mlb_nl', 'mlb_nl_central'),
# National League - West
'LAD': ('mlb_nl', 'mlb_nl_west'),
'ARI': ('mlb_nl', 'mlb_nl_west'),
'SDP': ('mlb_nl', 'mlb_nl_west'),
'SFG': ('mlb_nl', 'mlb_nl_west'),
'COL': ('mlb_nl', 'mlb_nl_west'),
}
NHL_DIVISIONS = {
# Eastern Conference - Atlantic
'BOS': ('nhl_eastern', 'nhl_atlantic'),
'BUF': ('nhl_eastern', 'nhl_atlantic'),
'DET': ('nhl_eastern', 'nhl_atlantic'),
'FLA': ('nhl_eastern', 'nhl_atlantic'),
'MTL': ('nhl_eastern', 'nhl_atlantic'),
'OTT': ('nhl_eastern', 'nhl_atlantic'),
'TBL': ('nhl_eastern', 'nhl_atlantic'),
'TOR': ('nhl_eastern', 'nhl_atlantic'),
# Eastern Conference - Metropolitan
'CAR': ('nhl_eastern', 'nhl_metropolitan'),
'CBJ': ('nhl_eastern', 'nhl_metropolitan'),
'NJD': ('nhl_eastern', 'nhl_metropolitan'),
'NYI': ('nhl_eastern', 'nhl_metropolitan'),
'NYR': ('nhl_eastern', 'nhl_metropolitan'),
'PHI': ('nhl_eastern', 'nhl_metropolitan'),
'PIT': ('nhl_eastern', 'nhl_metropolitan'),
'WSH': ('nhl_eastern', 'nhl_metropolitan'),
# Western Conference - Central
'ARI': ('nhl_western', 'nhl_central'), # Utah Hockey Club
'CHI': ('nhl_western', 'nhl_central'),
'COL': ('nhl_western', 'nhl_central'),
'DAL': ('nhl_western', 'nhl_central'),
'MIN': ('nhl_western', 'nhl_central'),
'NSH': ('nhl_western', 'nhl_central'),
'STL': ('nhl_western', 'nhl_central'),
'WPG': ('nhl_western', 'nhl_central'),
# Western Conference - Pacific
'ANA': ('nhl_western', 'nhl_pacific'),
'CGY': ('nhl_western', 'nhl_pacific'),
'EDM': ('nhl_western', 'nhl_pacific'),
'LAK': ('nhl_western', 'nhl_pacific'),
'SEA': ('nhl_western', 'nhl_pacific'),
'SJS': ('nhl_western', 'nhl_pacific'),
'VAN': ('nhl_western', 'nhl_pacific'),
'VGK': ('nhl_western', 'nhl_pacific'),
}
# =============================================================================
# FUZZY MATCHING
# =============================================================================
def normalize_for_matching(text: str) -> str:
"""Normalize text for fuzzy matching."""
import re
text = text.lower().strip()
# Remove common suffixes/prefixes
text = re.sub(r'\s*(arena|center|stadium|field|park|centre)\s*', ' ', text)
# Remove special characters
text = re.sub(r'[^a-z0-9\s]', '', text)
# Collapse spaces
text = re.sub(r'\s+', ' ', text).strip()
return text
def fuzzy_match_stadium(
team_arena_name: str,
team_city: str,
sport: str,
stadiums: list[dict],
confidence_threshold: float = 0.6
) -> tuple[Optional[str], float]:
"""
Fuzzy match team's arena to a canonical stadium.
Matching strategy:
- 70% weight: Name similarity (SequenceMatcher)
- 30% weight: City match (exact=1.0, partial=0.5)
Args:
team_arena_name: The arena name from team mapping
team_city: The team's city
sport: Sport code (NBA, MLB, NHL)
stadiums: List of canonical stadium dicts
confidence_threshold: Minimum confidence for a match
Returns:
(canonical_stadium_id, confidence_score)
"""
best_match = None
best_score = 0.0
# Normalize arena name
arena_normalized = normalize_for_matching(team_arena_name)
city_lower = team_city.lower()
# Filter to same sport
sport_stadiums = [s for s in stadiums if s['sport'] == sport]
for stadium in sport_stadiums:
stadium_name_normalized = normalize_for_matching(stadium['name'])
# Score 1: Name similarity
name_score = SequenceMatcher(
None,
arena_normalized,
stadium_name_normalized
).ratio()
# Also check full names (unnormalized)
full_name_score = SequenceMatcher(
None,
team_arena_name.lower(),
stadium['name'].lower()
).ratio()
# Take the better score
name_score = max(name_score, full_name_score)
# Score 2: City match
city_score = 0.0
stadium_city_lower = stadium['city'].lower()
if city_lower == stadium_city_lower:
city_score = 1.0
elif city_lower in stadium_city_lower or stadium_city_lower in city_lower:
city_score = 0.5
# Check for nearby cities (e.g., "San Francisco" team but "Oakland" arena)
nearby_cities = {
'san francisco': ['oakland', 'san jose'],
'new york': ['brooklyn', 'queens', 'elmont', 'newark'],
'los angeles': ['inglewood', 'anaheim'],
'miami': ['sunrise', 'fort lauderdale'],
'dallas': ['arlington', 'fort worth'],
'washington': ['landover', 'capital heights'],
'minneapolis': ['st paul', 'st. paul'],
'detroit': ['auburn hills', 'pontiac'],
}
for main_city, nearby in nearby_cities.items():
if city_lower == main_city and stadium_city_lower in nearby:
city_score = 0.7
elif stadium_city_lower == main_city and city_lower in nearby:
city_score = 0.7
# Combined score (weighted)
combined = (name_score * 0.7) + (city_score * 0.3)
if combined > best_score:
best_score = combined
best_match = stadium['canonical_id']
if best_score >= confidence_threshold:
return best_match, best_score
return None, best_score
# =============================================================================
# CANONICALIZATION
# =============================================================================
def generate_canonical_team_id(sport: str, abbrev: str) -> str:
"""
Generate deterministic canonical ID for team.
Format: team_{sport}_{abbrev}
Example: team_nba_atl
"""
return f"team_{sport.lower()}_{abbrev.lower()}"
def canonicalize_teams(
team_mappings: dict[str, dict],
sport: str,
canonical_stadiums: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalTeam], list[MatchWarning]]:
"""
Stage 2: Canonicalize teams.
1. Generate canonical IDs from abbreviations
2. Fuzzy match to stadiums
3. Log low-confidence matches for review
Args:
team_mappings: Team data dict (e.g., NBA_TEAMS)
sport: Sport code
canonical_stadiums: List of canonical stadium dicts
verbose: Print detailed progress
Returns:
(canonical_teams, warnings)
"""
teams = []
warnings = []
# Determine arena key based on sport
arena_key = 'arena' if sport in ['NBA', 'NHL'] else 'stadium'
# Get division structure
division_map = {
'NBA': NBA_DIVISIONS,
'MLB': MLB_DIVISIONS,
'NHL': NHL_DIVISIONS,
}.get(sport, {})
for abbrev, info in team_mappings.items():
canonical_id = generate_canonical_team_id(sport, abbrev)
arena_name = info.get(arena_key, '')
city = info.get('city', '')
team_name = info.get('name', '')
# Fuzzy match stadium
stadium_canonical_id, confidence = fuzzy_match_stadium(
arena_name, city, sport, canonical_stadiums
)
if stadium_canonical_id is None:
warnings.append(MatchWarning(
team_canonical_id=canonical_id,
team_name=team_name,
arena_name=arena_name,
matched_stadium=None,
issue='No stadium match found',
confidence=confidence
))
# Create placeholder ID
stadium_canonical_id = f"stadium_unknown_{sport.lower()}_{abbrev.lower()}"
if verbose:
print(f" WARNING: {canonical_id} - no stadium match for '{arena_name}'")
elif confidence < 0.8:
warnings.append(MatchWarning(
team_canonical_id=canonical_id,
team_name=team_name,
arena_name=arena_name,
matched_stadium=stadium_canonical_id,
issue='Low confidence stadium match',
confidence=confidence
))
if verbose:
print(f" WARNING: {canonical_id} - low confidence ({confidence:.2f}) match to {stadium_canonical_id}")
# Get conference/division
conf_id, div_id = division_map.get(abbrev, (None, None))
team = CanonicalTeam(
canonical_id=canonical_id,
name=team_name,
abbreviation=abbrev,
sport=sport,
city=city,
stadium_canonical_id=stadium_canonical_id,
conference_id=conf_id,
division_id=div_id
)
teams.append(team)
if verbose and confidence >= 0.8:
print(f" {canonical_id}: {team_name} -> {stadium_canonical_id} ({confidence:.2f})")
return teams, warnings
def canonicalize_all_teams(
canonical_stadiums: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalTeam], list[MatchWarning]]:
"""Canonicalize teams for all sports."""
all_teams = []
all_warnings = []
sport_mappings = [
('NBA', NBA_TEAMS),
('MLB', MLB_TEAMS),
('NHL', NHL_TEAMS),
]
for sport, team_map in sport_mappings:
if verbose:
print(f"\n{sport}:")
teams, warnings = canonicalize_teams(
team_map, sport, canonical_stadiums, verbose
)
all_teams.extend(teams)
all_warnings.extend(warnings)
return all_teams, all_warnings
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Canonicalize team data'
)
parser.add_argument(
'--stadiums', type=str, default='./data/stadiums_canonical.json',
help='Input canonical stadiums JSON file'
)
parser.add_argument(
'--output', type=str, default='./data',
help='Output directory for canonical files'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
args = parser.parse_args()
stadiums_path = Path(args.stadiums)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
# Load canonical stadiums
print(f"Loading canonical stadiums from {stadiums_path}...")
with open(stadiums_path) as f:
canonical_stadiums = json.load(f)
print(f" Loaded {len(canonical_stadiums)} canonical stadiums")
# Canonicalize teams
print("\nCanonicalizing teams...")
canonical_teams, warnings = canonicalize_all_teams(
canonical_stadiums, verbose=args.verbose
)
print(f" Created {len(canonical_teams)} canonical teams")
if warnings:
print(f"\n Warnings: {len(warnings)}")
for w in warnings:
print(f" - {w.team_canonical_id}: {w.issue} (confidence: {w.confidence:.2f})")
# Export
teams_path = output_dir / 'teams_canonical.json'
warnings_path = output_dir / 'team_matching_warnings.json'
with open(teams_path, 'w') as f:
json.dump([asdict(t) for t in canonical_teams], f, indent=2)
print(f"\nExported teams to {teams_path}")
if warnings:
with open(warnings_path, 'w') as f:
json.dump([asdict(w) for w in warnings], f, indent=2)
print(f"Exported warnings to {warnings_path}")
# Summary by sport
print("\nSummary by sport:")
by_sport = {}
for t in canonical_teams:
by_sport[t.sport] = by_sport.get(t.sport, 0) + 1
for sport, count in sorted(by_sport.items()):
print(f" {sport}: {count} teams")
if __name__ == '__main__':
main()

View File

@@ -2,7 +2,15 @@
"""
CloudKit Import Script
======================
Imports JSON data into CloudKit. Run separately from pipeline.
Imports canonical JSON data into CloudKit. Run after canonicalization pipeline.
Expected input files (from canonicalization pipeline):
- stadiums_canonical.json
- teams_canonical.json
- games_canonical.json
- stadium_aliases.json
- league_structure.json
- team_aliases.json
Setup:
1. CloudKit Dashboard > Tokens & Keys > Server-to-Server Keys
@@ -309,12 +317,35 @@ def main():
print(f"Environment: {args.env}\n")
data_dir = Path(args.data_dir)
stadiums = json.load(open(data_dir / 'stadiums.json'))
games = json.load(open(data_dir / 'games.json')) if (data_dir / 'games.json').exists() else []
# Load canonical format files (from canonicalization pipeline)
# Fall back to legacy format for backward compatibility
if (data_dir / 'stadiums_canonical.json').exists():
stadiums = json.load(open(data_dir / 'stadiums_canonical.json'))
use_canonical = True
else:
stadiums = json.load(open(data_dir / 'stadiums.json'))
use_canonical = False
if (data_dir / 'teams_canonical.json').exists():
teams = json.load(open(data_dir / 'teams_canonical.json'))
else:
teams = [] # Legacy: extracted from stadiums
if (data_dir / 'games_canonical.json').exists():
games = json.load(open(data_dir / 'games_canonical.json'))
elif (data_dir / 'games.json').exists():
games = json.load(open(data_dir / 'games.json'))
else:
games = []
league_structure = json.load(open(data_dir / 'league_structure.json')) if (data_dir / 'league_structure.json').exists() else []
team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else []
stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else []
print(f"Loaded {len(stadiums)} stadiums, {len(games)} games, {len(league_structure)} league structures, {len(team_aliases)} team aliases, {len(stadium_aliases)} stadium aliases\n")
print(f"Using {'canonical' if use_canonical else 'legacy'} format")
print(f"Loaded {len(stadiums)} stadiums, {len(teams)} teams, {len(games)} games")
print(f"Loaded {len(league_structure)} league structures, {len(team_aliases)} team aliases, {len(stadium_aliases)} stadium aliases\n")
ck = None
if not args.dry_run:
@@ -353,72 +384,135 @@ def main():
import_team_aliases = args.team_aliases_only or args.canonical_only or (not args.stadiums_only and not args.games_only and not args.league_structure_only and not args.stadium_aliases_only)
import_stadium_aliases = args.stadium_aliases_only or args.canonical_only or (not args.stadiums_only and not args.games_only and not args.league_structure_only and not args.team_aliases_only)
# Build stadium UUID lookup (stadium string ID -> UUID)
stadium_uuid_map = {s['id']: deterministic_uuid(s['id']) for s in stadiums}
# Build stadium ID lookup
# Canonical format uses canonical_id, legacy uses id
def get_stadium_id(s):
return s.get('canonical_id', s.get('id', ''))
# Import stadiums & teams
def get_team_id(t):
return t.get('canonical_id', '')
stadium_id_map = {get_stadium_id(s): deterministic_uuid(get_stadium_id(s)) for s in stadiums}
# Import stadiums
if import_stadiums:
print("--- Stadiums ---")
recs = [{
'recordType': 'Stadium', 'recordName': stadium_uuid_map[s['id']],
'fields': {
'stadiumId': {'value': stadium_uuid_map[s['id']]}, 'name': {'value': s['name']},
'city': {'value': s['city']}, 'state': {'value': s.get('state', '')},
'sport': {'value': s['sport']}, 'source': {'value': s.get('source', '')},
'teamAbbrevs': {'value': s.get('team_abbrevs', [])},
**({'location': {'value': {'latitude': s['latitude'], 'longitude': s['longitude']}}}
if s.get('latitude') else {}),
**({'capacity': {'value': s['capacity']}} if s.get('capacity') else {}),
recs = []
for s in stadiums:
stadium_id = get_stadium_id(s)
record_name = deterministic_uuid(stadium_id)
# Canonical format uses primary_team_abbrevs, legacy uses team_abbrevs
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
fields = {
'stadiumId': {'value': record_name},
'canonicalId': {'value': stadium_id}, # Store canonical_id as string
'name': {'value': s['name']},
'city': {'value': s['city']},
'state': {'value': s.get('state', '')},
'sport': {'value': s['sport']},
'source': {'value': s.get('source', 'canonical')},
'teamAbbrevs': {'value': team_abbrevs},
}
} for s in stadiums]
if s.get('latitude'):
fields['location'] = {'value': {'latitude': s['latitude'], 'longitude': s['longitude']}}
if s.get('capacity'):
fields['capacity'] = {'value': s['capacity']}
recs.append({'recordType': 'Stadium', 'recordName': record_name, 'fields': fields})
stats['stadiums'] = import_data(ck, recs, 'stadiums', args.dry_run, args.verbose)
# Import teams (canonical format has dedicated teams file)
if import_teams:
print("--- Teams ---")
teams = {}
for s in stadiums:
for abbr in s.get('team_abbrevs', []):
team_key = f"{s['sport']}_{abbr}" # Match Swift: "{sport.rawValue}_{abbrev}"
if team_key not in teams:
teams[team_key] = {'abbr': abbr, 'city': s['city'], 'sport': s['sport']}
team_uuid = deterministic_uuid(team_key)
team_map[(s['sport'], abbr)] = team_uuid
if teams:
# Canonical format: use teams_canonical.json
recs = []
for t in teams:
team_id = get_team_id(t)
record_name = deterministic_uuid(team_id)
team_map[(t['sport'], t['abbreviation'])] = record_name
recs = [{
'recordType': 'Team', 'recordName': deterministic_uuid(team_key),
'fields': {
'teamId': {'value': deterministic_uuid(team_key)},
'abbreviation': {'value': info['abbr']},
'name': {'value': info['abbr']},
'city': {'value': info['city']},
'sport': {'value': info['sport']},
}
} for team_key, info in teams.items()]
stats['teams'] = import_data(ck, recs, 'teams', args.dry_run, args.verbose)
fields = {
'teamId': {'value': record_name},
'canonicalId': {'value': team_id}, # Store canonical_id as string
'abbreviation': {'value': t['abbreviation']},
'name': {'value': t['name']},
'city': {'value': t['city']},
'sport': {'value': t['sport']},
'stadiumCanonicalId': {'value': t.get('stadium_canonical_id', '')},
}
if t.get('conference_id'):
fields['conferenceId'] = {'value': t['conference_id']}
if t.get('division_id'):
fields['divisionId'] = {'value': t['division_id']}
recs.append({'recordType': 'Team', 'recordName': record_name, 'fields': fields})
stats['teams'] = import_data(ck, recs, 'teams', args.dry_run, args.verbose)
else:
# Legacy format: extract teams from stadiums
teams_dict = {}
for s in stadiums:
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
for abbr in team_abbrevs:
team_key = f"{s['sport']}_{abbr}"
if team_key not in teams_dict:
teams_dict[team_key] = {'abbr': abbr, 'city': s['city'], 'sport': s['sport']}
team_uuid = deterministic_uuid(team_key)
team_map[(s['sport'], abbr)] = team_uuid
recs = [{
'recordType': 'Team', 'recordName': deterministic_uuid(team_key),
'fields': {
'teamId': {'value': deterministic_uuid(team_key)},
'canonicalId': {'value': team_key},
'abbreviation': {'value': info['abbr']},
'name': {'value': info['abbr']},
'city': {'value': info['city']},
'sport': {'value': info['sport']},
}
} for team_key, info in teams_dict.items()]
stats['teams'] = import_data(ck, recs, 'teams', args.dry_run, args.verbose)
# Import games
if import_games and games:
# Detect canonical game format (has canonical_id field)
use_canonical_games = games and 'canonical_id' in games[0]
# Rebuild team_map if only importing games (--games-only flag)
if not team_map:
for s in stadiums:
for abbr in s.get('team_abbrevs', []):
team_key = f"{s['sport']}_{abbr}"
team_map[(s['sport'], abbr)] = deterministic_uuid(team_key)
if teams:
# Canonical format: use teams_canonical.json
for t in teams:
team_id = get_team_id(t)
team_map[(t['sport'], t['abbreviation'])] = deterministic_uuid(team_id)
else:
# Legacy format: extract from stadiums
for s in stadiums:
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
for abbr in team_abbrevs:
team_key = f"{s['sport']}_{abbr}"
team_map[(s['sport'], abbr)] = deterministic_uuid(team_key)
# Build team -> stadium map for stadiumRef
# Build team -> stadium map for stadiumRef (legacy format needs this)
team_stadium_map = {}
for s in stadiums:
stadium_uuid = stadium_uuid_map[s['id']]
for abbr in s.get('team_abbrevs', []):
stadium_id = get_stadium_id(s)
stadium_uuid = stadium_id_map[stadium_id]
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
for abbr in team_abbrevs:
team_stadium_map[(s['sport'], abbr)] = stadium_uuid
print("--- Games ---")
print(f" Using {'canonical' if use_canonical_games else 'legacy'} game format")
# Deduplicate games by ID
# Deduplicate games by ID (canonical_id or id)
seen_ids = set()
unique_games = []
for g in games:
if g['id'] not in seen_ids:
seen_ids.add(g['id'])
game_id = g.get('canonical_id', g.get('id', ''))
if game_id not in seen_ids:
seen_ids.add(game_id)
unique_games.append(g)
if len(unique_games) < len(games):
@@ -426,13 +520,20 @@ def main():
recs = []
for g in unique_games:
game_uuid = deterministic_uuid(g['id'])
# Get game ID (canonical or legacy)
game_id = g.get('canonical_id', g.get('id', ''))
game_uuid = deterministic_uuid(game_id)
sport = g['sport']
fields = {
'gameId': {'value': game_uuid}, 'sport': {'value': sport},
'season': {'value': g.get('season', '')}, 'source': {'value': g.get('source', '')},
'gameId': {'value': game_uuid},
'canonicalId': {'value': game_id}, # Store canonical_id as string
'sport': {'value': sport},
'season': {'value': g.get('season', '')},
'source': {'value': g.get('source', 'canonical' if use_canonical_games else '')},
}
# Parse date/time
if g.get('date'):
try:
# Parse time like "7:30p" or "10:00a"
@@ -455,20 +556,38 @@ def main():
fields['dateTime'] = {'value': int(dt.timestamp() * 1000), 'type': 'TIMESTAMP'}
except Exception as e:
if args.verbose:
print(f" Warning: Failed to parse date/time for {g['id']}: {e}")
print(f" Warning: Failed to parse date/time for {game_id}: {e}")
# Team references
if use_canonical_games:
# Canonical format: extract team abbrev from canonical ID (team_nba_atl -> atl)
home_team_canonical_id = g.get('home_team_canonical_id', '')
away_team_canonical_id = g.get('away_team_canonical_id', '')
home_team_uuid = deterministic_uuid(home_team_canonical_id)
away_team_uuid = deterministic_uuid(away_team_canonical_id)
else:
# Legacy format: use abbreviations
home_team_key = f"{sport}_{g.get('home_team_abbrev', '')}"
away_team_key = f"{sport}_{g.get('away_team_abbrev', '')}"
home_team_uuid = deterministic_uuid(home_team_key)
away_team_uuid = deterministic_uuid(away_team_key)
# Team references - use (sport, abbrev) tuple for lookup
home_team_key = f"{sport}_{g.get('home_team_abbrev', '')}"
away_team_key = f"{sport}_{g.get('away_team_abbrev', '')}"
home_team_uuid = deterministic_uuid(home_team_key)
away_team_uuid = deterministic_uuid(away_team_key)
fields['homeTeamRef'] = {'value': {'recordName': home_team_uuid, 'action': 'NONE'}}
fields['awayTeamRef'] = {'value': {'recordName': away_team_uuid, 'action': 'NONE'}}
# Stadium reference - look up by home team abbrev
stadium_uuid = team_stadium_map.get((sport, g.get('home_team_abbrev', '')))
if stadium_uuid:
fields['stadiumRef'] = {'value': {'recordName': stadium_uuid, 'action': 'NONE'}}
# Stadium reference
if use_canonical_games and g.get('stadium_canonical_id'):
# Canonical format: use stadium_canonical_id directly
stadium_canonical_id = g['stadium_canonical_id']
stadium_uuid = stadium_id_map.get(stadium_canonical_id)
if stadium_uuid:
fields['stadiumRef'] = {'value': {'recordName': stadium_uuid, 'action': 'NONE'}}
fields['stadiumCanonicalId'] = {'value': stadium_canonical_id}
else:
# Legacy format: look up by home team abbrev
stadium_uuid = team_stadium_map.get((sport, g.get('home_team_abbrev', '')))
if stadium_uuid:
fields['stadiumRef'] = {'value': {'recordName': stadium_uuid, 'action': 'NONE'}}
recs.append({'recordType': 'Game', 'recordName': game_uuid, 'fields': fields})
@@ -554,9 +673,14 @@ def main():
fields['validUntil'] = {'value': int(dt.timestamp() * 1000), 'type': 'TIMESTAMP'}
except:
pass
# Extract sport from stadium_canonical_id (e.g., "stadium_nba_td_garden" -> "nba")
# This makes record names unique for shared venues (TD Garden has NBA and NHL entries)
stadium_id = sa['stadium_canonical_id']
sport = stadium_id.split('_')[1] if '_' in stadium_id else 'unknown'
record_name = f"{sport}_{sa['alias_name'].lower()}"
recs.append({
'recordType': 'StadiumAlias',
'recordName': sa['alias_name'].lower(), # Use alias_name as recordName (unique key)
'recordName': record_name,
'fields': fields
})
stats['stadium_aliases'] = import_data(ck, recs, 'stadium aliases', args.dry_run, args.verbose)

View File

@@ -0,0 +1,13 @@
{
"is_valid": true,
"error_count": 0,
"warning_count": 0,
"summary": {
"stadiums": 92,
"teams": 92,
"games": 4972,
"aliases": 130,
"by_category": {}
},
"errors": []
}

View File

@@ -554,7 +554,6 @@ nba_202526_lal_sas_0107,NBA,2025-26,2026-01-07,9:30p,San Antonio Spurs,Los Angel
nba_202526_mil_gsw_0107,NBA,2025-26,2026-01-07,10:00p,Golden State Warriors,Milwaukee Bucks,GSW,MIL,Chase Center,basketball-reference.com,False,
nba_202526_hou_por_0107,NBA,2025-26,2026-01-07,10:00p,Portland Trail Blazers,Houston Rockets,POR,HOU,Moda Center,basketball-reference.com,False,
nba_202526_ind_cho_0108,NBA,2025-26,2026-01-08,7:00p,Charlotte Hornets,Indiana Pacers,CHO,IND,Spectrum Center,basketball-reference.com,False,
nba_202526_mia_chi_0108,NBA,2025-26,2026-01-08,8:00p,Chicago Bulls,Miami Heat,CHI,MIA,United Center,basketball-reference.com,False,
nba_202526_cle_min_0108,NBA,2025-26,2026-01-08,8:00p,Minnesota Timberwolves,Cleveland Cavaliers,MIN,CLE,Target Center,basketball-reference.com,False,
nba_202526_dal_uta_0108,NBA,2025-26,2026-01-08,9:00p,Utah Jazz,Dallas Mavericks,UTA,DAL,Delta Center,basketball-reference.com,False,
nba_202526_tor_bos_0109,NBA,2025-26,2026-01-09,7:00p,Boston Celtics,Toronto Raptors,BOS,TOR,TD Garden,basketball-reference.com,False,
@@ -612,9 +611,9 @@ nba_202526_uta_dal_0115,NBA,2025-26,2026-01-15,8:30p,Dallas Mavericks,Utah Jazz,
nba_202526_nyk_gsw_0115,NBA,2025-26,2026-01-15,10:00p,Golden State Warriors,New York Knicks,GSW,NYK,Chase Center,basketball-reference.com,False,
nba_202526_atl_por_0115,NBA,2025-26,2026-01-15,10:00p,Portland Trail Blazers,Atlanta Hawks,POR,ATL,Moda Center,basketball-reference.com,False,
nba_202526_cho_lal_0115,NBA,2025-26,2026-01-15,10:30p,Los Angeles Lakers,Charlotte Hornets,LAL,CHO,Crypto.com Arena,basketball-reference.com,False,
nba_202526_chi_brk_0116,NBA,2025-26,2026-01-16,7:00p,Brooklyn Nets,Chicago Bulls,BRK,CHI,Barclays Center,basketball-reference.com,False,
nba_202526_nop_ind_0116,NBA,2025-26,2026-01-16,7:00p,Indiana Pacers,New Orleans Pelicans,IND,NOP,Gainbridge Fieldhouse,basketball-reference.com,False,
nba_202526_cle_phi_0116,NBA,2025-26,2026-01-16,7:00p,Philadelphia 76ers,Cleveland Cavaliers,PHI,CLE,Xfinity Mobile Arena,basketball-reference.com,False,
nba_202526_chi_brk_0116,NBA,2025-26,2026-01-16,7:30p,Brooklyn Nets,Chicago Bulls,BRK,CHI,Barclays Center,basketball-reference.com,False,
nba_202526_lac_tor_0116,NBA,2025-26,2026-01-16,7:30p,Toronto Raptors,Los Angeles Clippers,TOR,LAC,Scotiabank Arena,basketball-reference.com,False,
nba_202526_min_hou_0116,NBA,2025-26,2026-01-16,9:30p,Houston Rockets,Minnesota Timberwolves,HOU,MIN,Toyota Center,basketball-reference.com,False,
nba_202526_was_sac_0116,NBA,2025-26,2026-01-16,10:00p,Sacramento Kings,Washington Wizards,SAC,WAS,Golden 1 Center,basketball-reference.com,False,
@@ -722,9 +721,9 @@ nba_202526_sac_bos_0130,NBA,2025-26,2026-01-30,7:30p,Boston Celtics,Sacramento K
nba_202526_mem_nop_0130,NBA,2025-26,2026-01-30,7:30p,New Orleans Pelicans,Memphis Grizzlies,NOP,MEM,Smoothie King Center,basketball-reference.com,False,
nba_202526_por_nyk_0130,NBA,2025-26,2026-01-30,7:30p,New York Knicks,Portland Trail Blazers,NYK,POR,Madison Square Garden (IV),basketball-reference.com,False,
nba_202526_chi_mia_0130,NBA,2025-26,2026-01-30,8:00p,Miami Heat,Chicago Bulls,MIA,CHI,Kaseya Center,basketball-reference.com,False,
nba_202526_lac_den_0130,NBA,2025-26,2026-01-30,9:00p,Denver Nuggets,Los Angeles Clippers,DEN,LAC,Ball Arena,basketball-reference.com,False,
nba_202526_cle_pho_0130,NBA,2025-26,2026-01-30,9:00p,Phoenix Suns,Cleveland Cavaliers,PHO,CLE,Mortgage Matchup Center,basketball-reference.com,False,
nba_202526_brk_uta_0130,NBA,2025-26,2026-01-30,9:30p,Utah Jazz,Brooklyn Nets,UTA,BRK,Delta Center,basketball-reference.com,False,
nba_202526_lac_den_0130,NBA,2025-26,2026-01-30,10:00p,Denver Nuggets,Los Angeles Clippers,DEN,LAC,Ball Arena,basketball-reference.com,False,
nba_202526_det_gsw_0130,NBA,2025-26,2026-01-30,10:00p,Golden State Warriors,Detroit Pistons,GSW,DET,Chase Center,basketball-reference.com,False,
nba_202526_sas_cho_0131,NBA,2025-26,2026-01-31,3:00p,Charlotte Hornets,San Antonio Spurs,CHO,SAS,Spectrum Center,basketball-reference.com,False,
nba_202526_atl_ind_0131,NBA,2025-26,2026-01-31,7:00p,Indiana Pacers,Atlanta Hawks,IND,ATL,Gainbridge Fieldhouse,basketball-reference.com,False,
@@ -2707,7 +2706,7 @@ mlb_2026_tb_bos_0718,MLB,2026,2026-07-18,20:10,Boston Red Sox,Tampa Bay Rays,BOS
mlb_2026_tex_atl_0718,MLB,2026,2026-07-18,20:10,Atlanta Braves,Texas Rangers,ATL,TEX,Truist Park,statsapi.mlb.com,False,
mlb_2026_mia_mil_0718,MLB,2026,2026-07-18,20:10,Milwaukee Brewers,Miami Marlins,MIL,MIA,American Family Field,statsapi.mlb.com,False,
mlb_2026_sf_sea_0718,MLB,2026,2026-07-18,23:15,Seattle Mariners,San Francisco Giants,SEA,SF,T-Mobile Park,statsapi.mlb.com,False,
mlb_2026_lad_nyy_0718,MLB,2026,2026-07-18,23:15,New York Yankees,Los Angeles Dodgers,NYY,LAD,Yankee Stadium,statsapi.mlb.com,False,
mlb_2026_lad_nyy_0718,MLB,2026,2026-07-18,00:08,New York Yankees,Los Angeles Dodgers,NYY,LAD,Yankee Stadium,statsapi.mlb.com,False,
mlb_2026_wsh_ath_0718,MLB,2026,2026-07-18,02:05,Athletics,Washington Nationals,ATH,WSH,Sutter Health Park,statsapi.mlb.com,False,
mlb_2026_det_laa_0718,MLB,2026,2026-07-18,02:07,Los Angeles Angels,Detroit Tigers,LAA,DET,Angel Stadium,statsapi.mlb.com,False,
mlb_2026_pit_cle_0718,MLB,2026,2026-07-18,07:33,Cleveland Guardians,Pittsburgh Pirates,CLE,PIT,Progressive Field,statsapi.mlb.com,False,
@@ -3520,7 +3519,7 @@ mlb_2026_mil_pit_0917,MLB,2026,2026-09-17,16:35,Pittsburgh Pirates,Milwaukee Bre
mlb_2026_lad_cin_0917,MLB,2026,2026-09-17,16:40,Cincinnati Reds,Los Angeles Dodgers,CIN,LAD,Great American Ball Park,statsapi.mlb.com,False,
mlb_2026_ath_tb_0917,MLB,2026,2026-09-17,17:10,Tampa Bay Rays,Athletics,TB,ATH,Tropicana Field,statsapi.mlb.com,False,
mlb_2026_sd_col_0917,MLB,2026,2026-09-17,19:10,Colorado Rockies,San Diego Padres,COL,SD,Coors Field,statsapi.mlb.com,False,
mlb_2026_phi_nym_0917,MLB,2026,2026-09-17,23:10,New York Mets,Philadelphia Phillies,NYM,PHI,Citi Field,statsapi.mlb.com,False,
mlb_2026_phi_nym_0917,MLB,2026,2026-09-17,23:15,New York Mets,Philadelphia Phillies,NYM,PHI,Citi Field,statsapi.mlb.com,False,
mlb_2026_det_cws_0917,MLB,2026,2026-09-17,23:40,Chicago White Sox,Detroit Tigers,CWS,DET,Rate Field,statsapi.mlb.com,False,
mlb_2026_bos_tex_0917,MLB,2026,2026-09-17,00:05,Texas Rangers,Boston Red Sox,TEX,BOS,Globe Life Field,statsapi.mlb.com,False,
mlb_2026_min_laa_0917,MLB,2026,2026-09-17,01:38,Los Angeles Angels,Minnesota Twins,LAA,MIN,Angel Stadium,statsapi.mlb.com,False,
Can't render this file because it is too large.

View File

@@ -8324,21 +8324,6 @@
"is_playoff": false,
"broadcast": null
},
{
"id": "nba_202526_mia_chi_0108",
"sport": "NBA",
"season": "2025-26",
"date": "2026-01-08",
"time": "8:00p",
"home_team": "Chicago Bulls",
"away_team": "Miami Heat",
"home_team_abbrev": "CHI",
"away_team_abbrev": "MIA",
"venue": "United Center",
"source": "basketball-reference.com",
"is_playoff": false,
"broadcast": null
},
{
"id": "nba_202526_cle_min_0108",
"sport": "NBA",
@@ -9194,21 +9179,6 @@
"is_playoff": false,
"broadcast": null
},
{
"id": "nba_202526_chi_brk_0116",
"sport": "NBA",
"season": "2025-26",
"date": "2026-01-16",
"time": "7:00p",
"home_team": "Brooklyn Nets",
"away_team": "Chicago Bulls",
"home_team_abbrev": "BRK",
"away_team_abbrev": "CHI",
"venue": "Barclays Center",
"source": "basketball-reference.com",
"is_playoff": false,
"broadcast": null
},
{
"id": "nba_202526_nop_ind_0116",
"sport": "NBA",
@@ -9239,6 +9209,21 @@
"is_playoff": false,
"broadcast": null
},
{
"id": "nba_202526_chi_brk_0116",
"sport": "NBA",
"season": "2025-26",
"date": "2026-01-16",
"time": "7:30p",
"home_team": "Brooklyn Nets",
"away_team": "Chicago Bulls",
"home_team_abbrev": "BRK",
"away_team_abbrev": "CHI",
"venue": "Barclays Center",
"source": "basketball-reference.com",
"is_playoff": false,
"broadcast": null
},
{
"id": "nba_202526_lac_tor_0116",
"sport": "NBA",
@@ -10844,6 +10829,21 @@
"is_playoff": false,
"broadcast": null
},
{
"id": "nba_202526_lac_den_0130",
"sport": "NBA",
"season": "2025-26",
"date": "2026-01-30",
"time": "9:00p",
"home_team": "Denver Nuggets",
"away_team": "Los Angeles Clippers",
"home_team_abbrev": "DEN",
"away_team_abbrev": "LAC",
"venue": "Ball Arena",
"source": "basketball-reference.com",
"is_playoff": false,
"broadcast": null
},
{
"id": "nba_202526_cle_pho_0130",
"sport": "NBA",
@@ -10874,21 +10874,6 @@
"is_playoff": false,
"broadcast": null
},
{
"id": "nba_202526_lac_den_0130",
"sport": "NBA",
"season": "2025-26",
"date": "2026-01-30",
"time": "10:00p",
"home_team": "Denver Nuggets",
"away_team": "Los Angeles Clippers",
"home_team_abbrev": "DEN",
"away_team_abbrev": "LAC",
"venue": "Ball Arena",
"source": "basketball-reference.com",
"is_playoff": false,
"broadcast": null
},
{
"id": "nba_202526_det_gsw_0130",
"sport": "NBA",
@@ -40624,7 +40609,7 @@
"sport": "MLB",
"season": "2026",
"date": "2026-07-18",
"time": "23:15",
"time": "00:08",
"home_team": "New York Yankees",
"away_team": "Los Angeles Dodgers",
"home_team_abbrev": "NYY",
@@ -52819,7 +52804,7 @@
"sport": "MLB",
"season": "2026",
"date": "2026-09-17",
"time": "23:10",
"time": "23:15",
"home_team": "New York Mets",
"away_team": "Philadelphia Phillies",
"home_team_abbrev": "NYM",

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,782 @@
[
{
"alias_name": "state farm arena",
"stadium_canonical_id": "stadium_nba_state_farm_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "td garden",
"stadium_canonical_id": "stadium_nba_td_garden",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "barclays center",
"stadium_canonical_id": "stadium_nba_barclays_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "spectrum center",
"stadium_canonical_id": "stadium_nba_spectrum_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "united center",
"stadium_canonical_id": "stadium_nba_united_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "rocket mortgage fieldhouse",
"stadium_canonical_id": "stadium_nba_rocket_mortgage_fieldhouse",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "american airlines center",
"stadium_canonical_id": "stadium_nba_american_airlines_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "ball arena",
"stadium_canonical_id": "stadium_nba_ball_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "little caesars arena",
"stadium_canonical_id": "stadium_nba_little_caesars_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "chase center",
"stadium_canonical_id": "stadium_nba_chase_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "toyota center",
"stadium_canonical_id": "stadium_nba_toyota_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "gainbridge fieldhouse",
"stadium_canonical_id": "stadium_nba_gainbridge_fieldhouse",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "intuit dome",
"stadium_canonical_id": "stadium_nba_intuit_dome",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "crypto.com arena",
"stadium_canonical_id": "stadium_nba_cryptocom_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "cryptocom arena",
"stadium_canonical_id": "stadium_nba_cryptocom_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "fedexforum",
"stadium_canonical_id": "stadium_nba_fedexforum",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "kaseya center",
"stadium_canonical_id": "stadium_nba_kaseya_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "fiserv forum",
"stadium_canonical_id": "stadium_nba_fiserv_forum",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "target center",
"stadium_canonical_id": "stadium_nba_target_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "smoothie king center",
"stadium_canonical_id": "stadium_nba_smoothie_king_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "madison square garden",
"stadium_canonical_id": "stadium_nba_madison_square_garden",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "paycom center",
"stadium_canonical_id": "stadium_nba_paycom_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "kia center",
"stadium_canonical_id": "stadium_nba_kia_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "wells fargo center",
"stadium_canonical_id": "stadium_nba_wells_fargo_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "footprint center",
"stadium_canonical_id": "stadium_nba_footprint_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "moda center",
"stadium_canonical_id": "stadium_nba_moda_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "golden 1 center",
"stadium_canonical_id": "stadium_nba_golden_1_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "frost bank center",
"stadium_canonical_id": "stadium_nba_frost_bank_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "scotiabank arena",
"stadium_canonical_id": "stadium_nba_scotiabank_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "delta center",
"stadium_canonical_id": "stadium_nba_delta_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "capital one arena",
"stadium_canonical_id": "stadium_nba_capital_one_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "chase field",
"stadium_canonical_id": "stadium_mlb_chase_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "truist park",
"stadium_canonical_id": "stadium_mlb_truist_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "oriole park at camden yards",
"stadium_canonical_id": "stadium_mlb_oriole_park_at_camden_yards",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "fenway park",
"stadium_canonical_id": "stadium_mlb_fenway_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "wrigley field",
"stadium_canonical_id": "stadium_mlb_wrigley_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "guaranteed rate field",
"stadium_canonical_id": "stadium_mlb_guaranteed_rate_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "great american ball park",
"stadium_canonical_id": "stadium_mlb_great_american_ball_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "progressive field",
"stadium_canonical_id": "stadium_mlb_progressive_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "coors field",
"stadium_canonical_id": "stadium_mlb_coors_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "comerica park",
"stadium_canonical_id": "stadium_mlb_comerica_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "minute maid park",
"stadium_canonical_id": "stadium_mlb_minute_maid_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "kauffman stadium",
"stadium_canonical_id": "stadium_mlb_kauffman_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "angel stadium",
"stadium_canonical_id": "stadium_mlb_angel_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "dodger stadium",
"stadium_canonical_id": "stadium_mlb_dodger_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "loandepot park",
"stadium_canonical_id": "stadium_mlb_loandepot_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "american family field",
"stadium_canonical_id": "stadium_mlb_american_family_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "target field",
"stadium_canonical_id": "stadium_mlb_target_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "citi field",
"stadium_canonical_id": "stadium_mlb_citi_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "yankee stadium",
"stadium_canonical_id": "stadium_mlb_yankee_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "sutter health park",
"stadium_canonical_id": "stadium_mlb_sutter_health_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "citizens bank park",
"stadium_canonical_id": "stadium_mlb_citizens_bank_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "pnc park",
"stadium_canonical_id": "stadium_mlb_pnc_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "petco park",
"stadium_canonical_id": "stadium_mlb_petco_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "oracle park",
"stadium_canonical_id": "stadium_mlb_oracle_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "t-mobile park",
"stadium_canonical_id": "stadium_mlb_tmobile_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "tmobile park",
"stadium_canonical_id": "stadium_mlb_tmobile_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "busch stadium",
"stadium_canonical_id": "stadium_mlb_busch_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "tropicana field",
"stadium_canonical_id": "stadium_mlb_tropicana_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "globe life field",
"stadium_canonical_id": "stadium_mlb_globe_life_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "rogers centre",
"stadium_canonical_id": "stadium_mlb_rogers_centre",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "nationals park",
"stadium_canonical_id": "stadium_mlb_nationals_park",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "honda center",
"stadium_canonical_id": "stadium_nhl_honda_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "delta center",
"stadium_canonical_id": "stadium_nhl_delta_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "td garden",
"stadium_canonical_id": "stadium_nhl_td_garden",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "keybank center",
"stadium_canonical_id": "stadium_nhl_keybank_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "scotiabank saddledome",
"stadium_canonical_id": "stadium_nhl_scotiabank_saddledome",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "pnc arena",
"stadium_canonical_id": "stadium_nhl_pnc_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "united center",
"stadium_canonical_id": "stadium_nhl_united_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "ball arena",
"stadium_canonical_id": "stadium_nhl_ball_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "nationwide arena",
"stadium_canonical_id": "stadium_nhl_nationwide_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "american airlines center",
"stadium_canonical_id": "stadium_nhl_american_airlines_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "little caesars arena",
"stadium_canonical_id": "stadium_nhl_little_caesars_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "rogers place",
"stadium_canonical_id": "stadium_nhl_rogers_place",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "amerant bank arena",
"stadium_canonical_id": "stadium_nhl_amerant_bank_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "crypto.com arena",
"stadium_canonical_id": "stadium_nhl_cryptocom_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "cryptocom arena",
"stadium_canonical_id": "stadium_nhl_cryptocom_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "xcel energy center",
"stadium_canonical_id": "stadium_nhl_xcel_energy_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "bell centre",
"stadium_canonical_id": "stadium_nhl_bell_centre",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "bridgestone arena",
"stadium_canonical_id": "stadium_nhl_bridgestone_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "prudential center",
"stadium_canonical_id": "stadium_nhl_prudential_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "ubs arena",
"stadium_canonical_id": "stadium_nhl_ubs_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "madison square garden",
"stadium_canonical_id": "stadium_nhl_madison_square_garden",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "canadian tire centre",
"stadium_canonical_id": "stadium_nhl_canadian_tire_centre",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "wells fargo center",
"stadium_canonical_id": "stadium_nhl_wells_fargo_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "ppg paints arena",
"stadium_canonical_id": "stadium_nhl_ppg_paints_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "sap center",
"stadium_canonical_id": "stadium_nhl_sap_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "climate pledge arena",
"stadium_canonical_id": "stadium_nhl_climate_pledge_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "enterprise center",
"stadium_canonical_id": "stadium_nhl_enterprise_center",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "amalie arena",
"stadium_canonical_id": "stadium_nhl_amalie_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "scotiabank arena",
"stadium_canonical_id": "stadium_nhl_scotiabank_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "rogers arena",
"stadium_canonical_id": "stadium_nhl_rogers_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "t-mobile arena",
"stadium_canonical_id": "stadium_nhl_tmobile_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "tmobile arena",
"stadium_canonical_id": "stadium_nhl_tmobile_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "capital one arena",
"stadium_canonical_id": "stadium_nhl_capital_one_arena",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "canada life centre",
"stadium_canonical_id": "stadium_nhl_canada_life_centre",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "daikin park",
"stadium_canonical_id": "stadium_mlb_minute_maid_park",
"valid_from": "2025-01-01",
"valid_until": null
},
{
"alias_name": "enron field",
"stadium_canonical_id": "stadium_mlb_minute_maid_park",
"valid_from": "2000-04-01",
"valid_until": "2002-02-28"
},
{
"alias_name": "astros field",
"stadium_canonical_id": "stadium_mlb_minute_maid_park",
"valid_from": "2002-03-01",
"valid_until": "2002-06-04"
},
{
"alias_name": "rate field",
"stadium_canonical_id": "stadium_mlb_guaranteed_rate_field",
"valid_from": "2024-01-01",
"valid_until": null
},
{
"alias_name": "us cellular field",
"stadium_canonical_id": "stadium_mlb_guaranteed_rate_field",
"valid_from": "2003-01-01",
"valid_until": "2016-08-24"
},
{
"alias_name": "comiskey park ii",
"stadium_canonical_id": "stadium_mlb_guaranteed_rate_field",
"valid_from": "1991-04-01",
"valid_until": "2002-12-31"
},
{
"alias_name": "new comiskey park",
"stadium_canonical_id": "stadium_mlb_guaranteed_rate_field",
"valid_from": "1991-04-01",
"valid_until": "2002-12-31"
},
{
"alias_name": "suntrust park",
"stadium_canonical_id": "stadium_mlb_truist_park",
"valid_from": "2017-04-01",
"valid_until": "2020-01-13"
},
{
"alias_name": "jacobs field",
"stadium_canonical_id": "stadium_mlb_progressive_field",
"valid_from": "1994-04-01",
"valid_until": "2008-01-10"
},
{
"alias_name": "the jake",
"stadium_canonical_id": "stadium_mlb_progressive_field",
"valid_from": "1994-04-01",
"valid_until": "2008-01-10"
},
{
"alias_name": "miller park",
"stadium_canonical_id": "stadium_mlb_american_family_field",
"valid_from": "2001-04-01",
"valid_until": "2020-12-31"
},
{
"alias_name": "skydome",
"stadium_canonical_id": "stadium_mlb_rogers_centre",
"valid_from": "1989-06-01",
"valid_until": "2005-02-01"
},
{
"alias_name": "marlins park",
"stadium_canonical_id": "stadium_mlb_loandepot_park",
"valid_from": "2012-04-01",
"valid_until": "2021-03-31"
},
{
"alias_name": "att park",
"stadium_canonical_id": "stadium_mlb_oracle_park",
"valid_from": "2006-01-01",
"valid_until": "2019-01-08"
},
{
"alias_name": "sbc park",
"stadium_canonical_id": "stadium_mlb_oracle_park",
"valid_from": "2004-01-01",
"valid_until": "2005-12-31"
},
{
"alias_name": "pac bell park",
"stadium_canonical_id": "stadium_mlb_oracle_park",
"valid_from": "2000-04-01",
"valid_until": "2003-12-31"
},
{
"alias_name": "choctaw stadium",
"stadium_canonical_id": "stadium_mlb_globe_life_field",
"valid_from": "2020-01-01",
"valid_until": null
},
{
"alias_name": "philips arena",
"stadium_canonical_id": "stadium_nba_state_farm_arena",
"valid_from": "1999-09-01",
"valid_until": "2018-06-25"
},
{
"alias_name": "ftx arena",
"stadium_canonical_id": "stadium_nba_kaseya_center",
"valid_from": "2021-06-01",
"valid_until": "2023-03-31"
},
{
"alias_name": "american airlines arena",
"stadium_canonical_id": "stadium_nba_kaseya_center",
"valid_from": "1999-12-01",
"valid_until": "2021-05-31"
},
{
"alias_name": "bankers life fieldhouse",
"stadium_canonical_id": "stadium_nba_gainbridge_fieldhouse",
"valid_from": "2011-01-01",
"valid_until": "2021-12-31"
},
{
"alias_name": "conseco fieldhouse",
"stadium_canonical_id": "stadium_nba_gainbridge_fieldhouse",
"valid_from": "1999-11-01",
"valid_until": "2010-12-31"
},
{
"alias_name": "quicken loans arena",
"stadium_canonical_id": "stadium_nba_rocket_mortgage_fieldhouse",
"valid_from": "2005-08-01",
"valid_until": "2019-08-08"
},
{
"alias_name": "gund arena",
"stadium_canonical_id": "stadium_nba_rocket_mortgage_fieldhouse",
"valid_from": "1994-10-01",
"valid_until": "2005-07-31"
},
{
"alias_name": "amway center",
"stadium_canonical_id": "stadium_nba_kia_center",
"valid_from": "2010-10-01",
"valid_until": "2023-07-12"
},
{
"alias_name": "att center",
"stadium_canonical_id": "stadium_nba_frost_bank_center",
"valid_from": "2002-10-01",
"valid_until": "2023-10-01"
},
{
"alias_name": "vivint arena",
"stadium_canonical_id": "stadium_nba_delta_center",
"valid_from": "2020-12-01",
"valid_until": "2023-07-01"
},
{
"alias_name": "vivint smart home arena",
"stadium_canonical_id": "stadium_nba_delta_center",
"valid_from": "2015-11-01",
"valid_until": "2020-11-30"
},
{
"alias_name": "energysolutions arena",
"stadium_canonical_id": "stadium_nba_delta_center",
"valid_from": "2006-11-01",
"valid_until": "2015-10-31"
},
{
"alias_name": "fla live arena",
"stadium_canonical_id": "stadium_nhl_amerant_bank_arena",
"valid_from": "2021-10-01",
"valid_until": "2024-05-31"
},
{
"alias_name": "bb&t center",
"stadium_canonical_id": "stadium_nhl_amerant_bank_arena",
"valid_from": "2012-06-01",
"valid_until": "2021-09-30"
},
{
"alias_name": "bankatlantic center",
"stadium_canonical_id": "stadium_nhl_amerant_bank_arena",
"valid_from": "2005-10-01",
"valid_until": "2012-05-31"
},
{
"alias_name": "keyarena",
"stadium_canonical_id": "stadium_nhl_climate_pledge_arena",
"valid_from": "1995-01-01",
"valid_until": "2018-10-01"
},
{
"alias_name": "seattle center coliseum",
"stadium_canonical_id": "stadium_nhl_climate_pledge_arena",
"valid_from": "1962-01-01",
"valid_until": "1994-12-31"
}
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,412 @@
#!/usr/bin/env python3
"""
SportsTime Canonicalization Pipeline
====================================
Master script that orchestrates all data canonicalization steps.
This is the NEW pipeline that performs local identity resolution
BEFORE any CloudKit upload.
Pipeline Stages:
1. SCRAPE: Fetch raw data from web sources
2. CANONICALIZE STADIUMS: Generate canonical stadium IDs and aliases
3. CANONICALIZE TEAMS: Match teams to stadiums, generate canonical IDs
4. CANONICALIZE GAMES: Resolve all references, generate canonical IDs
5. VALIDATE: Verify all data is internally consistent
6. (Optional) UPLOAD: CloudKit upload (separate script)
Usage:
python run_canonicalization_pipeline.py # Full pipeline
python run_canonicalization_pipeline.py --season 2026 # Specify season
python run_canonicalization_pipeline.py --skip-scrape # Use existing raw data
python run_canonicalization_pipeline.py --verbose # Detailed output
"""
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, asdict
# Import pipeline components
from scrape_schedules import (
scrape_nba_basketball_reference,
scrape_mlb_statsapi,
scrape_nhl_hockey_reference,
generate_stadiums_from_teams,
assign_stable_ids,
export_to_json,
)
from canonicalize_stadiums import (
canonicalize_stadiums,
add_historical_aliases,
deduplicate_aliases,
)
from canonicalize_teams import canonicalize_all_teams
from canonicalize_games import canonicalize_games
from validate_canonical import validate_canonical_data
@dataclass
class PipelineResult:
"""Result of the full canonicalization pipeline."""
success: bool
stadiums_count: int
teams_count: int
games_count: int
aliases_count: int
validation_errors: int
validation_warnings: int
duration_seconds: float
output_dir: str
def print_header(text: str):
"""Print a formatted header."""
print()
print("=" * 70)
print(f" {text}")
print("=" * 70)
def print_section(text: str):
"""Print a section header."""
print()
print(f"--- {text} ---")
def run_pipeline(
season: int = 2026,
output_dir: Path = Path('./data'),
skip_scrape: bool = False,
validate: bool = True,
verbose: bool = False,
) -> PipelineResult:
"""
Run the complete canonicalization pipeline.
Args:
season: Season year (e.g., 2026)
output_dir: Directory for output files
skip_scrape: Skip scraping, use existing raw data
validate: Run validation step
verbose: Print detailed output
Returns:
PipelineResult with statistics
"""
start_time = datetime.now()
output_dir.mkdir(parents=True, exist_ok=True)
# =========================================================================
# STAGE 1: SCRAPE RAW DATA
# =========================================================================
if not skip_scrape:
print_header("STAGE 1: SCRAPING RAW DATA")
all_games = []
all_stadiums = []
# Scrape stadiums from team mappings
print_section("Stadiums")
all_stadiums = generate_stadiums_from_teams()
print(f" Generated {len(all_stadiums)} stadiums from team data")
# Scrape NBA
print_section(f"NBA {season}")
nba_games = scrape_nba_basketball_reference(season)
nba_season = f"{season-1}-{str(season)[2:]}"
nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
all_games.extend(nba_games)
print(f" Scraped {len(nba_games)} NBA games")
# Scrape MLB
print_section(f"MLB {season}")
mlb_games = scrape_mlb_statsapi(season)
mlb_games = assign_stable_ids(mlb_games, 'MLB', str(season))
all_games.extend(mlb_games)
print(f" Scraped {len(mlb_games)} MLB games")
# Scrape NHL
print_section(f"NHL {season}")
nhl_games = scrape_nhl_hockey_reference(season)
nhl_season = f"{season-1}-{str(season)[2:]}"
nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
all_games.extend(nhl_games)
print(f" Scraped {len(nhl_games)} NHL games")
# Export raw data
print_section("Exporting Raw Data")
export_to_json(all_games, all_stadiums, output_dir)
print(f" Exported to {output_dir}")
raw_games = [g.__dict__ for g in all_games]
raw_stadiums = [s.__dict__ for s in all_stadiums]
else:
print_header("LOADING EXISTING RAW DATA")
games_file = output_dir / 'games.json'
stadiums_file = output_dir / 'stadiums.json'
with open(games_file) as f:
raw_games = json.load(f)
print(f" Loaded {len(raw_games)} raw games")
with open(stadiums_file) as f:
raw_stadiums = json.load(f)
print(f" Loaded {len(raw_stadiums)} raw stadiums")
# =========================================================================
# STAGE 2: CANONICALIZE STADIUMS
# =========================================================================
print_header("STAGE 2: CANONICALIZING STADIUMS")
canonical_stadiums, stadium_aliases = canonicalize_stadiums(
raw_stadiums, verbose=verbose
)
print(f" Created {len(canonical_stadiums)} canonical stadiums")
# Add historical aliases
canonical_ids = {s.canonical_id for s in canonical_stadiums}
stadium_aliases = add_historical_aliases(stadium_aliases, canonical_ids)
stadium_aliases = deduplicate_aliases(stadium_aliases)
print(f" Created {len(stadium_aliases)} stadium aliases")
# Export
stadiums_canonical_path = output_dir / 'stadiums_canonical.json'
aliases_path = output_dir / 'stadium_aliases.json'
with open(stadiums_canonical_path, 'w') as f:
json.dump([asdict(s) for s in canonical_stadiums], f, indent=2)
with open(aliases_path, 'w') as f:
json.dump([asdict(a) for a in stadium_aliases], f, indent=2)
print(f" Exported to {stadiums_canonical_path}")
print(f" Exported to {aliases_path}")
# =========================================================================
# STAGE 3: CANONICALIZE TEAMS
# =========================================================================
print_header("STAGE 3: CANONICALIZING TEAMS")
# Convert canonical stadiums to dicts for team matching
stadiums_list = [asdict(s) for s in canonical_stadiums]
canonical_teams, team_warnings = canonicalize_all_teams(
stadiums_list, verbose=verbose
)
print(f" Created {len(canonical_teams)} canonical teams")
if team_warnings:
print(f" Warnings: {len(team_warnings)}")
if verbose:
for w in team_warnings:
print(f" - {w.team_canonical_id}: {w.issue}")
# Export
teams_canonical_path = output_dir / 'teams_canonical.json'
with open(teams_canonical_path, 'w') as f:
json.dump([asdict(t) for t in canonical_teams], f, indent=2)
print(f" Exported to {teams_canonical_path}")
# =========================================================================
# STAGE 4: CANONICALIZE GAMES
# =========================================================================
print_header("STAGE 4: CANONICALIZING GAMES")
# Convert data to dicts for game canonicalization
teams_list = [asdict(t) for t in canonical_teams]
aliases_list = [asdict(a) for a in stadium_aliases]
canonical_games_list, game_warnings = canonicalize_games(
raw_games, teams_list, aliases_list, verbose=verbose
)
print(f" Created {len(canonical_games_list)} canonical games")
if game_warnings:
print(f" Warnings: {len(game_warnings)}")
if verbose:
from collections import defaultdict
by_issue = defaultdict(int)
for w in game_warnings:
by_issue[w.issue] += 1
for issue, count in by_issue.items():
print(f" - {issue}: {count}")
# Export
games_canonical_path = output_dir / 'games_canonical.json'
with open(games_canonical_path, 'w') as f:
json.dump([asdict(g) for g in canonical_games_list], f, indent=2)
print(f" Exported to {games_canonical_path}")
# =========================================================================
# STAGE 5: VALIDATE
# =========================================================================
validation_result = None
if validate:
print_header("STAGE 5: VALIDATION")
# Reload as dicts for validation
canonical_stadiums_dicts = [asdict(s) for s in canonical_stadiums]
canonical_teams_dicts = [asdict(t) for t in canonical_teams]
canonical_games_dicts = [asdict(g) for g in canonical_games_list]
aliases_dicts = [asdict(a) for a in stadium_aliases]
validation_result = validate_canonical_data(
canonical_stadiums_dicts,
canonical_teams_dicts,
canonical_games_dicts,
aliases_dicts,
verbose=verbose
)
if validation_result.is_valid:
print(f" STATUS: PASSED")
else:
print(f" STATUS: FAILED")
print(f" Errors: {validation_result.error_count}")
print(f" Warnings: {validation_result.warning_count}")
# Export validation report
validation_path = output_dir / 'canonicalization_validation.json'
with open(validation_path, 'w') as f:
json.dump({
'is_valid': validation_result.is_valid,
'error_count': validation_result.error_count,
'warning_count': validation_result.warning_count,
'summary': validation_result.summary,
'errors': validation_result.errors[:100], # Limit to 100 for readability
}, f, indent=2)
print(f" Report exported to {validation_path}")
# =========================================================================
# SUMMARY
# =========================================================================
duration = (datetime.now() - start_time).total_seconds()
print_header("PIPELINE COMPLETE")
print()
print(f" Duration: {duration:.1f} seconds")
print(f" Stadiums: {len(canonical_stadiums)}")
print(f" Teams: {len(canonical_teams)}")
print(f" Games: {len(canonical_games_list)}")
print(f" Aliases: {len(stadium_aliases)}")
print()
# Games by sport
print(" Games by sport:")
by_sport = {}
for g in canonical_games_list:
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
for sport, count in sorted(by_sport.items()):
print(f" {sport}: {count:,} games")
print()
print(" Output files:")
print(f" - {output_dir / 'stadiums_canonical.json'}")
print(f" - {output_dir / 'stadium_aliases.json'}")
print(f" - {output_dir / 'teams_canonical.json'}")
print(f" - {output_dir / 'games_canonical.json'}")
print(f" - {output_dir / 'canonicalization_validation.json'}")
print()
# Final status
success = True
if validation_result and not validation_result.is_valid:
success = False
print(" PIPELINE FAILED - Validation errors detected")
print(" CloudKit upload should NOT proceed until errors are fixed")
else:
print(" PIPELINE SUCCEEDED - Ready for CloudKit upload")
print()
return PipelineResult(
success=success,
stadiums_count=len(canonical_stadiums),
teams_count=len(canonical_teams),
games_count=len(canonical_games_list),
aliases_count=len(stadium_aliases),
validation_errors=validation_result.error_count if validation_result else 0,
validation_warnings=validation_result.warning_count if validation_result else 0,
duration_seconds=duration,
output_dir=str(output_dir),
)
def main():
parser = argparse.ArgumentParser(
description='SportsTime Canonicalization Pipeline',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Pipeline Stages:
1. SCRAPE: Fetch raw data from web sources
2. CANONICALIZE STADIUMS: Generate canonical IDs and aliases
3. CANONICALIZE TEAMS: Match teams to stadiums
4. CANONICALIZE GAMES: Resolve all references
5. VALIDATE: Verify internal consistency
Examples:
python run_canonicalization_pipeline.py # Full pipeline
python run_canonicalization_pipeline.py --season 2026 # Different season
python run_canonicalization_pipeline.py --skip-scrape # Use existing raw data
python run_canonicalization_pipeline.py --verbose # Show all details
"""
)
parser.add_argument(
'--season', type=int, default=2026,
help='Season year (default: 2026)'
)
parser.add_argument(
'--output', type=str, default='./data',
help='Output directory (default: ./data)'
)
parser.add_argument(
'--skip-scrape', action='store_true',
help='Skip scraping, use existing raw data files'
)
parser.add_argument(
'--no-validate', action='store_true',
help='Skip validation step'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
parser.add_argument(
'--strict', action='store_true',
help='Exit with error code if validation fails'
)
args = parser.parse_args()
result = run_pipeline(
season=args.season,
output_dir=Path(args.output),
skip_scrape=args.skip_scrape,
validate=not args.no_validate,
verbose=args.verbose,
)
# Exit with error code if requested and validation failed
if args.strict and not result.success:
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,636 @@
#!/usr/bin/env python3
"""
Canonical Data Validation for SportsTime
=========================================
Stage 4 of the canonicalization pipeline.
Validates all canonical data before CloudKit upload.
FAILS if any ERROR-level issues are found.
Usage:
python validate_canonical.py --data-dir data/
python validate_canonical.py --stadiums data/stadiums_canonical.json \
--teams data/teams_canonical.json --games data/games_canonical.json
"""
import argparse
import json
from collections import defaultdict
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class ValidationError:
"""A validation error or warning."""
severity: str # 'error', 'warning'
category: str
message: str
details: Optional[dict] = None
@dataclass
class ValidationResult:
"""Overall validation result."""
is_valid: bool
error_count: int
warning_count: int
errors: list
summary: dict
# =============================================================================
# EXPECTED GAME COUNTS
# =============================================================================
EXPECTED_GAMES = {
'nba': {
'expected': 82,
'min': 75,
'max': 90,
'description': 'NBA regular season (82 games)'
},
'nhl': {
'expected': 82,
'min': 75,
'max': 90,
'description': 'NHL regular season (82 games)'
},
'mlb': {
'expected': 162,
'min': 155,
'max': 168,
'description': 'MLB regular season (162 games)'
},
}
# =============================================================================
# VALIDATION FUNCTIONS
# =============================================================================
def validate_no_duplicate_ids(
stadiums: list[dict],
teams: list[dict],
games: list[dict]
) -> list[ValidationError]:
"""Check for duplicate canonical IDs."""
errors = []
# Stadiums
seen_stadium_ids = set()
for s in stadiums:
canonical_id = s.get('canonical_id', '')
if canonical_id in seen_stadium_ids:
errors.append(ValidationError(
severity='error',
category='duplicate_id',
message=f'Duplicate stadium canonical_id: {canonical_id}'
))
seen_stadium_ids.add(canonical_id)
# Teams
seen_team_ids = set()
for t in teams:
canonical_id = t.get('canonical_id', '')
if canonical_id in seen_team_ids:
errors.append(ValidationError(
severity='error',
category='duplicate_id',
message=f'Duplicate team canonical_id: {canonical_id}'
))
seen_team_ids.add(canonical_id)
# Games
seen_game_ids = set()
for g in games:
canonical_id = g.get('canonical_id', '')
if canonical_id in seen_game_ids:
errors.append(ValidationError(
severity='error',
category='duplicate_id',
message=f'Duplicate game canonical_id: {canonical_id}'
))
seen_game_ids.add(canonical_id)
return errors
def validate_team_stadium_references(
teams: list[dict],
stadium_ids: set[str]
) -> list[ValidationError]:
"""Validate that all teams reference valid stadiums."""
errors = []
for team in teams:
canonical_id = team.get('canonical_id', '')
stadium_id = team.get('stadium_canonical_id', '')
if not stadium_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Team {canonical_id} has no stadium_canonical_id'
))
elif stadium_id.startswith('stadium_unknown'):
errors.append(ValidationError(
severity='warning',
category='unknown_stadium',
message=f'Team {canonical_id} has unknown stadium: {stadium_id}'
))
elif stadium_id not in stadium_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Team {canonical_id} references unknown stadium: {stadium_id}'
))
return errors
def validate_game_references(
games: list[dict],
team_ids: set[str],
stadium_ids: set[str]
) -> list[ValidationError]:
"""Validate that all games reference valid teams and stadiums."""
errors = []
for game in games:
canonical_id = game.get('canonical_id', '')
home_team_id = game.get('home_team_canonical_id', '')
away_team_id = game.get('away_team_canonical_id', '')
stadium_id = game.get('stadium_canonical_id', '')
# Home team
if not home_team_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Game {canonical_id} has no home_team_canonical_id'
))
elif home_team_id not in team_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Game {canonical_id} references unknown home team: {home_team_id}'
))
# Away team
if not away_team_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Game {canonical_id} has no away_team_canonical_id'
))
elif away_team_id not in team_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Game {canonical_id} references unknown away team: {away_team_id}'
))
# Stadium
if not stadium_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Game {canonical_id} has no stadium_canonical_id'
))
elif stadium_id.startswith('stadium_unknown'):
errors.append(ValidationError(
severity='warning',
category='unknown_stadium',
message=f'Game {canonical_id} has unknown stadium: {stadium_id}'
))
elif stadium_id not in stadium_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Game {canonical_id} references unknown stadium: {stadium_id}'
))
return errors
def validate_no_cross_sport_references(games: list[dict]) -> list[ValidationError]:
"""Validate that games don't have cross-sport team references."""
errors = []
for game in games:
canonical_id = game.get('canonical_id', '')
game_sport = game.get('sport', '').lower()
home_team_id = game.get('home_team_canonical_id', '')
away_team_id = game.get('away_team_canonical_id', '')
# Extract sport from team IDs (format: team_{sport}_{abbrev})
def get_sport_from_id(team_id: str) -> Optional[str]:
parts = team_id.split('_')
if len(parts) >= 2:
return parts[1]
return None
home_sport = get_sport_from_id(home_team_id)
away_sport = get_sport_from_id(away_team_id)
if home_sport and home_sport != game_sport:
errors.append(ValidationError(
severity='error',
category='cross_sport',
message=f'Game {canonical_id} ({game_sport}) has cross-sport home team ({home_sport})'
))
if away_sport and away_sport != game_sport:
errors.append(ValidationError(
severity='error',
category='cross_sport',
message=f'Game {canonical_id} ({game_sport}) has cross-sport away team ({away_sport})'
))
return errors
def validate_stadium_aliases(
aliases: list[dict],
stadium_ids: set[str]
) -> list[ValidationError]:
"""Validate that all stadium aliases reference valid stadiums."""
errors = []
for alias in aliases:
alias_name = alias.get('alias_name', '')
stadium_id = alias.get('stadium_canonical_id', '')
if not stadium_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Stadium alias "{alias_name}" has no stadium_canonical_id'
))
elif stadium_id not in stadium_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Stadium alias "{alias_name}" references unknown stadium: {stadium_id}'
))
return errors
def validate_game_counts_per_team(games: list[dict]) -> list[ValidationError]:
"""Validate that each team has expected number of games."""
errors = []
# Count games per team
team_game_counts = defaultdict(int)
for game in games:
home_id = game.get('home_team_canonical_id', '')
away_id = game.get('away_team_canonical_id', '')
team_game_counts[home_id] += 1
team_game_counts[away_id] += 1
# Check against expected counts
for team_id, count in team_game_counts.items():
# Extract sport from team ID
parts = team_id.split('_')
if len(parts) < 2:
continue
sport = parts[1]
if sport in EXPECTED_GAMES:
expected = EXPECTED_GAMES[sport]
if count < expected['min']:
errors.append(ValidationError(
severity='warning',
category='game_count',
message=f'Team {team_id} has only {count} games (expected ~{expected["expected"]})',
details={'count': count, 'expected': expected['expected'], 'min': expected['min']}
))
elif count > expected['max']:
errors.append(ValidationError(
severity='warning',
category='game_count',
message=f'Team {team_id} has {count} games (expected ~{expected["expected"]})',
details={'count': count, 'expected': expected['expected'], 'max': expected['max']}
))
return errors
def validate_required_fields(
stadiums: list[dict],
teams: list[dict],
games: list[dict]
) -> list[ValidationError]:
"""Validate that required fields are present."""
errors = []
# Required stadium fields
stadium_required = ['canonical_id', 'name', 'sport', 'latitude', 'longitude']
for s in stadiums:
for field in stadium_required:
if field not in s or s[field] is None:
errors.append(ValidationError(
severity='error',
category='missing_field',
message=f'Stadium {s.get("canonical_id", "unknown")} missing required field: {field}'
))
# Required team fields
team_required = ['canonical_id', 'name', 'abbreviation', 'sport', 'stadium_canonical_id']
for t in teams:
for field in team_required:
if field not in t or t[field] is None:
errors.append(ValidationError(
severity='error',
category='missing_field',
message=f'Team {t.get("canonical_id", "unknown")} missing required field: {field}'
))
# Required game fields
game_required = ['canonical_id', 'sport', 'date', 'home_team_canonical_id', 'away_team_canonical_id', 'stadium_canonical_id']
for g in games:
for field in game_required:
if field not in g or g[field] is None:
errors.append(ValidationError(
severity='error',
category='missing_field',
message=f'Game {g.get("canonical_id", "unknown")} missing required field: {field}'
))
return errors
# =============================================================================
# MAIN VALIDATION
# =============================================================================
def validate_canonical_data(
stadiums: list[dict],
teams: list[dict],
games: list[dict],
stadium_aliases: list[dict],
verbose: bool = False
) -> ValidationResult:
"""
Stage 4: Validate all canonical data.
Runs all validation checks and returns results.
Args:
stadiums: List of canonical stadium dicts
teams: List of canonical team dicts
games: List of canonical game dicts
stadium_aliases: List of stadium alias dicts
verbose: Print detailed progress
Returns:
ValidationResult with is_valid, error/warning counts, and error list
"""
all_errors = []
# Build ID sets for reference checking
stadium_ids = {s.get('canonical_id', '') for s in stadiums}
team_ids = {t.get('canonical_id', '') for t in teams}
print("Running validation checks...")
# 1. Duplicate IDs
if verbose:
print(" Checking for duplicate IDs...")
errors = validate_no_duplicate_ids(stadiums, teams, games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} duplicate ID issues")
# 2. Required fields
if verbose:
print(" Checking required fields...")
errors = validate_required_fields(stadiums, teams, games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} missing field issues")
# 3. Team -> Stadium references
if verbose:
print(" Checking team -> stadium references...")
errors = validate_team_stadium_references(teams, stadium_ids)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} team-stadium reference issues")
# 4. Game -> Team/Stadium references
if verbose:
print(" Checking game -> team/stadium references...")
errors = validate_game_references(games, team_ids, stadium_ids)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} game reference issues")
# 5. Cross-sport references
if verbose:
print(" Checking for cross-sport references...")
errors = validate_no_cross_sport_references(games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} cross-sport reference issues")
# 6. Stadium aliases
if verbose:
print(" Checking stadium alias references...")
errors = validate_stadium_aliases(stadium_aliases, stadium_ids)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} stadium alias issues")
# 7. Game counts per team
if verbose:
print(" Checking game counts per team...")
errors = validate_game_counts_per_team(games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} game count issues")
# Count by severity
error_count = sum(1 for e in all_errors if e.severity == 'error')
warning_count = sum(1 for e in all_errors if e.severity == 'warning')
# Count by category
by_category = defaultdict(int)
for e in all_errors:
by_category[e.category] += 1
# Determine validity (no errors = valid, warnings are OK)
is_valid = error_count == 0
return ValidationResult(
is_valid=is_valid,
error_count=error_count,
warning_count=warning_count,
errors=[asdict(e) for e in all_errors],
summary={
'stadiums': len(stadiums),
'teams': len(teams),
'games': len(games),
'aliases': len(stadium_aliases),
'by_category': dict(by_category)
}
)
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Validate canonical data'
)
parser.add_argument(
'--data-dir', type=str, default=None,
help='Directory containing all canonical JSON files'
)
parser.add_argument(
'--stadiums', type=str, default=None,
help='Input canonical stadiums JSON file'
)
parser.add_argument(
'--teams', type=str, default=None,
help='Input canonical teams JSON file'
)
parser.add_argument(
'--games', type=str, default=None,
help='Input canonical games JSON file'
)
parser.add_argument(
'--aliases', type=str, default=None,
help='Input stadium aliases JSON file'
)
parser.add_argument(
'--output', type=str, default=None,
help='Output file for validation report'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
parser.add_argument(
'--strict', action='store_true',
help='Exit with error code if validation fails'
)
args = parser.parse_args()
# Determine file paths
if args.data_dir:
data_dir = Path(args.data_dir)
stadiums_path = data_dir / 'stadiums_canonical.json'
teams_path = data_dir / 'teams_canonical.json'
games_path = data_dir / 'games_canonical.json'
aliases_path = data_dir / 'stadium_aliases.json'
else:
stadiums_path = Path(args.stadiums or './data/stadiums_canonical.json')
teams_path = Path(args.teams or './data/teams_canonical.json')
games_path = Path(args.games or './data/games_canonical.json')
aliases_path = Path(args.aliases or './data/stadium_aliases.json')
# Load input files
print(f"Loading canonical data...")
with open(stadiums_path) as f:
stadiums = json.load(f)
print(f" Loaded {len(stadiums)} stadiums from {stadiums_path}")
with open(teams_path) as f:
teams = json.load(f)
print(f" Loaded {len(teams)} teams from {teams_path}")
with open(games_path) as f:
games = json.load(f)
print(f" Loaded {len(games)} games from {games_path}")
stadium_aliases = []
if aliases_path.exists():
with open(aliases_path) as f:
stadium_aliases = json.load(f)
print(f" Loaded {len(stadium_aliases)} aliases from {aliases_path}")
# Validate
print()
result = validate_canonical_data(
stadiums, teams, games, stadium_aliases,
verbose=args.verbose
)
# Print results
print()
print("=" * 60)
print("VALIDATION RESULTS")
print("=" * 60)
print()
if result.is_valid:
print(" STATUS: PASSED")
else:
print(" STATUS: FAILED")
print()
print(f" Errors: {result.error_count}")
print(f" Warnings: {result.warning_count}")
print()
print(f" Data Summary:")
print(f" Stadiums: {result.summary['stadiums']}")
print(f" Teams: {result.summary['teams']}")
print(f" Games: {result.summary['games']}")
print(f" Aliases: {result.summary['aliases']}")
if result.summary['by_category']:
print()
print(f" Issues by Category:")
for category, count in sorted(result.summary['by_category'].items()):
print(f" {category}: {count}")
# Print errors (up to 20)
if result.errors:
errors_only = [e for e in result.errors if e['severity'] == 'error']
warnings_only = [e for e in result.errors if e['severity'] == 'warning']
if errors_only:
print()
print(" ERRORS (must fix):")
for e in errors_only[:20]:
print(f" [{e['category']}] {e['message']}")
if len(errors_only) > 20:
print(f" ... and {len(errors_only) - 20} more errors")
if warnings_only and args.verbose:
print()
print(" WARNINGS (informational):")
for e in warnings_only[:20]:
print(f" [{e['category']}] {e['message']}")
if len(warnings_only) > 20:
print(f" ... and {len(warnings_only) - 20} more warnings")
# Export report
if args.output:
output_path = Path(args.output)
with open(output_path, 'w') as f:
json.dump(asdict(result), f, indent=2)
print()
print(f"Report exported to {output_path}")
# Exit code
if args.strict and not result.is_valid:
print()
print("VALIDATION FAILED - Exiting with error code 1")
exit(1)
if __name__ == '__main__':
main()