#!/usr/bin/env python3 """ Team Canonicalization for SportsTime ==================================== Stage 2 of the canonicalization pipeline. Generates canonical team IDs and fuzzy matches teams to stadiums. Usage: python canonicalize_teams.py --stadiums data/stadiums_canonical.json --output data/ """ import argparse import json from dataclasses import dataclass, asdict, field from difflib import SequenceMatcher from pathlib import Path from typing import Optional # Import team mappings from scraper from scrape_schedules import NBA_TEAMS, MLB_TEAMS, NHL_TEAMS, NFL_TEAMS from mls import MLS_TEAMS from wnba import WNBA_TEAMS from nwsl import NWSL_TEAMS # ============================================================================= # DATA CLASSES # ============================================================================= @dataclass class CanonicalTeam: """A canonicalized team with stable ID.""" canonical_id: str name: str abbreviation: str sport: str city: str stadium_canonical_id: str conference_id: Optional[str] = None division_id: Optional[str] = None primary_color: Optional[str] = None secondary_color: Optional[str] = None @dataclass class MatchWarning: """Warning about a low-confidence match.""" team_canonical_id: str team_name: str arena_name: str matched_stadium: Optional[str] issue: str confidence: float # ============================================================================= # LEAGUE STRUCTURE # Maps team abbreviation -> (conference_id, division_id) # ============================================================================= NBA_DIVISIONS = { # Eastern Conference - Atlantic 'BOS': ('nba_eastern', 'nba_atlantic'), 'BRK': ('nba_eastern', 'nba_atlantic'), 'NYK': ('nba_eastern', 'nba_atlantic'), 'PHI': ('nba_eastern', 'nba_atlantic'), 'TOR': ('nba_eastern', 'nba_atlantic'), # Eastern Conference - Central 'CHI': ('nba_eastern', 'nba_central'), 'CLE': ('nba_eastern', 'nba_central'), 'DET': ('nba_eastern', 'nba_central'), 'IND': ('nba_eastern', 'nba_central'), 'MIL': ('nba_eastern', 'nba_central'), # Eastern Conference - Southeast 'ATL': ('nba_eastern', 'nba_southeast'), 'CHO': ('nba_eastern', 'nba_southeast'), 'MIA': ('nba_eastern', 'nba_southeast'), 'ORL': ('nba_eastern', 'nba_southeast'), 'WAS': ('nba_eastern', 'nba_southeast'), # Western Conference - Northwest 'DEN': ('nba_western', 'nba_northwest'), 'MIN': ('nba_western', 'nba_northwest'), 'OKC': ('nba_western', 'nba_northwest'), 'POR': ('nba_western', 'nba_northwest'), 'UTA': ('nba_western', 'nba_northwest'), # Western Conference - Pacific 'GSW': ('nba_western', 'nba_pacific'), 'LAC': ('nba_western', 'nba_pacific'), 'LAL': ('nba_western', 'nba_pacific'), 'PHO': ('nba_western', 'nba_pacific'), 'SAC': ('nba_western', 'nba_pacific'), # Western Conference - Southwest 'DAL': ('nba_western', 'nba_southwest'), 'HOU': ('nba_western', 'nba_southwest'), 'MEM': ('nba_western', 'nba_southwest'), 'NOP': ('nba_western', 'nba_southwest'), 'SAS': ('nba_western', 'nba_southwest'), } MLB_DIVISIONS = { # American League - East 'NYY': ('mlb_al', 'mlb_al_east'), 'BOS': ('mlb_al', 'mlb_al_east'), 'TOR': ('mlb_al', 'mlb_al_east'), 'BAL': ('mlb_al', 'mlb_al_east'), 'TBR': ('mlb_al', 'mlb_al_east'), # American League - Central 'CLE': ('mlb_al', 'mlb_al_central'), 'DET': ('mlb_al', 'mlb_al_central'), 'MIN': ('mlb_al', 'mlb_al_central'), 'CHW': ('mlb_al', 'mlb_al_central'), 'KCR': ('mlb_al', 'mlb_al_central'), # American League - West 'HOU': ('mlb_al', 'mlb_al_west'), 'SEA': ('mlb_al', 'mlb_al_west'), 'TEX': ('mlb_al', 'mlb_al_west'), 'LAA': ('mlb_al', 'mlb_al_west'), 'OAK': ('mlb_al', 'mlb_al_west'), # National League - East 'ATL': ('mlb_nl', 'mlb_nl_east'), 'PHI': ('mlb_nl', 'mlb_nl_east'), 'NYM': ('mlb_nl', 'mlb_nl_east'), 'MIA': ('mlb_nl', 'mlb_nl_east'), 'WSN': ('mlb_nl', 'mlb_nl_east'), # National League - Central 'MIL': ('mlb_nl', 'mlb_nl_central'), 'CHC': ('mlb_nl', 'mlb_nl_central'), 'STL': ('mlb_nl', 'mlb_nl_central'), 'PIT': ('mlb_nl', 'mlb_nl_central'), 'CIN': ('mlb_nl', 'mlb_nl_central'), # National League - West 'LAD': ('mlb_nl', 'mlb_nl_west'), 'ARI': ('mlb_nl', 'mlb_nl_west'), 'SDP': ('mlb_nl', 'mlb_nl_west'), 'SFG': ('mlb_nl', 'mlb_nl_west'), 'COL': ('mlb_nl', 'mlb_nl_west'), } NHL_DIVISIONS = { # Eastern Conference - Atlantic 'BOS': ('nhl_eastern', 'nhl_atlantic'), 'BUF': ('nhl_eastern', 'nhl_atlantic'), 'DET': ('nhl_eastern', 'nhl_atlantic'), 'FLA': ('nhl_eastern', 'nhl_atlantic'), 'MTL': ('nhl_eastern', 'nhl_atlantic'), 'OTT': ('nhl_eastern', 'nhl_atlantic'), 'TBL': ('nhl_eastern', 'nhl_atlantic'), 'TOR': ('nhl_eastern', 'nhl_atlantic'), # Eastern Conference - Metropolitan 'CAR': ('nhl_eastern', 'nhl_metropolitan'), 'CBJ': ('nhl_eastern', 'nhl_metropolitan'), 'NJD': ('nhl_eastern', 'nhl_metropolitan'), 'NYI': ('nhl_eastern', 'nhl_metropolitan'), 'NYR': ('nhl_eastern', 'nhl_metropolitan'), 'PHI': ('nhl_eastern', 'nhl_metropolitan'), 'PIT': ('nhl_eastern', 'nhl_metropolitan'), 'WSH': ('nhl_eastern', 'nhl_metropolitan'), # Western Conference - Central 'ARI': ('nhl_western', 'nhl_central'), # Utah Hockey Club 'CHI': ('nhl_western', 'nhl_central'), 'COL': ('nhl_western', 'nhl_central'), 'DAL': ('nhl_western', 'nhl_central'), 'MIN': ('nhl_western', 'nhl_central'), 'NSH': ('nhl_western', 'nhl_central'), 'STL': ('nhl_western', 'nhl_central'), 'WPG': ('nhl_western', 'nhl_central'), # Western Conference - Pacific 'ANA': ('nhl_western', 'nhl_pacific'), 'CGY': ('nhl_western', 'nhl_pacific'), 'EDM': ('nhl_western', 'nhl_pacific'), 'LAK': ('nhl_western', 'nhl_pacific'), 'SEA': ('nhl_western', 'nhl_pacific'), 'SJS': ('nhl_western', 'nhl_pacific'), 'VAN': ('nhl_western', 'nhl_pacific'), 'VGK': ('nhl_western', 'nhl_pacific'), } NFL_DIVISIONS = { # AFC East 'BUF': ('nfl_afc', 'nfl_afc_east'), 'MIA': ('nfl_afc', 'nfl_afc_east'), 'NE': ('nfl_afc', 'nfl_afc_east'), 'NYJ': ('nfl_afc', 'nfl_afc_east'), # AFC North 'BAL': ('nfl_afc', 'nfl_afc_north'), 'CIN': ('nfl_afc', 'nfl_afc_north'), 'CLE': ('nfl_afc', 'nfl_afc_north'), 'PIT': ('nfl_afc', 'nfl_afc_north'), # AFC South 'HOU': ('nfl_afc', 'nfl_afc_south'), 'IND': ('nfl_afc', 'nfl_afc_south'), 'JAX': ('nfl_afc', 'nfl_afc_south'), 'TEN': ('nfl_afc', 'nfl_afc_south'), # AFC West 'DEN': ('nfl_afc', 'nfl_afc_west'), 'KC': ('nfl_afc', 'nfl_afc_west'), 'LV': ('nfl_afc', 'nfl_afc_west'), 'LAC': ('nfl_afc', 'nfl_afc_west'), # NFC East 'DAL': ('nfl_nfc', 'nfl_nfc_east'), 'NYG': ('nfl_nfc', 'nfl_nfc_east'), 'PHI': ('nfl_nfc', 'nfl_nfc_east'), 'WAS': ('nfl_nfc', 'nfl_nfc_east'), # NFC North 'CHI': ('nfl_nfc', 'nfl_nfc_north'), 'DET': ('nfl_nfc', 'nfl_nfc_north'), 'GB': ('nfl_nfc', 'nfl_nfc_north'), 'MIN': ('nfl_nfc', 'nfl_nfc_north'), # NFC South 'ATL': ('nfl_nfc', 'nfl_nfc_south'), 'CAR': ('nfl_nfc', 'nfl_nfc_south'), 'NO': ('nfl_nfc', 'nfl_nfc_south'), 'TB': ('nfl_nfc', 'nfl_nfc_south'), # NFC West 'ARI': ('nfl_nfc', 'nfl_nfc_west'), 'LAR': ('nfl_nfc', 'nfl_nfc_west'), 'SF': ('nfl_nfc', 'nfl_nfc_west'), 'SEA': ('nfl_nfc', 'nfl_nfc_west'), } MLS_DIVISIONS = { # Eastern Conference (MLS uses conferences, not divisions) 'ATL': ('mls_eastern', None), 'CHI': ('mls_eastern', None), 'CIN': ('mls_eastern', None), 'CLB': ('mls_eastern', None), 'CLT': ('mls_eastern', None), 'DC': ('mls_eastern', None), 'MIA': ('mls_eastern', None), 'MTL': ('mls_eastern', None), 'NE': ('mls_eastern', None), 'NYCFC': ('mls_eastern', None), 'NYRB': ('mls_eastern', None), 'ORL': ('mls_eastern', None), 'PHI': ('mls_eastern', None), 'TOR': ('mls_eastern', None), # Western Conference 'AUS': ('mls_western', None), 'COL': ('mls_western', None), 'DAL': ('mls_western', None), 'HOU': ('mls_western', None), 'LAFC': ('mls_western', None), 'LAG': ('mls_western', None), 'MIN': ('mls_western', None), 'NSH': ('mls_western', None), 'POR': ('mls_western', None), 'RSL': ('mls_western', None), 'SD': ('mls_western', None), 'SEA': ('mls_western', None), 'SJ': ('mls_western', None), 'SKC': ('mls_western', None), 'STL': ('mls_western', None), 'VAN': ('mls_western', None), } WNBA_DIVISIONS = { # WNBA has no divisions (single league structure) 'ATL': ('wnba', None), 'CHI': ('wnba', None), 'CON': ('wnba', None), 'DAL': ('wnba', None), 'GSV': ('wnba', None), 'IND': ('wnba', None), 'LVA': ('wnba', None), 'LA': ('wnba', None), 'MIN': ('wnba', None), 'NY': ('wnba', None), 'PHO': ('wnba', None), 'SEA': ('wnba', None), 'WAS': ('wnba', None), } NWSL_DIVISIONS = { # NWSL has no divisions (single league structure) 'LA': ('nwsl', None), # Angel City FC 'SJ': ('nwsl', None), # Bay FC 'CHI': ('nwsl', None), # Chicago Red Stars 'HOU': ('nwsl', None), # Houston Dash 'KC': ('nwsl', None), # Kansas City Current 'NJ': ('nwsl', None), # NJ/NY Gotham FC 'NC': ('nwsl', None), # North Carolina Courage 'ORL': ('nwsl', None), # Orlando Pride 'POR': ('nwsl', None), # Portland Thorns FC 'SEA': ('nwsl', None), # Seattle Reign FC 'SD': ('nwsl', None), # San Diego Wave FC 'UTA': ('nwsl', None), # Utah Royals FC 'WAS': ('nwsl', None), # Washington Spirit } # ============================================================================= # FUZZY MATCHING # ============================================================================= def normalize_for_matching(text: str) -> str: """Normalize text for fuzzy matching.""" import re text = text.lower().strip() # Remove common suffixes/prefixes text = re.sub(r'\s*(arena|center|stadium|field|park|centre)\s*', ' ', text) # Remove special characters text = re.sub(r'[^a-z0-9\s]', '', text) # Collapse spaces text = re.sub(r'\s+', ' ', text).strip() return text def fuzzy_match_stadium( team_arena_name: str, team_city: str, sport: str, stadiums: list[dict], confidence_threshold: float = 0.6 ) -> tuple[Optional[str], float]: """ Fuzzy match team's arena to a canonical stadium. Matching strategy: - 70% weight: Name similarity (SequenceMatcher) - 30% weight: City match (exact=1.0, partial=0.5) Args: team_arena_name: The arena name from team mapping team_city: The team's city sport: Sport code (NBA, MLB, NHL) stadiums: List of canonical stadium dicts confidence_threshold: Minimum confidence for a match Returns: (canonical_stadium_id, confidence_score) """ best_match = None best_score = 0.0 # Normalize arena name arena_normalized = normalize_for_matching(team_arena_name) city_lower = team_city.lower() # Filter to same sport sport_stadiums = [s for s in stadiums if s['sport'] == sport] for stadium in sport_stadiums: stadium_name_normalized = normalize_for_matching(stadium['name']) # Score 1: Name similarity name_score = SequenceMatcher( None, arena_normalized, stadium_name_normalized ).ratio() # Also check full names (unnormalized) full_name_score = SequenceMatcher( None, team_arena_name.lower(), stadium['name'].lower() ).ratio() # Take the better score name_score = max(name_score, full_name_score) # Score 2: City match city_score = 0.0 stadium_city_lower = stadium['city'].lower() if city_lower == stadium_city_lower: city_score = 1.0 elif city_lower in stadium_city_lower or stadium_city_lower in city_lower: city_score = 0.5 # Check for nearby cities (e.g., "San Francisco" team but "Oakland" arena) nearby_cities = { 'san francisco': ['oakland', 'san jose'], 'new york': ['brooklyn', 'queens', 'elmont', 'newark'], 'los angeles': ['inglewood', 'anaheim'], 'miami': ['sunrise', 'fort lauderdale'], 'dallas': ['arlington', 'fort worth'], 'washington': ['landover', 'capital heights'], 'minneapolis': ['st paul', 'st. paul'], 'detroit': ['auburn hills', 'pontiac'], } for main_city, nearby in nearby_cities.items(): if city_lower == main_city and stadium_city_lower in nearby: city_score = 0.7 elif stadium_city_lower == main_city and city_lower in nearby: city_score = 0.7 # Combined score (weighted) combined = (name_score * 0.7) + (city_score * 0.3) if combined > best_score: best_score = combined best_match = stadium['canonical_id'] if best_score >= confidence_threshold: return best_match, best_score return None, best_score # ============================================================================= # CANONICALIZATION # ============================================================================= def generate_canonical_team_id(sport: str, abbrev: str) -> str: """ Generate deterministic canonical ID for team. Format: team_{sport}_{abbrev} Example: team_nba_atl """ return f"team_{sport.lower()}_{abbrev.lower()}" def canonicalize_teams( team_mappings: dict[str, dict], sport: str, canonical_stadiums: list[dict], verbose: bool = False ) -> tuple[list[CanonicalTeam], list[MatchWarning]]: """ Stage 2: Canonicalize teams. 1. Generate canonical IDs from abbreviations 2. Fuzzy match to stadiums 3. Log low-confidence matches for review Args: team_mappings: Team data dict (e.g., NBA_TEAMS) sport: Sport code canonical_stadiums: List of canonical stadium dicts verbose: Print detailed progress Returns: (canonical_teams, warnings) """ teams = [] warnings = [] # Determine arena key based on sport arena_key = 'arena' if sport in ['NBA', 'NHL', 'WNBA'] else 'stadium' # Get division structure division_map = { 'NBA': NBA_DIVISIONS, 'MLB': MLB_DIVISIONS, 'NHL': NHL_DIVISIONS, 'NFL': NFL_DIVISIONS, 'MLS': MLS_DIVISIONS, 'WNBA': WNBA_DIVISIONS, 'NWSL': NWSL_DIVISIONS, }.get(sport, {}) for abbrev, info in team_mappings.items(): canonical_id = generate_canonical_team_id(sport, abbrev) arena_name = info.get(arena_key, '') city = info.get('city', '') team_name = info.get('name', '') # Fuzzy match stadium stadium_canonical_id, confidence = fuzzy_match_stadium( arena_name, city, sport, canonical_stadiums ) if stadium_canonical_id is None: warnings.append(MatchWarning( team_canonical_id=canonical_id, team_name=team_name, arena_name=arena_name, matched_stadium=None, issue='No stadium match found', confidence=confidence )) # Create placeholder ID stadium_canonical_id = f"stadium_unknown_{sport.lower()}_{abbrev.lower()}" if verbose: print(f" WARNING: {canonical_id} - no stadium match for '{arena_name}'") elif confidence < 0.8: warnings.append(MatchWarning( team_canonical_id=canonical_id, team_name=team_name, arena_name=arena_name, matched_stadium=stadium_canonical_id, issue='Low confidence stadium match', confidence=confidence )) if verbose: print(f" WARNING: {canonical_id} - low confidence ({confidence:.2f}) match to {stadium_canonical_id}") # Get conference/division conf_id, div_id = division_map.get(abbrev, (None, None)) team = CanonicalTeam( canonical_id=canonical_id, name=team_name, abbreviation=abbrev, sport=sport, city=city, stadium_canonical_id=stadium_canonical_id, conference_id=conf_id, division_id=div_id ) teams.append(team) if verbose and confidence >= 0.8: print(f" {canonical_id}: {team_name} -> {stadium_canonical_id} ({confidence:.2f})") return teams, warnings def canonicalize_all_teams( canonical_stadiums: list[dict], verbose: bool = False ) -> tuple[list[CanonicalTeam], list[MatchWarning]]: """Canonicalize teams for all sports.""" all_teams = [] all_warnings = [] sport_mappings = [ ('NBA', NBA_TEAMS), ('MLB', MLB_TEAMS), ('NHL', NHL_TEAMS), ('NFL', NFL_TEAMS), ('MLS', MLS_TEAMS), ('WNBA', WNBA_TEAMS), ('NWSL', NWSL_TEAMS), ] for sport, team_map in sport_mappings: if verbose: print(f"\n{sport}:") teams, warnings = canonicalize_teams( team_map, sport, canonical_stadiums, verbose ) all_teams.extend(teams) all_warnings.extend(warnings) return all_teams, all_warnings # ============================================================================= # MAIN # ============================================================================= def main(): parser = argparse.ArgumentParser( description='Canonicalize team data' ) parser.add_argument( '--stadiums', type=str, default='./data/stadiums_canonical.json', help='Input canonical stadiums JSON file' ) parser.add_argument( '--output', type=str, default='./data', help='Output directory for canonical files' ) parser.add_argument( '--verbose', '-v', action='store_true', help='Verbose output' ) args = parser.parse_args() stadiums_path = Path(args.stadiums) output_dir = Path(args.output) output_dir.mkdir(parents=True, exist_ok=True) # Load canonical stadiums print(f"Loading canonical stadiums from {stadiums_path}...") with open(stadiums_path) as f: canonical_stadiums = json.load(f) print(f" Loaded {len(canonical_stadiums)} canonical stadiums") # Canonicalize teams print("\nCanonicalizing teams...") canonical_teams, warnings = canonicalize_all_teams( canonical_stadiums, verbose=args.verbose ) print(f" Created {len(canonical_teams)} canonical teams") if warnings: print(f"\n Warnings: {len(warnings)}") for w in warnings: print(f" - {w.team_canonical_id}: {w.issue} (confidence: {w.confidence:.2f})") # Export teams_path = output_dir / 'teams_canonical.json' warnings_path = output_dir / 'team_matching_warnings.json' with open(teams_path, 'w') as f: json.dump([asdict(t) for t in canonical_teams], f, indent=2) print(f"\nExported teams to {teams_path}") if warnings: with open(warnings_path, 'w') as f: json.dump([asdict(w) for w in warnings], f, indent=2) print(f"Exported warnings to {warnings_path}") # Summary by sport print("\nSummary by sport:") by_sport = {} for t in canonical_teams: by_sport[t.sport] = by_sport.get(t.sport, 0) + 1 for sport, count in sorted(by_sport.items()): print(f" {sport}: {count} teams") if __name__ == '__main__': main()