Files
Sportstime/Scripts/canonicalize_teams.py
Trey t d4d0d95c54 feat(03-01): add NFL to team canonicalization
Add NFL support to canonicalize_teams.py:
- Import NFL_TEAMS from scrape_schedules
- Add NFL_DIVISIONS dict with all 32 teams mapped to conference/division
- Include NFL in sport_mappings for canonicalization
- Add NFL_DIVISIONS to division_map lookup

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 09:34:49 -06:00

533 lines
17 KiB
Python

#!/usr/bin/env python3
"""
Team Canonicalization for SportsTime
====================================
Stage 2 of the canonicalization pipeline.
Generates canonical team IDs and fuzzy matches teams to stadiums.
Usage:
python canonicalize_teams.py --stadiums data/stadiums_canonical.json --output data/
"""
import argparse
import json
from dataclasses import dataclass, asdict, field
from difflib import SequenceMatcher
from pathlib import Path
from typing import Optional
# Import team mappings from scraper
from scrape_schedules import NBA_TEAMS, MLB_TEAMS, NHL_TEAMS, NFL_TEAMS
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class CanonicalTeam:
"""A canonicalized team with stable ID."""
canonical_id: str
name: str
abbreviation: str
sport: str
city: str
stadium_canonical_id: str
conference_id: Optional[str] = None
division_id: Optional[str] = None
primary_color: Optional[str] = None
secondary_color: Optional[str] = None
@dataclass
class MatchWarning:
"""Warning about a low-confidence match."""
team_canonical_id: str
team_name: str
arena_name: str
matched_stadium: Optional[str]
issue: str
confidence: float
# =============================================================================
# LEAGUE STRUCTURE
# Maps team abbreviation -> (conference_id, division_id)
# =============================================================================
NBA_DIVISIONS = {
# Eastern Conference - Atlantic
'BOS': ('nba_eastern', 'nba_atlantic'),
'BRK': ('nba_eastern', 'nba_atlantic'),
'NYK': ('nba_eastern', 'nba_atlantic'),
'PHI': ('nba_eastern', 'nba_atlantic'),
'TOR': ('nba_eastern', 'nba_atlantic'),
# Eastern Conference - Central
'CHI': ('nba_eastern', 'nba_central'),
'CLE': ('nba_eastern', 'nba_central'),
'DET': ('nba_eastern', 'nba_central'),
'IND': ('nba_eastern', 'nba_central'),
'MIL': ('nba_eastern', 'nba_central'),
# Eastern Conference - Southeast
'ATL': ('nba_eastern', 'nba_southeast'),
'CHO': ('nba_eastern', 'nba_southeast'),
'MIA': ('nba_eastern', 'nba_southeast'),
'ORL': ('nba_eastern', 'nba_southeast'),
'WAS': ('nba_eastern', 'nba_southeast'),
# Western Conference - Northwest
'DEN': ('nba_western', 'nba_northwest'),
'MIN': ('nba_western', 'nba_northwest'),
'OKC': ('nba_western', 'nba_northwest'),
'POR': ('nba_western', 'nba_northwest'),
'UTA': ('nba_western', 'nba_northwest'),
# Western Conference - Pacific
'GSW': ('nba_western', 'nba_pacific'),
'LAC': ('nba_western', 'nba_pacific'),
'LAL': ('nba_western', 'nba_pacific'),
'PHO': ('nba_western', 'nba_pacific'),
'SAC': ('nba_western', 'nba_pacific'),
# Western Conference - Southwest
'DAL': ('nba_western', 'nba_southwest'),
'HOU': ('nba_western', 'nba_southwest'),
'MEM': ('nba_western', 'nba_southwest'),
'NOP': ('nba_western', 'nba_southwest'),
'SAS': ('nba_western', 'nba_southwest'),
}
MLB_DIVISIONS = {
# American League - East
'NYY': ('mlb_al', 'mlb_al_east'),
'BOS': ('mlb_al', 'mlb_al_east'),
'TOR': ('mlb_al', 'mlb_al_east'),
'BAL': ('mlb_al', 'mlb_al_east'),
'TBR': ('mlb_al', 'mlb_al_east'),
# American League - Central
'CLE': ('mlb_al', 'mlb_al_central'),
'DET': ('mlb_al', 'mlb_al_central'),
'MIN': ('mlb_al', 'mlb_al_central'),
'CHW': ('mlb_al', 'mlb_al_central'),
'KCR': ('mlb_al', 'mlb_al_central'),
# American League - West
'HOU': ('mlb_al', 'mlb_al_west'),
'SEA': ('mlb_al', 'mlb_al_west'),
'TEX': ('mlb_al', 'mlb_al_west'),
'LAA': ('mlb_al', 'mlb_al_west'),
'OAK': ('mlb_al', 'mlb_al_west'),
# National League - East
'ATL': ('mlb_nl', 'mlb_nl_east'),
'PHI': ('mlb_nl', 'mlb_nl_east'),
'NYM': ('mlb_nl', 'mlb_nl_east'),
'MIA': ('mlb_nl', 'mlb_nl_east'),
'WSN': ('mlb_nl', 'mlb_nl_east'),
# National League - Central
'MIL': ('mlb_nl', 'mlb_nl_central'),
'CHC': ('mlb_nl', 'mlb_nl_central'),
'STL': ('mlb_nl', 'mlb_nl_central'),
'PIT': ('mlb_nl', 'mlb_nl_central'),
'CIN': ('mlb_nl', 'mlb_nl_central'),
# National League - West
'LAD': ('mlb_nl', 'mlb_nl_west'),
'ARI': ('mlb_nl', 'mlb_nl_west'),
'SDP': ('mlb_nl', 'mlb_nl_west'),
'SFG': ('mlb_nl', 'mlb_nl_west'),
'COL': ('mlb_nl', 'mlb_nl_west'),
}
NHL_DIVISIONS = {
# Eastern Conference - Atlantic
'BOS': ('nhl_eastern', 'nhl_atlantic'),
'BUF': ('nhl_eastern', 'nhl_atlantic'),
'DET': ('nhl_eastern', 'nhl_atlantic'),
'FLA': ('nhl_eastern', 'nhl_atlantic'),
'MTL': ('nhl_eastern', 'nhl_atlantic'),
'OTT': ('nhl_eastern', 'nhl_atlantic'),
'TBL': ('nhl_eastern', 'nhl_atlantic'),
'TOR': ('nhl_eastern', 'nhl_atlantic'),
# Eastern Conference - Metropolitan
'CAR': ('nhl_eastern', 'nhl_metropolitan'),
'CBJ': ('nhl_eastern', 'nhl_metropolitan'),
'NJD': ('nhl_eastern', 'nhl_metropolitan'),
'NYI': ('nhl_eastern', 'nhl_metropolitan'),
'NYR': ('nhl_eastern', 'nhl_metropolitan'),
'PHI': ('nhl_eastern', 'nhl_metropolitan'),
'PIT': ('nhl_eastern', 'nhl_metropolitan'),
'WSH': ('nhl_eastern', 'nhl_metropolitan'),
# Western Conference - Central
'ARI': ('nhl_western', 'nhl_central'), # Utah Hockey Club
'CHI': ('nhl_western', 'nhl_central'),
'COL': ('nhl_western', 'nhl_central'),
'DAL': ('nhl_western', 'nhl_central'),
'MIN': ('nhl_western', 'nhl_central'),
'NSH': ('nhl_western', 'nhl_central'),
'STL': ('nhl_western', 'nhl_central'),
'WPG': ('nhl_western', 'nhl_central'),
# Western Conference - Pacific
'ANA': ('nhl_western', 'nhl_pacific'),
'CGY': ('nhl_western', 'nhl_pacific'),
'EDM': ('nhl_western', 'nhl_pacific'),
'LAK': ('nhl_western', 'nhl_pacific'),
'SEA': ('nhl_western', 'nhl_pacific'),
'SJS': ('nhl_western', 'nhl_pacific'),
'VAN': ('nhl_western', 'nhl_pacific'),
'VGK': ('nhl_western', 'nhl_pacific'),
}
NFL_DIVISIONS = {
# AFC East
'BUF': ('nfl_afc', 'nfl_afc_east'),
'MIA': ('nfl_afc', 'nfl_afc_east'),
'NE': ('nfl_afc', 'nfl_afc_east'),
'NYJ': ('nfl_afc', 'nfl_afc_east'),
# AFC North
'BAL': ('nfl_afc', 'nfl_afc_north'),
'CIN': ('nfl_afc', 'nfl_afc_north'),
'CLE': ('nfl_afc', 'nfl_afc_north'),
'PIT': ('nfl_afc', 'nfl_afc_north'),
# AFC South
'HOU': ('nfl_afc', 'nfl_afc_south'),
'IND': ('nfl_afc', 'nfl_afc_south'),
'JAX': ('nfl_afc', 'nfl_afc_south'),
'TEN': ('nfl_afc', 'nfl_afc_south'),
# AFC West
'DEN': ('nfl_afc', 'nfl_afc_west'),
'KC': ('nfl_afc', 'nfl_afc_west'),
'LV': ('nfl_afc', 'nfl_afc_west'),
'LAC': ('nfl_afc', 'nfl_afc_west'),
# NFC East
'DAL': ('nfl_nfc', 'nfl_nfc_east'),
'NYG': ('nfl_nfc', 'nfl_nfc_east'),
'PHI': ('nfl_nfc', 'nfl_nfc_east'),
'WAS': ('nfl_nfc', 'nfl_nfc_east'),
# NFC North
'CHI': ('nfl_nfc', 'nfl_nfc_north'),
'DET': ('nfl_nfc', 'nfl_nfc_north'),
'GB': ('nfl_nfc', 'nfl_nfc_north'),
'MIN': ('nfl_nfc', 'nfl_nfc_north'),
# NFC South
'ATL': ('nfl_nfc', 'nfl_nfc_south'),
'CAR': ('nfl_nfc', 'nfl_nfc_south'),
'NO': ('nfl_nfc', 'nfl_nfc_south'),
'TB': ('nfl_nfc', 'nfl_nfc_south'),
# NFC West
'ARI': ('nfl_nfc', 'nfl_nfc_west'),
'LAR': ('nfl_nfc', 'nfl_nfc_west'),
'SF': ('nfl_nfc', 'nfl_nfc_west'),
'SEA': ('nfl_nfc', 'nfl_nfc_west'),
}
# =============================================================================
# FUZZY MATCHING
# =============================================================================
def normalize_for_matching(text: str) -> str:
"""Normalize text for fuzzy matching."""
import re
text = text.lower().strip()
# Remove common suffixes/prefixes
text = re.sub(r'\s*(arena|center|stadium|field|park|centre)\s*', ' ', text)
# Remove special characters
text = re.sub(r'[^a-z0-9\s]', '', text)
# Collapse spaces
text = re.sub(r'\s+', ' ', text).strip()
return text
def fuzzy_match_stadium(
team_arena_name: str,
team_city: str,
sport: str,
stadiums: list[dict],
confidence_threshold: float = 0.6
) -> tuple[Optional[str], float]:
"""
Fuzzy match team's arena to a canonical stadium.
Matching strategy:
- 70% weight: Name similarity (SequenceMatcher)
- 30% weight: City match (exact=1.0, partial=0.5)
Args:
team_arena_name: The arena name from team mapping
team_city: The team's city
sport: Sport code (NBA, MLB, NHL)
stadiums: List of canonical stadium dicts
confidence_threshold: Minimum confidence for a match
Returns:
(canonical_stadium_id, confidence_score)
"""
best_match = None
best_score = 0.0
# Normalize arena name
arena_normalized = normalize_for_matching(team_arena_name)
city_lower = team_city.lower()
# Filter to same sport
sport_stadiums = [s for s in stadiums if s['sport'] == sport]
for stadium in sport_stadiums:
stadium_name_normalized = normalize_for_matching(stadium['name'])
# Score 1: Name similarity
name_score = SequenceMatcher(
None,
arena_normalized,
stadium_name_normalized
).ratio()
# Also check full names (unnormalized)
full_name_score = SequenceMatcher(
None,
team_arena_name.lower(),
stadium['name'].lower()
).ratio()
# Take the better score
name_score = max(name_score, full_name_score)
# Score 2: City match
city_score = 0.0
stadium_city_lower = stadium['city'].lower()
if city_lower == stadium_city_lower:
city_score = 1.0
elif city_lower in stadium_city_lower or stadium_city_lower in city_lower:
city_score = 0.5
# Check for nearby cities (e.g., "San Francisco" team but "Oakland" arena)
nearby_cities = {
'san francisco': ['oakland', 'san jose'],
'new york': ['brooklyn', 'queens', 'elmont', 'newark'],
'los angeles': ['inglewood', 'anaheim'],
'miami': ['sunrise', 'fort lauderdale'],
'dallas': ['arlington', 'fort worth'],
'washington': ['landover', 'capital heights'],
'minneapolis': ['st paul', 'st. paul'],
'detroit': ['auburn hills', 'pontiac'],
}
for main_city, nearby in nearby_cities.items():
if city_lower == main_city and stadium_city_lower in nearby:
city_score = 0.7
elif stadium_city_lower == main_city and city_lower in nearby:
city_score = 0.7
# Combined score (weighted)
combined = (name_score * 0.7) + (city_score * 0.3)
if combined > best_score:
best_score = combined
best_match = stadium['canonical_id']
if best_score >= confidence_threshold:
return best_match, best_score
return None, best_score
# =============================================================================
# CANONICALIZATION
# =============================================================================
def generate_canonical_team_id(sport: str, abbrev: str) -> str:
"""
Generate deterministic canonical ID for team.
Format: team_{sport}_{abbrev}
Example: team_nba_atl
"""
return f"team_{sport.lower()}_{abbrev.lower()}"
def canonicalize_teams(
team_mappings: dict[str, dict],
sport: str,
canonical_stadiums: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalTeam], list[MatchWarning]]:
"""
Stage 2: Canonicalize teams.
1. Generate canonical IDs from abbreviations
2. Fuzzy match to stadiums
3. Log low-confidence matches for review
Args:
team_mappings: Team data dict (e.g., NBA_TEAMS)
sport: Sport code
canonical_stadiums: List of canonical stadium dicts
verbose: Print detailed progress
Returns:
(canonical_teams, warnings)
"""
teams = []
warnings = []
# Determine arena key based on sport
arena_key = 'arena' if sport in ['NBA', 'NHL'] else 'stadium'
# Get division structure
division_map = {
'NBA': NBA_DIVISIONS,
'MLB': MLB_DIVISIONS,
'NHL': NHL_DIVISIONS,
'NFL': NFL_DIVISIONS,
}.get(sport, {})
for abbrev, info in team_mappings.items():
canonical_id = generate_canonical_team_id(sport, abbrev)
arena_name = info.get(arena_key, '')
city = info.get('city', '')
team_name = info.get('name', '')
# Fuzzy match stadium
stadium_canonical_id, confidence = fuzzy_match_stadium(
arena_name, city, sport, canonical_stadiums
)
if stadium_canonical_id is None:
warnings.append(MatchWarning(
team_canonical_id=canonical_id,
team_name=team_name,
arena_name=arena_name,
matched_stadium=None,
issue='No stadium match found',
confidence=confidence
))
# Create placeholder ID
stadium_canonical_id = f"stadium_unknown_{sport.lower()}_{abbrev.lower()}"
if verbose:
print(f" WARNING: {canonical_id} - no stadium match for '{arena_name}'")
elif confidence < 0.8:
warnings.append(MatchWarning(
team_canonical_id=canonical_id,
team_name=team_name,
arena_name=arena_name,
matched_stadium=stadium_canonical_id,
issue='Low confidence stadium match',
confidence=confidence
))
if verbose:
print(f" WARNING: {canonical_id} - low confidence ({confidence:.2f}) match to {stadium_canonical_id}")
# Get conference/division
conf_id, div_id = division_map.get(abbrev, (None, None))
team = CanonicalTeam(
canonical_id=canonical_id,
name=team_name,
abbreviation=abbrev,
sport=sport,
city=city,
stadium_canonical_id=stadium_canonical_id,
conference_id=conf_id,
division_id=div_id
)
teams.append(team)
if verbose and confidence >= 0.8:
print(f" {canonical_id}: {team_name} -> {stadium_canonical_id} ({confidence:.2f})")
return teams, warnings
def canonicalize_all_teams(
canonical_stadiums: list[dict],
verbose: bool = False
) -> tuple[list[CanonicalTeam], list[MatchWarning]]:
"""Canonicalize teams for all sports."""
all_teams = []
all_warnings = []
sport_mappings = [
('NBA', NBA_TEAMS),
('MLB', MLB_TEAMS),
('NHL', NHL_TEAMS),
('NFL', NFL_TEAMS),
]
for sport, team_map in sport_mappings:
if verbose:
print(f"\n{sport}:")
teams, warnings = canonicalize_teams(
team_map, sport, canonical_stadiums, verbose
)
all_teams.extend(teams)
all_warnings.extend(warnings)
return all_teams, all_warnings
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Canonicalize team data'
)
parser.add_argument(
'--stadiums', type=str, default='./data/stadiums_canonical.json',
help='Input canonical stadiums JSON file'
)
parser.add_argument(
'--output', type=str, default='./data',
help='Output directory for canonical files'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
args = parser.parse_args()
stadiums_path = Path(args.stadiums)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
# Load canonical stadiums
print(f"Loading canonical stadiums from {stadiums_path}...")
with open(stadiums_path) as f:
canonical_stadiums = json.load(f)
print(f" Loaded {len(canonical_stadiums)} canonical stadiums")
# Canonicalize teams
print("\nCanonicalizing teams...")
canonical_teams, warnings = canonicalize_all_teams(
canonical_stadiums, verbose=args.verbose
)
print(f" Created {len(canonical_teams)} canonical teams")
if warnings:
print(f"\n Warnings: {len(warnings)}")
for w in warnings:
print(f" - {w.team_canonical_id}: {w.issue} (confidence: {w.confidence:.2f})")
# Export
teams_path = output_dir / 'teams_canonical.json'
warnings_path = output_dir / 'team_matching_warnings.json'
with open(teams_path, 'w') as f:
json.dump([asdict(t) for t in canonical_teams], f, indent=2)
print(f"\nExported teams to {teams_path}")
if warnings:
with open(warnings_path, 'w') as f:
json.dump([asdict(w) for w in warnings], f, indent=2)
print(f"Exported warnings to {warnings_path}")
# Summary by sport
print("\nSummary by sport:")
by_sport = {}
for t in canonical_teams:
by_sport[t.sport] = by_sport.get(t.sport, 0) + 1
for sport, count in sorted(by_sport.items()):
print(f" {sport}: {count} teams")
if __name__ == '__main__':
main()