- Import WNBA_TEAMS from wnba module - Add WNBA_DIVISIONS dict (single league structure, no divisions) - Add WNBA to sport_mappings for team canonicalization - Update arena_key to use 'arena' for WNBA (like NBA/NHL) - Add WNBA team abbreviation aliases (LV, LAS, NYL, PHX, etc.) - Add WNBA stadium aliases (Michelob Ultra Arena, Gateway Center, etc.) Total teams: 167 (13 WNBA teams added) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
591 lines
19 KiB
Python
591 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Team Canonicalization for SportsTime
|
|
====================================
|
|
Stage 2 of the canonicalization pipeline.
|
|
|
|
Generates canonical team IDs and fuzzy matches teams to stadiums.
|
|
|
|
Usage:
|
|
python canonicalize_teams.py --stadiums data/stadiums_canonical.json --output data/
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
from dataclasses import dataclass, asdict, field
|
|
from difflib import SequenceMatcher
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
# Import team mappings from scraper
|
|
from scrape_schedules import NBA_TEAMS, MLB_TEAMS, NHL_TEAMS, NFL_TEAMS
|
|
from mls import MLS_TEAMS
|
|
from wnba import WNBA_TEAMS
|
|
|
|
|
|
# =============================================================================
|
|
# DATA CLASSES
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class CanonicalTeam:
|
|
"""A canonicalized team with stable ID."""
|
|
canonical_id: str
|
|
name: str
|
|
abbreviation: str
|
|
sport: str
|
|
city: str
|
|
stadium_canonical_id: str
|
|
conference_id: Optional[str] = None
|
|
division_id: Optional[str] = None
|
|
primary_color: Optional[str] = None
|
|
secondary_color: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class MatchWarning:
|
|
"""Warning about a low-confidence match."""
|
|
team_canonical_id: str
|
|
team_name: str
|
|
arena_name: str
|
|
matched_stadium: Optional[str]
|
|
issue: str
|
|
confidence: float
|
|
|
|
|
|
# =============================================================================
|
|
# LEAGUE STRUCTURE
|
|
# Maps team abbreviation -> (conference_id, division_id)
|
|
# =============================================================================
|
|
|
|
NBA_DIVISIONS = {
|
|
# Eastern Conference - Atlantic
|
|
'BOS': ('nba_eastern', 'nba_atlantic'),
|
|
'BRK': ('nba_eastern', 'nba_atlantic'),
|
|
'NYK': ('nba_eastern', 'nba_atlantic'),
|
|
'PHI': ('nba_eastern', 'nba_atlantic'),
|
|
'TOR': ('nba_eastern', 'nba_atlantic'),
|
|
# Eastern Conference - Central
|
|
'CHI': ('nba_eastern', 'nba_central'),
|
|
'CLE': ('nba_eastern', 'nba_central'),
|
|
'DET': ('nba_eastern', 'nba_central'),
|
|
'IND': ('nba_eastern', 'nba_central'),
|
|
'MIL': ('nba_eastern', 'nba_central'),
|
|
# Eastern Conference - Southeast
|
|
'ATL': ('nba_eastern', 'nba_southeast'),
|
|
'CHO': ('nba_eastern', 'nba_southeast'),
|
|
'MIA': ('nba_eastern', 'nba_southeast'),
|
|
'ORL': ('nba_eastern', 'nba_southeast'),
|
|
'WAS': ('nba_eastern', 'nba_southeast'),
|
|
# Western Conference - Northwest
|
|
'DEN': ('nba_western', 'nba_northwest'),
|
|
'MIN': ('nba_western', 'nba_northwest'),
|
|
'OKC': ('nba_western', 'nba_northwest'),
|
|
'POR': ('nba_western', 'nba_northwest'),
|
|
'UTA': ('nba_western', 'nba_northwest'),
|
|
# Western Conference - Pacific
|
|
'GSW': ('nba_western', 'nba_pacific'),
|
|
'LAC': ('nba_western', 'nba_pacific'),
|
|
'LAL': ('nba_western', 'nba_pacific'),
|
|
'PHO': ('nba_western', 'nba_pacific'),
|
|
'SAC': ('nba_western', 'nba_pacific'),
|
|
# Western Conference - Southwest
|
|
'DAL': ('nba_western', 'nba_southwest'),
|
|
'HOU': ('nba_western', 'nba_southwest'),
|
|
'MEM': ('nba_western', 'nba_southwest'),
|
|
'NOP': ('nba_western', 'nba_southwest'),
|
|
'SAS': ('nba_western', 'nba_southwest'),
|
|
}
|
|
|
|
MLB_DIVISIONS = {
|
|
# American League - East
|
|
'NYY': ('mlb_al', 'mlb_al_east'),
|
|
'BOS': ('mlb_al', 'mlb_al_east'),
|
|
'TOR': ('mlb_al', 'mlb_al_east'),
|
|
'BAL': ('mlb_al', 'mlb_al_east'),
|
|
'TBR': ('mlb_al', 'mlb_al_east'),
|
|
# American League - Central
|
|
'CLE': ('mlb_al', 'mlb_al_central'),
|
|
'DET': ('mlb_al', 'mlb_al_central'),
|
|
'MIN': ('mlb_al', 'mlb_al_central'),
|
|
'CHW': ('mlb_al', 'mlb_al_central'),
|
|
'KCR': ('mlb_al', 'mlb_al_central'),
|
|
# American League - West
|
|
'HOU': ('mlb_al', 'mlb_al_west'),
|
|
'SEA': ('mlb_al', 'mlb_al_west'),
|
|
'TEX': ('mlb_al', 'mlb_al_west'),
|
|
'LAA': ('mlb_al', 'mlb_al_west'),
|
|
'OAK': ('mlb_al', 'mlb_al_west'),
|
|
# National League - East
|
|
'ATL': ('mlb_nl', 'mlb_nl_east'),
|
|
'PHI': ('mlb_nl', 'mlb_nl_east'),
|
|
'NYM': ('mlb_nl', 'mlb_nl_east'),
|
|
'MIA': ('mlb_nl', 'mlb_nl_east'),
|
|
'WSN': ('mlb_nl', 'mlb_nl_east'),
|
|
# National League - Central
|
|
'MIL': ('mlb_nl', 'mlb_nl_central'),
|
|
'CHC': ('mlb_nl', 'mlb_nl_central'),
|
|
'STL': ('mlb_nl', 'mlb_nl_central'),
|
|
'PIT': ('mlb_nl', 'mlb_nl_central'),
|
|
'CIN': ('mlb_nl', 'mlb_nl_central'),
|
|
# National League - West
|
|
'LAD': ('mlb_nl', 'mlb_nl_west'),
|
|
'ARI': ('mlb_nl', 'mlb_nl_west'),
|
|
'SDP': ('mlb_nl', 'mlb_nl_west'),
|
|
'SFG': ('mlb_nl', 'mlb_nl_west'),
|
|
'COL': ('mlb_nl', 'mlb_nl_west'),
|
|
}
|
|
|
|
NHL_DIVISIONS = {
|
|
# Eastern Conference - Atlantic
|
|
'BOS': ('nhl_eastern', 'nhl_atlantic'),
|
|
'BUF': ('nhl_eastern', 'nhl_atlantic'),
|
|
'DET': ('nhl_eastern', 'nhl_atlantic'),
|
|
'FLA': ('nhl_eastern', 'nhl_atlantic'),
|
|
'MTL': ('nhl_eastern', 'nhl_atlantic'),
|
|
'OTT': ('nhl_eastern', 'nhl_atlantic'),
|
|
'TBL': ('nhl_eastern', 'nhl_atlantic'),
|
|
'TOR': ('nhl_eastern', 'nhl_atlantic'),
|
|
# Eastern Conference - Metropolitan
|
|
'CAR': ('nhl_eastern', 'nhl_metropolitan'),
|
|
'CBJ': ('nhl_eastern', 'nhl_metropolitan'),
|
|
'NJD': ('nhl_eastern', 'nhl_metropolitan'),
|
|
'NYI': ('nhl_eastern', 'nhl_metropolitan'),
|
|
'NYR': ('nhl_eastern', 'nhl_metropolitan'),
|
|
'PHI': ('nhl_eastern', 'nhl_metropolitan'),
|
|
'PIT': ('nhl_eastern', 'nhl_metropolitan'),
|
|
'WSH': ('nhl_eastern', 'nhl_metropolitan'),
|
|
# Western Conference - Central
|
|
'ARI': ('nhl_western', 'nhl_central'), # Utah Hockey Club
|
|
'CHI': ('nhl_western', 'nhl_central'),
|
|
'COL': ('nhl_western', 'nhl_central'),
|
|
'DAL': ('nhl_western', 'nhl_central'),
|
|
'MIN': ('nhl_western', 'nhl_central'),
|
|
'NSH': ('nhl_western', 'nhl_central'),
|
|
'STL': ('nhl_western', 'nhl_central'),
|
|
'WPG': ('nhl_western', 'nhl_central'),
|
|
# Western Conference - Pacific
|
|
'ANA': ('nhl_western', 'nhl_pacific'),
|
|
'CGY': ('nhl_western', 'nhl_pacific'),
|
|
'EDM': ('nhl_western', 'nhl_pacific'),
|
|
'LAK': ('nhl_western', 'nhl_pacific'),
|
|
'SEA': ('nhl_western', 'nhl_pacific'),
|
|
'SJS': ('nhl_western', 'nhl_pacific'),
|
|
'VAN': ('nhl_western', 'nhl_pacific'),
|
|
'VGK': ('nhl_western', 'nhl_pacific'),
|
|
}
|
|
|
|
NFL_DIVISIONS = {
|
|
# AFC East
|
|
'BUF': ('nfl_afc', 'nfl_afc_east'),
|
|
'MIA': ('nfl_afc', 'nfl_afc_east'),
|
|
'NE': ('nfl_afc', 'nfl_afc_east'),
|
|
'NYJ': ('nfl_afc', 'nfl_afc_east'),
|
|
# AFC North
|
|
'BAL': ('nfl_afc', 'nfl_afc_north'),
|
|
'CIN': ('nfl_afc', 'nfl_afc_north'),
|
|
'CLE': ('nfl_afc', 'nfl_afc_north'),
|
|
'PIT': ('nfl_afc', 'nfl_afc_north'),
|
|
# AFC South
|
|
'HOU': ('nfl_afc', 'nfl_afc_south'),
|
|
'IND': ('nfl_afc', 'nfl_afc_south'),
|
|
'JAX': ('nfl_afc', 'nfl_afc_south'),
|
|
'TEN': ('nfl_afc', 'nfl_afc_south'),
|
|
# AFC West
|
|
'DEN': ('nfl_afc', 'nfl_afc_west'),
|
|
'KC': ('nfl_afc', 'nfl_afc_west'),
|
|
'LV': ('nfl_afc', 'nfl_afc_west'),
|
|
'LAC': ('nfl_afc', 'nfl_afc_west'),
|
|
# NFC East
|
|
'DAL': ('nfl_nfc', 'nfl_nfc_east'),
|
|
'NYG': ('nfl_nfc', 'nfl_nfc_east'),
|
|
'PHI': ('nfl_nfc', 'nfl_nfc_east'),
|
|
'WAS': ('nfl_nfc', 'nfl_nfc_east'),
|
|
# NFC North
|
|
'CHI': ('nfl_nfc', 'nfl_nfc_north'),
|
|
'DET': ('nfl_nfc', 'nfl_nfc_north'),
|
|
'GB': ('nfl_nfc', 'nfl_nfc_north'),
|
|
'MIN': ('nfl_nfc', 'nfl_nfc_north'),
|
|
# NFC South
|
|
'ATL': ('nfl_nfc', 'nfl_nfc_south'),
|
|
'CAR': ('nfl_nfc', 'nfl_nfc_south'),
|
|
'NO': ('nfl_nfc', 'nfl_nfc_south'),
|
|
'TB': ('nfl_nfc', 'nfl_nfc_south'),
|
|
# NFC West
|
|
'ARI': ('nfl_nfc', 'nfl_nfc_west'),
|
|
'LAR': ('nfl_nfc', 'nfl_nfc_west'),
|
|
'SF': ('nfl_nfc', 'nfl_nfc_west'),
|
|
'SEA': ('nfl_nfc', 'nfl_nfc_west'),
|
|
}
|
|
|
|
MLS_DIVISIONS = {
|
|
# Eastern Conference (MLS uses conferences, not divisions)
|
|
'ATL': ('mls_eastern', None),
|
|
'CHI': ('mls_eastern', None),
|
|
'CIN': ('mls_eastern', None),
|
|
'CLB': ('mls_eastern', None),
|
|
'CLT': ('mls_eastern', None),
|
|
'DC': ('mls_eastern', None),
|
|
'MIA': ('mls_eastern', None),
|
|
'MTL': ('mls_eastern', None),
|
|
'NE': ('mls_eastern', None),
|
|
'NYCFC': ('mls_eastern', None),
|
|
'NYRB': ('mls_eastern', None),
|
|
'ORL': ('mls_eastern', None),
|
|
'PHI': ('mls_eastern', None),
|
|
'TOR': ('mls_eastern', None),
|
|
# Western Conference
|
|
'AUS': ('mls_western', None),
|
|
'COL': ('mls_western', None),
|
|
'DAL': ('mls_western', None),
|
|
'HOU': ('mls_western', None),
|
|
'LAFC': ('mls_western', None),
|
|
'LAG': ('mls_western', None),
|
|
'MIN': ('mls_western', None),
|
|
'NSH': ('mls_western', None),
|
|
'POR': ('mls_western', None),
|
|
'RSL': ('mls_western', None),
|
|
'SD': ('mls_western', None),
|
|
'SEA': ('mls_western', None),
|
|
'SJ': ('mls_western', None),
|
|
'SKC': ('mls_western', None),
|
|
'STL': ('mls_western', None),
|
|
'VAN': ('mls_western', None),
|
|
}
|
|
|
|
WNBA_DIVISIONS = {
|
|
# WNBA has no divisions (single league structure)
|
|
'ATL': ('wnba', None),
|
|
'CHI': ('wnba', None),
|
|
'CON': ('wnba', None),
|
|
'DAL': ('wnba', None),
|
|
'GSV': ('wnba', None),
|
|
'IND': ('wnba', None),
|
|
'LVA': ('wnba', None),
|
|
'LA': ('wnba', None),
|
|
'MIN': ('wnba', None),
|
|
'NY': ('wnba', None),
|
|
'PHO': ('wnba', None),
|
|
'SEA': ('wnba', None),
|
|
'WAS': ('wnba', None),
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# FUZZY MATCHING
|
|
# =============================================================================
|
|
|
|
def normalize_for_matching(text: str) -> str:
|
|
"""Normalize text for fuzzy matching."""
|
|
import re
|
|
text = text.lower().strip()
|
|
# Remove common suffixes/prefixes
|
|
text = re.sub(r'\s*(arena|center|stadium|field|park|centre)\s*', ' ', text)
|
|
# Remove special characters
|
|
text = re.sub(r'[^a-z0-9\s]', '', text)
|
|
# Collapse spaces
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
return text
|
|
|
|
|
|
def fuzzy_match_stadium(
|
|
team_arena_name: str,
|
|
team_city: str,
|
|
sport: str,
|
|
stadiums: list[dict],
|
|
confidence_threshold: float = 0.6
|
|
) -> tuple[Optional[str], float]:
|
|
"""
|
|
Fuzzy match team's arena to a canonical stadium.
|
|
|
|
Matching strategy:
|
|
- 70% weight: Name similarity (SequenceMatcher)
|
|
- 30% weight: City match (exact=1.0, partial=0.5)
|
|
|
|
Args:
|
|
team_arena_name: The arena name from team mapping
|
|
team_city: The team's city
|
|
sport: Sport code (NBA, MLB, NHL)
|
|
stadiums: List of canonical stadium dicts
|
|
confidence_threshold: Minimum confidence for a match
|
|
|
|
Returns:
|
|
(canonical_stadium_id, confidence_score)
|
|
"""
|
|
best_match = None
|
|
best_score = 0.0
|
|
|
|
# Normalize arena name
|
|
arena_normalized = normalize_for_matching(team_arena_name)
|
|
city_lower = team_city.lower()
|
|
|
|
# Filter to same sport
|
|
sport_stadiums = [s for s in stadiums if s['sport'] == sport]
|
|
|
|
for stadium in sport_stadiums:
|
|
stadium_name_normalized = normalize_for_matching(stadium['name'])
|
|
|
|
# Score 1: Name similarity
|
|
name_score = SequenceMatcher(
|
|
None,
|
|
arena_normalized,
|
|
stadium_name_normalized
|
|
).ratio()
|
|
|
|
# Also check full names (unnormalized)
|
|
full_name_score = SequenceMatcher(
|
|
None,
|
|
team_arena_name.lower(),
|
|
stadium['name'].lower()
|
|
).ratio()
|
|
|
|
# Take the better score
|
|
name_score = max(name_score, full_name_score)
|
|
|
|
# Score 2: City match
|
|
city_score = 0.0
|
|
stadium_city_lower = stadium['city'].lower()
|
|
|
|
if city_lower == stadium_city_lower:
|
|
city_score = 1.0
|
|
elif city_lower in stadium_city_lower or stadium_city_lower in city_lower:
|
|
city_score = 0.5
|
|
# Check for nearby cities (e.g., "San Francisco" team but "Oakland" arena)
|
|
nearby_cities = {
|
|
'san francisco': ['oakland', 'san jose'],
|
|
'new york': ['brooklyn', 'queens', 'elmont', 'newark'],
|
|
'los angeles': ['inglewood', 'anaheim'],
|
|
'miami': ['sunrise', 'fort lauderdale'],
|
|
'dallas': ['arlington', 'fort worth'],
|
|
'washington': ['landover', 'capital heights'],
|
|
'minneapolis': ['st paul', 'st. paul'],
|
|
'detroit': ['auburn hills', 'pontiac'],
|
|
}
|
|
for main_city, nearby in nearby_cities.items():
|
|
if city_lower == main_city and stadium_city_lower in nearby:
|
|
city_score = 0.7
|
|
elif stadium_city_lower == main_city and city_lower in nearby:
|
|
city_score = 0.7
|
|
|
|
# Combined score (weighted)
|
|
combined = (name_score * 0.7) + (city_score * 0.3)
|
|
|
|
if combined > best_score:
|
|
best_score = combined
|
|
best_match = stadium['canonical_id']
|
|
|
|
if best_score >= confidence_threshold:
|
|
return best_match, best_score
|
|
|
|
return None, best_score
|
|
|
|
|
|
# =============================================================================
|
|
# CANONICALIZATION
|
|
# =============================================================================
|
|
|
|
def generate_canonical_team_id(sport: str, abbrev: str) -> str:
|
|
"""
|
|
Generate deterministic canonical ID for team.
|
|
|
|
Format: team_{sport}_{abbrev}
|
|
Example: team_nba_atl
|
|
"""
|
|
return f"team_{sport.lower()}_{abbrev.lower()}"
|
|
|
|
|
|
def canonicalize_teams(
|
|
team_mappings: dict[str, dict],
|
|
sport: str,
|
|
canonical_stadiums: list[dict],
|
|
verbose: bool = False
|
|
) -> tuple[list[CanonicalTeam], list[MatchWarning]]:
|
|
"""
|
|
Stage 2: Canonicalize teams.
|
|
|
|
1. Generate canonical IDs from abbreviations
|
|
2. Fuzzy match to stadiums
|
|
3. Log low-confidence matches for review
|
|
|
|
Args:
|
|
team_mappings: Team data dict (e.g., NBA_TEAMS)
|
|
sport: Sport code
|
|
canonical_stadiums: List of canonical stadium dicts
|
|
verbose: Print detailed progress
|
|
|
|
Returns:
|
|
(canonical_teams, warnings)
|
|
"""
|
|
teams = []
|
|
warnings = []
|
|
|
|
# Determine arena key based on sport
|
|
arena_key = 'arena' if sport in ['NBA', 'NHL', 'WNBA'] else 'stadium'
|
|
|
|
# Get division structure
|
|
division_map = {
|
|
'NBA': NBA_DIVISIONS,
|
|
'MLB': MLB_DIVISIONS,
|
|
'NHL': NHL_DIVISIONS,
|
|
'NFL': NFL_DIVISIONS,
|
|
'MLS': MLS_DIVISIONS,
|
|
'WNBA': WNBA_DIVISIONS,
|
|
}.get(sport, {})
|
|
|
|
for abbrev, info in team_mappings.items():
|
|
canonical_id = generate_canonical_team_id(sport, abbrev)
|
|
arena_name = info.get(arena_key, '')
|
|
city = info.get('city', '')
|
|
team_name = info.get('name', '')
|
|
|
|
# Fuzzy match stadium
|
|
stadium_canonical_id, confidence = fuzzy_match_stadium(
|
|
arena_name, city, sport, canonical_stadiums
|
|
)
|
|
|
|
if stadium_canonical_id is None:
|
|
warnings.append(MatchWarning(
|
|
team_canonical_id=canonical_id,
|
|
team_name=team_name,
|
|
arena_name=arena_name,
|
|
matched_stadium=None,
|
|
issue='No stadium match found',
|
|
confidence=confidence
|
|
))
|
|
# Create placeholder ID
|
|
stadium_canonical_id = f"stadium_unknown_{sport.lower()}_{abbrev.lower()}"
|
|
if verbose:
|
|
print(f" WARNING: {canonical_id} - no stadium match for '{arena_name}'")
|
|
|
|
elif confidence < 0.8:
|
|
warnings.append(MatchWarning(
|
|
team_canonical_id=canonical_id,
|
|
team_name=team_name,
|
|
arena_name=arena_name,
|
|
matched_stadium=stadium_canonical_id,
|
|
issue='Low confidence stadium match',
|
|
confidence=confidence
|
|
))
|
|
if verbose:
|
|
print(f" WARNING: {canonical_id} - low confidence ({confidence:.2f}) match to {stadium_canonical_id}")
|
|
|
|
# Get conference/division
|
|
conf_id, div_id = division_map.get(abbrev, (None, None))
|
|
|
|
team = CanonicalTeam(
|
|
canonical_id=canonical_id,
|
|
name=team_name,
|
|
abbreviation=abbrev,
|
|
sport=sport,
|
|
city=city,
|
|
stadium_canonical_id=stadium_canonical_id,
|
|
conference_id=conf_id,
|
|
division_id=div_id
|
|
)
|
|
teams.append(team)
|
|
|
|
if verbose and confidence >= 0.8:
|
|
print(f" {canonical_id}: {team_name} -> {stadium_canonical_id} ({confidence:.2f})")
|
|
|
|
return teams, warnings
|
|
|
|
|
|
def canonicalize_all_teams(
|
|
canonical_stadiums: list[dict],
|
|
verbose: bool = False
|
|
) -> tuple[list[CanonicalTeam], list[MatchWarning]]:
|
|
"""Canonicalize teams for all sports."""
|
|
all_teams = []
|
|
all_warnings = []
|
|
|
|
sport_mappings = [
|
|
('NBA', NBA_TEAMS),
|
|
('MLB', MLB_TEAMS),
|
|
('NHL', NHL_TEAMS),
|
|
('NFL', NFL_TEAMS),
|
|
('MLS', MLS_TEAMS),
|
|
('WNBA', WNBA_TEAMS),
|
|
]
|
|
|
|
for sport, team_map in sport_mappings:
|
|
if verbose:
|
|
print(f"\n{sport}:")
|
|
|
|
teams, warnings = canonicalize_teams(
|
|
team_map, sport, canonical_stadiums, verbose
|
|
)
|
|
all_teams.extend(teams)
|
|
all_warnings.extend(warnings)
|
|
|
|
return all_teams, all_warnings
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Canonicalize team data'
|
|
)
|
|
parser.add_argument(
|
|
'--stadiums', type=str, default='./data/stadiums_canonical.json',
|
|
help='Input canonical stadiums JSON file'
|
|
)
|
|
parser.add_argument(
|
|
'--output', type=str, default='./data',
|
|
help='Output directory for canonical files'
|
|
)
|
|
parser.add_argument(
|
|
'--verbose', '-v', action='store_true',
|
|
help='Verbose output'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
stadiums_path = Path(args.stadiums)
|
|
output_dir = Path(args.output)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load canonical stadiums
|
|
print(f"Loading canonical stadiums from {stadiums_path}...")
|
|
with open(stadiums_path) as f:
|
|
canonical_stadiums = json.load(f)
|
|
print(f" Loaded {len(canonical_stadiums)} canonical stadiums")
|
|
|
|
# Canonicalize teams
|
|
print("\nCanonicalizing teams...")
|
|
canonical_teams, warnings = canonicalize_all_teams(
|
|
canonical_stadiums, verbose=args.verbose
|
|
)
|
|
print(f" Created {len(canonical_teams)} canonical teams")
|
|
|
|
if warnings:
|
|
print(f"\n Warnings: {len(warnings)}")
|
|
for w in warnings:
|
|
print(f" - {w.team_canonical_id}: {w.issue} (confidence: {w.confidence:.2f})")
|
|
|
|
# Export
|
|
teams_path = output_dir / 'teams_canonical.json'
|
|
warnings_path = output_dir / 'team_matching_warnings.json'
|
|
|
|
with open(teams_path, 'w') as f:
|
|
json.dump([asdict(t) for t in canonical_teams], f, indent=2)
|
|
print(f"\nExported teams to {teams_path}")
|
|
|
|
if warnings:
|
|
with open(warnings_path, 'w') as f:
|
|
json.dump([asdict(w) for w in warnings], f, indent=2)
|
|
print(f"Exported warnings to {warnings_path}")
|
|
|
|
# Summary by sport
|
|
print("\nSummary by sport:")
|
|
by_sport = {}
|
|
for t in canonical_teams:
|
|
by_sport[t.sport] = by_sport.get(t.sport, 0) + 1
|
|
for sport, count in sorted(by_sport.items()):
|
|
print(f" {sport}: {count} teams")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|