Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
285 lines
7.7 KiB
Python
285 lines
7.7 KiB
Python
"""Canonical ID generation for games, teams, and stadiums."""
|
|
|
|
import re
|
|
import unicodedata
|
|
from datetime import date, datetime
|
|
from typing import Optional
|
|
|
|
|
|
def normalize_string(s: str) -> str:
|
|
"""Normalize a string for use in canonical IDs.
|
|
|
|
- Convert to lowercase
|
|
- Replace spaces and hyphens with underscores
|
|
- Remove special characters (except underscores)
|
|
- Collapse multiple underscores
|
|
- Strip leading/trailing underscores
|
|
|
|
Args:
|
|
s: String to normalize
|
|
|
|
Returns:
|
|
Normalized string suitable for IDs
|
|
"""
|
|
# Convert to lowercase
|
|
result = s.lower()
|
|
|
|
# Normalize unicode (e.g., é -> e)
|
|
result = unicodedata.normalize("NFKD", result)
|
|
result = result.encode("ascii", "ignore").decode("ascii")
|
|
|
|
# Replace spaces and hyphens with underscores
|
|
result = re.sub(r"[\s\-]+", "_", result)
|
|
|
|
# Remove special characters except underscores
|
|
result = re.sub(r"[^a-z0-9_]", "", result)
|
|
|
|
# Collapse multiple underscores
|
|
result = re.sub(r"_+", "_", result)
|
|
|
|
# Strip leading/trailing underscores
|
|
result = result.strip("_")
|
|
|
|
return result
|
|
|
|
|
|
def generate_game_id(
|
|
sport: str,
|
|
season: int,
|
|
away_abbrev: str,
|
|
home_abbrev: str,
|
|
game_date: date | datetime,
|
|
game_number: Optional[int] = None,
|
|
) -> str:
|
|
"""Generate a canonical game ID.
|
|
|
|
Format: game_{sport}_{season}_{YYYYMMDD}_{away}_{home}[_{game_number}]
|
|
|
|
Args:
|
|
sport: Sport code (e.g., 'nba', 'mlb')
|
|
season: Season start year (e.g., 2025 for 2025-26)
|
|
away_abbrev: Away team abbreviation (e.g., 'HOU')
|
|
home_abbrev: Home team abbreviation (e.g., 'OKC')
|
|
game_date: Date of the game
|
|
game_number: Game number for doubleheaders (1 or 2), None for single games
|
|
|
|
Returns:
|
|
Canonical game ID (e.g., 'game_nba_2025_20251021_hou_okc')
|
|
|
|
Examples:
|
|
>>> generate_game_id('nba', 2025, 'HOU', 'OKC', date(2025, 10, 21))
|
|
'game_nba_2025_20251021_hou_okc'
|
|
|
|
>>> generate_game_id('mlb', 2026, 'NYY', 'BOS', date(2026, 4, 1), game_number=1)
|
|
'game_mlb_2026_20260401_nyy_bos_1'
|
|
"""
|
|
# Normalize sport and abbreviations
|
|
sport_norm = sport.lower()
|
|
away_norm = away_abbrev.lower()
|
|
home_norm = home_abbrev.lower()
|
|
|
|
# Format date as YYYYMMDD
|
|
if isinstance(game_date, datetime):
|
|
game_date = game_date.date()
|
|
date_str = game_date.strftime("%Y%m%d")
|
|
|
|
# Build ID with game_ prefix
|
|
parts = ["game", sport_norm, str(season), date_str, away_norm, home_norm]
|
|
|
|
# Add game number for doubleheaders
|
|
if game_number is not None:
|
|
parts.append(str(game_number))
|
|
|
|
return "_".join(parts)
|
|
|
|
|
|
def generate_team_id(sport: str, city: str, name: str) -> str:
|
|
"""Generate a canonical team ID.
|
|
|
|
Format: team_{sport}_{abbreviation}
|
|
|
|
For most teams, we use the standard abbreviation. This function generates
|
|
a fallback ID based on city and name for teams without a known abbreviation.
|
|
|
|
Args:
|
|
sport: Sport code (e.g., 'nba', 'mlb')
|
|
city: Team city (e.g., 'Los Angeles')
|
|
name: Team name (e.g., 'Lakers')
|
|
|
|
Returns:
|
|
Canonical team ID (e.g., 'team_nba_la_lakers')
|
|
|
|
Examples:
|
|
>>> generate_team_id('nba', 'Los Angeles', 'Lakers')
|
|
'team_nba_la_lakers'
|
|
|
|
>>> generate_team_id('mlb', 'New York', 'Yankees')
|
|
'team_mlb_new_york_yankees'
|
|
"""
|
|
sport_norm = sport.lower()
|
|
city_norm = normalize_string(city)
|
|
name_norm = normalize_string(name)
|
|
|
|
return f"team_{sport_norm}_{city_norm}_{name_norm}"
|
|
|
|
|
|
def generate_team_id_from_abbrev(sport: str, abbreviation: str) -> str:
|
|
"""Generate a canonical team ID from abbreviation.
|
|
|
|
Format: team_{sport}_{abbreviation}
|
|
|
|
Args:
|
|
sport: Sport code (e.g., 'nba', 'mlb')
|
|
abbreviation: Team abbreviation (e.g., 'LAL', 'NYY')
|
|
|
|
Returns:
|
|
Canonical team ID (e.g., 'team_nba_lal')
|
|
|
|
Examples:
|
|
>>> generate_team_id_from_abbrev('nba', 'LAL')
|
|
'team_nba_lal'
|
|
|
|
>>> generate_team_id_from_abbrev('mlb', 'NYY')
|
|
'team_mlb_nyy'
|
|
"""
|
|
sport_norm = sport.lower()
|
|
abbrev_norm = abbreviation.lower()
|
|
|
|
return f"team_{sport_norm}_{abbrev_norm}"
|
|
|
|
|
|
def generate_stadium_id(sport: str, name: str) -> str:
|
|
"""Generate a canonical stadium ID.
|
|
|
|
Format: stadium_{sport}_{normalized_name}
|
|
|
|
Args:
|
|
sport: Sport code (e.g., 'nba', 'mlb')
|
|
name: Stadium name (e.g., 'Yankee Stadium')
|
|
|
|
Returns:
|
|
Canonical stadium ID (e.g., 'stadium_mlb_yankee_stadium')
|
|
|
|
Examples:
|
|
>>> generate_stadium_id('nba', 'Crypto.com Arena')
|
|
'stadium_nba_cryptocom_arena'
|
|
|
|
>>> generate_stadium_id('mlb', 'Yankee Stadium')
|
|
'stadium_mlb_yankee_stadium'
|
|
"""
|
|
sport_norm = sport.lower()
|
|
name_norm = normalize_string(name)
|
|
|
|
return f"stadium_{sport_norm}_{name_norm}"
|
|
|
|
|
|
def parse_game_id(game_id: str) -> dict:
|
|
"""Parse a canonical game ID into its components.
|
|
|
|
Args:
|
|
game_id: Canonical game ID (e.g., 'game_nba_2025_20251021_hou_okc')
|
|
|
|
Returns:
|
|
Dictionary with keys: sport, season, away_abbrev, home_abbrev,
|
|
year, month, day, game_number (optional)
|
|
|
|
Raises:
|
|
ValueError: If game_id format is invalid
|
|
|
|
Examples:
|
|
>>> parse_game_id('game_nba_2025_20251021_hou_okc')
|
|
{'sport': 'nba', 'season': 2025, 'away_abbrev': 'hou',
|
|
'home_abbrev': 'okc', 'year': 2025, 'month': 10, 'day': 21, 'game_number': None}
|
|
|
|
>>> parse_game_id('game_mlb_2026_20260401_nyy_bos_1')
|
|
{'sport': 'mlb', 'season': 2026, 'away_abbrev': 'nyy',
|
|
'home_abbrev': 'bos', 'year': 2026, 'month': 4, 'day': 1, 'game_number': 1}
|
|
"""
|
|
parts = game_id.split("_")
|
|
|
|
if len(parts) < 6 or len(parts) > 7:
|
|
raise ValueError(f"Invalid game ID format: {game_id}")
|
|
|
|
if parts[0] != "game":
|
|
raise ValueError(f"Game ID must start with 'game_': {game_id}")
|
|
|
|
sport = parts[1]
|
|
season = int(parts[2])
|
|
date_str = parts[3]
|
|
away_abbrev = parts[4]
|
|
home_abbrev = parts[5]
|
|
|
|
if len(date_str) != 8:
|
|
raise ValueError(f"Invalid date format in game ID: {game_id}")
|
|
|
|
year = int(date_str[:4])
|
|
month = int(date_str[4:6])
|
|
day = int(date_str[6:])
|
|
|
|
game_number = None
|
|
if len(parts) == 7:
|
|
game_number = int(parts[6])
|
|
|
|
return {
|
|
"sport": sport,
|
|
"season": season,
|
|
"away_abbrev": away_abbrev,
|
|
"home_abbrev": home_abbrev,
|
|
"year": year,
|
|
"month": month,
|
|
"day": day,
|
|
"game_number": game_number,
|
|
}
|
|
|
|
|
|
def parse_team_id(team_id: str) -> dict:
|
|
"""Parse a canonical team ID into its components.
|
|
|
|
Args:
|
|
team_id: Canonical team ID (e.g., 'team_nba_lal')
|
|
|
|
Returns:
|
|
Dictionary with keys: sport, identifier (abbreviation or city_name)
|
|
|
|
Raises:
|
|
ValueError: If team_id format is invalid
|
|
"""
|
|
if not team_id.startswith("team_"):
|
|
raise ValueError(f"Invalid team ID format: {team_id}")
|
|
|
|
parts = team_id.split("_", 2)
|
|
|
|
if len(parts) < 3:
|
|
raise ValueError(f"Invalid team ID format: {team_id}")
|
|
|
|
return {
|
|
"sport": parts[1],
|
|
"identifier": parts[2],
|
|
}
|
|
|
|
|
|
def parse_stadium_id(stadium_id: str) -> dict:
|
|
"""Parse a canonical stadium ID into its components.
|
|
|
|
Args:
|
|
stadium_id: Canonical stadium ID (e.g., 'stadium_nba_paycom_center')
|
|
|
|
Returns:
|
|
Dictionary with keys: sport, name
|
|
|
|
Raises:
|
|
ValueError: If stadium_id format is invalid
|
|
"""
|
|
if not stadium_id.startswith("stadium_"):
|
|
raise ValueError(f"Invalid stadium ID format: {stadium_id}")
|
|
|
|
parts = stadium_id.split("_", 2)
|
|
|
|
if len(parts) < 3:
|
|
raise ValueError(f"Invalid stadium ID format: {stadium_id}")
|
|
|
|
return {
|
|
"sport": parts[1],
|
|
"name": parts[2],
|
|
}
|