feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
279
Scripts/sportstime_parser/normalizers/canonical_id.py
Normal file
279
Scripts/sportstime_parser/normalizers/canonical_id.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""Canonical ID generation for games, teams, and stadiums."""
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from datetime import date, datetime
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def normalize_string(s: str) -> str:
|
||||
"""Normalize a string for use in canonical IDs.
|
||||
|
||||
- Convert to lowercase
|
||||
- Replace spaces and hyphens with underscores
|
||||
- Remove special characters (except underscores)
|
||||
- Collapse multiple underscores
|
||||
- Strip leading/trailing underscores
|
||||
|
||||
Args:
|
||||
s: String to normalize
|
||||
|
||||
Returns:
|
||||
Normalized string suitable for IDs
|
||||
"""
|
||||
# Convert to lowercase
|
||||
result = s.lower()
|
||||
|
||||
# Normalize unicode (e.g., é -> e)
|
||||
result = unicodedata.normalize("NFKD", result)
|
||||
result = result.encode("ascii", "ignore").decode("ascii")
|
||||
|
||||
# Replace spaces and hyphens with underscores
|
||||
result = re.sub(r"[\s\-]+", "_", result)
|
||||
|
||||
# Remove special characters except underscores
|
||||
result = re.sub(r"[^a-z0-9_]", "", result)
|
||||
|
||||
# Collapse multiple underscores
|
||||
result = re.sub(r"_+", "_", result)
|
||||
|
||||
# Strip leading/trailing underscores
|
||||
result = result.strip("_")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def generate_game_id(
|
||||
sport: str,
|
||||
season: int,
|
||||
away_abbrev: str,
|
||||
home_abbrev: str,
|
||||
game_date: date | datetime,
|
||||
game_number: Optional[int] = None,
|
||||
) -> str:
|
||||
"""Generate a canonical game ID.
|
||||
|
||||
Format: {sport}_{season}_{away}_{home}_{MMDD}[_{game_number}]
|
||||
|
||||
Args:
|
||||
sport: Sport code (e.g., 'nba', 'mlb')
|
||||
season: Season start year (e.g., 2025 for 2025-26)
|
||||
away_abbrev: Away team abbreviation (e.g., 'HOU')
|
||||
home_abbrev: Home team abbreviation (e.g., 'OKC')
|
||||
game_date: Date of the game
|
||||
game_number: Game number for doubleheaders (1 or 2), None for single games
|
||||
|
||||
Returns:
|
||||
Canonical game ID (e.g., 'nba_2025_hou_okc_1021')
|
||||
|
||||
Examples:
|
||||
>>> generate_game_id('nba', 2025, 'HOU', 'OKC', date(2025, 10, 21))
|
||||
'nba_2025_hou_okc_1021'
|
||||
|
||||
>>> generate_game_id('mlb', 2026, 'NYY', 'BOS', date(2026, 4, 1), game_number=1)
|
||||
'mlb_2026_nyy_bos_0401_1'
|
||||
"""
|
||||
# Normalize sport and abbreviations
|
||||
sport_norm = sport.lower()
|
||||
away_norm = away_abbrev.lower()
|
||||
home_norm = home_abbrev.lower()
|
||||
|
||||
# Format date as MMDD
|
||||
if isinstance(game_date, datetime):
|
||||
game_date = game_date.date()
|
||||
date_str = game_date.strftime("%m%d")
|
||||
|
||||
# Build ID
|
||||
parts = [sport_norm, str(season), away_norm, home_norm, date_str]
|
||||
|
||||
# Add game number for doubleheaders
|
||||
if game_number is not None:
|
||||
parts.append(str(game_number))
|
||||
|
||||
return "_".join(parts)
|
||||
|
||||
|
||||
def generate_team_id(sport: str, city: str, name: str) -> str:
|
||||
"""Generate a canonical team ID.
|
||||
|
||||
Format: team_{sport}_{abbreviation}
|
||||
|
||||
For most teams, we use the standard abbreviation. This function generates
|
||||
a fallback ID based on city and name for teams without a known abbreviation.
|
||||
|
||||
Args:
|
||||
sport: Sport code (e.g., 'nba', 'mlb')
|
||||
city: Team city (e.g., 'Los Angeles')
|
||||
name: Team name (e.g., 'Lakers')
|
||||
|
||||
Returns:
|
||||
Canonical team ID (e.g., 'team_nba_la_lakers')
|
||||
|
||||
Examples:
|
||||
>>> generate_team_id('nba', 'Los Angeles', 'Lakers')
|
||||
'team_nba_la_lakers'
|
||||
|
||||
>>> generate_team_id('mlb', 'New York', 'Yankees')
|
||||
'team_mlb_new_york_yankees'
|
||||
"""
|
||||
sport_norm = sport.lower()
|
||||
city_norm = normalize_string(city)
|
||||
name_norm = normalize_string(name)
|
||||
|
||||
return f"team_{sport_norm}_{city_norm}_{name_norm}"
|
||||
|
||||
|
||||
def generate_team_id_from_abbrev(sport: str, abbreviation: str) -> str:
|
||||
"""Generate a canonical team ID from abbreviation.
|
||||
|
||||
Format: team_{sport}_{abbreviation}
|
||||
|
||||
Args:
|
||||
sport: Sport code (e.g., 'nba', 'mlb')
|
||||
abbreviation: Team abbreviation (e.g., 'LAL', 'NYY')
|
||||
|
||||
Returns:
|
||||
Canonical team ID (e.g., 'team_nba_lal')
|
||||
|
||||
Examples:
|
||||
>>> generate_team_id_from_abbrev('nba', 'LAL')
|
||||
'team_nba_lal'
|
||||
|
||||
>>> generate_team_id_from_abbrev('mlb', 'NYY')
|
||||
'team_mlb_nyy'
|
||||
"""
|
||||
sport_norm = sport.lower()
|
||||
abbrev_norm = abbreviation.lower()
|
||||
|
||||
return f"team_{sport_norm}_{abbrev_norm}"
|
||||
|
||||
|
||||
def generate_stadium_id(sport: str, name: str) -> str:
|
||||
"""Generate a canonical stadium ID.
|
||||
|
||||
Format: stadium_{sport}_{normalized_name}
|
||||
|
||||
Args:
|
||||
sport: Sport code (e.g., 'nba', 'mlb')
|
||||
name: Stadium name (e.g., 'Yankee Stadium')
|
||||
|
||||
Returns:
|
||||
Canonical stadium ID (e.g., 'stadium_mlb_yankee_stadium')
|
||||
|
||||
Examples:
|
||||
>>> generate_stadium_id('nba', 'Crypto.com Arena')
|
||||
'stadium_nba_cryptocom_arena'
|
||||
|
||||
>>> generate_stadium_id('mlb', 'Yankee Stadium')
|
||||
'stadium_mlb_yankee_stadium'
|
||||
"""
|
||||
sport_norm = sport.lower()
|
||||
name_norm = normalize_string(name)
|
||||
|
||||
return f"stadium_{sport_norm}_{name_norm}"
|
||||
|
||||
|
||||
def parse_game_id(game_id: str) -> dict:
|
||||
"""Parse a canonical game ID into its components.
|
||||
|
||||
Args:
|
||||
game_id: Canonical game ID (e.g., 'nba_2025_hou_okc_1021')
|
||||
|
||||
Returns:
|
||||
Dictionary with keys: sport, season, away_abbrev, home_abbrev,
|
||||
month, day, game_number (optional)
|
||||
|
||||
Raises:
|
||||
ValueError: If game_id format is invalid
|
||||
|
||||
Examples:
|
||||
>>> parse_game_id('nba_2025_hou_okc_1021')
|
||||
{'sport': 'nba', 'season': 2025, 'away_abbrev': 'hou',
|
||||
'home_abbrev': 'okc', 'month': 10, 'day': 21, 'game_number': None}
|
||||
|
||||
>>> parse_game_id('mlb_2026_nyy_bos_0401_1')
|
||||
{'sport': 'mlb', 'season': 2026, 'away_abbrev': 'nyy',
|
||||
'home_abbrev': 'bos', 'month': 4, 'day': 1, 'game_number': 1}
|
||||
"""
|
||||
parts = game_id.split("_")
|
||||
|
||||
if len(parts) < 5 or len(parts) > 6:
|
||||
raise ValueError(f"Invalid game ID format: {game_id}")
|
||||
|
||||
sport = parts[0]
|
||||
season = int(parts[1])
|
||||
away_abbrev = parts[2]
|
||||
home_abbrev = parts[3]
|
||||
date_str = parts[4]
|
||||
|
||||
if len(date_str) != 4:
|
||||
raise ValueError(f"Invalid date format in game ID: {game_id}")
|
||||
|
||||
month = int(date_str[:2])
|
||||
day = int(date_str[2:])
|
||||
|
||||
game_number = None
|
||||
if len(parts) == 6:
|
||||
game_number = int(parts[5])
|
||||
|
||||
return {
|
||||
"sport": sport,
|
||||
"season": season,
|
||||
"away_abbrev": away_abbrev,
|
||||
"home_abbrev": home_abbrev,
|
||||
"month": month,
|
||||
"day": day,
|
||||
"game_number": game_number,
|
||||
}
|
||||
|
||||
|
||||
def parse_team_id(team_id: str) -> dict:
|
||||
"""Parse a canonical team ID into its components.
|
||||
|
||||
Args:
|
||||
team_id: Canonical team ID (e.g., 'team_nba_lal')
|
||||
|
||||
Returns:
|
||||
Dictionary with keys: sport, identifier (abbreviation or city_name)
|
||||
|
||||
Raises:
|
||||
ValueError: If team_id format is invalid
|
||||
"""
|
||||
if not team_id.startswith("team_"):
|
||||
raise ValueError(f"Invalid team ID format: {team_id}")
|
||||
|
||||
parts = team_id.split("_", 2)
|
||||
|
||||
if len(parts) < 3:
|
||||
raise ValueError(f"Invalid team ID format: {team_id}")
|
||||
|
||||
return {
|
||||
"sport": parts[1],
|
||||
"identifier": parts[2],
|
||||
}
|
||||
|
||||
|
||||
def parse_stadium_id(stadium_id: str) -> dict:
|
||||
"""Parse a canonical stadium ID into its components.
|
||||
|
||||
Args:
|
||||
stadium_id: Canonical stadium ID (e.g., 'stadium_nba_paycom_center')
|
||||
|
||||
Returns:
|
||||
Dictionary with keys: sport, name
|
||||
|
||||
Raises:
|
||||
ValueError: If stadium_id format is invalid
|
||||
"""
|
||||
if not stadium_id.startswith("stadium_"):
|
||||
raise ValueError(f"Invalid stadium ID format: {stadium_id}")
|
||||
|
||||
parts = stadium_id.split("_", 2)
|
||||
|
||||
if len(parts) < 3:
|
||||
raise ValueError(f"Invalid stadium ID format: {stadium_id}")
|
||||
|
||||
return {
|
||||
"sport": parts[1],
|
||||
"name": parts[2],
|
||||
}
|
||||
Reference in New Issue
Block a user