Files
Trey t 52d445bca4 feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00

184 lines
6.8 KiB
Python

"""Game data model for sportstime-parser."""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
from zoneinfo import ZoneInfo
import json
@dataclass
class Game:
"""Represents a game with all CloudKit fields.
Attributes:
id: Canonical game ID (e.g., 'nba_2025_hou_okc_1021')
sport: Sport code (e.g., 'nba', 'mlb')
season: Season start year (e.g., 2025 for 2025-26)
home_team_id: Canonical home team ID
away_team_id: Canonical away team ID
stadium_id: Canonical stadium ID
game_date: Game date/time in UTC
game_number: Game number for doubleheaders (1 or 2), None for single games
home_score: Final home team score (None if not played)
away_score: Final away team score (None if not played)
status: Game status ('scheduled', 'final', 'postponed', 'cancelled')
source_url: URL of the source page for manual review
raw_home_team: Original home team name from source (for debugging)
raw_away_team: Original away team name from source (for debugging)
raw_stadium: Original stadium name from source (for debugging)
"""
id: str
sport: str
season: int
home_team_id: str
away_team_id: str
stadium_id: str
game_date: datetime
game_number: Optional[int] = None
home_score: Optional[int] = None
away_score: Optional[int] = None
status: str = "scheduled"
source_url: Optional[str] = None
raw_home_team: Optional[str] = None
raw_away_team: Optional[str] = None
raw_stadium: Optional[str] = None
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"id": self.id,
"sport": self.sport,
"season": self.season,
"home_team_id": self.home_team_id,
"away_team_id": self.away_team_id,
"stadium_id": self.stadium_id,
"game_date": self.game_date.isoformat(),
"game_number": self.game_number,
"home_score": self.home_score,
"away_score": self.away_score,
"status": self.status,
"source_url": self.source_url,
"raw_home_team": self.raw_home_team,
"raw_away_team": self.raw_away_team,
"raw_stadium": self.raw_stadium,
}
def to_canonical_dict(
self,
stadium_timezone: str,
is_playoff: bool = False,
broadcast: Optional[str] = None,
) -> dict:
"""Convert to canonical dictionary format matching iOS app schema.
Args:
stadium_timezone: IANA timezone of the stadium (e.g., 'America/Chicago')
is_playoff: Whether this is a playoff game
broadcast: Broadcast network info (e.g., 'ESPN')
Returns:
Dictionary with field names matching JSONCanonicalGame in BootstrapService.swift
"""
# Convert game_date to UTC
if self.game_date.tzinfo is None:
# Localize naive datetime to stadium timezone first
local_tz = ZoneInfo(stadium_timezone)
local_dt = self.game_date.replace(tzinfo=local_tz)
else:
local_dt = self.game_date
utc_dt = local_dt.astimezone(ZoneInfo("UTC"))
# Format season as string (e.g., 2025 -> "2025-26" for NBA/NHL, "2025" for MLB)
sport_lower = self.sport.lower()
if sport_lower in ("nba", "nhl"):
season_str = f"{self.season}-{str(self.season + 1)[-2:]}"
else:
season_str = str(self.season)
return {
"canonical_id": self.id,
"sport": self.sport.upper(), # iOS Sport enum expects uppercase (e.g., "NFL")
"season": season_str,
"game_datetime_utc": utc_dt.strftime("%Y-%m-%dT%H:%M:%SZ"),
"home_team_canonical_id": self.home_team_id,
"away_team_canonical_id": self.away_team_id,
"stadium_canonical_id": self.stadium_id,
"is_playoff": is_playoff,
"broadcast_info": broadcast,
}
@classmethod
def from_dict(cls, data: dict) -> "Game":
"""Create a Game from a dictionary (internal format)."""
game_date = data["game_date"]
if isinstance(game_date, str):
game_date = datetime.fromisoformat(game_date)
return cls(
id=data["id"],
sport=data["sport"],
season=data["season"],
home_team_id=data["home_team_id"],
away_team_id=data["away_team_id"],
stadium_id=data["stadium_id"],
game_date=game_date,
game_number=data.get("game_number"),
home_score=data.get("home_score"),
away_score=data.get("away_score"),
status=data.get("status", "scheduled"),
source_url=data.get("source_url"),
raw_home_team=data.get("raw_home_team"),
raw_away_team=data.get("raw_away_team"),
raw_stadium=data.get("raw_stadium"),
)
@classmethod
def from_canonical_dict(cls, data: dict) -> "Game":
"""Create a Game from a canonical dictionary (iOS app format)."""
# Handle 'Z' suffix (fromisoformat doesn't support it before Python 3.11)
date_str = data["game_datetime_utc"].replace("Z", "+00:00")
game_date = datetime.fromisoformat(date_str)
# Parse season string (e.g., "2025-26" -> 2025, or "2025" -> 2025)
season_str = data["season"]
season = int(season_str.split("-")[0])
return cls(
id=data["canonical_id"],
sport=data["sport"],
season=season,
home_team_id=data["home_team_canonical_id"],
away_team_id=data["away_team_canonical_id"],
stadium_id=data["stadium_canonical_id"],
game_date=game_date,
status="scheduled",
)
def to_json(self) -> str:
"""Serialize to JSON string."""
return json.dumps(self.to_dict(), indent=2)
@classmethod
def from_json(cls, json_str: str) -> "Game":
"""Deserialize from JSON string."""
return cls.from_dict(json.loads(json_str))
def save_games(games: list[Game], filepath: str) -> None:
"""Save a list of games to a JSON file."""
with open(filepath, "w", encoding="utf-8") as f:
json.dump([g.to_dict() for g in games], f, indent=2)
def load_games(filepath: str) -> list[Game]:
"""Load a list of games from a JSON file (auto-detects format)."""
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
# Detect format: canonical has "canonical_id" and "game_datetime_utc", internal has "id"
if data and "canonical_id" in data[0] and "game_datetime_utc" in data[0]:
return [Game.from_canonical_dict(d) for d in data]
return [Game.from_dict(d) for d in data]