feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
112
Scripts/sportstime_parser/models/game.py
Normal file
112
Scripts/sportstime_parser/models/game.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Game data model for sportstime-parser."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
import json
|
||||
|
||||
|
||||
@dataclass
|
||||
class Game:
|
||||
"""Represents a game with all CloudKit fields.
|
||||
|
||||
Attributes:
|
||||
id: Canonical game ID (e.g., 'nba_2025_hou_okc_1021')
|
||||
sport: Sport code (e.g., 'nba', 'mlb')
|
||||
season: Season start year (e.g., 2025 for 2025-26)
|
||||
home_team_id: Canonical home team ID
|
||||
away_team_id: Canonical away team ID
|
||||
stadium_id: Canonical stadium ID
|
||||
game_date: Game date/time in UTC
|
||||
game_number: Game number for doubleheaders (1 or 2), None for single games
|
||||
home_score: Final home team score (None if not played)
|
||||
away_score: Final away team score (None if not played)
|
||||
status: Game status ('scheduled', 'final', 'postponed', 'cancelled')
|
||||
source_url: URL of the source page for manual review
|
||||
raw_home_team: Original home team name from source (for debugging)
|
||||
raw_away_team: Original away team name from source (for debugging)
|
||||
raw_stadium: Original stadium name from source (for debugging)
|
||||
"""
|
||||
|
||||
id: str
|
||||
sport: str
|
||||
season: int
|
||||
home_team_id: str
|
||||
away_team_id: str
|
||||
stadium_id: str
|
||||
game_date: datetime
|
||||
game_number: Optional[int] = None
|
||||
home_score: Optional[int] = None
|
||||
away_score: Optional[int] = None
|
||||
status: str = "scheduled"
|
||||
source_url: Optional[str] = None
|
||||
raw_home_team: Optional[str] = None
|
||||
raw_away_team: Optional[str] = None
|
||||
raw_stadium: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"id": self.id,
|
||||
"sport": self.sport,
|
||||
"season": self.season,
|
||||
"home_team_id": self.home_team_id,
|
||||
"away_team_id": self.away_team_id,
|
||||
"stadium_id": self.stadium_id,
|
||||
"game_date": self.game_date.isoformat(),
|
||||
"game_number": self.game_number,
|
||||
"home_score": self.home_score,
|
||||
"away_score": self.away_score,
|
||||
"status": self.status,
|
||||
"source_url": self.source_url,
|
||||
"raw_home_team": self.raw_home_team,
|
||||
"raw_away_team": self.raw_away_team,
|
||||
"raw_stadium": self.raw_stadium,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "Game":
|
||||
"""Create a Game from a dictionary."""
|
||||
game_date = data["game_date"]
|
||||
if isinstance(game_date, str):
|
||||
game_date = datetime.fromisoformat(game_date)
|
||||
|
||||
return cls(
|
||||
id=data["id"],
|
||||
sport=data["sport"],
|
||||
season=data["season"],
|
||||
home_team_id=data["home_team_id"],
|
||||
away_team_id=data["away_team_id"],
|
||||
stadium_id=data["stadium_id"],
|
||||
game_date=game_date,
|
||||
game_number=data.get("game_number"),
|
||||
home_score=data.get("home_score"),
|
||||
away_score=data.get("away_score"),
|
||||
status=data.get("status", "scheduled"),
|
||||
source_url=data.get("source_url"),
|
||||
raw_home_team=data.get("raw_home_team"),
|
||||
raw_away_team=data.get("raw_away_team"),
|
||||
raw_stadium=data.get("raw_stadium"),
|
||||
)
|
||||
|
||||
def to_json(self) -> str:
|
||||
"""Serialize to JSON string."""
|
||||
return json.dumps(self.to_dict(), indent=2)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_str: str) -> "Game":
|
||||
"""Deserialize from JSON string."""
|
||||
return cls.from_dict(json.loads(json_str))
|
||||
|
||||
|
||||
def save_games(games: list[Game], filepath: str) -> None:
|
||||
"""Save a list of games to a JSON file."""
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
json.dump([g.to_dict() for g in games], f, indent=2)
|
||||
|
||||
|
||||
def load_games(filepath: str) -> list[Game]:
|
||||
"""Load a list of games from a JSON file."""
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
return [Game.from_dict(d) for d in data]
|
||||
Reference in New Issue
Block a user