feat(scripts): rewrite parser as modular Python CLI

Replace monolithic scraping scripts with sportstime_parser package:

- Multi-source scrapers with automatic fallback for 7 sports
- Canonical ID generation for games, teams, and stadiums
- Fuzzy matching with configurable thresholds for name resolution
- CloudKit Web Services uploader with JWT auth, diff-based updates
- Resumable uploads with checkpoint state persistence
- Validation reports with manual review items and suggested matches
- Comprehensive test suite (249 tests)

CLI: sportstime-parser scrape|validate|upload|status|retry|clear

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-10 21:06:12 -06:00
parent 284a10d9e1
commit eeaf900e5a
109 changed files with 18415 additions and 266211 deletions

View File

@@ -0,0 +1,35 @@
"""Data models for sportstime-parser."""
from .game import Game, save_games, load_games
from .team import Team, save_teams, load_teams
from .stadium import Stadium, save_stadiums, load_stadiums
from .aliases import (
AliasType,
ReviewReason,
TeamAlias,
StadiumAlias,
FuzzyMatch,
ManualReviewItem,
)
__all__ = [
# Game
"Game",
"save_games",
"load_games",
# Team
"Team",
"save_teams",
"load_teams",
# Stadium
"Stadium",
"save_stadiums",
"load_stadiums",
# Aliases
"AliasType",
"ReviewReason",
"TeamAlias",
"StadiumAlias",
"FuzzyMatch",
"ManualReviewItem",
]

View File

@@ -0,0 +1,262 @@
"""Alias and manual review data models for sportstime-parser."""
from dataclasses import dataclass, field
from datetime import date, datetime
from enum import Enum
from typing import Optional
import json
class AliasType(Enum):
"""Type of team alias."""
NAME = "name"
ABBREVIATION = "abbreviation"
CITY = "city"
class ReviewReason(Enum):
"""Reason an item requires manual review."""
UNRESOLVED_TEAM = "unresolved_team"
UNRESOLVED_STADIUM = "unresolved_stadium"
LOW_CONFIDENCE_MATCH = "low_confidence_match"
MISSING_DATA = "missing_data"
DUPLICATE_GAME = "duplicate_game"
TIMEZONE_UNKNOWN = "timezone_unknown"
GEOGRAPHIC_FILTER = "geographic_filter"
@dataclass
class TeamAlias:
"""Represents a team alias with optional date validity.
Attributes:
id: Unique alias ID
team_canonical_id: The canonical team ID this alias resolves to
alias_type: Type of alias (name, abbreviation, city)
alias_value: The alias value to match against
valid_from: Start date of alias validity (None = always valid)
valid_until: End date of alias validity (None = still valid)
"""
id: str
team_canonical_id: str
alias_type: AliasType
alias_value: str
valid_from: Optional[date] = None
valid_until: Optional[date] = None
def is_valid_on(self, check_date: date) -> bool:
"""Check if this alias is valid on the given date."""
if self.valid_from and check_date < self.valid_from:
return False
if self.valid_until and check_date > self.valid_until:
return False
return True
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"id": self.id,
"team_canonical_id": self.team_canonical_id,
"alias_type": self.alias_type.value,
"alias_value": self.alias_value,
"valid_from": self.valid_from.isoformat() if self.valid_from else None,
"valid_until": self.valid_until.isoformat() if self.valid_until else None,
}
@classmethod
def from_dict(cls, data: dict) -> "TeamAlias":
"""Create a TeamAlias from a dictionary."""
valid_from = None
if data.get("valid_from"):
valid_from = date.fromisoformat(data["valid_from"])
valid_until = None
if data.get("valid_until"):
valid_until = date.fromisoformat(data["valid_until"])
return cls(
id=data["id"],
team_canonical_id=data["team_canonical_id"],
alias_type=AliasType(data["alias_type"]),
alias_value=data["alias_value"],
valid_from=valid_from,
valid_until=valid_until,
)
@dataclass
class StadiumAlias:
"""Represents a stadium alias with optional date validity.
Attributes:
alias_name: The alias name to match against (lowercase)
stadium_canonical_id: The canonical stadium ID this alias resolves to
valid_from: Start date of alias validity (None = always valid)
valid_until: End date of alias validity (None = still valid)
"""
alias_name: str
stadium_canonical_id: str
valid_from: Optional[date] = None
valid_until: Optional[date] = None
def is_valid_on(self, check_date: date) -> bool:
"""Check if this alias is valid on the given date."""
if self.valid_from and check_date < self.valid_from:
return False
if self.valid_until and check_date > self.valid_until:
return False
return True
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"alias_name": self.alias_name,
"stadium_canonical_id": self.stadium_canonical_id,
"valid_from": self.valid_from.isoformat() if self.valid_from else None,
"valid_until": self.valid_until.isoformat() if self.valid_until else None,
}
@classmethod
def from_dict(cls, data: dict) -> "StadiumAlias":
"""Create a StadiumAlias from a dictionary."""
valid_from = None
if data.get("valid_from"):
valid_from = date.fromisoformat(data["valid_from"])
valid_until = None
if data.get("valid_until"):
valid_until = date.fromisoformat(data["valid_until"])
return cls(
alias_name=data["alias_name"],
stadium_canonical_id=data["stadium_canonical_id"],
valid_from=valid_from,
valid_until=valid_until,
)
@dataclass
class FuzzyMatch:
"""Represents a fuzzy match suggestion with confidence score."""
canonical_id: str
canonical_name: str
confidence: int # 0-100
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"canonical_id": self.canonical_id,
"canonical_name": self.canonical_name,
"confidence": self.confidence,
}
@dataclass
class ManualReviewItem:
"""Represents an item requiring manual review.
Attributes:
id: Unique review item ID
reason: Why this item needs review
sport: Sport code
raw_value: The original unresolved value
context: Additional context about the issue
source_url: URL of the source page
suggested_matches: List of potential matches with confidence scores
game_date: Date of the game (if applicable)
created_at: When this review item was created
"""
id: str
reason: ReviewReason
sport: str
raw_value: str
context: dict = field(default_factory=dict)
source_url: Optional[str] = None
suggested_matches: list[FuzzyMatch] = field(default_factory=list)
game_date: Optional[date] = None
created_at: datetime = field(default_factory=datetime.now)
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"id": self.id,
"reason": self.reason.value,
"sport": self.sport,
"raw_value": self.raw_value,
"context": self.context,
"source_url": self.source_url,
"suggested_matches": [m.to_dict() for m in self.suggested_matches],
"game_date": self.game_date.isoformat() if self.game_date else None,
"created_at": self.created_at.isoformat(),
}
@classmethod
def from_dict(cls, data: dict) -> "ManualReviewItem":
"""Create a ManualReviewItem from a dictionary."""
game_date = None
if data.get("game_date"):
game_date = date.fromisoformat(data["game_date"])
created_at = datetime.now()
if data.get("created_at"):
created_at = datetime.fromisoformat(data["created_at"])
suggested_matches = []
for match_data in data.get("suggested_matches", []):
suggested_matches.append(FuzzyMatch(
canonical_id=match_data["canonical_id"],
canonical_name=match_data["canonical_name"],
confidence=match_data["confidence"],
))
return cls(
id=data["id"],
reason=ReviewReason(data["reason"]),
sport=data["sport"],
raw_value=data["raw_value"],
context=data.get("context", {}),
source_url=data.get("source_url"),
suggested_matches=suggested_matches,
game_date=game_date,
created_at=created_at,
)
def to_markdown(self) -> str:
"""Generate markdown representation for validation report."""
lines = [
f"### {self.reason.value.replace('_', ' ').title()}: {self.raw_value}",
"",
f"**Sport**: {self.sport.upper()}",
]
if self.game_date:
lines.append(f"**Game Date**: {self.game_date.isoformat()}")
if self.context:
lines.append("")
lines.append("**Context**:")
for key, value in self.context.items():
lines.append(f"- {key}: {value}")
if self.suggested_matches:
lines.append("")
lines.append("**Suggested Matches**:")
for i, match in enumerate(self.suggested_matches, 1):
marker = " <- likely correct" if match.confidence >= 90 else ""
lines.append(
f"{i}. `{match.canonical_id}` ({match.confidence}%){marker}"
)
if self.source_url:
lines.append("")
lines.append(f"**Source**: [{self.source_url}]({self.source_url})")
lines.append("")
lines.append("---")
lines.append("")
return "\n".join(lines)

View File

@@ -0,0 +1,112 @@
"""Game data model for sportstime-parser."""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
import json
@dataclass
class Game:
"""Represents a game with all CloudKit fields.
Attributes:
id: Canonical game ID (e.g., 'nba_2025_hou_okc_1021')
sport: Sport code (e.g., 'nba', 'mlb')
season: Season start year (e.g., 2025 for 2025-26)
home_team_id: Canonical home team ID
away_team_id: Canonical away team ID
stadium_id: Canonical stadium ID
game_date: Game date/time in UTC
game_number: Game number for doubleheaders (1 or 2), None for single games
home_score: Final home team score (None if not played)
away_score: Final away team score (None if not played)
status: Game status ('scheduled', 'final', 'postponed', 'cancelled')
source_url: URL of the source page for manual review
raw_home_team: Original home team name from source (for debugging)
raw_away_team: Original away team name from source (for debugging)
raw_stadium: Original stadium name from source (for debugging)
"""
id: str
sport: str
season: int
home_team_id: str
away_team_id: str
stadium_id: str
game_date: datetime
game_number: Optional[int] = None
home_score: Optional[int] = None
away_score: Optional[int] = None
status: str = "scheduled"
source_url: Optional[str] = None
raw_home_team: Optional[str] = None
raw_away_team: Optional[str] = None
raw_stadium: Optional[str] = None
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"id": self.id,
"sport": self.sport,
"season": self.season,
"home_team_id": self.home_team_id,
"away_team_id": self.away_team_id,
"stadium_id": self.stadium_id,
"game_date": self.game_date.isoformat(),
"game_number": self.game_number,
"home_score": self.home_score,
"away_score": self.away_score,
"status": self.status,
"source_url": self.source_url,
"raw_home_team": self.raw_home_team,
"raw_away_team": self.raw_away_team,
"raw_stadium": self.raw_stadium,
}
@classmethod
def from_dict(cls, data: dict) -> "Game":
"""Create a Game from a dictionary."""
game_date = data["game_date"]
if isinstance(game_date, str):
game_date = datetime.fromisoformat(game_date)
return cls(
id=data["id"],
sport=data["sport"],
season=data["season"],
home_team_id=data["home_team_id"],
away_team_id=data["away_team_id"],
stadium_id=data["stadium_id"],
game_date=game_date,
game_number=data.get("game_number"),
home_score=data.get("home_score"),
away_score=data.get("away_score"),
status=data.get("status", "scheduled"),
source_url=data.get("source_url"),
raw_home_team=data.get("raw_home_team"),
raw_away_team=data.get("raw_away_team"),
raw_stadium=data.get("raw_stadium"),
)
def to_json(self) -> str:
"""Serialize to JSON string."""
return json.dumps(self.to_dict(), indent=2)
@classmethod
def from_json(cls, json_str: str) -> "Game":
"""Deserialize from JSON string."""
return cls.from_dict(json.loads(json_str))
def save_games(games: list[Game], filepath: str) -> None:
"""Save a list of games to a JSON file."""
with open(filepath, "w", encoding="utf-8") as f:
json.dump([g.to_dict() for g in games], f, indent=2)
def load_games(filepath: str) -> list[Game]:
"""Load a list of games from a JSON file."""
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
return [Game.from_dict(d) for d in data]

View File

@@ -0,0 +1,108 @@
"""Stadium data model for sportstime-parser."""
from dataclasses import dataclass
from typing import Optional
import json
@dataclass
class Stadium:
"""Represents a stadium with all CloudKit fields.
Attributes:
id: Canonical stadium ID (e.g., 'stadium_nba_paycom_center')
sport: Primary sport code (e.g., 'nba', 'mlb')
name: Current stadium name (e.g., 'Paycom Center')
city: City name (e.g., 'Oklahoma City')
state: State/province code (e.g., 'OK', 'ON')
country: Country code (e.g., 'USA', 'Canada')
latitude: Latitude coordinate
longitude: Longitude coordinate
capacity: Seating capacity
surface: Playing surface (e.g., 'grass', 'turf', 'hardwood')
roof_type: Roof type (e.g., 'dome', 'retractable', 'open')
opened_year: Year stadium opened
image_url: URL to stadium image
timezone: IANA timezone (e.g., 'America/Chicago')
"""
id: str
sport: str
name: str
city: str
state: str
country: str
latitude: float
longitude: float
capacity: Optional[int] = None
surface: Optional[str] = None
roof_type: Optional[str] = None
opened_year: Optional[int] = None
image_url: Optional[str] = None
timezone: Optional[str] = None
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"id": self.id,
"sport": self.sport,
"name": self.name,
"city": self.city,
"state": self.state,
"country": self.country,
"latitude": self.latitude,
"longitude": self.longitude,
"capacity": self.capacity,
"surface": self.surface,
"roof_type": self.roof_type,
"opened_year": self.opened_year,
"image_url": self.image_url,
"timezone": self.timezone,
}
@classmethod
def from_dict(cls, data: dict) -> "Stadium":
"""Create a Stadium from a dictionary."""
return cls(
id=data["id"],
sport=data["sport"],
name=data["name"],
city=data["city"],
state=data["state"],
country=data["country"],
latitude=data["latitude"],
longitude=data["longitude"],
capacity=data.get("capacity"),
surface=data.get("surface"),
roof_type=data.get("roof_type"),
opened_year=data.get("opened_year"),
image_url=data.get("image_url"),
timezone=data.get("timezone"),
)
def to_json(self) -> str:
"""Serialize to JSON string."""
return json.dumps(self.to_dict(), indent=2)
@classmethod
def from_json(cls, json_str: str) -> "Stadium":
"""Deserialize from JSON string."""
return cls.from_dict(json.loads(json_str))
def is_in_allowed_region(self) -> bool:
"""Check if stadium is in USA, Canada, or Mexico."""
allowed = {"USA", "US", "United States", "Canada", "CA", "Mexico", "MX"}
return self.country in allowed
def save_stadiums(stadiums: list[Stadium], filepath: str) -> None:
"""Save a list of stadiums to a JSON file."""
with open(filepath, "w", encoding="utf-8") as f:
json.dump([s.to_dict() for s in stadiums], f, indent=2)
def load_stadiums(filepath: str) -> list[Stadium]:
"""Load a list of stadiums from a JSON file."""
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
return [Stadium.from_dict(d) for d in data]

View File

@@ -0,0 +1,95 @@
"""Team data model for sportstime-parser."""
from dataclasses import dataclass
from typing import Optional
import json
@dataclass
class Team:
"""Represents a team with all CloudKit fields.
Attributes:
id: Canonical team ID (e.g., 'team_nba_okc')
sport: Sport code (e.g., 'nba', 'mlb')
city: Team city (e.g., 'Oklahoma City')
name: Team name (e.g., 'Thunder')
full_name: Full team name (e.g., 'Oklahoma City Thunder')
abbreviation: Official abbreviation (e.g., 'OKC')
conference: Conference name (e.g., 'Western', 'American')
division: Division name (e.g., 'Northwest', 'AL West')
primary_color: Primary team color as hex (e.g., '#007AC1')
secondary_color: Secondary team color as hex (e.g., '#EF3B24')
logo_url: URL to team logo image
stadium_id: Canonical ID of home stadium
"""
id: str
sport: str
city: str
name: str
full_name: str
abbreviation: str
conference: Optional[str] = None
division: Optional[str] = None
primary_color: Optional[str] = None
secondary_color: Optional[str] = None
logo_url: Optional[str] = None
stadium_id: Optional[str] = None
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"id": self.id,
"sport": self.sport,
"city": self.city,
"name": self.name,
"full_name": self.full_name,
"abbreviation": self.abbreviation,
"conference": self.conference,
"division": self.division,
"primary_color": self.primary_color,
"secondary_color": self.secondary_color,
"logo_url": self.logo_url,
"stadium_id": self.stadium_id,
}
@classmethod
def from_dict(cls, data: dict) -> "Team":
"""Create a Team from a dictionary."""
return cls(
id=data["id"],
sport=data["sport"],
city=data["city"],
name=data["name"],
full_name=data["full_name"],
abbreviation=data["abbreviation"],
conference=data.get("conference"),
division=data.get("division"),
primary_color=data.get("primary_color"),
secondary_color=data.get("secondary_color"),
logo_url=data.get("logo_url"),
stadium_id=data.get("stadium_id"),
)
def to_json(self) -> str:
"""Serialize to JSON string."""
return json.dumps(self.to_dict(), indent=2)
@classmethod
def from_json(cls, json_str: str) -> "Team":
"""Deserialize from JSON string."""
return cls.from_dict(json.loads(json_str))
def save_teams(teams: list[Team], filepath: str) -> None:
"""Save a list of teams to a JSON file."""
with open(filepath, "w", encoding="utf-8") as f:
json.dump([t.to_dict() for t in teams], f, indent=2)
def load_teams(filepath: str) -> list[Team]:
"""Load a list of teams from a JSON file."""
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
return [Team.from_dict(d) for d in data]