feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
177
sportstime_parser/models/team.py
Normal file
177
sportstime_parser/models/team.py
Normal file
@@ -0,0 +1,177 @@
|
||||
"""Team data model for sportstime-parser."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
import json
|
||||
|
||||
|
||||
@dataclass
|
||||
class Team:
|
||||
"""Represents a team with all CloudKit fields.
|
||||
|
||||
Attributes:
|
||||
id: Canonical team ID (e.g., 'team_nba_okc')
|
||||
sport: Sport code (e.g., 'nba', 'mlb')
|
||||
city: Team city (e.g., 'Oklahoma City')
|
||||
name: Team name (e.g., 'Thunder')
|
||||
full_name: Full team name (e.g., 'Oklahoma City Thunder')
|
||||
abbreviation: Official abbreviation (e.g., 'OKC')
|
||||
conference: Conference name (e.g., 'Western', 'American')
|
||||
division: Division name (e.g., 'Northwest', 'AL West')
|
||||
primary_color: Primary team color as hex (e.g., '#007AC1')
|
||||
secondary_color: Secondary team color as hex (e.g., '#EF3B24')
|
||||
logo_url: URL to team logo image
|
||||
stadium_id: Canonical ID of home stadium
|
||||
"""
|
||||
|
||||
id: str
|
||||
sport: str
|
||||
city: str
|
||||
name: str
|
||||
full_name: str
|
||||
abbreviation: str
|
||||
conference: Optional[str] = None
|
||||
division: Optional[str] = None
|
||||
primary_color: Optional[str] = None
|
||||
secondary_color: Optional[str] = None
|
||||
logo_url: Optional[str] = None
|
||||
stadium_id: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"id": self.id,
|
||||
"sport": self.sport,
|
||||
"city": self.city,
|
||||
"name": self.name,
|
||||
"full_name": self.full_name,
|
||||
"abbreviation": self.abbreviation,
|
||||
"conference": self.conference,
|
||||
"division": self.division,
|
||||
"primary_color": self.primary_color,
|
||||
"secondary_color": self.secondary_color,
|
||||
"logo_url": self.logo_url,
|
||||
"stadium_id": self.stadium_id,
|
||||
}
|
||||
|
||||
def _make_qualified_id(self, name: Optional[str]) -> Optional[str]:
|
||||
"""Convert a conference/division name to a qualified ID.
|
||||
|
||||
Examples:
|
||||
"Eastern" → "nba_eastern"
|
||||
"AL West" → "mlb_al_west"
|
||||
"Southeast" → "nba_southeast"
|
||||
"""
|
||||
if not name:
|
||||
return None
|
||||
# Lowercase, replace spaces with underscores
|
||||
normalized = name.lower().replace(" ", "_")
|
||||
return f"{self.sport.lower()}_{normalized}"
|
||||
|
||||
def to_canonical_dict(self) -> dict:
|
||||
"""Convert to canonical dictionary format matching iOS app schema.
|
||||
|
||||
Returns:
|
||||
Dictionary with field names matching JSONCanonicalTeam in BootstrapService.swift
|
||||
"""
|
||||
return {
|
||||
"canonical_id": self.id,
|
||||
"name": self.name,
|
||||
"abbreviation": self.abbreviation,
|
||||
"sport": self.sport.upper(), # iOS Sport enum expects uppercase (e.g., "NFL")
|
||||
"city": self.city,
|
||||
"stadium_canonical_id": self.stadium_id or "",
|
||||
"conference_id": self._make_qualified_id(self.conference),
|
||||
"division_id": self._make_qualified_id(self.division),
|
||||
"primary_color": self.primary_color,
|
||||
"secondary_color": self.secondary_color,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "Team":
|
||||
"""Create a Team from a dictionary (internal format)."""
|
||||
return cls(
|
||||
id=data["id"],
|
||||
sport=data["sport"],
|
||||
city=data["city"],
|
||||
name=data["name"],
|
||||
full_name=data["full_name"],
|
||||
abbreviation=data["abbreviation"],
|
||||
conference=data.get("conference"),
|
||||
division=data.get("division"),
|
||||
primary_color=data.get("primary_color"),
|
||||
secondary_color=data.get("secondary_color"),
|
||||
logo_url=data.get("logo_url"),
|
||||
stadium_id=data.get("stadium_id"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _extract_name_from_qualified_id(qualified_id: Optional[str], sport: str) -> Optional[str]:
|
||||
"""Extract the name portion from a qualified ID.
|
||||
|
||||
Examples:
|
||||
"nba_eastern" → "Eastern"
|
||||
"mlb_al_west" → "AL West"
|
||||
"nba_southeast" → "Southeast"
|
||||
"""
|
||||
if not qualified_id:
|
||||
return None
|
||||
# Remove sport prefix (e.g., "nba_" or "mlb_")
|
||||
prefix = f"{sport.lower()}_"
|
||||
if qualified_id.startswith(prefix):
|
||||
name = qualified_id[len(prefix):]
|
||||
else:
|
||||
name = qualified_id
|
||||
# Convert underscores to spaces and title case
|
||||
# Special handling for league abbreviations (AL, NL, etc.)
|
||||
parts = name.split("_")
|
||||
result_parts = []
|
||||
for part in parts:
|
||||
if part.upper() in ("AL", "NL", "AFC", "NFC"):
|
||||
result_parts.append(part.upper())
|
||||
else:
|
||||
result_parts.append(part.capitalize())
|
||||
return " ".join(result_parts)
|
||||
|
||||
@classmethod
|
||||
def from_canonical_dict(cls, data: dict) -> "Team":
|
||||
"""Create a Team from a canonical dictionary (iOS app format)."""
|
||||
sport = data["sport"].lower()
|
||||
return cls(
|
||||
id=data["canonical_id"],
|
||||
sport=data["sport"],
|
||||
city=data["city"],
|
||||
name=data["name"],
|
||||
full_name=f"{data['city']} {data['name']}", # Reconstruct full_name
|
||||
abbreviation=data["abbreviation"],
|
||||
conference=cls._extract_name_from_qualified_id(data.get("conference_id"), sport),
|
||||
division=cls._extract_name_from_qualified_id(data.get("division_id"), sport),
|
||||
primary_color=data.get("primary_color"),
|
||||
secondary_color=data.get("secondary_color"),
|
||||
stadium_id=data.get("stadium_canonical_id"),
|
||||
)
|
||||
|
||||
def to_json(self) -> str:
|
||||
"""Serialize to JSON string."""
|
||||
return json.dumps(self.to_dict(), indent=2)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_str: str) -> "Team":
|
||||
"""Deserialize from JSON string."""
|
||||
return cls.from_dict(json.loads(json_str))
|
||||
|
||||
|
||||
def save_teams(teams: list[Team], filepath: str) -> None:
|
||||
"""Save a list of teams to a JSON file."""
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
json.dump([t.to_dict() for t in teams], f, indent=2)
|
||||
|
||||
|
||||
def load_teams(filepath: str) -> list[Team]:
|
||||
"""Load a list of teams from a JSON file (auto-detects format)."""
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
# Detect format: canonical has "canonical_id", internal has "id"
|
||||
if data and "canonical_id" in data[0]:
|
||||
return [Team.from_canonical_dict(d) for d in data]
|
||||
return [Team.from_dict(d) for d in data]
|
||||
Reference in New Issue
Block a user