feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
154
sportstime_parser/models/stadium.py
Normal file
154
sportstime_parser/models/stadium.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""Stadium data model for sportstime-parser."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
import json
|
||||
|
||||
|
||||
@dataclass
|
||||
class Stadium:
|
||||
"""Represents a stadium with all CloudKit fields.
|
||||
|
||||
Attributes:
|
||||
id: Canonical stadium ID (e.g., 'stadium_nba_paycom_center')
|
||||
sport: Primary sport code (e.g., 'nba', 'mlb')
|
||||
name: Current stadium name (e.g., 'Paycom Center')
|
||||
city: City name (e.g., 'Oklahoma City')
|
||||
state: State/province code (e.g., 'OK', 'ON')
|
||||
country: Country code (e.g., 'USA', 'Canada')
|
||||
latitude: Latitude coordinate
|
||||
longitude: Longitude coordinate
|
||||
capacity: Seating capacity
|
||||
surface: Playing surface (e.g., 'grass', 'turf', 'hardwood')
|
||||
roof_type: Roof type (e.g., 'dome', 'retractable', 'open')
|
||||
opened_year: Year stadium opened
|
||||
image_url: URL to stadium image
|
||||
timezone: IANA timezone (e.g., 'America/Chicago')
|
||||
"""
|
||||
|
||||
id: str
|
||||
sport: str
|
||||
name: str
|
||||
city: str
|
||||
state: str
|
||||
country: str
|
||||
latitude: float
|
||||
longitude: float
|
||||
capacity: Optional[int] = None
|
||||
surface: Optional[str] = None
|
||||
roof_type: Optional[str] = None
|
||||
opened_year: Optional[int] = None
|
||||
image_url: Optional[str] = None
|
||||
timezone: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"id": self.id,
|
||||
"sport": self.sport,
|
||||
"name": self.name,
|
||||
"city": self.city,
|
||||
"state": self.state,
|
||||
"country": self.country,
|
||||
"latitude": self.latitude,
|
||||
"longitude": self.longitude,
|
||||
"capacity": self.capacity,
|
||||
"surface": self.surface,
|
||||
"roof_type": self.roof_type,
|
||||
"opened_year": self.opened_year,
|
||||
"image_url": self.image_url,
|
||||
"timezone": self.timezone,
|
||||
}
|
||||
|
||||
def to_canonical_dict(self, primary_team_abbrevs: list[str] | None = None) -> dict:
|
||||
"""Convert to canonical dictionary format matching iOS app schema.
|
||||
|
||||
Args:
|
||||
primary_team_abbrevs: List of team abbreviations that play at this stadium.
|
||||
If None, defaults to empty list.
|
||||
|
||||
Returns:
|
||||
Dictionary with field names matching JSONCanonicalStadium in BootstrapService.swift
|
||||
"""
|
||||
return {
|
||||
"canonical_id": self.id,
|
||||
"name": self.name,
|
||||
"city": self.city,
|
||||
"state": self.state,
|
||||
"latitude": self.latitude,
|
||||
"longitude": self.longitude,
|
||||
"capacity": self.capacity if self.capacity is not None else 0,
|
||||
"sport": self.sport.upper(), # iOS Sport enum expects uppercase (e.g., "NFL")
|
||||
"primary_team_abbrevs": primary_team_abbrevs or [],
|
||||
"year_opened": self.opened_year,
|
||||
"timezone_identifier": self.timezone,
|
||||
"image_url": self.image_url,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "Stadium":
|
||||
"""Create a Stadium from a dictionary (internal format)."""
|
||||
return cls(
|
||||
id=data["id"],
|
||||
sport=data["sport"],
|
||||
name=data["name"],
|
||||
city=data["city"],
|
||||
state=data["state"],
|
||||
country=data["country"],
|
||||
latitude=data["latitude"],
|
||||
longitude=data["longitude"],
|
||||
capacity=data.get("capacity"),
|
||||
surface=data.get("surface"),
|
||||
roof_type=data.get("roof_type"),
|
||||
opened_year=data.get("opened_year"),
|
||||
image_url=data.get("image_url"),
|
||||
timezone=data.get("timezone"),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_canonical_dict(cls, data: dict) -> "Stadium":
|
||||
"""Create a Stadium from a canonical dictionary (iOS app format)."""
|
||||
return cls(
|
||||
id=data["canonical_id"],
|
||||
sport=data["sport"],
|
||||
name=data["name"],
|
||||
city=data["city"],
|
||||
state=data["state"],
|
||||
country="USA", # Canonical format doesn't include country
|
||||
latitude=data["latitude"],
|
||||
longitude=data["longitude"],
|
||||
capacity=data.get("capacity"),
|
||||
opened_year=data.get("year_opened"),
|
||||
image_url=data.get("image_url"),
|
||||
timezone=data.get("timezone_identifier"),
|
||||
)
|
||||
|
||||
def to_json(self) -> str:
|
||||
"""Serialize to JSON string."""
|
||||
return json.dumps(self.to_dict(), indent=2)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_str: str) -> "Stadium":
|
||||
"""Deserialize from JSON string."""
|
||||
return cls.from_dict(json.loads(json_str))
|
||||
|
||||
def is_in_allowed_region(self) -> bool:
|
||||
"""Check if stadium is in USA, Canada, or Mexico."""
|
||||
allowed = {"USA", "US", "United States", "Canada", "CA", "Mexico", "MX"}
|
||||
return self.country in allowed
|
||||
|
||||
|
||||
def save_stadiums(stadiums: list[Stadium], filepath: str) -> None:
|
||||
"""Save a list of stadiums to a JSON file."""
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
json.dump([s.to_dict() for s in stadiums], f, indent=2)
|
||||
|
||||
|
||||
def load_stadiums(filepath: str) -> list[Stadium]:
|
||||
"""Load a list of stadiums from a JSON file (auto-detects format)."""
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
# Detect format: canonical has "canonical_id", internal has "id"
|
||||
if data and "canonical_id" in data[0]:
|
||||
return [Stadium.from_canonical_dict(d) for d in data]
|
||||
return [Stadium.from_dict(d) for d in data]
|
||||
Reference in New Issue
Block a user