feat(scripts): add sportstime-parser data pipeline

Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-20 18:56:25 -06:00
parent ac78042a7e
commit 52d445bca4
76 changed files with 25065 additions and 0 deletions

View File

@@ -0,0 +1,335 @@
"""Base scraper class for all sport scrapers."""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import date, datetime
from typing import Optional
from ..config import EXPECTED_GAME_COUNTS
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..utils.http import RateLimitedSession, get_session
from ..utils.logging import get_logger, log_error, log_warning
from ..utils.progress import ScrapeProgress
@dataclass
class RawGameData:
"""Raw game data before normalization.
This intermediate format holds data as scraped from sources,
before team/stadium resolution and canonical ID generation.
"""
game_date: datetime
home_team_raw: str
away_team_raw: str
stadium_raw: Optional[str] = None
home_score: Optional[int] = None
away_score: Optional[int] = None
status: str = "scheduled"
source_url: Optional[str] = None
game_number: Optional[int] = None # For doubleheaders
@dataclass
class ScrapeResult:
"""Result of a scraping operation.
Attributes:
games: List of normalized Game objects
teams: List of Team objects
stadiums: List of Stadium objects
review_items: Items requiring manual review
source: Name of the source used
success: Whether scraping succeeded
error_message: Error message if failed
"""
games: list[Game] = field(default_factory=list)
teams: list[Team] = field(default_factory=list)
stadiums: list[Stadium] = field(default_factory=list)
review_items: list[ManualReviewItem] = field(default_factory=list)
source: str = ""
success: bool = True
error_message: Optional[str] = None
@property
def game_count(self) -> int:
return len(self.games)
@property
def team_count(self) -> int:
return len(self.teams)
@property
def stadium_count(self) -> int:
return len(self.stadiums)
@property
def review_count(self) -> int:
return len(self.review_items)
class BaseScraper(ABC):
"""Abstract base class for sport scrapers.
Subclasses must implement:
- scrape_games(): Fetch and normalize game schedule
- scrape_teams(): Fetch team information
- scrape_stadiums(): Fetch stadium information
- _get_sources(): Return list of source names in priority order
Features:
- Multi-source fallback (try sources in order)
- Built-in rate limiting
- Error handling with partial data discard
- Progress tracking
- Source URL tracking for manual review
"""
def __init__(
self,
sport: str,
season: int,
session: Optional[RateLimitedSession] = None,
):
"""Initialize the scraper.
Args:
sport: Sport code (e.g., 'nba', 'mlb')
season: Season start year (e.g., 2025 for 2025-26)
session: Optional HTTP session (default: global session)
"""
self.sport = sport.lower()
self.season = season
self.session = session or get_session()
self._logger = get_logger()
self._progress: Optional[ScrapeProgress] = None
@property
def expected_game_count(self) -> int:
"""Get expected number of games for this sport."""
return EXPECTED_GAME_COUNTS.get(self.sport, 0)
@abstractmethod
def _get_sources(self) -> list[str]:
"""Return list of source names in priority order.
Returns:
List of source identifiers (e.g., ['basketball_reference', 'espn', 'cbs'])
"""
pass
@abstractmethod
def _scrape_games_from_source(
self,
source: str,
) -> list[RawGameData]:
"""Scrape games from a specific source.
Args:
source: Source identifier
Returns:
List of raw game data
Raises:
Exception: If scraping fails
"""
pass
@abstractmethod
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw game data to Game objects.
Args:
raw_games: Raw scraped data
Returns:
Tuple of (normalized games, review items)
"""
pass
@abstractmethod
def scrape_teams(self) -> list[Team]:
"""Fetch team information.
Returns:
List of Team objects
"""
pass
@abstractmethod
def scrape_stadiums(self) -> list[Stadium]:
"""Fetch stadium information.
Returns:
List of Stadium objects
"""
pass
def scrape_games(self) -> ScrapeResult:
"""Scrape games with multi-source fallback.
Tries each source in priority order. On failure, discards
partial data and tries the next source.
Returns:
ScrapeResult with games, review items, and status
"""
sources = self._get_sources()
last_error: Optional[str] = None
sources_tried = 0
# Allow 3 sources to be tried. This enables NHL to fall back to NHL API
# for venue data since Hockey Reference doesn't provide it.
max_sources_to_try = 3
for source in sources:
self._logger.info(f"Trying source: {source}")
sources_tried += 1
try:
# Scrape raw data
raw_games = self._scrape_games_from_source(source)
if not raw_games:
log_warning(f"No games found from {source}")
# If multiple sources return nothing, the schedule likely doesn't exist
if sources_tried >= max_sources_to_try:
return ScrapeResult(
success=False,
error_message=f"No schedule data available (tried {sources_tried} sources)",
)
continue
self._logger.info(f"Found {len(raw_games)} raw games from {source}")
# Normalize data
games, review_items = self._normalize_games(raw_games)
self._logger.info(
f"Normalized {len(games)} games, {len(review_items)} need review"
)
return ScrapeResult(
games=games,
review_items=review_items,
source=source,
success=True,
)
except Exception as e:
last_error = str(e)
log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
# If we've tried enough sources, bail out
if sources_tried >= max_sources_to_try:
break
continue
# All sources failed
return ScrapeResult(
success=False,
error_message=f"All sources failed. Last error: {last_error}",
)
def scrape_all(self) -> ScrapeResult:
"""Scrape games, teams, and stadiums.
Returns:
Complete ScrapeResult with all data
"""
self._progress = ScrapeProgress(self.sport, self.season)
self._progress.start()
try:
# Scrape games
result = self.scrape_games()
if not result.success:
self._progress.log_error(result.error_message or "Unknown error")
self._progress.finish()
return result
# Scrape teams
teams = self.scrape_teams()
result.teams = teams
# Scrape stadiums
stadiums = self.scrape_stadiums()
result.stadiums = stadiums
# Update progress
self._progress.games_count = result.game_count
self._progress.teams_count = result.team_count
self._progress.stadiums_count = result.stadium_count
self._progress.errors_count = result.review_count
self._progress.finish()
return result
except Exception as e:
log_error(f"Scraping failed: {e}", exc_info=True)
self._progress.finish()
return ScrapeResult(
success=False,
error_message=str(e),
)
def _get_season_months(self) -> list[tuple[int, int]]:
"""Get the months to scrape for this sport's season.
Returns:
List of (year, month) tuples
"""
# Default implementation for sports with fall-spring seasons
# (NBA, NHL, etc.)
months = []
# Fall months of season start year
for month in range(10, 13): # Oct-Dec
months.append((self.season, month))
# Winter-spring months of following year
for month in range(1, 7): # Jan-Jun
months.append((self.season + 1, month))
return months
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build a source URL with parameters.
Subclasses should override this to build URLs for their sources.
Args:
source: Source identifier
**kwargs: URL parameters
Returns:
Complete URL string
"""
raise NotImplementedError(f"URL builder not implemented for {source}")
class ScraperError(Exception):
"""Exception raised when scraping fails."""
def __init__(self, source: str, message: str):
self.source = source
self.message = message
super().__init__(f"[{source}] {message}")
class PartialDataError(ScraperError):
"""Exception raised when only partial data was retrieved."""
def __init__(self, source: str, message: str, partial_count: int):
self.partial_count = partial_count
super().__init__(source, f"{message} (got {partial_count} items)")