feat(scripts): add sportstime-parser data pipeline

Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00
parent ac78042a7e
commit 52d445bca4
76 changed files with 25065 additions and 0 deletions
--- a/sportstime_parser/scrapers/base.py
+++ b/sportstime_parser/scrapers/base.py
@@ -0,0 +1,335 @@
+"""Base scraper class for all sport scrapers."""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from datetime import date, datetime
+from typing import Optional
+
+from ..config import EXPECTED_GAME_COUNTS
+from ..models.game import Game
+from ..models.team import Team
+from ..models.stadium import Stadium
+from ..models.aliases import ManualReviewItem
+from ..utils.http import RateLimitedSession, get_session
+from ..utils.logging import get_logger, log_error, log_warning
+from ..utils.progress import ScrapeProgress
+
+
+@dataclass
+class RawGameData:
+    """Raw game data before normalization.
+
+    This intermediate format holds data as scraped from sources,
+    before team/stadium resolution and canonical ID generation.
+    """
+
+    game_date: datetime
+    home_team_raw: str
+    away_team_raw: str
+    stadium_raw: Optional[str] = None
+    home_score: Optional[int] = None
+    away_score: Optional[int] = None
+    status: str = "scheduled"
+    source_url: Optional[str] = None
+    game_number: Optional[int] = None  # For doubleheaders
+
+
+@dataclass
+class ScrapeResult:
+    """Result of a scraping operation.
+
+    Attributes:
+        games: List of normalized Game objects
+        teams: List of Team objects
+        stadiums: List of Stadium objects
+        review_items: Items requiring manual review
+        source: Name of the source used
+        success: Whether scraping succeeded
+        error_message: Error message if failed
+    """
+
+    games: list[Game] = field(default_factory=list)
+    teams: list[Team] = field(default_factory=list)
+    stadiums: list[Stadium] = field(default_factory=list)
+    review_items: list[ManualReviewItem] = field(default_factory=list)
+    source: str = ""
+    success: bool = True
+    error_message: Optional[str] = None
+
+    @property
+    def game_count(self) -> int:
+        return len(self.games)
+
+    @property
+    def team_count(self) -> int:
+        return len(self.teams)
+
+    @property
+    def stadium_count(self) -> int:
+        return len(self.stadiums)
+
+    @property
+    def review_count(self) -> int:
+        return len(self.review_items)
+
+
+class BaseScraper(ABC):
+    """Abstract base class for sport scrapers.
+
+    Subclasses must implement:
+    - scrape_games(): Fetch and normalize game schedule
+    - scrape_teams(): Fetch team information
+    - scrape_stadiums(): Fetch stadium information
+    - _get_sources(): Return list of source names in priority order
+
+    Features:
+    - Multi-source fallback (try sources in order)
+    - Built-in rate limiting
+    - Error handling with partial data discard
+    - Progress tracking
+    - Source URL tracking for manual review
+    """
+
+    def __init__(
+        self,
+        sport: str,
+        season: int,
+        session: Optional[RateLimitedSession] = None,
+    ):
+        """Initialize the scraper.
+
+        Args:
+            sport: Sport code (e.g., 'nba', 'mlb')
+            season: Season start year (e.g., 2025 for 2025-26)
+            session: Optional HTTP session (default: global session)
+        """
+        self.sport = sport.lower()
+        self.season = season
+        self.session = session or get_session()
+        self._logger = get_logger()
+        self._progress: Optional[ScrapeProgress] = None
+
+    @property
+    def expected_game_count(self) -> int:
+        """Get expected number of games for this sport."""
+        return EXPECTED_GAME_COUNTS.get(self.sport, 0)
+
+    @abstractmethod
+    def _get_sources(self) -> list[str]:
+        """Return list of source names in priority order.
+
+        Returns:
+            List of source identifiers (e.g., ['basketball_reference', 'espn', 'cbs'])
+        """
+        pass
+
+    @abstractmethod
+    def _scrape_games_from_source(
+        self,
+        source: str,
+    ) -> list[RawGameData]:
+        """Scrape games from a specific source.
+
+        Args:
+            source: Source identifier
+
+        Returns:
+            List of raw game data
+
+        Raises:
+            Exception: If scraping fails
+        """
+        pass
+
+    @abstractmethod
+    def _normalize_games(
+        self,
+        raw_games: list[RawGameData],
+    ) -> tuple[list[Game], list[ManualReviewItem]]:
+        """Normalize raw game data to Game objects.
+
+        Args:
+            raw_games: Raw scraped data
+
+        Returns:
+            Tuple of (normalized games, review items)
+        """
+        pass
+
+    @abstractmethod
+    def scrape_teams(self) -> list[Team]:
+        """Fetch team information.
+
+        Returns:
+            List of Team objects
+        """
+        pass
+
+    @abstractmethod
+    def scrape_stadiums(self) -> list[Stadium]:
+        """Fetch stadium information.
+
+        Returns:
+            List of Stadium objects
+        """
+        pass
+
+    def scrape_games(self) -> ScrapeResult:
+        """Scrape games with multi-source fallback.
+
+        Tries each source in priority order. On failure, discards
+        partial data and tries the next source.
+
+        Returns:
+            ScrapeResult with games, review items, and status
+        """
+        sources = self._get_sources()
+        last_error: Optional[str] = None
+        sources_tried = 0
+        # Allow 3 sources to be tried. This enables NHL to fall back to NHL API
+        # for venue data since Hockey Reference doesn't provide it.
+        max_sources_to_try = 3
+
+        for source in sources:
+            self._logger.info(f"Trying source: {source}")
+            sources_tried += 1
+
+            try:
+                # Scrape raw data
+                raw_games = self._scrape_games_from_source(source)
+
+                if not raw_games:
+                    log_warning(f"No games found from {source}")
+                    # If multiple sources return nothing, the schedule likely doesn't exist
+                    if sources_tried >= max_sources_to_try:
+                        return ScrapeResult(
+                            success=False,
+                            error_message=f"No schedule data available (tried {sources_tried} sources)",
+                        )
+                    continue
+
+                self._logger.info(f"Found {len(raw_games)} raw games from {source}")
+
+                # Normalize data
+                games, review_items = self._normalize_games(raw_games)
+
+                self._logger.info(
+                    f"Normalized {len(games)} games, {len(review_items)} need review"
+                )
+
+                return ScrapeResult(
+                    games=games,
+                    review_items=review_items,
+                    source=source,
+                    success=True,
+                )
+
+            except Exception as e:
+                last_error = str(e)
+                log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
+                # If we've tried enough sources, bail out
+                if sources_tried >= max_sources_to_try:
+                    break
+                continue
+
+        # All sources failed
+        return ScrapeResult(
+            success=False,
+            error_message=f"All sources failed. Last error: {last_error}",
+        )
+
+    def scrape_all(self) -> ScrapeResult:
+        """Scrape games, teams, and stadiums.
+
+        Returns:
+            Complete ScrapeResult with all data
+        """
+        self._progress = ScrapeProgress(self.sport, self.season)
+        self._progress.start()
+
+        try:
+            # Scrape games
+            result = self.scrape_games()
+
+            if not result.success:
+                self._progress.log_error(result.error_message or "Unknown error")
+                self._progress.finish()
+                return result
+
+            # Scrape teams
+            teams = self.scrape_teams()
+            result.teams = teams
+
+            # Scrape stadiums
+            stadiums = self.scrape_stadiums()
+            result.stadiums = stadiums
+
+            # Update progress
+            self._progress.games_count = result.game_count
+            self._progress.teams_count = result.team_count
+            self._progress.stadiums_count = result.stadium_count
+            self._progress.errors_count = result.review_count
+
+            self._progress.finish()
+
+            return result
+
+        except Exception as e:
+            log_error(f"Scraping failed: {e}", exc_info=True)
+            self._progress.finish()
+
+            return ScrapeResult(
+                success=False,
+                error_message=str(e),
+            )
+
+    def _get_season_months(self) -> list[tuple[int, int]]:
+        """Get the months to scrape for this sport's season.
+
+        Returns:
+            List of (year, month) tuples
+        """
+        # Default implementation for sports with fall-spring seasons
+        # (NBA, NHL, etc.)
+        months = []
+
+        # Fall months of season start year
+        for month in range(10, 13):  # Oct-Dec
+            months.append((self.season, month))
+
+        # Winter-spring months of following year
+        for month in range(1, 7):  # Jan-Jun
+            months.append((self.season + 1, month))
+
+        return months
+
+    def _get_source_url(self, source: str, **kwargs) -> str:
+        """Build a source URL with parameters.
+
+        Subclasses should override this to build URLs for their sources.
+
+        Args:
+            source: Source identifier
+            **kwargs: URL parameters
+
+        Returns:
+            Complete URL string
+        """
+        raise NotImplementedError(f"URL builder not implemented for {source}")
+
+
+class ScraperError(Exception):
+    """Exception raised when scraping fails."""
+
+    def __init__(self, source: str, message: str):
+        self.source = source
+        self.message = message
+        super().__init__(f"[{source}] {message}")
+
+
+class PartialDataError(ScraperError):
+    """Exception raised when only partial data was retrieved."""
+
+    def __init__(self, source: str, message: str, partial_count: int):
+        self.partial_count = partial_count
+        super().__init__(source, f"{message} (got {partial_count} items)")