Sportstime/Scripts/sportstime_parser/scrapers/base.py

"""Base scraper class for all sport scrapers."""

from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import date, datetime
from typing import Optional

from ..config import EXPECTED_GAME_COUNTS
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..utils.http import RateLimitedSession, get_session
from ..utils.logging import get_logger, log_error, log_warning
from ..utils.progress import ScrapeProgress


@dataclass
class RawGameData:
    """Raw game data before normalization.

    This intermediate format holds data as scraped from sources,
    before team/stadium resolution and canonical ID generation.
    """

    game_date: datetime
    home_team_raw: str
    away_team_raw: str
    stadium_raw: Optional[str] = None
    home_score: Optional[int] = None
    away_score: Optional[int] = None
    status: str = "scheduled"
    source_url: Optional[str] = None
    game_number: Optional[int] = None  # For doubleheaders


@dataclass
class ScrapeResult:
    """Result of a scraping operation.

    Attributes:
        games: List of normalized Game objects
        teams: List of Team objects
        stadiums: List of Stadium objects
        review_items: Items requiring manual review
        source: Name of the source used
        success: Whether scraping succeeded
        error_message: Error message if failed
    """

    games: list[Game] = field(default_factory=list)
    teams: list[Team] = field(default_factory=list)
    stadiums: list[Stadium] = field(default_factory=list)
    review_items: list[ManualReviewItem] = field(default_factory=list)
    source: str = ""
    success: bool = True
    error_message: Optional[str] = None

    @property
    def game_count(self) -> int:
        return len(self.games)

    @property
    def team_count(self) -> int:
        return len(self.teams)

    @property
    def stadium_count(self) -> int:
        return len(self.stadiums)

    @property
    def review_count(self) -> int:
        return len(self.review_items)


class BaseScraper(ABC):
    """Abstract base class for sport scrapers.

    Subclasses must implement:
    - scrape_games(): Fetch and normalize game schedule
    - scrape_teams(): Fetch team information
    - scrape_stadiums(): Fetch stadium information
    - _get_sources(): Return list of source names in priority order

    Features:
    - Multi-source fallback (try sources in order)
    - Built-in rate limiting
    - Error handling with partial data discard
    - Progress tracking
    - Source URL tracking for manual review
    """

    def __init__(
        self,
        sport: str,
        season: int,
        session: Optional[RateLimitedSession] = None,
    ):
        """Initialize the scraper.

        Args:
            sport: Sport code (e.g., 'nba', 'mlb')
            season: Season start year (e.g., 2025 for 2025-26)
            session: Optional HTTP session (default: global session)
        """
        self.sport = sport.lower()
        self.season = season
        self.session = session or get_session()
        self._logger = get_logger()
        self._progress: Optional[ScrapeProgress] = None

    @property
    def expected_game_count(self) -> int:
        """Get expected number of games for this sport."""
        return EXPECTED_GAME_COUNTS.get(self.sport, 0)

    @abstractmethod
    def _get_sources(self) -> list[str]:
        """Return list of source names in priority order.

        Returns:
            List of source identifiers (e.g., ['basketball_reference', 'espn', 'cbs'])
        """
        pass

    @abstractmethod
    def _scrape_games_from_source(
        self,
        source: str,
    ) -> list[RawGameData]:
        """Scrape games from a specific source.

        Args:
            source: Source identifier

        Returns:
            List of raw game data

        Raises:
            Exception: If scraping fails
        """
        pass

    @abstractmethod
    def _normalize_games(
        self,
        raw_games: list[RawGameData],
    ) -> tuple[list[Game], list[ManualReviewItem]]:
        """Normalize raw game data to Game objects.

        Args:
            raw_games: Raw scraped data

        Returns:
            Tuple of (normalized games, review items)
        """
        pass

    @abstractmethod
    def scrape_teams(self) -> list[Team]:
        """Fetch team information.

        Returns:
            List of Team objects
        """
        pass

    @abstractmethod
    def scrape_stadiums(self) -> list[Stadium]:
        """Fetch stadium information.

        Returns:
            List of Stadium objects
        """
        pass

    def scrape_games(self) -> ScrapeResult:
        """Scrape games with multi-source fallback.

        Tries each source in priority order. On failure, discards
        partial data and tries the next source.

        Returns:
            ScrapeResult with games, review items, and status
        """
        sources = self._get_sources()
        last_error: Optional[str] = None
        sources_tried = 0
        # Allow 3 sources to be tried. This enables NHL to fall back to NHL API
        # for venue data since Hockey Reference doesn't provide it.
        max_sources_to_try = 3

        for source in sources:
            self._logger.info(f"Trying source: {source}")
            sources_tried += 1

            try:
                # Scrape raw data
                raw_games = self._scrape_games_from_source(source)

                if not raw_games:
                    log_warning(f"No games found from {source}")
                    # If multiple sources return nothing, the schedule likely doesn't exist
                    if sources_tried >= max_sources_to_try:
                        return ScrapeResult(
                            success=False,
                            error_message=f"No schedule data available (tried {sources_tried} sources)",
                        )
                    continue

                self._logger.info(f"Found {len(raw_games)} raw games from {source}")

                # Normalize data
                games, review_items = self._normalize_games(raw_games)

                self._logger.info(
                    f"Normalized {len(games)} games, {len(review_items)} need review"
                )

                return ScrapeResult(
                    games=games,
                    review_items=review_items,
                    source=source,
                    success=True,
                )

            except Exception as e:
                last_error = str(e)
                log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
                # If we've tried enough sources, bail out
                if sources_tried >= max_sources_to_try:
                    break
                continue

        # All sources failed
        return ScrapeResult(
            success=False,
            error_message=f"All sources failed. Last error: {last_error}",
        )

    def scrape_all(self) -> ScrapeResult:
        """Scrape games, teams, and stadiums.

        Returns:
            Complete ScrapeResult with all data
        """
        self._progress = ScrapeProgress(self.sport, self.season)
        self._progress.start()

        try:
            # Scrape games
            result = self.scrape_games()

            if not result.success:
                self._progress.log_error(result.error_message or "Unknown error")
                self._progress.finish()
                return result

            # Scrape teams
            teams = self.scrape_teams()
            result.teams = teams

            # Scrape stadiums
            stadiums = self.scrape_stadiums()
            result.stadiums = stadiums

            # Update progress
            self._progress.games_count = result.game_count
            self._progress.teams_count = result.team_count
            self._progress.stadiums_count = result.stadium_count
            self._progress.errors_count = result.review_count

            self._progress.finish()

            return result

        except Exception as e:
            log_error(f"Scraping failed: {e}", exc_info=True)
            self._progress.finish()

            return ScrapeResult(
                success=False,
                error_message=str(e),
            )

    def _get_season_months(self) -> list[tuple[int, int]]:
        """Get the months to scrape for this sport's season.

        Returns:
            List of (year, month) tuples
        """
        # Default implementation for sports with fall-spring seasons
        # (NBA, NHL, etc.)
        months = []

        # Fall months of season start year
        for month in range(10, 13):  # Oct-Dec
            months.append((self.season, month))

        # Winter-spring months of following year
        for month in range(1, 7):  # Jan-Jun
            months.append((self.season + 1, month))

        return months

    def _get_source_url(self, source: str, **kwargs) -> str:
        """Build a source URL with parameters.

        Subclasses should override this to build URLs for their sources.

        Args:
            source: Source identifier
            **kwargs: URL parameters

        Returns:
            Complete URL string
        """
        raise NotImplementedError(f"URL builder not implemented for {source}")


class ScraperError(Exception):
    """Exception raised when scraping fails."""

    def __init__(self, source: str, message: str):
        self.source = source
        self.message = message
        super().__init__(f"[{source}] {message}")


class PartialDataError(ScraperError):
    """Exception raised when only partial data was retrieved."""

    def __init__(self, source: str, message: str, partial_count: int):
        self.partial_count = partial_count
        super().__init__(source, f"{message} (got {partial_count} items)")