"""Base scraper class for all sport scrapers.""" from abc import ABC, abstractmethod from dataclasses import dataclass, field from datetime import date, datetime from typing import Optional from ..config import EXPECTED_GAME_COUNTS from ..models.game import Game from ..models.team import Team from ..models.stadium import Stadium from ..models.aliases import ManualReviewItem from ..utils.http import RateLimitedSession, get_session from ..utils.logging import get_logger, log_error, log_warning from ..utils.progress import ScrapeProgress @dataclass class RawGameData: """Raw game data before normalization. This intermediate format holds data as scraped from sources, before team/stadium resolution and canonical ID generation. """ game_date: datetime home_team_raw: str away_team_raw: str stadium_raw: Optional[str] = None home_score: Optional[int] = None away_score: Optional[int] = None status: str = "scheduled" source_url: Optional[str] = None game_number: Optional[int] = None # For doubleheaders @dataclass class ScrapeResult: """Result of a scraping operation. Attributes: games: List of normalized Game objects teams: List of Team objects stadiums: List of Stadium objects review_items: Items requiring manual review source: Name of the source used success: Whether scraping succeeded error_message: Error message if failed """ games: list[Game] = field(default_factory=list) teams: list[Team] = field(default_factory=list) stadiums: list[Stadium] = field(default_factory=list) review_items: list[ManualReviewItem] = field(default_factory=list) source: str = "" success: bool = True error_message: Optional[str] = None @property def game_count(self) -> int: return len(self.games) @property def team_count(self) -> int: return len(self.teams) @property def stadium_count(self) -> int: return len(self.stadiums) @property def review_count(self) -> int: return len(self.review_items) class BaseScraper(ABC): """Abstract base class for sport scrapers. Subclasses must implement: - scrape_games(): Fetch and normalize game schedule - scrape_teams(): Fetch team information - scrape_stadiums(): Fetch stadium information - _get_sources(): Return list of source names in priority order Features: - Multi-source fallback (try sources in order) - Built-in rate limiting - Error handling with partial data discard - Progress tracking - Source URL tracking for manual review """ def __init__( self, sport: str, season: int, session: Optional[RateLimitedSession] = None, ): """Initialize the scraper. Args: sport: Sport code (e.g., 'nba', 'mlb') season: Season start year (e.g., 2025 for 2025-26) session: Optional HTTP session (default: global session) """ self.sport = sport.lower() self.season = season self.session = session or get_session() self._logger = get_logger() self._progress: Optional[ScrapeProgress] = None @property def expected_game_count(self) -> int: """Get expected number of games for this sport.""" return EXPECTED_GAME_COUNTS.get(self.sport, 0) @abstractmethod def _get_sources(self) -> list[str]: """Return list of source names in priority order. Returns: List of source identifiers (e.g., ['basketball_reference', 'espn', 'cbs']) """ pass @abstractmethod def _scrape_games_from_source( self, source: str, ) -> list[RawGameData]: """Scrape games from a specific source. Args: source: Source identifier Returns: List of raw game data Raises: Exception: If scraping fails """ pass @abstractmethod def _normalize_games( self, raw_games: list[RawGameData], ) -> tuple[list[Game], list[ManualReviewItem]]: """Normalize raw game data to Game objects. Args: raw_games: Raw scraped data Returns: Tuple of (normalized games, review items) """ pass @abstractmethod def scrape_teams(self) -> list[Team]: """Fetch team information. Returns: List of Team objects """ pass @abstractmethod def scrape_stadiums(self) -> list[Stadium]: """Fetch stadium information. Returns: List of Stadium objects """ pass def scrape_games(self) -> ScrapeResult: """Scrape games with multi-source fallback. Tries each source in priority order. On failure, discards partial data and tries the next source. Returns: ScrapeResult with games, review items, and status """ sources = self._get_sources() last_error: Optional[str] = None for source in sources: self._logger.info(f"Trying source: {source}") try: # Scrape raw data raw_games = self._scrape_games_from_source(source) if not raw_games: log_warning(f"No games found from {source}") continue self._logger.info(f"Found {len(raw_games)} raw games from {source}") # Normalize data games, review_items = self._normalize_games(raw_games) self._logger.info( f"Normalized {len(games)} games, {len(review_items)} need review" ) return ScrapeResult( games=games, review_items=review_items, source=source, success=True, ) except Exception as e: last_error = str(e) log_error(f"Failed to scrape from {source}: {e}", exc_info=True) # Discard partial data and try next source continue # All sources failed return ScrapeResult( success=False, error_message=f"All sources failed. Last error: {last_error}", ) def scrape_all(self) -> ScrapeResult: """Scrape games, teams, and stadiums. Returns: Complete ScrapeResult with all data """ self._progress = ScrapeProgress(self.sport, self.season) self._progress.start() try: # Scrape games result = self.scrape_games() if not result.success: self._progress.log_error(result.error_message or "Unknown error") self._progress.finish() return result # Scrape teams teams = self.scrape_teams() result.teams = teams # Scrape stadiums stadiums = self.scrape_stadiums() result.stadiums = stadiums # Update progress self._progress.games_count = result.game_count self._progress.teams_count = result.team_count self._progress.stadiums_count = result.stadium_count self._progress.errors_count = result.review_count self._progress.finish() return result except Exception as e: log_error(f"Scraping failed: {e}", exc_info=True) self._progress.finish() return ScrapeResult( success=False, error_message=str(e), ) def _get_season_months(self) -> list[tuple[int, int]]: """Get the months to scrape for this sport's season. Returns: List of (year, month) tuples """ # Default implementation for sports with fall-spring seasons # (NBA, NHL, etc.) months = [] # Fall months of season start year for month in range(10, 13): # Oct-Dec months.append((self.season, month)) # Winter-spring months of following year for month in range(1, 7): # Jan-Jun months.append((self.season + 1, month)) return months def _get_source_url(self, source: str, **kwargs) -> str: """Build a source URL with parameters. Subclasses should override this to build URLs for their sources. Args: source: Source identifier **kwargs: URL parameters Returns: Complete URL string """ raise NotImplementedError(f"URL builder not implemented for {source}") class ScraperError(Exception): """Exception raised when scraping fails.""" def __init__(self, source: str, message: str): self.source = source self.message = message super().__init__(f"[{source}] {message}") class PartialDataError(ScraperError): """Exception raised when only partial data was retrieved.""" def __init__(self, source: str, message: str, partial_count: int): self.partial_count = partial_count super().__init__(source, f"{message} (got {partial_count} items)")