Files
Sportstime/Scripts/sportstime_parser/scrapers/base.py
Trey t a8b0491571 wip
2026-01-19 22:12:53 -06:00

334 lines
9.7 KiB
Python

"""Base scraper class for all sport scrapers."""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import date, datetime
from typing import Optional
from ..config import EXPECTED_GAME_COUNTS
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..utils.http import RateLimitedSession, get_session
from ..utils.logging import get_logger, log_error, log_warning
from ..utils.progress import ScrapeProgress
@dataclass
class RawGameData:
"""Raw game data before normalization.
This intermediate format holds data as scraped from sources,
before team/stadium resolution and canonical ID generation.
"""
game_date: datetime
home_team_raw: str
away_team_raw: str
stadium_raw: Optional[str] = None
home_score: Optional[int] = None
away_score: Optional[int] = None
status: str = "scheduled"
source_url: Optional[str] = None
game_number: Optional[int] = None # For doubleheaders
@dataclass
class ScrapeResult:
"""Result of a scraping operation.
Attributes:
games: List of normalized Game objects
teams: List of Team objects
stadiums: List of Stadium objects
review_items: Items requiring manual review
source: Name of the source used
success: Whether scraping succeeded
error_message: Error message if failed
"""
games: list[Game] = field(default_factory=list)
teams: list[Team] = field(default_factory=list)
stadiums: list[Stadium] = field(default_factory=list)
review_items: list[ManualReviewItem] = field(default_factory=list)
source: str = ""
success: bool = True
error_message: Optional[str] = None
@property
def game_count(self) -> int:
return len(self.games)
@property
def team_count(self) -> int:
return len(self.teams)
@property
def stadium_count(self) -> int:
return len(self.stadiums)
@property
def review_count(self) -> int:
return len(self.review_items)
class BaseScraper(ABC):
"""Abstract base class for sport scrapers.
Subclasses must implement:
- scrape_games(): Fetch and normalize game schedule
- scrape_teams(): Fetch team information
- scrape_stadiums(): Fetch stadium information
- _get_sources(): Return list of source names in priority order
Features:
- Multi-source fallback (try sources in order)
- Built-in rate limiting
- Error handling with partial data discard
- Progress tracking
- Source URL tracking for manual review
"""
def __init__(
self,
sport: str,
season: int,
session: Optional[RateLimitedSession] = None,
):
"""Initialize the scraper.
Args:
sport: Sport code (e.g., 'nba', 'mlb')
season: Season start year (e.g., 2025 for 2025-26)
session: Optional HTTP session (default: global session)
"""
self.sport = sport.lower()
self.season = season
self.session = session or get_session()
self._logger = get_logger()
self._progress: Optional[ScrapeProgress] = None
@property
def expected_game_count(self) -> int:
"""Get expected number of games for this sport."""
return EXPECTED_GAME_COUNTS.get(self.sport, 0)
@abstractmethod
def _get_sources(self) -> list[str]:
"""Return list of source names in priority order.
Returns:
List of source identifiers (e.g., ['basketball_reference', 'espn', 'cbs'])
"""
pass
@abstractmethod
def _scrape_games_from_source(
self,
source: str,
) -> list[RawGameData]:
"""Scrape games from a specific source.
Args:
source: Source identifier
Returns:
List of raw game data
Raises:
Exception: If scraping fails
"""
pass
@abstractmethod
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw game data to Game objects.
Args:
raw_games: Raw scraped data
Returns:
Tuple of (normalized games, review items)
"""
pass
@abstractmethod
def scrape_teams(self) -> list[Team]:
"""Fetch team information.
Returns:
List of Team objects
"""
pass
@abstractmethod
def scrape_stadiums(self) -> list[Stadium]:
"""Fetch stadium information.
Returns:
List of Stadium objects
"""
pass
def scrape_games(self) -> ScrapeResult:
"""Scrape games with multi-source fallback.
Tries each source in priority order. On failure, discards
partial data and tries the next source.
Returns:
ScrapeResult with games, review items, and status
"""
sources = self._get_sources()
last_error: Optional[str] = None
sources_tried = 0
max_sources_to_try = 2 # Don't try all sources if first few return nothing
for source in sources:
self._logger.info(f"Trying source: {source}")
sources_tried += 1
try:
# Scrape raw data
raw_games = self._scrape_games_from_source(source)
if not raw_games:
log_warning(f"No games found from {source}")
# If multiple sources return nothing, the schedule likely doesn't exist
if sources_tried >= max_sources_to_try:
return ScrapeResult(
success=False,
error_message=f"No schedule data available (tried {sources_tried} sources)",
)
continue
self._logger.info(f"Found {len(raw_games)} raw games from {source}")
# Normalize data
games, review_items = self._normalize_games(raw_games)
self._logger.info(
f"Normalized {len(games)} games, {len(review_items)} need review"
)
return ScrapeResult(
games=games,
review_items=review_items,
source=source,
success=True,
)
except Exception as e:
last_error = str(e)
log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
# If we've tried enough sources, bail out
if sources_tried >= max_sources_to_try:
break
continue
# All sources failed
return ScrapeResult(
success=False,
error_message=f"All sources failed. Last error: {last_error}",
)
def scrape_all(self) -> ScrapeResult:
"""Scrape games, teams, and stadiums.
Returns:
Complete ScrapeResult with all data
"""
self._progress = ScrapeProgress(self.sport, self.season)
self._progress.start()
try:
# Scrape games
result = self.scrape_games()
if not result.success:
self._progress.log_error(result.error_message or "Unknown error")
self._progress.finish()
return result
# Scrape teams
teams = self.scrape_teams()
result.teams = teams
# Scrape stadiums
stadiums = self.scrape_stadiums()
result.stadiums = stadiums
# Update progress
self._progress.games_count = result.game_count
self._progress.teams_count = result.team_count
self._progress.stadiums_count = result.stadium_count
self._progress.errors_count = result.review_count
self._progress.finish()
return result
except Exception as e:
log_error(f"Scraping failed: {e}", exc_info=True)
self._progress.finish()
return ScrapeResult(
success=False,
error_message=str(e),
)
def _get_season_months(self) -> list[tuple[int, int]]:
"""Get the months to scrape for this sport's season.
Returns:
List of (year, month) tuples
"""
# Default implementation for sports with fall-spring seasons
# (NBA, NHL, etc.)
months = []
# Fall months of season start year
for month in range(10, 13): # Oct-Dec
months.append((self.season, month))
# Winter-spring months of following year
for month in range(1, 7): # Jan-Jun
months.append((self.season + 1, month))
return months
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build a source URL with parameters.
Subclasses should override this to build URLs for their sources.
Args:
source: Source identifier
**kwargs: URL parameters
Returns:
Complete URL string
"""
raise NotImplementedError(f"URL builder not implemented for {source}")
class ScraperError(Exception):
"""Exception raised when scraping fails."""
def __init__(self, source: str, message: str):
self.source = source
self.message = message
super().__init__(f"[{source}] {message}")
class PartialDataError(ScraperError):
"""Exception raised when only partial data was retrieved."""
def __init__(self, source: str, message: str, partial_count: int):
self.partial_count = partial_count
super().__init__(source, f"{message} (got {partial_count} items)")