Scripts changes: - Add WNBA abbreviation aliases to team_resolver.py - Fix NHL stadium coordinates in stadium_resolver.py - Add validate_aliases.py script for orphan detection - Update scrapers with improved error handling - Add DATA_AUDIT.md and REMEDIATION_PLAN.md documentation - Update alias JSON files with new mappings iOS bundle updates: - Update games_canonical.json with latest scraped data - Update teams_canonical.json and stadiums_canonical.json - Sync alias files with Scripts versions All 5 remediation phases complete. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
336 lines
9.8 KiB
Python
336 lines
9.8 KiB
Python
"""Base scraper class for all sport scrapers."""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass, field
|
|
from datetime import date, datetime
|
|
from typing import Optional
|
|
|
|
from ..config import EXPECTED_GAME_COUNTS
|
|
from ..models.game import Game
|
|
from ..models.team import Team
|
|
from ..models.stadium import Stadium
|
|
from ..models.aliases import ManualReviewItem
|
|
from ..utils.http import RateLimitedSession, get_session
|
|
from ..utils.logging import get_logger, log_error, log_warning
|
|
from ..utils.progress import ScrapeProgress
|
|
|
|
|
|
@dataclass
|
|
class RawGameData:
|
|
"""Raw game data before normalization.
|
|
|
|
This intermediate format holds data as scraped from sources,
|
|
before team/stadium resolution and canonical ID generation.
|
|
"""
|
|
|
|
game_date: datetime
|
|
home_team_raw: str
|
|
away_team_raw: str
|
|
stadium_raw: Optional[str] = None
|
|
home_score: Optional[int] = None
|
|
away_score: Optional[int] = None
|
|
status: str = "scheduled"
|
|
source_url: Optional[str] = None
|
|
game_number: Optional[int] = None # For doubleheaders
|
|
|
|
|
|
@dataclass
|
|
class ScrapeResult:
|
|
"""Result of a scraping operation.
|
|
|
|
Attributes:
|
|
games: List of normalized Game objects
|
|
teams: List of Team objects
|
|
stadiums: List of Stadium objects
|
|
review_items: Items requiring manual review
|
|
source: Name of the source used
|
|
success: Whether scraping succeeded
|
|
error_message: Error message if failed
|
|
"""
|
|
|
|
games: list[Game] = field(default_factory=list)
|
|
teams: list[Team] = field(default_factory=list)
|
|
stadiums: list[Stadium] = field(default_factory=list)
|
|
review_items: list[ManualReviewItem] = field(default_factory=list)
|
|
source: str = ""
|
|
success: bool = True
|
|
error_message: Optional[str] = None
|
|
|
|
@property
|
|
def game_count(self) -> int:
|
|
return len(self.games)
|
|
|
|
@property
|
|
def team_count(self) -> int:
|
|
return len(self.teams)
|
|
|
|
@property
|
|
def stadium_count(self) -> int:
|
|
return len(self.stadiums)
|
|
|
|
@property
|
|
def review_count(self) -> int:
|
|
return len(self.review_items)
|
|
|
|
|
|
class BaseScraper(ABC):
|
|
"""Abstract base class for sport scrapers.
|
|
|
|
Subclasses must implement:
|
|
- scrape_games(): Fetch and normalize game schedule
|
|
- scrape_teams(): Fetch team information
|
|
- scrape_stadiums(): Fetch stadium information
|
|
- _get_sources(): Return list of source names in priority order
|
|
|
|
Features:
|
|
- Multi-source fallback (try sources in order)
|
|
- Built-in rate limiting
|
|
- Error handling with partial data discard
|
|
- Progress tracking
|
|
- Source URL tracking for manual review
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
sport: str,
|
|
season: int,
|
|
session: Optional[RateLimitedSession] = None,
|
|
):
|
|
"""Initialize the scraper.
|
|
|
|
Args:
|
|
sport: Sport code (e.g., 'nba', 'mlb')
|
|
season: Season start year (e.g., 2025 for 2025-26)
|
|
session: Optional HTTP session (default: global session)
|
|
"""
|
|
self.sport = sport.lower()
|
|
self.season = season
|
|
self.session = session or get_session()
|
|
self._logger = get_logger()
|
|
self._progress: Optional[ScrapeProgress] = None
|
|
|
|
@property
|
|
def expected_game_count(self) -> int:
|
|
"""Get expected number of games for this sport."""
|
|
return EXPECTED_GAME_COUNTS.get(self.sport, 0)
|
|
|
|
@abstractmethod
|
|
def _get_sources(self) -> list[str]:
|
|
"""Return list of source names in priority order.
|
|
|
|
Returns:
|
|
List of source identifiers (e.g., ['basketball_reference', 'espn', 'cbs'])
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def _scrape_games_from_source(
|
|
self,
|
|
source: str,
|
|
) -> list[RawGameData]:
|
|
"""Scrape games from a specific source.
|
|
|
|
Args:
|
|
source: Source identifier
|
|
|
|
Returns:
|
|
List of raw game data
|
|
|
|
Raises:
|
|
Exception: If scraping fails
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def _normalize_games(
|
|
self,
|
|
raw_games: list[RawGameData],
|
|
) -> tuple[list[Game], list[ManualReviewItem]]:
|
|
"""Normalize raw game data to Game objects.
|
|
|
|
Args:
|
|
raw_games: Raw scraped data
|
|
|
|
Returns:
|
|
Tuple of (normalized games, review items)
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def scrape_teams(self) -> list[Team]:
|
|
"""Fetch team information.
|
|
|
|
Returns:
|
|
List of Team objects
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def scrape_stadiums(self) -> list[Stadium]:
|
|
"""Fetch stadium information.
|
|
|
|
Returns:
|
|
List of Stadium objects
|
|
"""
|
|
pass
|
|
|
|
def scrape_games(self) -> ScrapeResult:
|
|
"""Scrape games with multi-source fallback.
|
|
|
|
Tries each source in priority order. On failure, discards
|
|
partial data and tries the next source.
|
|
|
|
Returns:
|
|
ScrapeResult with games, review items, and status
|
|
"""
|
|
sources = self._get_sources()
|
|
last_error: Optional[str] = None
|
|
sources_tried = 0
|
|
# Allow 3 sources to be tried. This enables NHL to fall back to NHL API
|
|
# for venue data since Hockey Reference doesn't provide it.
|
|
max_sources_to_try = 3
|
|
|
|
for source in sources:
|
|
self._logger.info(f"Trying source: {source}")
|
|
sources_tried += 1
|
|
|
|
try:
|
|
# Scrape raw data
|
|
raw_games = self._scrape_games_from_source(source)
|
|
|
|
if not raw_games:
|
|
log_warning(f"No games found from {source}")
|
|
# If multiple sources return nothing, the schedule likely doesn't exist
|
|
if sources_tried >= max_sources_to_try:
|
|
return ScrapeResult(
|
|
success=False,
|
|
error_message=f"No schedule data available (tried {sources_tried} sources)",
|
|
)
|
|
continue
|
|
|
|
self._logger.info(f"Found {len(raw_games)} raw games from {source}")
|
|
|
|
# Normalize data
|
|
games, review_items = self._normalize_games(raw_games)
|
|
|
|
self._logger.info(
|
|
f"Normalized {len(games)} games, {len(review_items)} need review"
|
|
)
|
|
|
|
return ScrapeResult(
|
|
games=games,
|
|
review_items=review_items,
|
|
source=source,
|
|
success=True,
|
|
)
|
|
|
|
except Exception as e:
|
|
last_error = str(e)
|
|
log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
|
|
# If we've tried enough sources, bail out
|
|
if sources_tried >= max_sources_to_try:
|
|
break
|
|
continue
|
|
|
|
# All sources failed
|
|
return ScrapeResult(
|
|
success=False,
|
|
error_message=f"All sources failed. Last error: {last_error}",
|
|
)
|
|
|
|
def scrape_all(self) -> ScrapeResult:
|
|
"""Scrape games, teams, and stadiums.
|
|
|
|
Returns:
|
|
Complete ScrapeResult with all data
|
|
"""
|
|
self._progress = ScrapeProgress(self.sport, self.season)
|
|
self._progress.start()
|
|
|
|
try:
|
|
# Scrape games
|
|
result = self.scrape_games()
|
|
|
|
if not result.success:
|
|
self._progress.log_error(result.error_message or "Unknown error")
|
|
self._progress.finish()
|
|
return result
|
|
|
|
# Scrape teams
|
|
teams = self.scrape_teams()
|
|
result.teams = teams
|
|
|
|
# Scrape stadiums
|
|
stadiums = self.scrape_stadiums()
|
|
result.stadiums = stadiums
|
|
|
|
# Update progress
|
|
self._progress.games_count = result.game_count
|
|
self._progress.teams_count = result.team_count
|
|
self._progress.stadiums_count = result.stadium_count
|
|
self._progress.errors_count = result.review_count
|
|
|
|
self._progress.finish()
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
log_error(f"Scraping failed: {e}", exc_info=True)
|
|
self._progress.finish()
|
|
|
|
return ScrapeResult(
|
|
success=False,
|
|
error_message=str(e),
|
|
)
|
|
|
|
def _get_season_months(self) -> list[tuple[int, int]]:
|
|
"""Get the months to scrape for this sport's season.
|
|
|
|
Returns:
|
|
List of (year, month) tuples
|
|
"""
|
|
# Default implementation for sports with fall-spring seasons
|
|
# (NBA, NHL, etc.)
|
|
months = []
|
|
|
|
# Fall months of season start year
|
|
for month in range(10, 13): # Oct-Dec
|
|
months.append((self.season, month))
|
|
|
|
# Winter-spring months of following year
|
|
for month in range(1, 7): # Jan-Jun
|
|
months.append((self.season + 1, month))
|
|
|
|
return months
|
|
|
|
def _get_source_url(self, source: str, **kwargs) -> str:
|
|
"""Build a source URL with parameters.
|
|
|
|
Subclasses should override this to build URLs for their sources.
|
|
|
|
Args:
|
|
source: Source identifier
|
|
**kwargs: URL parameters
|
|
|
|
Returns:
|
|
Complete URL string
|
|
"""
|
|
raise NotImplementedError(f"URL builder not implemented for {source}")
|
|
|
|
|
|
class ScraperError(Exception):
|
|
"""Exception raised when scraping fails."""
|
|
|
|
def __init__(self, source: str, message: str):
|
|
self.source = source
|
|
self.message = message
|
|
super().__init__(f"[{source}] {message}")
|
|
|
|
|
|
class PartialDataError(ScraperError):
|
|
"""Exception raised when only partial data was retrieved."""
|
|
|
|
def __init__(self, source: str, message: str, partial_count: int):
|
|
self.partial_count = partial_count
|
|
super().__init__(source, f"{message} (got {partial_count} items)")
|