feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
322
Scripts/sportstime_parser/scrapers/base.py
Normal file
322
Scripts/sportstime_parser/scrapers/base.py
Normal file
@@ -0,0 +1,322 @@
|
||||
"""Base scraper class for all sport scrapers."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date, datetime
|
||||
from typing import Optional
|
||||
|
||||
from ..config import EXPECTED_GAME_COUNTS
|
||||
from ..models.game import Game
|
||||
from ..models.team import Team
|
||||
from ..models.stadium import Stadium
|
||||
from ..models.aliases import ManualReviewItem
|
||||
from ..utils.http import RateLimitedSession, get_session
|
||||
from ..utils.logging import get_logger, log_error, log_warning
|
||||
from ..utils.progress import ScrapeProgress
|
||||
|
||||
|
||||
@dataclass
|
||||
class RawGameData:
|
||||
"""Raw game data before normalization.
|
||||
|
||||
This intermediate format holds data as scraped from sources,
|
||||
before team/stadium resolution and canonical ID generation.
|
||||
"""
|
||||
|
||||
game_date: datetime
|
||||
home_team_raw: str
|
||||
away_team_raw: str
|
||||
stadium_raw: Optional[str] = None
|
||||
home_score: Optional[int] = None
|
||||
away_score: Optional[int] = None
|
||||
status: str = "scheduled"
|
||||
source_url: Optional[str] = None
|
||||
game_number: Optional[int] = None # For doubleheaders
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeResult:
|
||||
"""Result of a scraping operation.
|
||||
|
||||
Attributes:
|
||||
games: List of normalized Game objects
|
||||
teams: List of Team objects
|
||||
stadiums: List of Stadium objects
|
||||
review_items: Items requiring manual review
|
||||
source: Name of the source used
|
||||
success: Whether scraping succeeded
|
||||
error_message: Error message if failed
|
||||
"""
|
||||
|
||||
games: list[Game] = field(default_factory=list)
|
||||
teams: list[Team] = field(default_factory=list)
|
||||
stadiums: list[Stadium] = field(default_factory=list)
|
||||
review_items: list[ManualReviewItem] = field(default_factory=list)
|
||||
source: str = ""
|
||||
success: bool = True
|
||||
error_message: Optional[str] = None
|
||||
|
||||
@property
|
||||
def game_count(self) -> int:
|
||||
return len(self.games)
|
||||
|
||||
@property
|
||||
def team_count(self) -> int:
|
||||
return len(self.teams)
|
||||
|
||||
@property
|
||||
def stadium_count(self) -> int:
|
||||
return len(self.stadiums)
|
||||
|
||||
@property
|
||||
def review_count(self) -> int:
|
||||
return len(self.review_items)
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
"""Abstract base class for sport scrapers.
|
||||
|
||||
Subclasses must implement:
|
||||
- scrape_games(): Fetch and normalize game schedule
|
||||
- scrape_teams(): Fetch team information
|
||||
- scrape_stadiums(): Fetch stadium information
|
||||
- _get_sources(): Return list of source names in priority order
|
||||
|
||||
Features:
|
||||
- Multi-source fallback (try sources in order)
|
||||
- Built-in rate limiting
|
||||
- Error handling with partial data discard
|
||||
- Progress tracking
|
||||
- Source URL tracking for manual review
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sport: str,
|
||||
season: int,
|
||||
session: Optional[RateLimitedSession] = None,
|
||||
):
|
||||
"""Initialize the scraper.
|
||||
|
||||
Args:
|
||||
sport: Sport code (e.g., 'nba', 'mlb')
|
||||
season: Season start year (e.g., 2025 for 2025-26)
|
||||
session: Optional HTTP session (default: global session)
|
||||
"""
|
||||
self.sport = sport.lower()
|
||||
self.season = season
|
||||
self.session = session or get_session()
|
||||
self._logger = get_logger()
|
||||
self._progress: Optional[ScrapeProgress] = None
|
||||
|
||||
@property
|
||||
def expected_game_count(self) -> int:
|
||||
"""Get expected number of games for this sport."""
|
||||
return EXPECTED_GAME_COUNTS.get(self.sport, 0)
|
||||
|
||||
@abstractmethod
|
||||
def _get_sources(self) -> list[str]:
|
||||
"""Return list of source names in priority order.
|
||||
|
||||
Returns:
|
||||
List of source identifiers (e.g., ['basketball_reference', 'espn', 'cbs'])
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _scrape_games_from_source(
|
||||
self,
|
||||
source: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Scrape games from a specific source.
|
||||
|
||||
Args:
|
||||
source: Source identifier
|
||||
|
||||
Returns:
|
||||
List of raw game data
|
||||
|
||||
Raises:
|
||||
Exception: If scraping fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _normalize_games(
|
||||
self,
|
||||
raw_games: list[RawGameData],
|
||||
) -> tuple[list[Game], list[ManualReviewItem]]:
|
||||
"""Normalize raw game data to Game objects.
|
||||
|
||||
Args:
|
||||
raw_games: Raw scraped data
|
||||
|
||||
Returns:
|
||||
Tuple of (normalized games, review items)
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def scrape_teams(self) -> list[Team]:
|
||||
"""Fetch team information.
|
||||
|
||||
Returns:
|
||||
List of Team objects
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def scrape_stadiums(self) -> list[Stadium]:
|
||||
"""Fetch stadium information.
|
||||
|
||||
Returns:
|
||||
List of Stadium objects
|
||||
"""
|
||||
pass
|
||||
|
||||
def scrape_games(self) -> ScrapeResult:
|
||||
"""Scrape games with multi-source fallback.
|
||||
|
||||
Tries each source in priority order. On failure, discards
|
||||
partial data and tries the next source.
|
||||
|
||||
Returns:
|
||||
ScrapeResult with games, review items, and status
|
||||
"""
|
||||
sources = self._get_sources()
|
||||
last_error: Optional[str] = None
|
||||
|
||||
for source in sources:
|
||||
self._logger.info(f"Trying source: {source}")
|
||||
|
||||
try:
|
||||
# Scrape raw data
|
||||
raw_games = self._scrape_games_from_source(source)
|
||||
|
||||
if not raw_games:
|
||||
log_warning(f"No games found from {source}")
|
||||
continue
|
||||
|
||||
self._logger.info(f"Found {len(raw_games)} raw games from {source}")
|
||||
|
||||
# Normalize data
|
||||
games, review_items = self._normalize_games(raw_games)
|
||||
|
||||
self._logger.info(
|
||||
f"Normalized {len(games)} games, {len(review_items)} need review"
|
||||
)
|
||||
|
||||
return ScrapeResult(
|
||||
games=games,
|
||||
review_items=review_items,
|
||||
source=source,
|
||||
success=True,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
|
||||
# Discard partial data and try next source
|
||||
continue
|
||||
|
||||
# All sources failed
|
||||
return ScrapeResult(
|
||||
success=False,
|
||||
error_message=f"All sources failed. Last error: {last_error}",
|
||||
)
|
||||
|
||||
def scrape_all(self) -> ScrapeResult:
|
||||
"""Scrape games, teams, and stadiums.
|
||||
|
||||
Returns:
|
||||
Complete ScrapeResult with all data
|
||||
"""
|
||||
self._progress = ScrapeProgress(self.sport, self.season)
|
||||
self._progress.start()
|
||||
|
||||
try:
|
||||
# Scrape games
|
||||
result = self.scrape_games()
|
||||
|
||||
if not result.success:
|
||||
self._progress.log_error(result.error_message or "Unknown error")
|
||||
self._progress.finish()
|
||||
return result
|
||||
|
||||
# Scrape teams
|
||||
teams = self.scrape_teams()
|
||||
result.teams = teams
|
||||
|
||||
# Scrape stadiums
|
||||
stadiums = self.scrape_stadiums()
|
||||
result.stadiums = stadiums
|
||||
|
||||
# Update progress
|
||||
self._progress.games_count = result.game_count
|
||||
self._progress.teams_count = result.team_count
|
||||
self._progress.stadiums_count = result.stadium_count
|
||||
self._progress.errors_count = result.review_count
|
||||
|
||||
self._progress.finish()
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
log_error(f"Scraping failed: {e}", exc_info=True)
|
||||
self._progress.finish()
|
||||
|
||||
return ScrapeResult(
|
||||
success=False,
|
||||
error_message=str(e),
|
||||
)
|
||||
|
||||
def _get_season_months(self) -> list[tuple[int, int]]:
|
||||
"""Get the months to scrape for this sport's season.
|
||||
|
||||
Returns:
|
||||
List of (year, month) tuples
|
||||
"""
|
||||
# Default implementation for sports with fall-spring seasons
|
||||
# (NBA, NHL, etc.)
|
||||
months = []
|
||||
|
||||
# Fall months of season start year
|
||||
for month in range(10, 13): # Oct-Dec
|
||||
months.append((self.season, month))
|
||||
|
||||
# Winter-spring months of following year
|
||||
for month in range(1, 7): # Jan-Jun
|
||||
months.append((self.season + 1, month))
|
||||
|
||||
return months
|
||||
|
||||
def _get_source_url(self, source: str, **kwargs) -> str:
|
||||
"""Build a source URL with parameters.
|
||||
|
||||
Subclasses should override this to build URLs for their sources.
|
||||
|
||||
Args:
|
||||
source: Source identifier
|
||||
**kwargs: URL parameters
|
||||
|
||||
Returns:
|
||||
Complete URL string
|
||||
"""
|
||||
raise NotImplementedError(f"URL builder not implemented for {source}")
|
||||
|
||||
|
||||
class ScraperError(Exception):
|
||||
"""Exception raised when scraping fails."""
|
||||
|
||||
def __init__(self, source: str, message: str):
|
||||
self.source = source
|
||||
self.message = message
|
||||
super().__init__(f"[{source}] {message}")
|
||||
|
||||
|
||||
class PartialDataError(ScraperError):
|
||||
"""Exception raised when only partial data was retrieved."""
|
||||
|
||||
def __init__(self, source: str, message: str, partial_count: int):
|
||||
self.partial_count = partial_count
|
||||
super().__init__(source, f"{message} (got {partial_count} items)")
|
||||
Reference in New Issue
Block a user