Sportstime/Scripts/core.py

#!/usr/bin/env python3
"""
Core shared utilities for SportsTime data scrapers.

This module provides:
- Rate limiting utilities
- Data classes (Game, Stadium)
- Multi-source fallback system
- ID generation
- Export utilities
"""

import json
import time
from collections import defaultdict
from dataclasses import dataclass, asdict, field
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Callable

import pandas as pd
import requests
from bs4 import BeautifulSoup


__all__ = [
    # Constants
    'REQUEST_DELAY',
    # Rate limiting
    'rate_limit',
    'fetch_page',
    # Data classes
    'Game',
    'Stadium',
    'ScraperSource',
    'StadiumScraperSource',
    # Fallback system
    'scrape_with_fallback',
    'scrape_stadiums_with_fallback',
    # ID generation
    'assign_stable_ids',
    # Export utilities
    'export_to_json',
    'validate_games',
]


# =============================================================================
# RATE LIMITING
# =============================================================================

REQUEST_DELAY = 3.0  # seconds between requests to same domain
last_request_time: dict[str, float] = {}


def rate_limit(domain: str) -> None:
    """Enforce rate limiting per domain."""
    now = time.time()
    if domain in last_request_time:
        elapsed = now - last_request_time[domain]
        if elapsed < REQUEST_DELAY:
            time.sleep(REQUEST_DELAY - elapsed)
    last_request_time[domain] = time.time()


def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]:
    """Fetch and parse a webpage with rate limiting."""
    rate_limit(domain)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Cache-Control': 'max-age=0',
    }
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None


# =============================================================================
# DATA CLASSES
# =============================================================================

@dataclass
class Game:
    """Represents a single game."""
    id: str
    sport: str
    season: str
    date: str  # YYYY-MM-DD
    time: Optional[str]  # HH:MM (24hr, ET)
    home_team: str
    away_team: str
    home_team_abbrev: str
    away_team_abbrev: str
    venue: str
    source: str
    is_playoff: bool = False
    broadcast: Optional[str] = None


@dataclass
class Stadium:
    """Represents a stadium/arena/ballpark."""
    id: str
    name: str
    city: str
    state: str
    latitude: float
    longitude: float
    capacity: int
    sport: str
    team_abbrevs: list
    source: str
    year_opened: Optional[int] = None


# =============================================================================
# MULTI-SOURCE FALLBACK SYSTEM
# =============================================================================

@dataclass
class ScraperSource:
    """Represents a single data source for scraping games."""
    name: str
    scraper_func: Callable[[int], list]  # Takes season, returns list[Game]
    priority: int = 1  # Lower = higher priority (1 is best)
    min_games: int = 10  # Minimum games to consider successful


def scrape_with_fallback(
    sport: str,
    season: int,
    sources: list[ScraperSource],
    verbose: bool = True
) -> list[Game]:
    """
    Try multiple sources in priority order until one succeeds.

    Args:
        sport: Sport name for logging
        season: Season year
        sources: List of ScraperSource configs, sorted by priority
        verbose: Whether to print status messages

    Returns:
        List of Game objects from the first successful source
    """
    sources = sorted(sources, key=lambda s: s.priority)

    for i, source in enumerate(sources):
        try:
            if verbose:
                attempt = f"[{i+1}/{len(sources)}]"
                print(f"  {attempt} Trying {source.name}...")

            games = source.scraper_func(season)

            if games and len(games) >= source.min_games:
                if verbose:
                    print(f"  ✓ {source.name} returned {len(games)} games")
                return games
            else:
                if verbose:
                    count = len(games) if games else 0
                    print(f"  ✗ {source.name} returned only {count} games (min: {source.min_games})")

        except Exception as e:
            if verbose:
                print(f"  ✗ {source.name} failed: {e}")
            continue

    # All sources failed
    if verbose:
        print(f"  ⚠ All {len(sources)} sources failed for {sport}")
    return []


@dataclass
class StadiumScraperSource:
    """Represents a single data source for stadium scraping."""
    name: str
    scraper_func: Callable[[], list]  # Returns list[Stadium]
    priority: int = 1  # Lower = higher priority (1 is best)
    min_venues: int = 5  # Minimum venues to consider successful


def scrape_stadiums_with_fallback(
    sport: str,
    sources: list[StadiumScraperSource],
    verbose: bool = True
) -> list[Stadium]:
    """
    Try multiple stadium sources in priority order until one succeeds.

    Args:
        sport: Sport name for logging
        sources: List of StadiumScraperSource configs, sorted by priority
        verbose: Whether to print status messages

    Returns:
        List of Stadium objects from the first successful source
    """
    sources = sorted(sources, key=lambda s: s.priority)

    for i, source in enumerate(sources):
        try:
            if verbose:
                attempt = f"[{i+1}/{len(sources)}]"
                print(f"  {attempt} Trying {source.name}...")

            stadiums = source.scraper_func()

            if stadiums and len(stadiums) >= source.min_venues:
                if verbose:
                    print(f"  ✓ {source.name} returned {len(stadiums)} venues")
                return stadiums
            else:
                if verbose:
                    count = len(stadiums) if stadiums else 0
                    print(f"  ✗ {source.name} returned only {count} venues (min: {source.min_venues})")

        except Exception as e:
            if verbose:
                print(f"  ✗ {source.name} failed: {e}")
            continue

    # All sources failed
    if verbose:
        print(f"  ⚠ All {len(sources)} sources failed for {sport}")
    return []


# =============================================================================
# ID GENERATION
# =============================================================================

def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]:
    """
    Assign IDs based on matchup + date.
    Format: {sport}_{season}_{away}_{home}_{MMDD} (or {MMDD}_2 for doubleheaders)

    When games are rescheduled, the old ID becomes orphaned and a new one is created.
    Use --delete-all before import to clean up orphaned records.
    """
    season_str = season.replace('-', '')

    # Track how many times we've seen each base ID (for doubleheaders)
    id_counts: dict[str, int] = defaultdict(int)

    for game in games:
        away = game.away_team_abbrev.lower()
        home = game.home_team_abbrev.lower()
        # Extract MMDD from date (YYYY-MM-DD)
        date_parts = game.date.split('-')
        mmdd = f"{date_parts[1]}{date_parts[2]}" if len(date_parts) == 3 else "0000"

        base_id = f"{sport.lower()}_{season_str}_{away}_{home}_{mmdd}"
        id_counts[base_id] += 1

        # Add suffix for doubleheaders (game 2+)
        if id_counts[base_id] > 1:
            game.id = f"{base_id}_{id_counts[base_id]}"
        else:
            game.id = base_id

    return games


# =============================================================================
# EXPORT UTILITIES
# =============================================================================

def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path) -> None:
    """
    Export scraped data to organized JSON files.

    Structure:
      data/
        games/
          mlb_2025.json
          nba_2025.json
          ...
        canonical/
          stadiums.json
        stadiums.json  (legacy, for backward compatibility)
    """
    output_dir.mkdir(parents=True, exist_ok=True)

    # Create subdirectories
    games_dir = output_dir / 'games'
    canonical_dir = output_dir / 'canonical'
    games_dir.mkdir(exist_ok=True)
    canonical_dir.mkdir(exist_ok=True)

    # Group games by sport and season
    games_by_sport_season: dict[str, list[Game]] = {}
    for game in games:
        sport = game.sport.lower()
        season = game.season
        key = f"{sport}_{season}"
        if key not in games_by_sport_season:
            games_by_sport_season[key] = []
        games_by_sport_season[key].append(game)

    # Export games by sport/season
    total_exported = 0
    for key, sport_games in games_by_sport_season.items():
        games_data = [asdict(g) for g in sport_games]
        filepath = games_dir / f"{key}.json"
        with open(filepath, 'w') as f:
            json.dump(games_data, f, indent=2)
        print(f"  Exported {len(sport_games):,} games to games/{key}.json")
        total_exported += len(sport_games)

    # Export combined games.json for backward compatibility
    all_games_data = [asdict(g) for g in games]
    with open(output_dir / 'games.json', 'w') as f:
        json.dump(all_games_data, f, indent=2)

    # Export stadiums to canonical/
    stadiums_data = [asdict(s) for s in stadiums]
    with open(canonical_dir / 'stadiums.json', 'w') as f:
        json.dump(stadiums_data, f, indent=2)

    # Also export to root for backward compatibility
    with open(output_dir / 'stadiums.json', 'w') as f:
        json.dump(stadiums_data, f, indent=2)

    # Export as CSV for easy viewing
    if games:
        df_games = pd.DataFrame(all_games_data)
        df_games.to_csv(output_dir / 'games.csv', index=False)

    if stadiums:
        df_stadiums = pd.DataFrame(stadiums_data)
        df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False)

    print(f"\nExported {total_exported:,} games across {len(games_by_sport_season)} sport/season files")
    print(f"Exported {len(stadiums):,} stadiums to canonical/stadiums.json")


def validate_games(games_by_source: dict[str, list[Game]]) -> dict:
    """
    Cross-validate games from multiple sources.
    Returns discrepancies.
    """
    discrepancies = {
        'missing_in_source': [],
        'date_mismatch': [],
        'time_mismatch': [],
        'venue_mismatch': [],
    }

    sources = list(games_by_source.keys())
    if len(sources) < 2:
        return discrepancies

    primary = sources[0]
    primary_games = {g.id: g for g in games_by_source[primary]}

    for source in sources[1:]:
        secondary_games = {g.id: g for g in games_by_source[source]}

        for game_id, game in primary_games.items():
            if game_id not in secondary_games:
                discrepancies['missing_in_source'].append({
                    'game_id': game_id,
                    'present_in': primary,
                    'missing_in': source
                })

    return discrepancies