feat(01-01): create core.py shared module

- Rate limiting utilities (REQUEST_DELAY, rate_limit, fetch_page) - Data classes (Game, Stadium) - Multi-source fallback system (ScraperSource, scrape_with_fallback) - Stadium fallback system (StadiumScraperSource, scrape_stadiums_with_fallback) - ID generation (assign_stable_ids) - Export utilities (export_to_json, validate_games) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-09 23:58:55 -06:00
parent 67b570dbee
commit edbb5dbbda
1 changed files with 384 additions and 0 deletions
--- a/Scripts/core.py
+++ b/Scripts/core.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python3
+"""
+Core shared utilities for SportsTime data scrapers.
+
+This module provides:
+- Rate limiting utilities
+- Data classes (Game, Stadium)
+- Multi-source fallback system
+- ID generation
+- Export utilities
+"""
+
+import json
+import time
+from collections import defaultdict
+from dataclasses import dataclass, asdict, field
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional, Callable
+
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+
+__all__ = [
+    # Constants
+    'REQUEST_DELAY',
+    # Rate limiting
+    'rate_limit',
+    'fetch_page',
+    # Data classes
+    'Game',
+    'Stadium',
+    'ScraperSource',
+    'StadiumScraperSource',
+    # Fallback system
+    'scrape_with_fallback',
+    'scrape_stadiums_with_fallback',
+    # ID generation
+    'assign_stable_ids',
+    # Export utilities
+    'export_to_json',
+    'validate_games',
+]
+
+
+# =============================================================================
+# RATE LIMITING
+# =============================================================================
+
+REQUEST_DELAY = 3.0  # seconds between requests to same domain
+last_request_time: dict[str, float] = {}
+
+
+def rate_limit(domain: str) -> None:
+    """Enforce rate limiting per domain."""
+    now = time.time()
+    if domain in last_request_time:
+        elapsed = now - last_request_time[domain]
+        if elapsed < REQUEST_DELAY:
+            time.sleep(REQUEST_DELAY - elapsed)
+    last_request_time[domain] = time.time()
+
+
+def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]:
+    """Fetch and parse a webpage with rate limiting."""
+    rate_limit(domain)
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'none',
+        'Sec-Fetch-User': '?1',
+        'Cache-Control': 'max-age=0',
+    }
+    try:
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+        return BeautifulSoup(response.content, 'html.parser')
+    except Exception as e:
+        print(f"Error fetching {url}: {e}")
+        return None
+
+
+# =============================================================================
+# DATA CLASSES
+# =============================================================================
+
+@dataclass
+class Game:
+    """Represents a single game."""
+    id: str
+    sport: str
+    season: str
+    date: str  # YYYY-MM-DD
+    time: Optional[str]  # HH:MM (24hr, ET)
+    home_team: str
+    away_team: str
+    home_team_abbrev: str
+    away_team_abbrev: str
+    venue: str
+    source: str
+    is_playoff: bool = False
+    broadcast: Optional[str] = None
+
+
+@dataclass
+class Stadium:
+    """Represents a stadium/arena/ballpark."""
+    id: str
+    name: str
+    city: str
+    state: str
+    latitude: float
+    longitude: float
+    capacity: int
+    sport: str
+    team_abbrevs: list
+    source: str
+    year_opened: Optional[int] = None
+
+
+# =============================================================================
+# MULTI-SOURCE FALLBACK SYSTEM
+# =============================================================================
+
+@dataclass
+class ScraperSource:
+    """Represents a single data source for scraping games."""
+    name: str
+    scraper_func: Callable[[int], list]  # Takes season, returns list[Game]
+    priority: int = 1  # Lower = higher priority (1 is best)
+    min_games: int = 10  # Minimum games to consider successful
+
+
+def scrape_with_fallback(
+    sport: str,
+    season: int,
+    sources: list[ScraperSource],
+    verbose: bool = True
+) -> list[Game]:
+    """
+    Try multiple sources in priority order until one succeeds.
+
+    Args:
+        sport: Sport name for logging
+        season: Season year
+        sources: List of ScraperSource configs, sorted by priority
+        verbose: Whether to print status messages
+
+    Returns:
+        List of Game objects from the first successful source
+    """
+    sources = sorted(sources, key=lambda s: s.priority)
+
+    for i, source in enumerate(sources):
+        try:
+            if verbose:
+                attempt = f"[{i+1}/{len(sources)}]"
+                print(f"  {attempt} Trying {source.name}...")
+
+            games = source.scraper_func(season)
+
+            if games and len(games) >= source.min_games:
+                if verbose:
+                    print(f"  ✓ {source.name} returned {len(games)} games")
+                return games
+            else:
+                if verbose:
+                    count = len(games) if games else 0
+                    print(f"  ✗ {source.name} returned only {count} games (min: {source.min_games})")
+
+        except Exception as e:
+            if verbose:
+                print(f"  ✗ {source.name} failed: {e}")
+            continue
+
+    # All sources failed
+    if verbose:
+        print(f"  ⚠ All {len(sources)} sources failed for {sport}")
+    return []
+
+
+@dataclass
+class StadiumScraperSource:
+    """Represents a single data source for stadium scraping."""
+    name: str
+    scraper_func: Callable[[], list]  # Returns list[Stadium]
+    priority: int = 1  # Lower = higher priority (1 is best)
+    min_venues: int = 5  # Minimum venues to consider successful
+
+
+def scrape_stadiums_with_fallback(
+    sport: str,
+    sources: list[StadiumScraperSource],
+    verbose: bool = True
+) -> list[Stadium]:
+    """
+    Try multiple stadium sources in priority order until one succeeds.
+
+    Args:
+        sport: Sport name for logging
+        sources: List of StadiumScraperSource configs, sorted by priority
+        verbose: Whether to print status messages
+
+    Returns:
+        List of Stadium objects from the first successful source
+    """
+    sources = sorted(sources, key=lambda s: s.priority)
+
+    for i, source in enumerate(sources):
+        try:
+            if verbose:
+                attempt = f"[{i+1}/{len(sources)}]"
+                print(f"  {attempt} Trying {source.name}...")
+
+            stadiums = source.scraper_func()
+
+            if stadiums and len(stadiums) >= source.min_venues:
+                if verbose:
+                    print(f"  ✓ {source.name} returned {len(stadiums)} venues")
+                return stadiums
+            else:
+                if verbose:
+                    count = len(stadiums) if stadiums else 0
+                    print(f"  ✗ {source.name} returned only {count} venues (min: {source.min_venues})")
+
+        except Exception as e:
+            if verbose:
+                print(f"  ✗ {source.name} failed: {e}")
+            continue
+
+    # All sources failed
+    if verbose:
+        print(f"  ⚠ All {len(sources)} sources failed for {sport}")
+    return []
+
+
+# =============================================================================
+# ID GENERATION
+# =============================================================================
+
+def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]:
+    """
+    Assign IDs based on matchup + date.
+    Format: {sport}_{season}_{away}_{home}_{MMDD} (or {MMDD}_2 for doubleheaders)
+
+    When games are rescheduled, the old ID becomes orphaned and a new one is created.
+    Use --delete-all before import to clean up orphaned records.
+    """
+    season_str = season.replace('-', '')
+
+    # Track how many times we've seen each base ID (for doubleheaders)
+    id_counts: dict[str, int] = defaultdict(int)
+
+    for game in games:
+        away = game.away_team_abbrev.lower()
+        home = game.home_team_abbrev.lower()
+        # Extract MMDD from date (YYYY-MM-DD)
+        date_parts = game.date.split('-')
+        mmdd = f"{date_parts[1]}{date_parts[2]}" if len(date_parts) == 3 else "0000"
+
+        base_id = f"{sport.lower()}_{season_str}_{away}_{home}_{mmdd}"
+        id_counts[base_id] += 1
+
+        # Add suffix for doubleheaders (game 2+)
+        if id_counts[base_id] > 1:
+            game.id = f"{base_id}_{id_counts[base_id]}"
+        else:
+            game.id = base_id
+
+    return games
+
+
+# =============================================================================
+# EXPORT UTILITIES
+# =============================================================================
+
+def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path) -> None:
+    """
+    Export scraped data to organized JSON files.
+
+    Structure:
+      data/
+        games/
+          mlb_2025.json
+          nba_2025.json
+          ...
+        canonical/
+          stadiums.json
+        stadiums.json  (legacy, for backward compatibility)
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Create subdirectories
+    games_dir = output_dir / 'games'
+    canonical_dir = output_dir / 'canonical'
+    games_dir.mkdir(exist_ok=True)
+    canonical_dir.mkdir(exist_ok=True)
+
+    # Group games by sport and season
+    games_by_sport_season: dict[str, list[Game]] = {}
+    for game in games:
+        sport = game.sport.lower()
+        season = game.season
+        key = f"{sport}_{season}"
+        if key not in games_by_sport_season:
+            games_by_sport_season[key] = []
+        games_by_sport_season[key].append(game)
+
+    # Export games by sport/season
+    total_exported = 0
+    for key, sport_games in games_by_sport_season.items():
+        games_data = [asdict(g) for g in sport_games]
+        filepath = games_dir / f"{key}.json"
+        with open(filepath, 'w') as f:
+            json.dump(games_data, f, indent=2)
+        print(f"  Exported {len(sport_games):,} games to games/{key}.json")
+        total_exported += len(sport_games)
+
+    # Export combined games.json for backward compatibility
+    all_games_data = [asdict(g) for g in games]
+    with open(output_dir / 'games.json', 'w') as f:
+        json.dump(all_games_data, f, indent=2)
+
+    # Export stadiums to canonical/
+    stadiums_data = [asdict(s) for s in stadiums]
+    with open(canonical_dir / 'stadiums.json', 'w') as f:
+        json.dump(stadiums_data, f, indent=2)
+
+    # Also export to root for backward compatibility
+    with open(output_dir / 'stadiums.json', 'w') as f:
+        json.dump(stadiums_data, f, indent=2)
+
+    # Export as CSV for easy viewing
+    if games:
+        df_games = pd.DataFrame(all_games_data)
+        df_games.to_csv(output_dir / 'games.csv', index=False)
+
+    if stadiums:
+        df_stadiums = pd.DataFrame(stadiums_data)
+        df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False)
+
+    print(f"\nExported {total_exported:,} games across {len(games_by_sport_season)} sport/season files")
+    print(f"Exported {len(stadiums):,} stadiums to canonical/stadiums.json")
+
+
+def validate_games(games_by_source: dict[str, list[Game]]) -> dict:
+    """
+    Cross-validate games from multiple sources.
+    Returns discrepancies.
+    """
+    discrepancies = {
+        'missing_in_source': [],
+        'date_mismatch': [],
+        'time_mismatch': [],
+        'venue_mismatch': [],
+    }
+
+    sources = list(games_by_source.keys())
+    if len(sources) < 2:
+        return discrepancies
+
+    primary = sources[0]
+    primary_games = {g.id: g for g in games_by_source[primary]}
+
+    for source in sources[1:]:
+        secondary_games = {g.id: g for g in games_by_source[source]}
+
+        for game_id, game in primary_games.items():
+            if game_id not in secondary_games:
+                discrepancies['missing_in_source'].append({
+                    'game_id': game_id,
+                    'present_in': primary,
+                    'missing_in': source
+                })
+
+    return discrepancies