#!/usr/bin/env python3 """ Core shared utilities for SportsTime data scrapers. This module provides: - Rate limiting utilities - Data classes (Game, Stadium) - Multi-source fallback system - ID generation - Export utilities """ import json import time from collections import defaultdict from dataclasses import dataclass, asdict, field from datetime import datetime, timedelta from pathlib import Path from typing import Optional, Callable import pandas as pd import requests from bs4 import BeautifulSoup __all__ = [ # Constants 'REQUEST_DELAY', # Rate limiting 'rate_limit', 'fetch_page', # Data classes 'Game', 'Stadium', 'ScraperSource', 'StadiumScraperSource', # Fallback system 'scrape_with_fallback', 'scrape_stadiums_with_fallback', # ID generation 'assign_stable_ids', # Export utilities 'export_to_json', 'validate_games', ] # ============================================================================= # RATE LIMITING # ============================================================================= REQUEST_DELAY = 3.0 # seconds between requests to same domain last_request_time: dict[str, float] = {} def rate_limit(domain: str) -> None: """Enforce rate limiting per domain.""" now = time.time() if domain in last_request_time: elapsed = now - last_request_time[domain] if elapsed < REQUEST_DELAY: time.sleep(REQUEST_DELAY - elapsed) last_request_time[domain] = time.time() def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]: """Fetch and parse a webpage with rate limiting.""" rate_limit(domain) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Cache-Control': 'max-age=0', } try: response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() return BeautifulSoup(response.content, 'html.parser') except Exception as e: print(f"Error fetching {url}: {e}") return None # ============================================================================= # DATA CLASSES # ============================================================================= @dataclass class Game: """Represents a single game.""" id: str sport: str season: str date: str # YYYY-MM-DD time: Optional[str] # HH:MM (24hr, ET) home_team: str away_team: str home_team_abbrev: str away_team_abbrev: str venue: str source: str is_playoff: bool = False broadcast: Optional[str] = None @dataclass class Stadium: """Represents a stadium/arena/ballpark.""" id: str name: str city: str state: str latitude: float longitude: float capacity: int sport: str team_abbrevs: list source: str year_opened: Optional[int] = None # ============================================================================= # MULTI-SOURCE FALLBACK SYSTEM # ============================================================================= @dataclass class ScraperSource: """Represents a single data source for scraping games.""" name: str scraper_func: Callable[[int], list] # Takes season, returns list[Game] priority: int = 1 # Lower = higher priority (1 is best) min_games: int = 10 # Minimum games to consider successful def scrape_with_fallback( sport: str, season: int, sources: list[ScraperSource], verbose: bool = True ) -> list[Game]: """ Try multiple sources in priority order until one succeeds. Args: sport: Sport name for logging season: Season year sources: List of ScraperSource configs, sorted by priority verbose: Whether to print status messages Returns: List of Game objects from the first successful source """ sources = sorted(sources, key=lambda s: s.priority) for i, source in enumerate(sources): try: if verbose: attempt = f"[{i+1}/{len(sources)}]" print(f" {attempt} Trying {source.name}...") games = source.scraper_func(season) if games and len(games) >= source.min_games: if verbose: print(f" ✓ {source.name} returned {len(games)} games") return games else: if verbose: count = len(games) if games else 0 print(f" ✗ {source.name} returned only {count} games (min: {source.min_games})") except Exception as e: if verbose: print(f" ✗ {source.name} failed: {e}") continue # All sources failed if verbose: print(f" ⚠ All {len(sources)} sources failed for {sport}") return [] @dataclass class StadiumScraperSource: """Represents a single data source for stadium scraping.""" name: str scraper_func: Callable[[], list] # Returns list[Stadium] priority: int = 1 # Lower = higher priority (1 is best) min_venues: int = 5 # Minimum venues to consider successful def scrape_stadiums_with_fallback( sport: str, sources: list[StadiumScraperSource], verbose: bool = True ) -> list[Stadium]: """ Try multiple stadium sources in priority order until one succeeds. Args: sport: Sport name for logging sources: List of StadiumScraperSource configs, sorted by priority verbose: Whether to print status messages Returns: List of Stadium objects from the first successful source """ sources = sorted(sources, key=lambda s: s.priority) for i, source in enumerate(sources): try: if verbose: attempt = f"[{i+1}/{len(sources)}]" print(f" {attempt} Trying {source.name}...") stadiums = source.scraper_func() if stadiums and len(stadiums) >= source.min_venues: if verbose: print(f" ✓ {source.name} returned {len(stadiums)} venues") return stadiums else: if verbose: count = len(stadiums) if stadiums else 0 print(f" ✗ {source.name} returned only {count} venues (min: {source.min_venues})") except Exception as e: if verbose: print(f" ✗ {source.name} failed: {e}") continue # All sources failed if verbose: print(f" ⚠ All {len(sources)} sources failed for {sport}") return [] # ============================================================================= # ID GENERATION # ============================================================================= def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]: """ Assign IDs based on matchup + date. Format: {sport}_{season}_{away}_{home}_{MMDD} (or {MMDD}_2 for doubleheaders) When games are rescheduled, the old ID becomes orphaned and a new one is created. Use --delete-all before import to clean up orphaned records. """ season_str = season.replace('-', '') # Track how many times we've seen each base ID (for doubleheaders) id_counts: dict[str, int] = defaultdict(int) for game in games: away = game.away_team_abbrev.lower() home = game.home_team_abbrev.lower() # Extract MMDD from date (YYYY-MM-DD) date_parts = game.date.split('-') mmdd = f"{date_parts[1]}{date_parts[2]}" if len(date_parts) == 3 else "0000" base_id = f"{sport.lower()}_{season_str}_{away}_{home}_{mmdd}" id_counts[base_id] += 1 # Add suffix for doubleheaders (game 2+) if id_counts[base_id] > 1: game.id = f"{base_id}_{id_counts[base_id]}" else: game.id = base_id return games # ============================================================================= # EXPORT UTILITIES # ============================================================================= def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path) -> None: """ Export scraped data to organized JSON files. Structure: data/ games/ mlb_2025.json nba_2025.json ... canonical/ stadiums.json stadiums.json (legacy, for backward compatibility) """ output_dir.mkdir(parents=True, exist_ok=True) # Create subdirectories games_dir = output_dir / 'games' canonical_dir = output_dir / 'canonical' games_dir.mkdir(exist_ok=True) canonical_dir.mkdir(exist_ok=True) # Group games by sport and season games_by_sport_season: dict[str, list[Game]] = {} for game in games: sport = game.sport.lower() season = game.season key = f"{sport}_{season}" if key not in games_by_sport_season: games_by_sport_season[key] = [] games_by_sport_season[key].append(game) # Export games by sport/season total_exported = 0 for key, sport_games in games_by_sport_season.items(): games_data = [asdict(g) for g in sport_games] filepath = games_dir / f"{key}.json" with open(filepath, 'w') as f: json.dump(games_data, f, indent=2) print(f" Exported {len(sport_games):,} games to games/{key}.json") total_exported += len(sport_games) # Export combined games.json for backward compatibility all_games_data = [asdict(g) for g in games] with open(output_dir / 'games.json', 'w') as f: json.dump(all_games_data, f, indent=2) # Export stadiums to canonical/ stadiums_data = [asdict(s) for s in stadiums] with open(canonical_dir / 'stadiums.json', 'w') as f: json.dump(stadiums_data, f, indent=2) # Also export to root for backward compatibility with open(output_dir / 'stadiums.json', 'w') as f: json.dump(stadiums_data, f, indent=2) # Export as CSV for easy viewing if games: df_games = pd.DataFrame(all_games_data) df_games.to_csv(output_dir / 'games.csv', index=False) if stadiums: df_stadiums = pd.DataFrame(stadiums_data) df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False) print(f"\nExported {total_exported:,} games across {len(games_by_sport_season)} sport/season files") print(f"Exported {len(stadiums):,} stadiums to canonical/stadiums.json") def validate_games(games_by_source: dict[str, list[Game]]) -> dict: """ Cross-validate games from multiple sources. Returns discrepancies. """ discrepancies = { 'missing_in_source': [], 'date_mismatch': [], 'time_mismatch': [], 'venue_mismatch': [], } sources = list(games_by_source.keys()) if len(sources) < 2: return discrepancies primary = sources[0] primary_games = {g.id: g for g in games_by_source[primary]} for source in sources[1:]: secondary_games = {g.id: g for g in games_by_source[source]} for game_id, game in primary_games.items(): if game_id not in secondary_games: discrepancies['missing_in_source'].append({ 'game_id': game_id, 'present_in': primary, 'missing_in': source }) return discrepancies