diff --git a/Scripts/core.py b/Scripts/core.py new file mode 100644 index 0000000..25a49ea --- /dev/null +++ b/Scripts/core.py @@ -0,0 +1,384 @@ +#!/usr/bin/env python3 +""" +Core shared utilities for SportsTime data scrapers. + +This module provides: +- Rate limiting utilities +- Data classes (Game, Stadium) +- Multi-source fallback system +- ID generation +- Export utilities +""" + +import json +import time +from collections import defaultdict +from dataclasses import dataclass, asdict, field +from datetime import datetime, timedelta +from pathlib import Path +from typing import Optional, Callable + +import pandas as pd +import requests +from bs4 import BeautifulSoup + + +__all__ = [ + # Constants + 'REQUEST_DELAY', + # Rate limiting + 'rate_limit', + 'fetch_page', + # Data classes + 'Game', + 'Stadium', + 'ScraperSource', + 'StadiumScraperSource', + # Fallback system + 'scrape_with_fallback', + 'scrape_stadiums_with_fallback', + # ID generation + 'assign_stable_ids', + # Export utilities + 'export_to_json', + 'validate_games', +] + + +# ============================================================================= +# RATE LIMITING +# ============================================================================= + +REQUEST_DELAY = 3.0 # seconds between requests to same domain +last_request_time: dict[str, float] = {} + + +def rate_limit(domain: str) -> None: + """Enforce rate limiting per domain.""" + now = time.time() + if domain in last_request_time: + elapsed = now - last_request_time[domain] + if elapsed < REQUEST_DELAY: + time.sleep(REQUEST_DELAY - elapsed) + last_request_time[domain] = time.time() + + +def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]: + """Fetch and parse a webpage with rate limiting.""" + rate_limit(domain) + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1', + 'Cache-Control': 'max-age=0', + } + try: + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() + return BeautifulSoup(response.content, 'html.parser') + except Exception as e: + print(f"Error fetching {url}: {e}") + return None + + +# ============================================================================= +# DATA CLASSES +# ============================================================================= + +@dataclass +class Game: + """Represents a single game.""" + id: str + sport: str + season: str + date: str # YYYY-MM-DD + time: Optional[str] # HH:MM (24hr, ET) + home_team: str + away_team: str + home_team_abbrev: str + away_team_abbrev: str + venue: str + source: str + is_playoff: bool = False + broadcast: Optional[str] = None + + +@dataclass +class Stadium: + """Represents a stadium/arena/ballpark.""" + id: str + name: str + city: str + state: str + latitude: float + longitude: float + capacity: int + sport: str + team_abbrevs: list + source: str + year_opened: Optional[int] = None + + +# ============================================================================= +# MULTI-SOURCE FALLBACK SYSTEM +# ============================================================================= + +@dataclass +class ScraperSource: + """Represents a single data source for scraping games.""" + name: str + scraper_func: Callable[[int], list] # Takes season, returns list[Game] + priority: int = 1 # Lower = higher priority (1 is best) + min_games: int = 10 # Minimum games to consider successful + + +def scrape_with_fallback( + sport: str, + season: int, + sources: list[ScraperSource], + verbose: bool = True +) -> list[Game]: + """ + Try multiple sources in priority order until one succeeds. + + Args: + sport: Sport name for logging + season: Season year + sources: List of ScraperSource configs, sorted by priority + verbose: Whether to print status messages + + Returns: + List of Game objects from the first successful source + """ + sources = sorted(sources, key=lambda s: s.priority) + + for i, source in enumerate(sources): + try: + if verbose: + attempt = f"[{i+1}/{len(sources)}]" + print(f" {attempt} Trying {source.name}...") + + games = source.scraper_func(season) + + if games and len(games) >= source.min_games: + if verbose: + print(f" ✓ {source.name} returned {len(games)} games") + return games + else: + if verbose: + count = len(games) if games else 0 + print(f" ✗ {source.name} returned only {count} games (min: {source.min_games})") + + except Exception as e: + if verbose: + print(f" ✗ {source.name} failed: {e}") + continue + + # All sources failed + if verbose: + print(f" ⚠ All {len(sources)} sources failed for {sport}") + return [] + + +@dataclass +class StadiumScraperSource: + """Represents a single data source for stadium scraping.""" + name: str + scraper_func: Callable[[], list] # Returns list[Stadium] + priority: int = 1 # Lower = higher priority (1 is best) + min_venues: int = 5 # Minimum venues to consider successful + + +def scrape_stadiums_with_fallback( + sport: str, + sources: list[StadiumScraperSource], + verbose: bool = True +) -> list[Stadium]: + """ + Try multiple stadium sources in priority order until one succeeds. + + Args: + sport: Sport name for logging + sources: List of StadiumScraperSource configs, sorted by priority + verbose: Whether to print status messages + + Returns: + List of Stadium objects from the first successful source + """ + sources = sorted(sources, key=lambda s: s.priority) + + for i, source in enumerate(sources): + try: + if verbose: + attempt = f"[{i+1}/{len(sources)}]" + print(f" {attempt} Trying {source.name}...") + + stadiums = source.scraper_func() + + if stadiums and len(stadiums) >= source.min_venues: + if verbose: + print(f" ✓ {source.name} returned {len(stadiums)} venues") + return stadiums + else: + if verbose: + count = len(stadiums) if stadiums else 0 + print(f" ✗ {source.name} returned only {count} venues (min: {source.min_venues})") + + except Exception as e: + if verbose: + print(f" ✗ {source.name} failed: {e}") + continue + + # All sources failed + if verbose: + print(f" ⚠ All {len(sources)} sources failed for {sport}") + return [] + + +# ============================================================================= +# ID GENERATION +# ============================================================================= + +def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]: + """ + Assign IDs based on matchup + date. + Format: {sport}_{season}_{away}_{home}_{MMDD} (or {MMDD}_2 for doubleheaders) + + When games are rescheduled, the old ID becomes orphaned and a new one is created. + Use --delete-all before import to clean up orphaned records. + """ + season_str = season.replace('-', '') + + # Track how many times we've seen each base ID (for doubleheaders) + id_counts: dict[str, int] = defaultdict(int) + + for game in games: + away = game.away_team_abbrev.lower() + home = game.home_team_abbrev.lower() + # Extract MMDD from date (YYYY-MM-DD) + date_parts = game.date.split('-') + mmdd = f"{date_parts[1]}{date_parts[2]}" if len(date_parts) == 3 else "0000" + + base_id = f"{sport.lower()}_{season_str}_{away}_{home}_{mmdd}" + id_counts[base_id] += 1 + + # Add suffix for doubleheaders (game 2+) + if id_counts[base_id] > 1: + game.id = f"{base_id}_{id_counts[base_id]}" + else: + game.id = base_id + + return games + + +# ============================================================================= +# EXPORT UTILITIES +# ============================================================================= + +def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path) -> None: + """ + Export scraped data to organized JSON files. + + Structure: + data/ + games/ + mlb_2025.json + nba_2025.json + ... + canonical/ + stadiums.json + stadiums.json (legacy, for backward compatibility) + """ + output_dir.mkdir(parents=True, exist_ok=True) + + # Create subdirectories + games_dir = output_dir / 'games' + canonical_dir = output_dir / 'canonical' + games_dir.mkdir(exist_ok=True) + canonical_dir.mkdir(exist_ok=True) + + # Group games by sport and season + games_by_sport_season: dict[str, list[Game]] = {} + for game in games: + sport = game.sport.lower() + season = game.season + key = f"{sport}_{season}" + if key not in games_by_sport_season: + games_by_sport_season[key] = [] + games_by_sport_season[key].append(game) + + # Export games by sport/season + total_exported = 0 + for key, sport_games in games_by_sport_season.items(): + games_data = [asdict(g) for g in sport_games] + filepath = games_dir / f"{key}.json" + with open(filepath, 'w') as f: + json.dump(games_data, f, indent=2) + print(f" Exported {len(sport_games):,} games to games/{key}.json") + total_exported += len(sport_games) + + # Export combined games.json for backward compatibility + all_games_data = [asdict(g) for g in games] + with open(output_dir / 'games.json', 'w') as f: + json.dump(all_games_data, f, indent=2) + + # Export stadiums to canonical/ + stadiums_data = [asdict(s) for s in stadiums] + with open(canonical_dir / 'stadiums.json', 'w') as f: + json.dump(stadiums_data, f, indent=2) + + # Also export to root for backward compatibility + with open(output_dir / 'stadiums.json', 'w') as f: + json.dump(stadiums_data, f, indent=2) + + # Export as CSV for easy viewing + if games: + df_games = pd.DataFrame(all_games_data) + df_games.to_csv(output_dir / 'games.csv', index=False) + + if stadiums: + df_stadiums = pd.DataFrame(stadiums_data) + df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False) + + print(f"\nExported {total_exported:,} games across {len(games_by_sport_season)} sport/season files") + print(f"Exported {len(stadiums):,} stadiums to canonical/stadiums.json") + + +def validate_games(games_by_source: dict[str, list[Game]]) -> dict: + """ + Cross-validate games from multiple sources. + Returns discrepancies. + """ + discrepancies = { + 'missing_in_source': [], + 'date_mismatch': [], + 'time_mismatch': [], + 'venue_mismatch': [], + } + + sources = list(games_by_source.keys()) + if len(sources) < 2: + return discrepancies + + primary = sources[0] + primary_games = {g.id: g for g in games_by_source[primary]} + + for source in sources[1:]: + secondary_games = {g.id: g for g in games_by_source[source]} + + for game_id, game in primary_games.items(): + if game_id not in secondary_games: + discrepancies['missing_in_source'].append({ + 'game_id': game_id, + 'present_in': primary, + 'missing_in': source + }) + + return discrepancies