feat(01-01): create core.py shared module

- Rate limiting utilities (REQUEST_DELAY, rate_limit, fetch_page)
- Data classes (Game, Stadium)
- Multi-source fallback system (ScraperSource, scrape_with_fallback)
- Stadium fallback system (StadiumScraperSource, scrape_stadiums_with_fallback)
- ID generation (assign_stable_ids)
- Export utilities (export_to_json, validate_games)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-09 23:58:55 -06:00
parent 67b570dbee
commit edbb5dbbda

384
Scripts/core.py Normal file
View File

@@ -0,0 +1,384 @@
#!/usr/bin/env python3
"""
Core shared utilities for SportsTime data scrapers.
This module provides:
- Rate limiting utilities
- Data classes (Game, Stadium)
- Multi-source fallback system
- ID generation
- Export utilities
"""
import json
import time
from collections import defaultdict
from dataclasses import dataclass, asdict, field
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Callable
import pandas as pd
import requests
from bs4 import BeautifulSoup
__all__ = [
# Constants
'REQUEST_DELAY',
# Rate limiting
'rate_limit',
'fetch_page',
# Data classes
'Game',
'Stadium',
'ScraperSource',
'StadiumScraperSource',
# Fallback system
'scrape_with_fallback',
'scrape_stadiums_with_fallback',
# ID generation
'assign_stable_ids',
# Export utilities
'export_to_json',
'validate_games',
]
# =============================================================================
# RATE LIMITING
# =============================================================================
REQUEST_DELAY = 3.0 # seconds between requests to same domain
last_request_time: dict[str, float] = {}
def rate_limit(domain: str) -> None:
"""Enforce rate limiting per domain."""
now = time.time()
if domain in last_request_time:
elapsed = now - last_request_time[domain]
if elapsed < REQUEST_DELAY:
time.sleep(REQUEST_DELAY - elapsed)
last_request_time[domain] = time.time()
def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]:
"""Fetch and parse a webpage with rate limiting."""
rate_limit(domain)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Cache-Control': 'max-age=0',
}
try:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class Game:
"""Represents a single game."""
id: str
sport: str
season: str
date: str # YYYY-MM-DD
time: Optional[str] # HH:MM (24hr, ET)
home_team: str
away_team: str
home_team_abbrev: str
away_team_abbrev: str
venue: str
source: str
is_playoff: bool = False
broadcast: Optional[str] = None
@dataclass
class Stadium:
"""Represents a stadium/arena/ballpark."""
id: str
name: str
city: str
state: str
latitude: float
longitude: float
capacity: int
sport: str
team_abbrevs: list
source: str
year_opened: Optional[int] = None
# =============================================================================
# MULTI-SOURCE FALLBACK SYSTEM
# =============================================================================
@dataclass
class ScraperSource:
"""Represents a single data source for scraping games."""
name: str
scraper_func: Callable[[int], list] # Takes season, returns list[Game]
priority: int = 1 # Lower = higher priority (1 is best)
min_games: int = 10 # Minimum games to consider successful
def scrape_with_fallback(
sport: str,
season: int,
sources: list[ScraperSource],
verbose: bool = True
) -> list[Game]:
"""
Try multiple sources in priority order until one succeeds.
Args:
sport: Sport name for logging
season: Season year
sources: List of ScraperSource configs, sorted by priority
verbose: Whether to print status messages
Returns:
List of Game objects from the first successful source
"""
sources = sorted(sources, key=lambda s: s.priority)
for i, source in enumerate(sources):
try:
if verbose:
attempt = f"[{i+1}/{len(sources)}]"
print(f" {attempt} Trying {source.name}...")
games = source.scraper_func(season)
if games and len(games) >= source.min_games:
if verbose:
print(f"{source.name} returned {len(games)} games")
return games
else:
if verbose:
count = len(games) if games else 0
print(f"{source.name} returned only {count} games (min: {source.min_games})")
except Exception as e:
if verbose:
print(f"{source.name} failed: {e}")
continue
# All sources failed
if verbose:
print(f" ⚠ All {len(sources)} sources failed for {sport}")
return []
@dataclass
class StadiumScraperSource:
"""Represents a single data source for stadium scraping."""
name: str
scraper_func: Callable[[], list] # Returns list[Stadium]
priority: int = 1 # Lower = higher priority (1 is best)
min_venues: int = 5 # Minimum venues to consider successful
def scrape_stadiums_with_fallback(
sport: str,
sources: list[StadiumScraperSource],
verbose: bool = True
) -> list[Stadium]:
"""
Try multiple stadium sources in priority order until one succeeds.
Args:
sport: Sport name for logging
sources: List of StadiumScraperSource configs, sorted by priority
verbose: Whether to print status messages
Returns:
List of Stadium objects from the first successful source
"""
sources = sorted(sources, key=lambda s: s.priority)
for i, source in enumerate(sources):
try:
if verbose:
attempt = f"[{i+1}/{len(sources)}]"
print(f" {attempt} Trying {source.name}...")
stadiums = source.scraper_func()
if stadiums and len(stadiums) >= source.min_venues:
if verbose:
print(f"{source.name} returned {len(stadiums)} venues")
return stadiums
else:
if verbose:
count = len(stadiums) if stadiums else 0
print(f"{source.name} returned only {count} venues (min: {source.min_venues})")
except Exception as e:
if verbose:
print(f"{source.name} failed: {e}")
continue
# All sources failed
if verbose:
print(f" ⚠ All {len(sources)} sources failed for {sport}")
return []
# =============================================================================
# ID GENERATION
# =============================================================================
def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]:
"""
Assign IDs based on matchup + date.
Format: {sport}_{season}_{away}_{home}_{MMDD} (or {MMDD}_2 for doubleheaders)
When games are rescheduled, the old ID becomes orphaned and a new one is created.
Use --delete-all before import to clean up orphaned records.
"""
season_str = season.replace('-', '')
# Track how many times we've seen each base ID (for doubleheaders)
id_counts: dict[str, int] = defaultdict(int)
for game in games:
away = game.away_team_abbrev.lower()
home = game.home_team_abbrev.lower()
# Extract MMDD from date (YYYY-MM-DD)
date_parts = game.date.split('-')
mmdd = f"{date_parts[1]}{date_parts[2]}" if len(date_parts) == 3 else "0000"
base_id = f"{sport.lower()}_{season_str}_{away}_{home}_{mmdd}"
id_counts[base_id] += 1
# Add suffix for doubleheaders (game 2+)
if id_counts[base_id] > 1:
game.id = f"{base_id}_{id_counts[base_id]}"
else:
game.id = base_id
return games
# =============================================================================
# EXPORT UTILITIES
# =============================================================================
def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path) -> None:
"""
Export scraped data to organized JSON files.
Structure:
data/
games/
mlb_2025.json
nba_2025.json
...
canonical/
stadiums.json
stadiums.json (legacy, for backward compatibility)
"""
output_dir.mkdir(parents=True, exist_ok=True)
# Create subdirectories
games_dir = output_dir / 'games'
canonical_dir = output_dir / 'canonical'
games_dir.mkdir(exist_ok=True)
canonical_dir.mkdir(exist_ok=True)
# Group games by sport and season
games_by_sport_season: dict[str, list[Game]] = {}
for game in games:
sport = game.sport.lower()
season = game.season
key = f"{sport}_{season}"
if key not in games_by_sport_season:
games_by_sport_season[key] = []
games_by_sport_season[key].append(game)
# Export games by sport/season
total_exported = 0
for key, sport_games in games_by_sport_season.items():
games_data = [asdict(g) for g in sport_games]
filepath = games_dir / f"{key}.json"
with open(filepath, 'w') as f:
json.dump(games_data, f, indent=2)
print(f" Exported {len(sport_games):,} games to games/{key}.json")
total_exported += len(sport_games)
# Export combined games.json for backward compatibility
all_games_data = [asdict(g) for g in games]
with open(output_dir / 'games.json', 'w') as f:
json.dump(all_games_data, f, indent=2)
# Export stadiums to canonical/
stadiums_data = [asdict(s) for s in stadiums]
with open(canonical_dir / 'stadiums.json', 'w') as f:
json.dump(stadiums_data, f, indent=2)
# Also export to root for backward compatibility
with open(output_dir / 'stadiums.json', 'w') as f:
json.dump(stadiums_data, f, indent=2)
# Export as CSV for easy viewing
if games:
df_games = pd.DataFrame(all_games_data)
df_games.to_csv(output_dir / 'games.csv', index=False)
if stadiums:
df_stadiums = pd.DataFrame(stadiums_data)
df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False)
print(f"\nExported {total_exported:,} games across {len(games_by_sport_season)} sport/season files")
print(f"Exported {len(stadiums):,} stadiums to canonical/stadiums.json")
def validate_games(games_by_source: dict[str, list[Game]]) -> dict:
"""
Cross-validate games from multiple sources.
Returns discrepancies.
"""
discrepancies = {
'missing_in_source': [],
'date_mismatch': [],
'time_mismatch': [],
'venue_mismatch': [],
}
sources = list(games_by_source.keys())
if len(sources) < 2:
return discrepancies
primary = sources[0]
primary_games = {g.id: g for g in games_by_source[primary]}
for source in sources[1:]:
secondary_games = {g.id: g for g in games_by_source[source]}
for game_id, game in primary_games.items():
if game_id not in secondary_games:
discrepancies['missing_in_source'].append({
'game_id': game_id,
'present_in': primary,
'missing_in': source
})
return discrepancies