- Rate limiting utilities (REQUEST_DELAY, rate_limit, fetch_page) - Data classes (Game, Stadium) - Multi-source fallback system (ScraperSource, scrape_with_fallback) - Stadium fallback system (StadiumScraperSource, scrape_stadiums_with_fallback) - ID generation (assign_stable_ids) - Export utilities (export_to_json, validate_games) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
385 lines
12 KiB
Python
385 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Core shared utilities for SportsTime data scrapers.
|
|
|
|
This module provides:
|
|
- Rate limiting utilities
|
|
- Data classes (Game, Stadium)
|
|
- Multi-source fallback system
|
|
- ID generation
|
|
- Export utilities
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass, asdict, field
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Optional, Callable
|
|
|
|
import pandas as pd
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
__all__ = [
|
|
# Constants
|
|
'REQUEST_DELAY',
|
|
# Rate limiting
|
|
'rate_limit',
|
|
'fetch_page',
|
|
# Data classes
|
|
'Game',
|
|
'Stadium',
|
|
'ScraperSource',
|
|
'StadiumScraperSource',
|
|
# Fallback system
|
|
'scrape_with_fallback',
|
|
'scrape_stadiums_with_fallback',
|
|
# ID generation
|
|
'assign_stable_ids',
|
|
# Export utilities
|
|
'export_to_json',
|
|
'validate_games',
|
|
]
|
|
|
|
|
|
# =============================================================================
|
|
# RATE LIMITING
|
|
# =============================================================================
|
|
|
|
REQUEST_DELAY = 3.0 # seconds between requests to same domain
|
|
last_request_time: dict[str, float] = {}
|
|
|
|
|
|
def rate_limit(domain: str) -> None:
|
|
"""Enforce rate limiting per domain."""
|
|
now = time.time()
|
|
if domain in last_request_time:
|
|
elapsed = now - last_request_time[domain]
|
|
if elapsed < REQUEST_DELAY:
|
|
time.sleep(REQUEST_DELAY - elapsed)
|
|
last_request_time[domain] = time.time()
|
|
|
|
|
|
def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]:
|
|
"""Fetch and parse a webpage with rate limiting."""
|
|
rate_limit(domain)
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'Sec-Fetch-Dest': 'document',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Site': 'none',
|
|
'Sec-Fetch-User': '?1',
|
|
'Cache-Control': 'max-age=0',
|
|
}
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
return BeautifulSoup(response.content, 'html.parser')
|
|
except Exception as e:
|
|
print(f"Error fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
# =============================================================================
|
|
# DATA CLASSES
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class Game:
|
|
"""Represents a single game."""
|
|
id: str
|
|
sport: str
|
|
season: str
|
|
date: str # YYYY-MM-DD
|
|
time: Optional[str] # HH:MM (24hr, ET)
|
|
home_team: str
|
|
away_team: str
|
|
home_team_abbrev: str
|
|
away_team_abbrev: str
|
|
venue: str
|
|
source: str
|
|
is_playoff: bool = False
|
|
broadcast: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class Stadium:
|
|
"""Represents a stadium/arena/ballpark."""
|
|
id: str
|
|
name: str
|
|
city: str
|
|
state: str
|
|
latitude: float
|
|
longitude: float
|
|
capacity: int
|
|
sport: str
|
|
team_abbrevs: list
|
|
source: str
|
|
year_opened: Optional[int] = None
|
|
|
|
|
|
# =============================================================================
|
|
# MULTI-SOURCE FALLBACK SYSTEM
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class ScraperSource:
|
|
"""Represents a single data source for scraping games."""
|
|
name: str
|
|
scraper_func: Callable[[int], list] # Takes season, returns list[Game]
|
|
priority: int = 1 # Lower = higher priority (1 is best)
|
|
min_games: int = 10 # Minimum games to consider successful
|
|
|
|
|
|
def scrape_with_fallback(
|
|
sport: str,
|
|
season: int,
|
|
sources: list[ScraperSource],
|
|
verbose: bool = True
|
|
) -> list[Game]:
|
|
"""
|
|
Try multiple sources in priority order until one succeeds.
|
|
|
|
Args:
|
|
sport: Sport name for logging
|
|
season: Season year
|
|
sources: List of ScraperSource configs, sorted by priority
|
|
verbose: Whether to print status messages
|
|
|
|
Returns:
|
|
List of Game objects from the first successful source
|
|
"""
|
|
sources = sorted(sources, key=lambda s: s.priority)
|
|
|
|
for i, source in enumerate(sources):
|
|
try:
|
|
if verbose:
|
|
attempt = f"[{i+1}/{len(sources)}]"
|
|
print(f" {attempt} Trying {source.name}...")
|
|
|
|
games = source.scraper_func(season)
|
|
|
|
if games and len(games) >= source.min_games:
|
|
if verbose:
|
|
print(f" ✓ {source.name} returned {len(games)} games")
|
|
return games
|
|
else:
|
|
if verbose:
|
|
count = len(games) if games else 0
|
|
print(f" ✗ {source.name} returned only {count} games (min: {source.min_games})")
|
|
|
|
except Exception as e:
|
|
if verbose:
|
|
print(f" ✗ {source.name} failed: {e}")
|
|
continue
|
|
|
|
# All sources failed
|
|
if verbose:
|
|
print(f" ⚠ All {len(sources)} sources failed for {sport}")
|
|
return []
|
|
|
|
|
|
@dataclass
|
|
class StadiumScraperSource:
|
|
"""Represents a single data source for stadium scraping."""
|
|
name: str
|
|
scraper_func: Callable[[], list] # Returns list[Stadium]
|
|
priority: int = 1 # Lower = higher priority (1 is best)
|
|
min_venues: int = 5 # Minimum venues to consider successful
|
|
|
|
|
|
def scrape_stadiums_with_fallback(
|
|
sport: str,
|
|
sources: list[StadiumScraperSource],
|
|
verbose: bool = True
|
|
) -> list[Stadium]:
|
|
"""
|
|
Try multiple stadium sources in priority order until one succeeds.
|
|
|
|
Args:
|
|
sport: Sport name for logging
|
|
sources: List of StadiumScraperSource configs, sorted by priority
|
|
verbose: Whether to print status messages
|
|
|
|
Returns:
|
|
List of Stadium objects from the first successful source
|
|
"""
|
|
sources = sorted(sources, key=lambda s: s.priority)
|
|
|
|
for i, source in enumerate(sources):
|
|
try:
|
|
if verbose:
|
|
attempt = f"[{i+1}/{len(sources)}]"
|
|
print(f" {attempt} Trying {source.name}...")
|
|
|
|
stadiums = source.scraper_func()
|
|
|
|
if stadiums and len(stadiums) >= source.min_venues:
|
|
if verbose:
|
|
print(f" ✓ {source.name} returned {len(stadiums)} venues")
|
|
return stadiums
|
|
else:
|
|
if verbose:
|
|
count = len(stadiums) if stadiums else 0
|
|
print(f" ✗ {source.name} returned only {count} venues (min: {source.min_venues})")
|
|
|
|
except Exception as e:
|
|
if verbose:
|
|
print(f" ✗ {source.name} failed: {e}")
|
|
continue
|
|
|
|
# All sources failed
|
|
if verbose:
|
|
print(f" ⚠ All {len(sources)} sources failed for {sport}")
|
|
return []
|
|
|
|
|
|
# =============================================================================
|
|
# ID GENERATION
|
|
# =============================================================================
|
|
|
|
def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]:
|
|
"""
|
|
Assign IDs based on matchup + date.
|
|
Format: {sport}_{season}_{away}_{home}_{MMDD} (or {MMDD}_2 for doubleheaders)
|
|
|
|
When games are rescheduled, the old ID becomes orphaned and a new one is created.
|
|
Use --delete-all before import to clean up orphaned records.
|
|
"""
|
|
season_str = season.replace('-', '')
|
|
|
|
# Track how many times we've seen each base ID (for doubleheaders)
|
|
id_counts: dict[str, int] = defaultdict(int)
|
|
|
|
for game in games:
|
|
away = game.away_team_abbrev.lower()
|
|
home = game.home_team_abbrev.lower()
|
|
# Extract MMDD from date (YYYY-MM-DD)
|
|
date_parts = game.date.split('-')
|
|
mmdd = f"{date_parts[1]}{date_parts[2]}" if len(date_parts) == 3 else "0000"
|
|
|
|
base_id = f"{sport.lower()}_{season_str}_{away}_{home}_{mmdd}"
|
|
id_counts[base_id] += 1
|
|
|
|
# Add suffix for doubleheaders (game 2+)
|
|
if id_counts[base_id] > 1:
|
|
game.id = f"{base_id}_{id_counts[base_id]}"
|
|
else:
|
|
game.id = base_id
|
|
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# EXPORT UTILITIES
|
|
# =============================================================================
|
|
|
|
def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path) -> None:
|
|
"""
|
|
Export scraped data to organized JSON files.
|
|
|
|
Structure:
|
|
data/
|
|
games/
|
|
mlb_2025.json
|
|
nba_2025.json
|
|
...
|
|
canonical/
|
|
stadiums.json
|
|
stadiums.json (legacy, for backward compatibility)
|
|
"""
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create subdirectories
|
|
games_dir = output_dir / 'games'
|
|
canonical_dir = output_dir / 'canonical'
|
|
games_dir.mkdir(exist_ok=True)
|
|
canonical_dir.mkdir(exist_ok=True)
|
|
|
|
# Group games by sport and season
|
|
games_by_sport_season: dict[str, list[Game]] = {}
|
|
for game in games:
|
|
sport = game.sport.lower()
|
|
season = game.season
|
|
key = f"{sport}_{season}"
|
|
if key not in games_by_sport_season:
|
|
games_by_sport_season[key] = []
|
|
games_by_sport_season[key].append(game)
|
|
|
|
# Export games by sport/season
|
|
total_exported = 0
|
|
for key, sport_games in games_by_sport_season.items():
|
|
games_data = [asdict(g) for g in sport_games]
|
|
filepath = games_dir / f"{key}.json"
|
|
with open(filepath, 'w') as f:
|
|
json.dump(games_data, f, indent=2)
|
|
print(f" Exported {len(sport_games):,} games to games/{key}.json")
|
|
total_exported += len(sport_games)
|
|
|
|
# Export combined games.json for backward compatibility
|
|
all_games_data = [asdict(g) for g in games]
|
|
with open(output_dir / 'games.json', 'w') as f:
|
|
json.dump(all_games_data, f, indent=2)
|
|
|
|
# Export stadiums to canonical/
|
|
stadiums_data = [asdict(s) for s in stadiums]
|
|
with open(canonical_dir / 'stadiums.json', 'w') as f:
|
|
json.dump(stadiums_data, f, indent=2)
|
|
|
|
# Also export to root for backward compatibility
|
|
with open(output_dir / 'stadiums.json', 'w') as f:
|
|
json.dump(stadiums_data, f, indent=2)
|
|
|
|
# Export as CSV for easy viewing
|
|
if games:
|
|
df_games = pd.DataFrame(all_games_data)
|
|
df_games.to_csv(output_dir / 'games.csv', index=False)
|
|
|
|
if stadiums:
|
|
df_stadiums = pd.DataFrame(stadiums_data)
|
|
df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False)
|
|
|
|
print(f"\nExported {total_exported:,} games across {len(games_by_sport_season)} sport/season files")
|
|
print(f"Exported {len(stadiums):,} stadiums to canonical/stadiums.json")
|
|
|
|
|
|
def validate_games(games_by_source: dict[str, list[Game]]) -> dict:
|
|
"""
|
|
Cross-validate games from multiple sources.
|
|
Returns discrepancies.
|
|
"""
|
|
discrepancies = {
|
|
'missing_in_source': [],
|
|
'date_mismatch': [],
|
|
'time_mismatch': [],
|
|
'venue_mismatch': [],
|
|
}
|
|
|
|
sources = list(games_by_source.keys())
|
|
if len(sources) < 2:
|
|
return discrepancies
|
|
|
|
primary = sources[0]
|
|
primary_games = {g.id: g for g in games_by_source[primary]}
|
|
|
|
for source in sources[1:]:
|
|
secondary_games = {g.id: g for g in games_by_source[source]}
|
|
|
|
for game_id, game in primary_games.items():
|
|
if game_id not in secondary_games:
|
|
discrepancies['missing_in_source'].append({
|
|
'game_id': game_id,
|
|
'present_in': primary,
|
|
'missing_in': source
|
|
})
|
|
|
|
return discrepancies
|