feat(01-01): create core.py shared module
- Rate limiting utilities (REQUEST_DELAY, rate_limit, fetch_page) - Data classes (Game, Stadium) - Multi-source fallback system (ScraperSource, scrape_with_fallback) - Stadium fallback system (StadiumScraperSource, scrape_stadiums_with_fallback) - ID generation (assign_stable_ids) - Export utilities (export_to_json, validate_games) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
384
Scripts/core.py
Normal file
384
Scripts/core.py
Normal file
@@ -0,0 +1,384 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Core shared utilities for SportsTime data scrapers.
|
||||
|
||||
This module provides:
|
||||
- Rate limiting utilities
|
||||
- Data classes (Game, Stadium)
|
||||
- Multi-source fallback system
|
||||
- ID generation
|
||||
- Export utilities
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, asdict, field
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
__all__ = [
|
||||
# Constants
|
||||
'REQUEST_DELAY',
|
||||
# Rate limiting
|
||||
'rate_limit',
|
||||
'fetch_page',
|
||||
# Data classes
|
||||
'Game',
|
||||
'Stadium',
|
||||
'ScraperSource',
|
||||
'StadiumScraperSource',
|
||||
# Fallback system
|
||||
'scrape_with_fallback',
|
||||
'scrape_stadiums_with_fallback',
|
||||
# ID generation
|
||||
'assign_stable_ids',
|
||||
# Export utilities
|
||||
'export_to_json',
|
||||
'validate_games',
|
||||
]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# RATE LIMITING
|
||||
# =============================================================================
|
||||
|
||||
REQUEST_DELAY = 3.0 # seconds between requests to same domain
|
||||
last_request_time: dict[str, float] = {}
|
||||
|
||||
|
||||
def rate_limit(domain: str) -> None:
|
||||
"""Enforce rate limiting per domain."""
|
||||
now = time.time()
|
||||
if domain in last_request_time:
|
||||
elapsed = now - last_request_time[domain]
|
||||
if elapsed < REQUEST_DELAY:
|
||||
time.sleep(REQUEST_DELAY - elapsed)
|
||||
last_request_time[domain] = time.time()
|
||||
|
||||
|
||||
def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]:
|
||||
"""Fetch and parse a webpage with rate limiting."""
|
||||
rate_limit(domain)
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
return BeautifulSoup(response.content, 'html.parser')
|
||||
except Exception as e:
|
||||
print(f"Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DATA CLASSES
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class Game:
|
||||
"""Represents a single game."""
|
||||
id: str
|
||||
sport: str
|
||||
season: str
|
||||
date: str # YYYY-MM-DD
|
||||
time: Optional[str] # HH:MM (24hr, ET)
|
||||
home_team: str
|
||||
away_team: str
|
||||
home_team_abbrev: str
|
||||
away_team_abbrev: str
|
||||
venue: str
|
||||
source: str
|
||||
is_playoff: bool = False
|
||||
broadcast: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Stadium:
|
||||
"""Represents a stadium/arena/ballpark."""
|
||||
id: str
|
||||
name: str
|
||||
city: str
|
||||
state: str
|
||||
latitude: float
|
||||
longitude: float
|
||||
capacity: int
|
||||
sport: str
|
||||
team_abbrevs: list
|
||||
source: str
|
||||
year_opened: Optional[int] = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MULTI-SOURCE FALLBACK SYSTEM
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class ScraperSource:
|
||||
"""Represents a single data source for scraping games."""
|
||||
name: str
|
||||
scraper_func: Callable[[int], list] # Takes season, returns list[Game]
|
||||
priority: int = 1 # Lower = higher priority (1 is best)
|
||||
min_games: int = 10 # Minimum games to consider successful
|
||||
|
||||
|
||||
def scrape_with_fallback(
|
||||
sport: str,
|
||||
season: int,
|
||||
sources: list[ScraperSource],
|
||||
verbose: bool = True
|
||||
) -> list[Game]:
|
||||
"""
|
||||
Try multiple sources in priority order until one succeeds.
|
||||
|
||||
Args:
|
||||
sport: Sport name for logging
|
||||
season: Season year
|
||||
sources: List of ScraperSource configs, sorted by priority
|
||||
verbose: Whether to print status messages
|
||||
|
||||
Returns:
|
||||
List of Game objects from the first successful source
|
||||
"""
|
||||
sources = sorted(sources, key=lambda s: s.priority)
|
||||
|
||||
for i, source in enumerate(sources):
|
||||
try:
|
||||
if verbose:
|
||||
attempt = f"[{i+1}/{len(sources)}]"
|
||||
print(f" {attempt} Trying {source.name}...")
|
||||
|
||||
games = source.scraper_func(season)
|
||||
|
||||
if games and len(games) >= source.min_games:
|
||||
if verbose:
|
||||
print(f" ✓ {source.name} returned {len(games)} games")
|
||||
return games
|
||||
else:
|
||||
if verbose:
|
||||
count = len(games) if games else 0
|
||||
print(f" ✗ {source.name} returned only {count} games (min: {source.min_games})")
|
||||
|
||||
except Exception as e:
|
||||
if verbose:
|
||||
print(f" ✗ {source.name} failed: {e}")
|
||||
continue
|
||||
|
||||
# All sources failed
|
||||
if verbose:
|
||||
print(f" ⚠ All {len(sources)} sources failed for {sport}")
|
||||
return []
|
||||
|
||||
|
||||
@dataclass
|
||||
class StadiumScraperSource:
|
||||
"""Represents a single data source for stadium scraping."""
|
||||
name: str
|
||||
scraper_func: Callable[[], list] # Returns list[Stadium]
|
||||
priority: int = 1 # Lower = higher priority (1 is best)
|
||||
min_venues: int = 5 # Minimum venues to consider successful
|
||||
|
||||
|
||||
def scrape_stadiums_with_fallback(
|
||||
sport: str,
|
||||
sources: list[StadiumScraperSource],
|
||||
verbose: bool = True
|
||||
) -> list[Stadium]:
|
||||
"""
|
||||
Try multiple stadium sources in priority order until one succeeds.
|
||||
|
||||
Args:
|
||||
sport: Sport name for logging
|
||||
sources: List of StadiumScraperSource configs, sorted by priority
|
||||
verbose: Whether to print status messages
|
||||
|
||||
Returns:
|
||||
List of Stadium objects from the first successful source
|
||||
"""
|
||||
sources = sorted(sources, key=lambda s: s.priority)
|
||||
|
||||
for i, source in enumerate(sources):
|
||||
try:
|
||||
if verbose:
|
||||
attempt = f"[{i+1}/{len(sources)}]"
|
||||
print(f" {attempt} Trying {source.name}...")
|
||||
|
||||
stadiums = source.scraper_func()
|
||||
|
||||
if stadiums and len(stadiums) >= source.min_venues:
|
||||
if verbose:
|
||||
print(f" ✓ {source.name} returned {len(stadiums)} venues")
|
||||
return stadiums
|
||||
else:
|
||||
if verbose:
|
||||
count = len(stadiums) if stadiums else 0
|
||||
print(f" ✗ {source.name} returned only {count} venues (min: {source.min_venues})")
|
||||
|
||||
except Exception as e:
|
||||
if verbose:
|
||||
print(f" ✗ {source.name} failed: {e}")
|
||||
continue
|
||||
|
||||
# All sources failed
|
||||
if verbose:
|
||||
print(f" ⚠ All {len(sources)} sources failed for {sport}")
|
||||
return []
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# ID GENERATION
|
||||
# =============================================================================
|
||||
|
||||
def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]:
|
||||
"""
|
||||
Assign IDs based on matchup + date.
|
||||
Format: {sport}_{season}_{away}_{home}_{MMDD} (or {MMDD}_2 for doubleheaders)
|
||||
|
||||
When games are rescheduled, the old ID becomes orphaned and a new one is created.
|
||||
Use --delete-all before import to clean up orphaned records.
|
||||
"""
|
||||
season_str = season.replace('-', '')
|
||||
|
||||
# Track how many times we've seen each base ID (for doubleheaders)
|
||||
id_counts: dict[str, int] = defaultdict(int)
|
||||
|
||||
for game in games:
|
||||
away = game.away_team_abbrev.lower()
|
||||
home = game.home_team_abbrev.lower()
|
||||
# Extract MMDD from date (YYYY-MM-DD)
|
||||
date_parts = game.date.split('-')
|
||||
mmdd = f"{date_parts[1]}{date_parts[2]}" if len(date_parts) == 3 else "0000"
|
||||
|
||||
base_id = f"{sport.lower()}_{season_str}_{away}_{home}_{mmdd}"
|
||||
id_counts[base_id] += 1
|
||||
|
||||
# Add suffix for doubleheaders (game 2+)
|
||||
if id_counts[base_id] > 1:
|
||||
game.id = f"{base_id}_{id_counts[base_id]}"
|
||||
else:
|
||||
game.id = base_id
|
||||
|
||||
return games
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# EXPORT UTILITIES
|
||||
# =============================================================================
|
||||
|
||||
def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path) -> None:
|
||||
"""
|
||||
Export scraped data to organized JSON files.
|
||||
|
||||
Structure:
|
||||
data/
|
||||
games/
|
||||
mlb_2025.json
|
||||
nba_2025.json
|
||||
...
|
||||
canonical/
|
||||
stadiums.json
|
||||
stadiums.json (legacy, for backward compatibility)
|
||||
"""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create subdirectories
|
||||
games_dir = output_dir / 'games'
|
||||
canonical_dir = output_dir / 'canonical'
|
||||
games_dir.mkdir(exist_ok=True)
|
||||
canonical_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Group games by sport and season
|
||||
games_by_sport_season: dict[str, list[Game]] = {}
|
||||
for game in games:
|
||||
sport = game.sport.lower()
|
||||
season = game.season
|
||||
key = f"{sport}_{season}"
|
||||
if key not in games_by_sport_season:
|
||||
games_by_sport_season[key] = []
|
||||
games_by_sport_season[key].append(game)
|
||||
|
||||
# Export games by sport/season
|
||||
total_exported = 0
|
||||
for key, sport_games in games_by_sport_season.items():
|
||||
games_data = [asdict(g) for g in sport_games]
|
||||
filepath = games_dir / f"{key}.json"
|
||||
with open(filepath, 'w') as f:
|
||||
json.dump(games_data, f, indent=2)
|
||||
print(f" Exported {len(sport_games):,} games to games/{key}.json")
|
||||
total_exported += len(sport_games)
|
||||
|
||||
# Export combined games.json for backward compatibility
|
||||
all_games_data = [asdict(g) for g in games]
|
||||
with open(output_dir / 'games.json', 'w') as f:
|
||||
json.dump(all_games_data, f, indent=2)
|
||||
|
||||
# Export stadiums to canonical/
|
||||
stadiums_data = [asdict(s) for s in stadiums]
|
||||
with open(canonical_dir / 'stadiums.json', 'w') as f:
|
||||
json.dump(stadiums_data, f, indent=2)
|
||||
|
||||
# Also export to root for backward compatibility
|
||||
with open(output_dir / 'stadiums.json', 'w') as f:
|
||||
json.dump(stadiums_data, f, indent=2)
|
||||
|
||||
# Export as CSV for easy viewing
|
||||
if games:
|
||||
df_games = pd.DataFrame(all_games_data)
|
||||
df_games.to_csv(output_dir / 'games.csv', index=False)
|
||||
|
||||
if stadiums:
|
||||
df_stadiums = pd.DataFrame(stadiums_data)
|
||||
df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False)
|
||||
|
||||
print(f"\nExported {total_exported:,} games across {len(games_by_sport_season)} sport/season files")
|
||||
print(f"Exported {len(stadiums):,} stadiums to canonical/stadiums.json")
|
||||
|
||||
|
||||
def validate_games(games_by_source: dict[str, list[Game]]) -> dict:
|
||||
"""
|
||||
Cross-validate games from multiple sources.
|
||||
Returns discrepancies.
|
||||
"""
|
||||
discrepancies = {
|
||||
'missing_in_source': [],
|
||||
'date_mismatch': [],
|
||||
'time_mismatch': [],
|
||||
'venue_mismatch': [],
|
||||
}
|
||||
|
||||
sources = list(games_by_source.keys())
|
||||
if len(sources) < 2:
|
||||
return discrepancies
|
||||
|
||||
primary = sources[0]
|
||||
primary_games = {g.id: g for g in games_by_source[primary]}
|
||||
|
||||
for source in sources[1:]:
|
||||
secondary_games = {g.id: g for g in games_by_source[source]}
|
||||
|
||||
for game_id, game in primary_games.items():
|
||||
if game_id not in secondary_games:
|
||||
discrepancies['missing_in_source'].append({
|
||||
'game_id': game_id,
|
||||
'present_in': primary,
|
||||
'missing_in': source
|
||||
})
|
||||
|
||||
return discrepancies
|
||||
Reference in New Issue
Block a user