#!/usr/bin/env python3 """ Sports Schedule Scraper Orchestrator This script coordinates scraping across sport-specific modules: - core.py: Shared utilities, data classes, fallback system - mlb.py: MLB scrapers - nba.py: NBA scrapers - nhl.py: NHL scrapers - nfl.py: NFL scrapers - mls.py: MLS stadiums - wnba.py: WNBA stadiums - nwsl.py: NWSL stadiums Usage: python scrape_schedules.py --sport nba --season 2026 python scrape_schedules.py --sport all --season 2026 python scrape_schedules.py --stadiums-only """ import argparse import csv import json import time from collections import defaultdict from dataclasses import asdict from datetime import datetime from io import StringIO from pathlib import Path from typing import Optional import requests # Import from core module from core import ( Game, Stadium, ScraperSource, StadiumScraperSource, fetch_page, scrape_with_fallback, scrape_stadiums_with_fallback, assign_stable_ids, export_to_json, ) # Import from sport modules (core 4 sports) from mlb import ( scrape_mlb_games, scrape_mlb_stadiums, MLB_TEAMS, ) from nba import ( scrape_nba_games, scrape_nba_stadiums, get_nba_season_string, NBA_TEAMS, ) from nhl import ( scrape_nhl_games, scrape_nhl_stadiums, get_nhl_season_string, NHL_TEAMS, ) from nfl import ( scrape_nfl_games, scrape_nfl_stadiums, get_nfl_season_string, NFL_TEAMS, ) from mls import ( MLS_TEAMS, get_mls_team_abbrev, scrape_mls_stadiums, MLS_STADIUM_SOURCES, ) from wnba import ( WNBA_TEAMS, get_wnba_team_abbrev, scrape_wnba_stadiums, WNBA_STADIUM_SOURCES, ) from nwsl import ( NWSL_TEAMS, get_nwsl_team_abbrev, scrape_nwsl_stadiums, NWSL_STADIUM_SOURCES, ) # ============================================================================= # NON-CORE SPORT SCRAPERS # NOTE: MLS, WNBA, NWSL stadiums are now imported from their respective modules # ============================================================================= def _scrape_espn_schedule(sport: str, league: str, season: int, date_range: tuple[str, str]) -> list[Game]: """ Fetch schedule from ESPN API. Shared helper for non-core sports that use ESPN API. """ games = [] sport_upper = { 'wnba': 'WNBA', 'usa.1': 'MLS', 'usa.nwsl': 'NWSL', }.get(league, league.upper()) print(f"Fetching {sport_upper} {season} from ESPN API...") url = f"https://site.api.espn.com/apis/site/v2/sports/{sport}/{league}/scoreboard" params = { 'dates': f"{date_range[0]}-{date_range[1]}", 'limit': 1000 } headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } try: response = requests.get(url, params=params, headers=headers, timeout=30) response.raise_for_status() data = response.json() events = data.get('events', []) for event in events: try: date_str = event.get('date', '')[:10] time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None competitions = event.get('competitions', [{}]) if not competitions: continue comp = competitions[0] competitors = comp.get('competitors', []) if len(competitors) < 2: continue home_team = None away_team = None home_abbrev = None away_abbrev = None for team in competitors: team_data = team.get('team', {}) team_name = team_data.get('displayName', team_data.get('name', '')) team_abbrev = team_data.get('abbreviation', '') if team.get('homeAway') == 'home': home_team = team_name home_abbrev = team_abbrev else: away_team = team_name away_abbrev = team_abbrev if not home_team or not away_team: continue venue = comp.get('venue', {}).get('fullName', '') game_id = f"{sport_upper.lower()}_{date_str}_{away_abbrev}_{home_abbrev}".lower() game = Game( id=game_id, sport=sport_upper, season=str(season), date=date_str, time=time_str, home_team=home_team, away_team=away_team, home_team_abbrev=home_abbrev or get_team_abbrev(home_team, sport_upper), away_team_abbrev=away_abbrev or get_team_abbrev(away_team, sport_upper), venue=venue, source='espn.com' ) games.append(game) except Exception: continue print(f" Found {len(games)} games from ESPN") except Exception as e: print(f"Error fetching ESPN {sport_upper}: {e}") return games def scrape_wnba_espn(season: int) -> list[Game]: """Fetch WNBA schedule from ESPN API.""" start = f"{season}0501" end = f"{season}1031" return _scrape_espn_schedule('basketball', 'wnba', season, (start, end)) def scrape_mls_espn(season: int) -> list[Game]: """Fetch MLS schedule from ESPN API.""" start = f"{season}0201" end = f"{season}1231" return _scrape_espn_schedule('soccer', 'usa.1', season, (start, end)) def scrape_nwsl_espn(season: int) -> list[Game]: """Fetch NWSL schedule from ESPN API.""" start = f"{season}0301" end = f"{season}1130" return _scrape_espn_schedule('soccer', 'usa.nwsl', season, (start, end)) def scrape_wnba_basketball_reference(season: int) -> list[Game]: """Scrape WNBA schedule from Basketball-Reference.""" games = [] url = f"https://www.basketball-reference.com/wnba/years/{season}_games.html" print(f"Scraping WNBA {season} from Basketball-Reference...") soup = fetch_page(url, 'basketball-reference.com') if not soup: return games table = soup.find('table', {'id': 'schedule'}) if not table: return games tbody = table.find('tbody') if not tbody: return games for row in tbody.find_all('tr'): if row.get('class') and 'thead' in row.get('class'): continue try: date_cell = row.find('th', {'data-stat': 'date_game'}) if not date_cell: continue date_link = date_cell.find('a') date_str = date_link.text if date_link else date_cell.text visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'}) home_cell = row.find('td', {'data-stat': 'home_team_name'}) if not visitor_cell or not home_cell: continue visitor_link = visitor_cell.find('a') home_link = home_cell.find('a') away_team = visitor_link.text if visitor_link else visitor_cell.text home_team = home_link.text if home_link else home_cell.text try: parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y') date_formatted = parsed_date.strftime('%Y-%m-%d') except: continue away_abbrev = get_team_abbrev(away_team, 'WNBA') home_abbrev = get_team_abbrev(home_team, 'WNBA') game_id = f"wnba_{date_formatted}_{away_abbrev}_{home_abbrev}".lower().replace(' ', '') game = Game( id=game_id, sport='WNBA', season=str(season), date=date_formatted, time=None, home_team=home_team, away_team=away_team, home_team_abbrev=home_abbrev, away_team_abbrev=away_abbrev, venue='', source='basketball-reference.com' ) games.append(game) except Exception: continue print(f" Found {len(games)} games from Basketball-Reference") return games def scrape_wnba_cbssports(season: int) -> list[Game]: """Fetch WNBA schedule from CBS Sports.""" games = [] print(f"Fetching WNBA {season} from CBS Sports...") # Placeholder - CBS Sports scraping would go here print(f" Found {len(games)} games from CBS Sports") return games def scrape_mls_fbref(season: int) -> list[Game]: """Scrape MLS schedule from FBref.""" games = [] print(f"Scraping MLS {season} from FBref...") # Placeholder - FBref scraping would go here print(f" Found {len(games)} games from FBref") return games def scrape_mls_mlssoccer(season: int) -> list[Game]: """Scrape MLS schedule from MLSSoccer.com.""" games = [] print(f"Scraping MLS {season} from MLSSoccer.com...") # Placeholder - MLSSoccer.com scraping would go here print(f" Found {len(games)} games from MLSSoccer.com") return games def scrape_nwsl_fbref(season: int) -> list[Game]: """Scrape NWSL schedule from FBref.""" games = [] print(f"Scraping NWSL {season} from FBref...") # Placeholder - FBref scraping would go here print(f" Found {len(games)} games from FBref") return games def scrape_nwsl_nwslsoccer(season: int) -> list[Game]: """Scrape NWSL schedule from NWSL.com.""" games = [] print(f"Scraping NWSL {season} from NWSL.com...") # Placeholder - NWSL.com scraping would go here print(f" Found {len(games)} games from NWSL.com") return games # ============================================================================= # LEGACY STADIUM FUNCTIONS # ============================================================================= def scrape_stadiums_hifld() -> list[Stadium]: """Legacy: Scrape from HIFLD open data.""" # Placeholder for legacy HIFLD scraping return [] def generate_stadiums_from_teams() -> list[Stadium]: """Generate stadium entries from team data with hardcoded coordinates.""" stadiums = [] # This function would generate stadiums from all team dictionaries # Keeping as placeholder since sport modules have their own stadium scrapers return stadiums def scrape_all_stadiums() -> list[Stadium]: """Comprehensive stadium scraping for all sports.""" all_stadiums = [] # Core sports (from modules) all_stadiums.extend(scrape_mlb_stadiums()) all_stadiums.extend(scrape_nba_stadiums()) all_stadiums.extend(scrape_nhl_stadiums()) all_stadiums.extend(scrape_nfl_stadiums()) # Non-core sports all_stadiums.extend(scrape_mls_stadiums()) all_stadiums.extend(scrape_wnba_stadiums()) all_stadiums.extend(scrape_nwsl_stadiums()) return all_stadiums # ============================================================================= # HELPERS # ============================================================================= def get_team_abbrev(team_name: str, sport: str) -> str: """Get team abbreviation from full name.""" teams = { 'NBA': NBA_TEAMS, 'MLB': MLB_TEAMS, 'NHL': NHL_TEAMS, 'NFL': NFL_TEAMS, 'WNBA': WNBA_TEAMS, 'MLS': MLS_TEAMS, 'NWSL': NWSL_TEAMS, }.get(sport, {}) for abbrev, info in teams.items(): if info['name'].lower() == team_name.lower(): return abbrev if team_name.lower() in info['name'].lower(): return abbrev # Return first 3 letters as fallback return team_name[:3].upper() # ============================================================================= # MAIN ORCHESTRATOR # ============================================================================= def main(): parser = argparse.ArgumentParser(description='Scrape sports schedules') parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'all'], default='all') parser.add_argument('--season', type=int, default=2026, help='Season year (ending year)') parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data (legacy method)') parser.add_argument('--stadiums-update', action='store_true', help='Scrape ALL stadium data for all 8 sports (comprehensive)') parser.add_argument('--output', type=str, default='./data', help='Output directory') args = parser.parse_args() output_dir = Path(args.output) all_games = [] all_stadiums = [] # Scrape stadiums print("\n" + "="*60) print("SCRAPING STADIUMS") print("="*60) if args.stadiums_update: print("Using comprehensive stadium scrapers for all sports...") all_stadiums.extend(scrape_all_stadiums()) print(f" Total stadiums scraped: {len(all_stadiums)}") else: all_stadiums.extend(scrape_stadiums_hifld()) all_stadiums.extend(generate_stadiums_from_teams()) # If stadiums-only mode, export and exit if args.stadiums_only: export_to_json([], all_stadiums, output_dir) return # Scrape schedules using sport modules if args.sport in ['nba', 'all']: print("\n" + "="*60) print(f"SCRAPING NBA {args.season}") print("="*60) nba_games = scrape_nba_games(args.season) nba_season = get_nba_season_string(args.season) nba_games = assign_stable_ids(nba_games, 'NBA', nba_season) all_games.extend(nba_games) if args.sport in ['mlb', 'all']: print("\n" + "="*60) print(f"SCRAPING MLB {args.season}") print("="*60) mlb_games = scrape_mlb_games(args.season) mlb_games = assign_stable_ids(mlb_games, 'MLB', str(args.season)) all_games.extend(mlb_games) if args.sport in ['nhl', 'all']: print("\n" + "="*60) print(f"SCRAPING NHL {args.season}") print("="*60) nhl_games = scrape_nhl_games(args.season) nhl_season = get_nhl_season_string(args.season) nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season) all_games.extend(nhl_games) if args.sport in ['nfl', 'all']: print("\n" + "="*60) print(f"SCRAPING NFL {args.season}") print("="*60) nfl_games = scrape_nfl_games(args.season) nfl_season = get_nfl_season_string(args.season) nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season) all_games.extend(nfl_games) # Non-core sports (TODO: Extract to modules) if args.sport in ['wnba', 'all']: print("\n" + "="*60) print(f"SCRAPING WNBA {args.season}") print("="*60) wnba_sources = [ ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100), ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100), ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50), ] wnba_games = scrape_with_fallback('WNBA', args.season, wnba_sources) wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(args.season)) all_games.extend(wnba_games) if args.sport in ['mls', 'all']: print("\n" + "="*60) print(f"SCRAPING MLS {args.season}") print("="*60) mls_sources = [ ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200), ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100), ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100), ] mls_games = scrape_with_fallback('MLS', args.season, mls_sources) mls_games = assign_stable_ids(mls_games, 'MLS', str(args.season)) all_games.extend(mls_games) if args.sport in ['nwsl', 'all']: print("\n" + "="*60) print(f"SCRAPING NWSL {args.season}") print("="*60) nwsl_sources = [ ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100), ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50), ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50), ] nwsl_games = scrape_with_fallback('NWSL', args.season, nwsl_sources) nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(args.season)) all_games.extend(nwsl_games) # Export print("\n" + "="*60) print("EXPORTING DATA") print("="*60) export_to_json(all_games, all_stadiums, output_dir) # Summary print("\n" + "="*60) print("SUMMARY") print("="*60) print(f"Total games scraped: {len(all_games)}") print(f"Total stadiums: {len(all_stadiums)}") by_sport = {} for g in all_games: by_sport[g.sport] = by_sport.get(g.sport, 0) + 1 for sport, count in by_sport.items(): print(f" {sport}: {count} games") if __name__ == '__main__': main()