#!/usr/bin/env python3 """ Sports Schedule Scraper Orchestrator This script coordinates scraping across sport-specific modules: - core.py: Shared utilities, data classes, fallback system - mlb.py: MLB scrapers - nba.py: NBA scrapers - nhl.py: NHL scrapers - nfl.py: NFL scrapers Non-core sports (WNBA, MLS, NWSL, CBB) remain inline pending extraction. Usage: python scrape_schedules.py --sport nba --season 2026 python scrape_schedules.py --sport all --season 2026 python scrape_schedules.py --stadiums-only """ import argparse import csv import json import time from collections import defaultdict from dataclasses import asdict from datetime import datetime from io import StringIO from pathlib import Path from typing import Optional import requests # Import from core module from core import ( Game, Stadium, ScraperSource, StadiumScraperSource, fetch_page, scrape_with_fallback, scrape_stadiums_with_fallback, assign_stable_ids, export_to_json, ) # Import from sport modules (core 4 sports) from mlb import ( scrape_mlb_games, scrape_mlb_stadiums, MLB_TEAMS, ) from nba import ( scrape_nba_games, scrape_nba_stadiums, get_nba_season_string, NBA_TEAMS, ) from nhl import ( scrape_nhl_games, scrape_nhl_stadiums, get_nhl_season_string, NHL_TEAMS, ) from nfl import ( scrape_nfl_games, scrape_nfl_stadiums, get_nfl_season_string, NFL_TEAMS, ) from mls import ( MLS_TEAMS, get_mls_team_abbrev, scrape_mls_stadiums, MLS_STADIUM_SOURCES, ) from wnba import ( WNBA_TEAMS, get_wnba_team_abbrev, scrape_wnba_stadiums, WNBA_STADIUM_SOURCES, ) # ============================================================================= # NON-CORE SPORT TEAM MAPPINGS # TODO: Extract to separate modules (nwsl.py, cbb.py) # NOTE: MLS_TEAMS is now imported from mls.py # NOTE: WNBA_TEAMS is now imported from wnba.py # ============================================================================= NWSL_TEAMS = { 'LA': {'name': 'Angel City FC', 'city': 'Los Angeles', 'stadium': 'BMO Stadium'}, 'SJ': {'name': 'Bay FC', 'city': 'San Jose', 'stadium': 'PayPal Park'}, 'CHI': {'name': 'Chicago Red Stars', 'city': 'Bridgeview', 'stadium': 'SeatGeek Stadium'}, 'HOU': {'name': 'Houston Dash', 'city': 'Houston', 'stadium': 'Shell Energy Stadium'}, 'KC': {'name': 'Kansas City Current', 'city': 'Kansas City', 'stadium': 'CPKC Stadium'}, 'NJ': {'name': 'NJ/NY Gotham FC', 'city': 'Harrison', 'stadium': 'Red Bull Arena'}, 'NC': {'name': 'North Carolina Courage', 'city': 'Cary', 'stadium': 'WakeMed Soccer Park'}, 'ORL': {'name': 'Orlando Pride', 'city': 'Orlando', 'stadium': 'Inter&Co Stadium'}, 'POR': {'name': 'Portland Thorns FC', 'city': 'Portland', 'stadium': 'Providence Park'}, 'SEA': {'name': 'Seattle Reign FC', 'city': 'Seattle', 'stadium': 'Lumen Field'}, 'SD': {'name': 'San Diego Wave FC', 'city': 'San Diego', 'stadium': 'Snapdragon Stadium'}, 'UTA': {'name': 'Utah Royals FC', 'city': 'Sandy', 'stadium': 'America First Field'}, 'WAS': {'name': 'Washington Spirit', 'city': 'Washington', 'stadium': 'Audi Field'}, } # ============================================================================= # NON-CORE SPORT SCRAPERS # TODO: Extract to separate modules (wnba.py, mls.py, nwsl.py, cbb.py) # ============================================================================= def _scrape_espn_schedule(sport: str, league: str, season: int, date_range: tuple[str, str]) -> list[Game]: """ Fetch schedule from ESPN API. Shared helper for non-core sports that use ESPN API. """ games = [] sport_upper = { 'wnba': 'WNBA', 'usa.1': 'MLS', 'usa.nwsl': 'NWSL', 'mens-college-basketball': 'CBB' }.get(league, league.upper()) print(f"Fetching {sport_upper} {season} from ESPN API...") url = f"https://site.api.espn.com/apis/site/v2/sports/{sport}/{league}/scoreboard" params = { 'dates': f"{date_range[0]}-{date_range[1]}", 'limit': 1000 } headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } try: response = requests.get(url, params=params, headers=headers, timeout=30) response.raise_for_status() data = response.json() events = data.get('events', []) for event in events: try: date_str = event.get('date', '')[:10] time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None competitions = event.get('competitions', [{}]) if not competitions: continue comp = competitions[0] competitors = comp.get('competitors', []) if len(competitors) < 2: continue home_team = None away_team = None home_abbrev = None away_abbrev = None for team in competitors: team_data = team.get('team', {}) team_name = team_data.get('displayName', team_data.get('name', '')) team_abbrev = team_data.get('abbreviation', '') if team.get('homeAway') == 'home': home_team = team_name home_abbrev = team_abbrev else: away_team = team_name away_abbrev = team_abbrev if not home_team or not away_team: continue venue = comp.get('venue', {}).get('fullName', '') game_id = f"{sport_upper.lower()}_{date_str}_{away_abbrev}_{home_abbrev}".lower() game = Game( id=game_id, sport=sport_upper, season=str(season), date=date_str, time=time_str, home_team=home_team, away_team=away_team, home_team_abbrev=home_abbrev or get_team_abbrev(home_team, sport_upper), away_team_abbrev=away_abbrev or get_team_abbrev(away_team, sport_upper), venue=venue, source='espn.com' ) games.append(game) except Exception: continue print(f" Found {len(games)} games from ESPN") except Exception as e: print(f"Error fetching ESPN {sport_upper}: {e}") return games def scrape_wnba_espn(season: int) -> list[Game]: """Fetch WNBA schedule from ESPN API.""" start = f"{season}0501" end = f"{season}1031" return _scrape_espn_schedule('basketball', 'wnba', season, (start, end)) def scrape_mls_espn(season: int) -> list[Game]: """Fetch MLS schedule from ESPN API.""" start = f"{season}0201" end = f"{season}1231" return _scrape_espn_schedule('soccer', 'usa.1', season, (start, end)) def scrape_nwsl_espn(season: int) -> list[Game]: """Fetch NWSL schedule from ESPN API.""" start = f"{season}0301" end = f"{season}1130" return _scrape_espn_schedule('soccer', 'usa.nwsl', season, (start, end)) def scrape_cbb_espn(season: int) -> list[Game]: """Fetch College Basketball schedule from ESPN API (D1 only).""" start = f"{season-1}1101" end = f"{season}0415" return _scrape_espn_schedule('basketball', 'mens-college-basketball', season, (start, end)) def scrape_wnba_basketball_reference(season: int) -> list[Game]: """Scrape WNBA schedule from Basketball-Reference.""" games = [] url = f"https://www.basketball-reference.com/wnba/years/{season}_games.html" print(f"Scraping WNBA {season} from Basketball-Reference...") soup = fetch_page(url, 'basketball-reference.com') if not soup: return games table = soup.find('table', {'id': 'schedule'}) if not table: return games tbody = table.find('tbody') if not tbody: return games for row in tbody.find_all('tr'): if row.get('class') and 'thead' in row.get('class'): continue try: date_cell = row.find('th', {'data-stat': 'date_game'}) if not date_cell: continue date_link = date_cell.find('a') date_str = date_link.text if date_link else date_cell.text visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'}) home_cell = row.find('td', {'data-stat': 'home_team_name'}) if not visitor_cell or not home_cell: continue visitor_link = visitor_cell.find('a') home_link = home_cell.find('a') away_team = visitor_link.text if visitor_link else visitor_cell.text home_team = home_link.text if home_link else home_cell.text try: parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y') date_formatted = parsed_date.strftime('%Y-%m-%d') except: continue away_abbrev = get_team_abbrev(away_team, 'WNBA') home_abbrev = get_team_abbrev(home_team, 'WNBA') game_id = f"wnba_{date_formatted}_{away_abbrev}_{home_abbrev}".lower().replace(' ', '') game = Game( id=game_id, sport='WNBA', season=str(season), date=date_formatted, time=None, home_team=home_team, away_team=away_team, home_team_abbrev=home_abbrev, away_team_abbrev=away_abbrev, venue='', source='basketball-reference.com' ) games.append(game) except Exception: continue print(f" Found {len(games)} games from Basketball-Reference") return games def scrape_wnba_cbssports(season: int) -> list[Game]: """Fetch WNBA schedule from CBS Sports.""" games = [] print(f"Fetching WNBA {season} from CBS Sports...") # Placeholder - CBS Sports scraping would go here print(f" Found {len(games)} games from CBS Sports") return games def scrape_mls_fbref(season: int) -> list[Game]: """Scrape MLS schedule from FBref.""" games = [] print(f"Scraping MLS {season} from FBref...") # Placeholder - FBref scraping would go here print(f" Found {len(games)} games from FBref") return games def scrape_mls_mlssoccer(season: int) -> list[Game]: """Scrape MLS schedule from MLSSoccer.com.""" games = [] print(f"Scraping MLS {season} from MLSSoccer.com...") # Placeholder - MLSSoccer.com scraping would go here print(f" Found {len(games)} games from MLSSoccer.com") return games def scrape_nwsl_fbref(season: int) -> list[Game]: """Scrape NWSL schedule from FBref.""" games = [] print(f"Scraping NWSL {season} from FBref...") # Placeholder - FBref scraping would go here print(f" Found {len(games)} games from FBref") return games def scrape_nwsl_nwslsoccer(season: int) -> list[Game]: """Scrape NWSL schedule from NWSL.com.""" games = [] print(f"Scraping NWSL {season} from NWSL.com...") # Placeholder - NWSL.com scraping would go here print(f" Found {len(games)} games from NWSL.com") return games def scrape_cbb_sports_reference(season: int) -> list[Game]: """Scrape College Basketball schedule from Sports-Reference.""" games = [] print(f"Scraping CBB {season} from Sports-Reference...") # Placeholder - Sports-Reference scraping would go here print(f" Found {len(games)} games from Sports-Reference") return games def scrape_cbb_cbssports(season: int) -> list[Game]: """Fetch College Basketball schedule from CBS Sports.""" games = [] print(f"Fetching CBB {season} from CBS Sports...") # Placeholder - CBS Sports scraping would go here print(f" Found {len(games)} games from CBS Sports") return games # ============================================================================= # NON-CORE STADIUM SCRAPERS # TODO: Extract to separate modules (nwsl.py, cbb.py) # NOTE: scrape_mls_stadiums() is now imported from mls.py # NOTE: scrape_wnba_stadiums() is now imported from wnba.py # ============================================================================= def scrape_nwsl_stadiums() -> list[Stadium]: """Fetch NWSL stadium data (hardcoded).""" print("\nNWSL STADIUMS") print("-" * 40) stadiums = [] # Would include NWSL stadium data here print(f" Found {len(stadiums)} NWSL stadiums") return stadiums def scrape_cbb_stadiums() -> list[Stadium]: """Fetch College Basketball arena data.""" print("\nCBB STADIUMS") print("-" * 40) stadiums = [] # Would include CBB arena data here print(f" Found {len(stadiums)} CBB arenas") return stadiums # ============================================================================= # LEGACY STADIUM FUNCTIONS # ============================================================================= def scrape_stadiums_hifld() -> list[Stadium]: """Legacy: Scrape from HIFLD open data.""" # Placeholder for legacy HIFLD scraping return [] def generate_stadiums_from_teams() -> list[Stadium]: """Generate stadium entries from team data with hardcoded coordinates.""" stadiums = [] # This function would generate stadiums from all team dictionaries # Keeping as placeholder since sport modules have their own stadium scrapers return stadiums def scrape_all_stadiums() -> list[Stadium]: """Comprehensive stadium scraping for all sports.""" all_stadiums = [] # Core sports (from modules) all_stadiums.extend(scrape_mlb_stadiums()) all_stadiums.extend(scrape_nba_stadiums()) all_stadiums.extend(scrape_nhl_stadiums()) all_stadiums.extend(scrape_nfl_stadiums()) # Non-core sports all_stadiums.extend(scrape_mls_stadiums()) all_stadiums.extend(scrape_wnba_stadiums()) all_stadiums.extend(scrape_nwsl_stadiums()) all_stadiums.extend(scrape_cbb_stadiums()) return all_stadiums # ============================================================================= # HELPERS # ============================================================================= def get_team_abbrev(team_name: str, sport: str) -> str: """Get team abbreviation from full name.""" teams = { 'NBA': NBA_TEAMS, 'MLB': MLB_TEAMS, 'NHL': NHL_TEAMS, 'NFL': NFL_TEAMS, 'WNBA': WNBA_TEAMS, 'MLS': MLS_TEAMS, 'NWSL': NWSL_TEAMS, }.get(sport, {}) for abbrev, info in teams.items(): if info['name'].lower() == team_name.lower(): return abbrev if team_name.lower() in info['name'].lower(): return abbrev # Return first 3 letters as fallback return team_name[:3].upper() # ============================================================================= # MAIN ORCHESTRATOR # ============================================================================= def main(): parser = argparse.ArgumentParser(description='Scrape sports schedules') parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all') parser.add_argument('--season', type=int, default=2026, help='Season year (ending year)') parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data (legacy method)') parser.add_argument('--stadiums-update', action='store_true', help='Scrape ALL stadium data for all 8 sports (comprehensive)') parser.add_argument('--output', type=str, default='./data', help='Output directory') args = parser.parse_args() output_dir = Path(args.output) all_games = [] all_stadiums = [] # Scrape stadiums print("\n" + "="*60) print("SCRAPING STADIUMS") print("="*60) if args.stadiums_update: print("Using comprehensive stadium scrapers for all sports...") all_stadiums.extend(scrape_all_stadiums()) print(f" Total stadiums scraped: {len(all_stadiums)}") else: all_stadiums.extend(scrape_stadiums_hifld()) all_stadiums.extend(generate_stadiums_from_teams()) # If stadiums-only mode, export and exit if args.stadiums_only: export_to_json([], all_stadiums, output_dir) return # Scrape schedules using sport modules if args.sport in ['nba', 'all']: print("\n" + "="*60) print(f"SCRAPING NBA {args.season}") print("="*60) nba_games = scrape_nba_games(args.season) nba_season = get_nba_season_string(args.season) nba_games = assign_stable_ids(nba_games, 'NBA', nba_season) all_games.extend(nba_games) if args.sport in ['mlb', 'all']: print("\n" + "="*60) print(f"SCRAPING MLB {args.season}") print("="*60) mlb_games = scrape_mlb_games(args.season) mlb_games = assign_stable_ids(mlb_games, 'MLB', str(args.season)) all_games.extend(mlb_games) if args.sport in ['nhl', 'all']: print("\n" + "="*60) print(f"SCRAPING NHL {args.season}") print("="*60) nhl_games = scrape_nhl_games(args.season) nhl_season = get_nhl_season_string(args.season) nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season) all_games.extend(nhl_games) if args.sport in ['nfl', 'all']: print("\n" + "="*60) print(f"SCRAPING NFL {args.season}") print("="*60) nfl_games = scrape_nfl_games(args.season) nfl_season = get_nfl_season_string(args.season) nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season) all_games.extend(nfl_games) # Non-core sports (TODO: Extract to modules) if args.sport in ['wnba', 'all']: print("\n" + "="*60) print(f"SCRAPING WNBA {args.season}") print("="*60) wnba_sources = [ ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100), ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100), ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50), ] wnba_games = scrape_with_fallback('WNBA', args.season, wnba_sources) wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(args.season)) all_games.extend(wnba_games) if args.sport in ['mls', 'all']: print("\n" + "="*60) print(f"SCRAPING MLS {args.season}") print("="*60) mls_sources = [ ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200), ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100), ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100), ] mls_games = scrape_with_fallback('MLS', args.season, mls_sources) mls_games = assign_stable_ids(mls_games, 'MLS', str(args.season)) all_games.extend(mls_games) if args.sport in ['nwsl', 'all']: print("\n" + "="*60) print(f"SCRAPING NWSL {args.season}") print("="*60) nwsl_sources = [ ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100), ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50), ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50), ] nwsl_games = scrape_with_fallback('NWSL', args.season, nwsl_sources) nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(args.season)) all_games.extend(nwsl_games) if args.sport in ['cbb', 'all']: print("\n" + "="*60) print(f"SCRAPING CBB {args.season}") print("="*60) cbb_sources = [ ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000), ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500), ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300), ] cbb_games = scrape_with_fallback('CBB', args.season, cbb_sources) cbb_season = f"{args.season-1}-{str(args.season)[2:]}" cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season) all_games.extend(cbb_games) # Export print("\n" + "="*60) print("EXPORTING DATA") print("="*60) export_to_json(all_games, all_stadiums, output_dir) # Summary print("\n" + "="*60) print("SUMMARY") print("="*60) print(f"Total games scraped: {len(all_games)}") print(f"Total stadiums: {len(all_stadiums)}") by_sport = {} for g in all_games: by_sport[g.sport] = by_sport.get(g.sport, 0) + 1 for sport, count in by_sport.items(): print(f" {sport}: {count} games") if __name__ == '__main__': main()