Sportstime/Scripts/scrape_schedules.py

#!/usr/bin/env python3
"""
Sports Schedule Scraper Orchestrator

This script coordinates scraping across sport-specific modules:
- core.py: Shared utilities, data classes, fallback system
- mlb.py: MLB scrapers
- nba.py: NBA scrapers
- nhl.py: NHL scrapers
- nfl.py: NFL scrapers

Non-core sports (WNBA, MLS, NWSL, CBB) remain inline pending extraction.

Usage:
    python scrape_schedules.py --sport nba --season 2026
    python scrape_schedules.py --sport all --season 2026
    python scrape_schedules.py --stadiums-only
"""

import argparse
import csv
import json
import time
from collections import defaultdict
from dataclasses import asdict
from datetime import datetime
from io import StringIO
from pathlib import Path
from typing import Optional

import requests

# Import from core module
from core import (
    Game,
    Stadium,
    ScraperSource,
    StadiumScraperSource,
    fetch_page,
    scrape_with_fallback,
    scrape_stadiums_with_fallback,
    assign_stable_ids,
    export_to_json,
)

# Import from sport modules (core 4 sports)
from mlb import (
    scrape_mlb_games,
    scrape_mlb_stadiums,
    MLB_TEAMS,
)
from nba import (
    scrape_nba_games,
    scrape_nba_stadiums,
    get_nba_season_string,
    NBA_TEAMS,
)
from nhl import (
    scrape_nhl_games,
    scrape_nhl_stadiums,
    get_nhl_season_string,
    NHL_TEAMS,
)
from nfl import (
    scrape_nfl_games,
    scrape_nfl_stadiums,
    get_nfl_season_string,
    NFL_TEAMS,
)
from mls import (
    MLS_TEAMS,
    get_mls_team_abbrev,
    scrape_mls_stadiums,
    MLS_STADIUM_SOURCES,
)


# =============================================================================
# NON-CORE SPORT TEAM MAPPINGS
# TODO: Extract to separate modules (wnba.py, nwsl.py, cbb.py)
# NOTE: MLS_TEAMS is now imported from mls.py
# =============================================================================

WNBA_TEAMS = {
    'ATL': {'name': 'Atlanta Dream', 'city': 'Atlanta', 'arena': 'Gateway Center Arena'},
    'CHI': {'name': 'Chicago Sky', 'city': 'Chicago', 'arena': 'Wintrust Arena'},
    'CON': {'name': 'Connecticut Sun', 'city': 'Uncasville', 'arena': 'Mohegan Sun Arena'},
    'DAL': {'name': 'Dallas Wings', 'city': 'Arlington', 'arena': 'College Park Center'},
    'GSV': {'name': 'Golden State Valkyries', 'city': 'San Francisco', 'arena': 'Chase Center'},
    'IND': {'name': 'Indiana Fever', 'city': 'Indianapolis', 'arena': 'Gainbridge Fieldhouse'},
    'LVA': {'name': 'Las Vegas Aces', 'city': 'Las Vegas', 'arena': 'Michelob Ultra Arena'},
    'LA': {'name': 'Los Angeles Sparks', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
    'MIN': {'name': 'Minnesota Lynx', 'city': 'Minneapolis', 'arena': 'Target Center'},
    'NY': {'name': 'New York Liberty', 'city': 'Brooklyn', 'arena': 'Barclays Center'},
    'PHO': {'name': 'Phoenix Mercury', 'city': 'Phoenix', 'arena': 'Footprint Center'},
    'SEA': {'name': 'Seattle Storm', 'city': 'Seattle', 'arena': 'Climate Pledge Arena'},
    'WAS': {'name': 'Washington Mystics', 'city': 'Washington', 'arena': 'Entertainment & Sports Arena'},
}

NWSL_TEAMS = {
    'LA': {'name': 'Angel City FC', 'city': 'Los Angeles', 'stadium': 'BMO Stadium'},
    'SJ': {'name': 'Bay FC', 'city': 'San Jose', 'stadium': 'PayPal Park'},
    'CHI': {'name': 'Chicago Red Stars', 'city': 'Bridgeview', 'stadium': 'SeatGeek Stadium'},
    'HOU': {'name': 'Houston Dash', 'city': 'Houston', 'stadium': 'Shell Energy Stadium'},
    'KC': {'name': 'Kansas City Current', 'city': 'Kansas City', 'stadium': 'CPKC Stadium'},
    'NJ': {'name': 'NJ/NY Gotham FC', 'city': 'Harrison', 'stadium': 'Red Bull Arena'},
    'NC': {'name': 'North Carolina Courage', 'city': 'Cary', 'stadium': 'WakeMed Soccer Park'},
    'ORL': {'name': 'Orlando Pride', 'city': 'Orlando', 'stadium': 'Inter&Co Stadium'},
    'POR': {'name': 'Portland Thorns FC', 'city': 'Portland', 'stadium': 'Providence Park'},
    'SEA': {'name': 'Seattle Reign FC', 'city': 'Seattle', 'stadium': 'Lumen Field'},
    'SD': {'name': 'San Diego Wave FC', 'city': 'San Diego', 'stadium': 'Snapdragon Stadium'},
    'UTA': {'name': 'Utah Royals FC', 'city': 'Sandy', 'stadium': 'America First Field'},
    'WAS': {'name': 'Washington Spirit', 'city': 'Washington', 'stadium': 'Audi Field'},
}


# =============================================================================
# NON-CORE SPORT SCRAPERS
# TODO: Extract to separate modules (wnba.py, mls.py, nwsl.py, cbb.py)
# =============================================================================

def _scrape_espn_schedule(sport: str, league: str, season: int, date_range: tuple[str, str]) -> list[Game]:
    """
    Fetch schedule from ESPN API.
    Shared helper for non-core sports that use ESPN API.
    """
    games = []
    sport_upper = {
        'wnba': 'WNBA',
        'usa.1': 'MLS',
        'usa.nwsl': 'NWSL',
        'mens-college-basketball': 'CBB'
    }.get(league, league.upper())

    print(f"Fetching {sport_upper} {season} from ESPN API...")

    url = f"https://site.api.espn.com/apis/site/v2/sports/{sport}/{league}/scoreboard"
    params = {
        'dates': f"{date_range[0]}-{date_range[1]}",
        'limit': 1000
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    }

    try:
        response = requests.get(url, params=params, headers=headers, timeout=30)
        response.raise_for_status()
        data = response.json()

        events = data.get('events', [])

        for event in events:
            try:
                date_str = event.get('date', '')[:10]
                time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None

                competitions = event.get('competitions', [{}])
                if not competitions:
                    continue

                comp = competitions[0]
                competitors = comp.get('competitors', [])

                if len(competitors) < 2:
                    continue

                home_team = None
                away_team = None
                home_abbrev = None
                away_abbrev = None

                for team in competitors:
                    team_data = team.get('team', {})
                    team_name = team_data.get('displayName', team_data.get('name', ''))
                    team_abbrev = team_data.get('abbreviation', '')

                    if team.get('homeAway') == 'home':
                        home_team = team_name
                        home_abbrev = team_abbrev
                    else:
                        away_team = team_name
                        away_abbrev = team_abbrev

                if not home_team or not away_team:
                    continue

                venue = comp.get('venue', {}).get('fullName', '')
                game_id = f"{sport_upper.lower()}_{date_str}_{away_abbrev}_{home_abbrev}".lower()

                game = Game(
                    id=game_id,
                    sport=sport_upper,
                    season=str(season),
                    date=date_str,
                    time=time_str,
                    home_team=home_team,
                    away_team=away_team,
                    home_team_abbrev=home_abbrev or get_team_abbrev(home_team, sport_upper),
                    away_team_abbrev=away_abbrev or get_team_abbrev(away_team, sport_upper),
                    venue=venue,
                    source='espn.com'
                )
                games.append(game)

            except Exception:
                continue

        print(f"  Found {len(games)} games from ESPN")

    except Exception as e:
        print(f"Error fetching ESPN {sport_upper}: {e}")

    return games


def scrape_wnba_espn(season: int) -> list[Game]:
    """Fetch WNBA schedule from ESPN API."""
    start = f"{season}0501"
    end = f"{season}1031"
    return _scrape_espn_schedule('basketball', 'wnba', season, (start, end))


def scrape_mls_espn(season: int) -> list[Game]:
    """Fetch MLS schedule from ESPN API."""
    start = f"{season}0201"
    end = f"{season}1231"
    return _scrape_espn_schedule('soccer', 'usa.1', season, (start, end))


def scrape_nwsl_espn(season: int) -> list[Game]:
    """Fetch NWSL schedule from ESPN API."""
    start = f"{season}0301"
    end = f"{season}1130"
    return _scrape_espn_schedule('soccer', 'usa.nwsl', season, (start, end))


def scrape_cbb_espn(season: int) -> list[Game]:
    """Fetch College Basketball schedule from ESPN API (D1 only)."""
    start = f"{season-1}1101"
    end = f"{season}0415"
    return _scrape_espn_schedule('basketball', 'mens-college-basketball', season, (start, end))


def scrape_wnba_basketball_reference(season: int) -> list[Game]:
    """Scrape WNBA schedule from Basketball-Reference."""
    games = []
    url = f"https://www.basketball-reference.com/wnba/years/{season}_games.html"

    print(f"Scraping WNBA {season} from Basketball-Reference...")
    soup = fetch_page(url, 'basketball-reference.com')

    if not soup:
        return games

    table = soup.find('table', {'id': 'schedule'})
    if not table:
        return games

    tbody = table.find('tbody')
    if not tbody:
        return games

    for row in tbody.find_all('tr'):
        if row.get('class') and 'thead' in row.get('class'):
            continue

        try:
            date_cell = row.find('th', {'data-stat': 'date_game'})
            if not date_cell:
                continue
            date_link = date_cell.find('a')
            date_str = date_link.text if date_link else date_cell.text

            visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
            home_cell = row.find('td', {'data-stat': 'home_team_name'})

            if not visitor_cell or not home_cell:
                continue

            visitor_link = visitor_cell.find('a')
            home_link = home_cell.find('a')

            away_team = visitor_link.text if visitor_link else visitor_cell.text
            home_team = home_link.text if home_link else home_cell.text

            try:
                parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y')
                date_formatted = parsed_date.strftime('%Y-%m-%d')
            except:
                continue

            away_abbrev = get_team_abbrev(away_team, 'WNBA')
            home_abbrev = get_team_abbrev(home_team, 'WNBA')
            game_id = f"wnba_{date_formatted}_{away_abbrev}_{home_abbrev}".lower().replace(' ', '')

            game = Game(
                id=game_id,
                sport='WNBA',
                season=str(season),
                date=date_formatted,
                time=None,
                home_team=home_team,
                away_team=away_team,
                home_team_abbrev=home_abbrev,
                away_team_abbrev=away_abbrev,
                venue='',
                source='basketball-reference.com'
            )
            games.append(game)

        except Exception:
            continue

    print(f"  Found {len(games)} games from Basketball-Reference")
    return games


def scrape_wnba_cbssports(season: int) -> list[Game]:
    """Fetch WNBA schedule from CBS Sports."""
    games = []
    print(f"Fetching WNBA {season} from CBS Sports...")
    # Placeholder - CBS Sports scraping would go here
    print(f"  Found {len(games)} games from CBS Sports")
    return games


def scrape_mls_fbref(season: int) -> list[Game]:
    """Scrape MLS schedule from FBref."""
    games = []
    print(f"Scraping MLS {season} from FBref...")
    # Placeholder - FBref scraping would go here
    print(f"  Found {len(games)} games from FBref")
    return games


def scrape_mls_mlssoccer(season: int) -> list[Game]:
    """Scrape MLS schedule from MLSSoccer.com."""
    games = []
    print(f"Scraping MLS {season} from MLSSoccer.com...")
    # Placeholder - MLSSoccer.com scraping would go here
    print(f"  Found {len(games)} games from MLSSoccer.com")
    return games


def scrape_nwsl_fbref(season: int) -> list[Game]:
    """Scrape NWSL schedule from FBref."""
    games = []
    print(f"Scraping NWSL {season} from FBref...")
    # Placeholder - FBref scraping would go here
    print(f"  Found {len(games)} games from FBref")
    return games


def scrape_nwsl_nwslsoccer(season: int) -> list[Game]:
    """Scrape NWSL schedule from NWSL.com."""
    games = []
    print(f"Scraping NWSL {season} from NWSL.com...")
    # Placeholder - NWSL.com scraping would go here
    print(f"  Found {len(games)} games from NWSL.com")
    return games


def scrape_cbb_sports_reference(season: int) -> list[Game]:
    """Scrape College Basketball schedule from Sports-Reference."""
    games = []
    print(f"Scraping CBB {season} from Sports-Reference...")
    # Placeholder - Sports-Reference scraping would go here
    print(f"  Found {len(games)} games from Sports-Reference")
    return games


def scrape_cbb_cbssports(season: int) -> list[Game]:
    """Fetch College Basketball schedule from CBS Sports."""
    games = []
    print(f"Fetching CBB {season} from CBS Sports...")
    # Placeholder - CBS Sports scraping would go here
    print(f"  Found {len(games)} games from CBS Sports")
    return games


# =============================================================================
# NON-CORE STADIUM SCRAPERS
# TODO: Extract to separate modules (wnba.py, nwsl.py, cbb.py)
# NOTE: scrape_mls_stadiums() is now imported from mls.py
# =============================================================================

def scrape_wnba_stadiums() -> list[Stadium]:
    """Fetch WNBA arena data (hardcoded)."""
    print("\nWNBA STADIUMS")
    print("-" * 40)
    stadiums = []
    # Would include WNBA arena data here
    print(f"  Found {len(stadiums)} WNBA arenas")
    return stadiums


def scrape_nwsl_stadiums() -> list[Stadium]:
    """Fetch NWSL stadium data (hardcoded)."""
    print("\nNWSL STADIUMS")
    print("-" * 40)
    stadiums = []
    # Would include NWSL stadium data here
    print(f"  Found {len(stadiums)} NWSL stadiums")
    return stadiums


def scrape_cbb_stadiums() -> list[Stadium]:
    """Fetch College Basketball arena data."""
    print("\nCBB STADIUMS")
    print("-" * 40)
    stadiums = []
    # Would include CBB arena data here
    print(f"  Found {len(stadiums)} CBB arenas")
    return stadiums


# =============================================================================
# LEGACY STADIUM FUNCTIONS
# =============================================================================

def scrape_stadiums_hifld() -> list[Stadium]:
    """Legacy: Scrape from HIFLD open data."""
    # Placeholder for legacy HIFLD scraping
    return []


def generate_stadiums_from_teams() -> list[Stadium]:
    """Generate stadium entries from team data with hardcoded coordinates."""
    stadiums = []
    # This function would generate stadiums from all team dictionaries
    # Keeping as placeholder since sport modules have their own stadium scrapers
    return stadiums


def scrape_all_stadiums() -> list[Stadium]:
    """Comprehensive stadium scraping for all sports."""
    all_stadiums = []

    # Core sports (from modules)
    all_stadiums.extend(scrape_mlb_stadiums())
    all_stadiums.extend(scrape_nba_stadiums())
    all_stadiums.extend(scrape_nhl_stadiums())
    all_stadiums.extend(scrape_nfl_stadiums())

    # Non-core sports
    all_stadiums.extend(scrape_mls_stadiums())
    all_stadiums.extend(scrape_wnba_stadiums())
    all_stadiums.extend(scrape_nwsl_stadiums())
    all_stadiums.extend(scrape_cbb_stadiums())

    return all_stadiums


# =============================================================================
# HELPERS
# =============================================================================

def get_team_abbrev(team_name: str, sport: str) -> str:
    """Get team abbreviation from full name."""
    teams = {
        'NBA': NBA_TEAMS,
        'MLB': MLB_TEAMS,
        'NHL': NHL_TEAMS,
        'NFL': NFL_TEAMS,
        'WNBA': WNBA_TEAMS,
        'MLS': MLS_TEAMS,
        'NWSL': NWSL_TEAMS,
    }.get(sport, {})

    for abbrev, info in teams.items():
        if info['name'].lower() == team_name.lower():
            return abbrev
        if team_name.lower() in info['name'].lower():
            return abbrev

    # Return first 3 letters as fallback
    return team_name[:3].upper()


# =============================================================================
# MAIN ORCHESTRATOR
# =============================================================================

def main():
    parser = argparse.ArgumentParser(description='Scrape sports schedules')
    parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all')
    parser.add_argument('--season', type=int, default=2026, help='Season year (ending year)')
    parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data (legacy method)')
    parser.add_argument('--stadiums-update', action='store_true', help='Scrape ALL stadium data for all 8 sports (comprehensive)')
    parser.add_argument('--output', type=str, default='./data', help='Output directory')

    args = parser.parse_args()
    output_dir = Path(args.output)

    all_games = []
    all_stadiums = []

    # Scrape stadiums
    print("\n" + "="*60)
    print("SCRAPING STADIUMS")
    print("="*60)

    if args.stadiums_update:
        print("Using comprehensive stadium scrapers for all sports...")
        all_stadiums.extend(scrape_all_stadiums())
        print(f"  Total stadiums scraped: {len(all_stadiums)}")
    else:
        all_stadiums.extend(scrape_stadiums_hifld())
        all_stadiums.extend(generate_stadiums_from_teams())

    # If stadiums-only mode, export and exit
    if args.stadiums_only:
        export_to_json([], all_stadiums, output_dir)
        return

    # Scrape schedules using sport modules
    if args.sport in ['nba', 'all']:
        print("\n" + "="*60)
        print(f"SCRAPING NBA {args.season}")
        print("="*60)
        nba_games = scrape_nba_games(args.season)
        nba_season = get_nba_season_string(args.season)
        nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
        all_games.extend(nba_games)

    if args.sport in ['mlb', 'all']:
        print("\n" + "="*60)
        print(f"SCRAPING MLB {args.season}")
        print("="*60)
        mlb_games = scrape_mlb_games(args.season)
        mlb_games = assign_stable_ids(mlb_games, 'MLB', str(args.season))
        all_games.extend(mlb_games)

    if args.sport in ['nhl', 'all']:
        print("\n" + "="*60)
        print(f"SCRAPING NHL {args.season}")
        print("="*60)
        nhl_games = scrape_nhl_games(args.season)
        nhl_season = get_nhl_season_string(args.season)
        nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
        all_games.extend(nhl_games)

    if args.sport in ['nfl', 'all']:
        print("\n" + "="*60)
        print(f"SCRAPING NFL {args.season}")
        print("="*60)
        nfl_games = scrape_nfl_games(args.season)
        nfl_season = get_nfl_season_string(args.season)
        nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
        all_games.extend(nfl_games)

    # Non-core sports (TODO: Extract to modules)
    if args.sport in ['wnba', 'all']:
        print("\n" + "="*60)
        print(f"SCRAPING WNBA {args.season}")
        print("="*60)
        wnba_sources = [
            ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
            ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
            ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
        ]
        wnba_games = scrape_with_fallback('WNBA', args.season, wnba_sources)
        wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(args.season))
        all_games.extend(wnba_games)

    if args.sport in ['mls', 'all']:
        print("\n" + "="*60)
        print(f"SCRAPING MLS {args.season}")
        print("="*60)
        mls_sources = [
            ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
            ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
            ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
        ]
        mls_games = scrape_with_fallback('MLS', args.season, mls_sources)
        mls_games = assign_stable_ids(mls_games, 'MLS', str(args.season))
        all_games.extend(mls_games)

    if args.sport in ['nwsl', 'all']:
        print("\n" + "="*60)
        print(f"SCRAPING NWSL {args.season}")
        print("="*60)
        nwsl_sources = [
            ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
            ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
            ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
        ]
        nwsl_games = scrape_with_fallback('NWSL', args.season, nwsl_sources)
        nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(args.season))
        all_games.extend(nwsl_games)

    if args.sport in ['cbb', 'all']:
        print("\n" + "="*60)
        print(f"SCRAPING CBB {args.season}")
        print("="*60)
        cbb_sources = [
            ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
            ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
            ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
        ]
        cbb_games = scrape_with_fallback('CBB', args.season, cbb_sources)
        cbb_season = f"{args.season-1}-{str(args.season)[2:]}"
        cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
        all_games.extend(cbb_games)

    # Export
    print("\n" + "="*60)
    print("EXPORTING DATA")
    print("="*60)

    export_to_json(all_games, all_stadiums, output_dir)

    # Summary
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Total games scraped: {len(all_games)}")
    print(f"Total stadiums: {len(all_stadiums)}")

    by_sport = {}
    for g in all_games:
        by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
    for sport, count in by_sport.items():
        print(f"  {sport}: {count} games")


if __name__ == '__main__':
    main()