Sportstime/Scripts/run_pipeline.py

#!/usr/bin/env python3
"""
SportsTime Data Pipeline
========================
Master script that orchestrates all data fetching, validation, and reporting.

Usage:
    python run_pipeline.py                    # Full pipeline with defaults
    python run_pipeline.py --season 2026     # Specify season
    python run_pipeline.py --sport nba       # Single sport only
    python run_pipeline.py --skip-scrape     # Validate existing data only
    python run_pipeline.py --verbose         # Detailed output
"""

import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass
from typing import Optional
from enum import Enum

# Import from core module
from core import (
    Game, Stadium, ScraperSource, scrape_with_fallback,
    assign_stable_ids, export_to_json,
)

# Import from sport modules
from nba import scrape_nba_basketball_reference, scrape_nba_espn, scrape_nba_cbssports
from mlb import scrape_mlb_statsapi, scrape_mlb_baseball_reference, scrape_mlb_espn
from nhl import scrape_nhl_hockey_reference, scrape_nhl_espn, scrape_nhl_api
from nfl import scrape_nfl_espn, scrape_nfl_pro_football_reference, scrape_nfl_cbssports

# Import secondary sports from scrape_schedules (stubs)
from scrape_schedules import (
    # WNBA sources
    scrape_wnba_espn, scrape_wnba_basketball_reference, scrape_wnba_cbssports,
    # MLS sources
    scrape_mls_espn, scrape_mls_fbref, scrape_mls_mlssoccer,
    # NWSL sources
    scrape_nwsl_espn, scrape_nwsl_fbref, scrape_nwsl_nwslsoccer,
    # CBB sources
    scrape_cbb_espn, scrape_cbb_sports_reference, scrape_cbb_cbssports,
    # Utilities
    generate_stadiums_from_teams,
)
from validate_data import (
    validate_games,
    validate_stadiums,
    scrape_mlb_all_sources,
    scrape_nba_all_sources,
    scrape_nhl_all_sources,
    ValidationReport,
)


class Severity(Enum):
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"


@dataclass
class PipelineResult:
    success: bool
    games_scraped: int
    stadiums_scraped: int
    games_by_sport: dict
    validation_reports: list
    stadium_issues: list
    high_severity_count: int
    medium_severity_count: int
    low_severity_count: int
    output_dir: Path
    duration_seconds: float


def print_header(text: str):
    """Print a formatted header."""
    print()
    print("=" * 70)
    print(f"  {text}")
    print("=" * 70)


def print_section(text: str):
    """Print a section header."""
    print()
    print(f"--- {text} ---")


def print_severity(severity: str, message: str):
    """Print a message with severity indicator."""
    icons = {
        'high': '🔴 HIGH',
        'medium': '🟡 MEDIUM',
        'low': '🟢 LOW',
    }
    print(f"  {icons.get(severity, '⚪')} {message}")


def run_pipeline(
    season: int = 2025,
    sport: str = 'all',
    output_dir: Path = Path('./data'),
    skip_scrape: bool = False,
    validate: bool = True,
    verbose: bool = False,
) -> PipelineResult:
    """
    Run the complete data pipeline.
    """
    start_time = datetime.now()

    all_games = []
    all_stadiums = []
    games_by_sport = {}
    validation_reports = []
    stadium_issues = []

    output_dir.mkdir(parents=True, exist_ok=True)

    # =========================================================================
    # PHASE 1: SCRAPE DATA
    # =========================================================================

    if not skip_scrape:
        print_header("PHASE 1: SCRAPING DATA")

        # Scrape stadiums
        print_section("Stadiums")
        all_stadiums = generate_stadiums_from_teams()
        print(f"  Generated {len(all_stadiums)} stadiums from team data")

        # Scrape by sport with multi-source fallback
        if sport in ['nba', 'all']:
            print_section(f"NBA {season}")
            nba_sources = [
                ScraperSource('Basketball-Reference', scrape_nba_basketball_reference, priority=1, min_games=500),
                ScraperSource('ESPN', scrape_nba_espn, priority=2, min_games=500),
                ScraperSource('CBS Sports', scrape_nba_cbssports, priority=3, min_games=100),
            ]
            nba_games = scrape_with_fallback('NBA', season, nba_sources)
            nba_season = f"{season-1}-{str(season)[2:]}"
            nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
            all_games.extend(nba_games)
            games_by_sport['NBA'] = len(nba_games)

        if sport in ['mlb', 'all']:
            print_section(f"MLB {season}")
            mlb_sources = [
                ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=1000),
                ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=500),
                ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=500),
            ]
            mlb_games = scrape_with_fallback('MLB', season, mlb_sources)
            mlb_games = assign_stable_ids(mlb_games, 'MLB', str(season))
            all_games.extend(mlb_games)
            games_by_sport['MLB'] = len(mlb_games)

        if sport in ['nhl', 'all']:
            print_section(f"NHL {season}")
            nhl_sources = [
                ScraperSource('Hockey-Reference', scrape_nhl_hockey_reference, priority=1, min_games=500),
                ScraperSource('ESPN', scrape_nhl_espn, priority=2, min_games=500),
                ScraperSource('NHL API', scrape_nhl_api, priority=3, min_games=100),
            ]
            nhl_games = scrape_with_fallback('NHL', season, nhl_sources)
            nhl_season = f"{season-1}-{str(season)[2:]}"
            nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
            all_games.extend(nhl_games)
            games_by_sport['NHL'] = len(nhl_games)

        if sport in ['nfl', 'all']:
            print_section(f"NFL {season}")
            nfl_sources = [
                ScraperSource('ESPN', scrape_nfl_espn, priority=1, min_games=200),
                ScraperSource('Pro-Football-Reference', scrape_nfl_pro_football_reference, priority=2, min_games=200),
                ScraperSource('CBS Sports', scrape_nfl_cbssports, priority=3, min_games=100),
            ]
            nfl_games = scrape_with_fallback('NFL', season, nfl_sources)
            nfl_season = f"{season-1}-{str(season)[2:]}"
            nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
            all_games.extend(nfl_games)
            games_by_sport['NFL'] = len(nfl_games)

        if sport in ['wnba', 'all']:
            print_section(f"WNBA {season}")
            wnba_sources = [
                ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
                ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
                ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
            ]
            wnba_games = scrape_with_fallback('WNBA', season, wnba_sources)
            wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(season))
            all_games.extend(wnba_games)
            games_by_sport['WNBA'] = len(wnba_games)

        if sport in ['mls', 'all']:
            print_section(f"MLS {season}")
            mls_sources = [
                ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
                ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
                ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
            ]
            mls_games = scrape_with_fallback('MLS', season, mls_sources)
            mls_games = assign_stable_ids(mls_games, 'MLS', str(season))
            all_games.extend(mls_games)
            games_by_sport['MLS'] = len(mls_games)

        if sport in ['nwsl', 'all']:
            print_section(f"NWSL {season}")
            nwsl_sources = [
                ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
                ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
                ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
            ]
            nwsl_games = scrape_with_fallback('NWSL', season, nwsl_sources)
            nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(season))
            all_games.extend(nwsl_games)
            games_by_sport['NWSL'] = len(nwsl_games)

        if sport in ['cbb', 'all']:
            print_section(f"CBB {season}")
            cbb_sources = [
                ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
                ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
                ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
            ]
            cbb_games = scrape_with_fallback('CBB', season, cbb_sources)
            cbb_season = f"{season-1}-{str(season)[2:]}"
            cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
            all_games.extend(cbb_games)
            games_by_sport['CBB'] = len(cbb_games)

        # Export data
        print_section("Exporting Data")
        export_to_json(all_games, all_stadiums, output_dir)
        print(f"  Exported to {output_dir}")

    else:
        # Load existing data
        print_header("LOADING EXISTING DATA")

        games_file = output_dir / 'games.json'
        stadiums_file = output_dir / 'stadiums.json'

        if games_file.exists():
            with open(games_file) as f:
                games_data = json.load(f)
                all_games = [Game(**g) for g in games_data]
                for g in all_games:
                    games_by_sport[g.sport] = games_by_sport.get(g.sport, 0) + 1
            print(f"  Loaded {len(all_games)} games")

        if stadiums_file.exists():
            with open(stadiums_file) as f:
                stadiums_data = json.load(f)
                all_stadiums = [Stadium(**s) for s in stadiums_data]
            print(f"  Loaded {len(all_stadiums)} stadiums")

    # =========================================================================
    # PHASE 2: VALIDATE DATA
    # =========================================================================

    if validate:
        print_header("PHASE 2: CROSS-VALIDATION")

        # MLB validation (has two good sources)
        if sport in ['mlb', 'all']:
            print_section("MLB Cross-Validation")
            try:
                mlb_sources = scrape_mlb_all_sources(season)
                source_names = list(mlb_sources.keys())

                if len(source_names) >= 2:
                    games1 = mlb_sources[source_names[0]]
                    games2 = mlb_sources[source_names[1]]

                    if games1 and games2:
                        report = validate_games(
                            games1, games2,
                            source_names[0], source_names[1],
                            'MLB', str(season)
                        )
                        validation_reports.append(report)

                        print(f"  Sources: {source_names[0]} vs {source_names[1]}")
                        print(f"  Games compared: {report.total_games_source1} vs {report.total_games_source2}")
                        print(f"  Matched: {report.games_matched}")
                        print(f"  Discrepancies: {len(report.discrepancies)}")
            except Exception as e:
                print(f"  Error during MLB validation: {e}")

        # Stadium validation
        print_section("Stadium Validation")
        stadium_issues = validate_stadiums(all_stadiums)
        print(f"  Issues found: {len(stadium_issues)}")

        # Data quality checks
        print_section("Data Quality Checks")

        # Check game counts per team
        if sport in ['nba', 'all']:
            nba_games = [g for g in all_games if g.sport == 'NBA']
            team_counts = {}
            for g in nba_games:
                team_counts[g.home_team_abbrev] = team_counts.get(g.home_team_abbrev, 0) + 1
                team_counts[g.away_team_abbrev] = team_counts.get(g.away_team_abbrev, 0) + 1

            for team, count in sorted(team_counts.items()):
                if count < 75 or count > 90:
                    print(f"  NBA: {team} has {count} games (expected ~82)")

        if sport in ['nhl', 'all']:
            nhl_games = [g for g in all_games if g.sport == 'NHL']
            team_counts = {}
            for g in nhl_games:
                team_counts[g.home_team_abbrev] = team_counts.get(g.home_team_abbrev, 0) + 1
                team_counts[g.away_team_abbrev] = team_counts.get(g.away_team_abbrev, 0) + 1

            for team, count in sorted(team_counts.items()):
                if count < 75 or count > 90:
                    print(f"  NHL: {team} has {count} games (expected ~82)")

        if sport in ['nfl', 'all']:
            nfl_games = [g for g in all_games if g.sport == 'NFL']
            team_counts = {}
            for g in nfl_games:
                team_counts[g.home_team_abbrev] = team_counts.get(g.home_team_abbrev, 0) + 1
                team_counts[g.away_team_abbrev] = team_counts.get(g.away_team_abbrev, 0) + 1

            for team, count in sorted(team_counts.items()):
                if count < 15 or count > 20:
                    print(f"  NFL: {team} has {count} games (expected ~17)")

    # =========================================================================
    # PHASE 3: GENERATE REPORT
    # =========================================================================

    print_header("PHASE 3: DISCREPANCY REPORT")

    # Count by severity
    high_count = 0
    medium_count = 0
    low_count = 0

    # Game discrepancies
    for report in validation_reports:
        for d in report.discrepancies:
            if d.severity == 'high':
                high_count += 1
            elif d.severity == 'medium':
                medium_count += 1
            else:
                low_count += 1

    # Stadium issues
    for issue in stadium_issues:
        if issue['severity'] == 'high':
            high_count += 1
        elif issue['severity'] == 'medium':
            medium_count += 1
        else:
            low_count += 1

    # Print summary
    print()
    print(f"  🔴 HIGH severity:   {high_count}")
    print(f"  🟡 MEDIUM severity: {medium_count}")
    print(f"  🟢 LOW severity:    {low_count}")
    print()

    # Print high severity issues (always)
    if high_count > 0:
        print_section("HIGH Severity Issues (Requires Attention)")

        shown = 0
        max_show = 10 if not verbose else 50

        for report in validation_reports:
            for d in report.discrepancies:
                if d.severity == 'high' and shown < max_show:
                    print_severity('high', f"[{report.sport}] {d.field}: {d.game_key}")
                    if verbose:
                        print(f"       {d.source1}: {d.value1}")
                        print(f"       {d.source2}: {d.value2}")
                    shown += 1

        for issue in stadium_issues:
            if issue['severity'] == 'high' and shown < max_show:
                print_severity('high', f"[Stadium] {issue['stadium']}: {issue['issue']}")
                shown += 1

        if high_count > max_show:
            print(f"  ... and {high_count - max_show} more (use --verbose to see all)")

    # Print medium severity if verbose
    if medium_count > 0 and verbose:
        print_section("MEDIUM Severity Issues")

        for report in validation_reports:
            for d in report.discrepancies:
                if d.severity == 'medium':
                    print_severity('medium', f"[{report.sport}] {d.field}: {d.game_key}")

        for issue in stadium_issues:
            if issue['severity'] == 'medium':
                print_severity('medium', f"[Stadium] {issue['stadium']}: {issue['issue']}")

    # Save full report
    report_path = output_dir / 'pipeline_report.json'
    full_report = {
        'generated_at': datetime.now().isoformat(),
        'season': season,
        'sport': sport,
        'summary': {
            'games_scraped': len(all_games),
            'stadiums_scraped': len(all_stadiums),
            'games_by_sport': games_by_sport,
            'high_severity': high_count,
            'medium_severity': medium_count,
            'low_severity': low_count,
        },
        'game_validations': [r.to_dict() for r in validation_reports],
        'stadium_issues': stadium_issues,
    }

    with open(report_path, 'w') as f:
        json.dump(full_report, f, indent=2)

    # =========================================================================
    # FINAL SUMMARY
    # =========================================================================

    duration = (datetime.now() - start_time).total_seconds()

    print_header("PIPELINE COMPLETE")
    print()
    print(f"  Duration:    {duration:.1f} seconds")
    print(f"  Games:       {len(all_games):,}")
    print(f"  Stadiums:    {len(all_stadiums)}")
    print(f"  Output:      {output_dir.absolute()}")
    print()

    for sport_name, count in sorted(games_by_sport.items()):
        print(f"    {sport_name}: {count:,} games")

    print()
    print(f"  Reports saved to:")
    print(f"    - {output_dir / 'games.json'}")
    print(f"    - {output_dir / 'stadiums.json'}")
    print(f"    - {output_dir / 'pipeline_report.json'}")
    print()

    # Status indicator
    if high_count > 0:
        print("  ⚠️  STATUS: Review required - high severity issues found")
    elif medium_count > 0:
        print("  ✓  STATUS: Complete with warnings")
    else:
        print("  ✅ STATUS: All checks passed")

    print()

    return PipelineResult(
        success=high_count == 0,
        games_scraped=len(all_games),
        stadiums_scraped=len(all_stadiums),
        games_by_sport=games_by_sport,
        validation_reports=validation_reports,
        stadium_issues=stadium_issues,
        high_severity_count=high_count,
        medium_severity_count=medium_count,
        low_severity_count=low_count,
        output_dir=output_dir,
        duration_seconds=duration,
    )


def main():
    parser = argparse.ArgumentParser(
        description='SportsTime Data Pipeline - Fetch, validate, and report on sports data',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python run_pipeline.py                     # Full pipeline
  python run_pipeline.py --season 2026       # Different season
  python run_pipeline.py --sport mlb         # MLB only
  python run_pipeline.py --skip-scrape       # Validate existing data
  python run_pipeline.py --verbose           # Show all issues
        """
    )

    parser.add_argument(
        '--season', type=int, default=2025,
        help='Season year (default: 2025)'
    )
    parser.add_argument(
        '--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all',
        help='Sport to process (default: all)'
    )
    parser.add_argument(
        '--output', type=str, default='./data',
        help='Output directory (default: ./data)'
    )
    parser.add_argument(
        '--skip-scrape', action='store_true',
        help='Skip scraping, validate existing data only'
    )
    parser.add_argument(
        '--no-validate', action='store_true',
        help='Skip validation step'
    )
    parser.add_argument(
        '--verbose', '-v', action='store_true',
        help='Verbose output with all issues'
    )

    args = parser.parse_args()

    result = run_pipeline(
        season=args.season,
        sport=args.sport,
        output_dir=Path(args.output),
        skip_scrape=args.skip_scrape,
        validate=not args.no_validate,
        verbose=args.verbose,
    )

    # Exit with error code if high severity issues
    sys.exit(0 if result.success else 1)


if __name__ == '__main__':
    main()