Sportstime/Scripts/validate_canonical.py

#!/usr/bin/env python3
"""
Canonical Data Validation for SportsTime
=========================================
Stage 4 of the canonicalization pipeline.

Validates all canonical data before CloudKit upload.
FAILS if any ERROR-level issues are found.

Usage:
    python validate_canonical.py --data-dir data/
    python validate_canonical.py --stadiums data/stadiums_canonical.json \
        --teams data/teams_canonical.json --games data/games_canonical.json
"""

import argparse
import json
from collections import defaultdict
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional


# =============================================================================
# DATA CLASSES
# =============================================================================

@dataclass
class ValidationError:
    """A validation error or warning."""
    severity: str  # 'error', 'warning'
    category: str
    message: str
    details: Optional[dict] = None


@dataclass
class ValidationResult:
    """Overall validation result."""
    is_valid: bool
    error_count: int
    warning_count: int
    errors: list
    summary: dict


# =============================================================================
# EXPECTED GAME COUNTS
# =============================================================================

EXPECTED_GAMES = {
    'nba': {
        'expected': 82,
        'min': 75,
        'max': 90,
        'description': 'NBA regular season (82 games)'
    },
    'nhl': {
        'expected': 82,
        'min': 75,
        'max': 90,
        'description': 'NHL regular season (82 games)'
    },
    'mlb': {
        'expected': 162,
        'min': 155,
        'max': 168,
        'description': 'MLB regular season (162 games)'
    },
    'nfl': {
        'expected': 17,
        'min': 15,
        'max': 20,
        'description': 'NFL regular season (17 games)'
    },
    'wnba': {
        'expected': 40,
        'min': 35,
        'max': 45,
        'description': 'WNBA regular season (40 games)'
    },
    'mls': {
        'expected': 34,
        'min': 30,
        'max': 40,
        'description': 'MLS regular season (34 games)'
    },
    'nwsl': {
        'expected': 26,
        'min': 22,
        'max': 30,
        'description': 'NWSL regular season (26 games)'
    },
}


# =============================================================================
# VALIDATION FUNCTIONS
# =============================================================================

def validate_no_duplicate_ids(
    stadiums: list[dict],
    teams: list[dict],
    games: list[dict]
) -> list[ValidationError]:
    """Check for duplicate canonical IDs."""
    errors = []

    # Stadiums
    seen_stadium_ids = set()
    for s in stadiums:
        canonical_id = s.get('canonical_id', '')
        if canonical_id in seen_stadium_ids:
            errors.append(ValidationError(
                severity='error',
                category='duplicate_id',
                message=f'Duplicate stadium canonical_id: {canonical_id}'
            ))
        seen_stadium_ids.add(canonical_id)

    # Teams
    seen_team_ids = set()
    for t in teams:
        canonical_id = t.get('canonical_id', '')
        if canonical_id in seen_team_ids:
            errors.append(ValidationError(
                severity='error',
                category='duplicate_id',
                message=f'Duplicate team canonical_id: {canonical_id}'
            ))
        seen_team_ids.add(canonical_id)

    # Games
    seen_game_ids = set()
    for g in games:
        canonical_id = g.get('canonical_id', '')
        if canonical_id in seen_game_ids:
            errors.append(ValidationError(
                severity='error',
                category='duplicate_id',
                message=f'Duplicate game canonical_id: {canonical_id}'
            ))
        seen_game_ids.add(canonical_id)

    return errors


def validate_team_stadium_references(
    teams: list[dict],
    stadium_ids: set[str]
) -> list[ValidationError]:
    """Validate that all teams reference valid stadiums."""
    errors = []

    for team in teams:
        canonical_id = team.get('canonical_id', '')
        stadium_id = team.get('stadium_canonical_id', '')

        if not stadium_id:
            errors.append(ValidationError(
                severity='error',
                category='missing_reference',
                message=f'Team {canonical_id} has no stadium_canonical_id'
            ))
        elif stadium_id.startswith('stadium_unknown'):
            errors.append(ValidationError(
                severity='warning',
                category='unknown_stadium',
                message=f'Team {canonical_id} has unknown stadium: {stadium_id}'
            ))
        elif stadium_id not in stadium_ids:
            errors.append(ValidationError(
                severity='error',
                category='dangling_reference',
                message=f'Team {canonical_id} references unknown stadium: {stadium_id}'
            ))

    return errors


def validate_game_references(
    games: list[dict],
    team_ids: set[str],
    stadium_ids: set[str]
) -> list[ValidationError]:
    """Validate that all games reference valid teams and stadiums."""
    errors = []

    for game in games:
        canonical_id = game.get('canonical_id', '')
        home_team_id = game.get('home_team_canonical_id', '')
        away_team_id = game.get('away_team_canonical_id', '')
        stadium_id = game.get('stadium_canonical_id', '')

        # Home team
        if not home_team_id:
            errors.append(ValidationError(
                severity='error',
                category='missing_reference',
                message=f'Game {canonical_id} has no home_team_canonical_id'
            ))
        elif home_team_id not in team_ids:
            errors.append(ValidationError(
                severity='error',
                category='dangling_reference',
                message=f'Game {canonical_id} references unknown home team: {home_team_id}'
            ))

        # Away team
        if not away_team_id:
            errors.append(ValidationError(
                severity='error',
                category='missing_reference',
                message=f'Game {canonical_id} has no away_team_canonical_id'
            ))
        elif away_team_id not in team_ids:
            errors.append(ValidationError(
                severity='error',
                category='dangling_reference',
                message=f'Game {canonical_id} references unknown away team: {away_team_id}'
            ))

        # Stadium
        if not stadium_id:
            errors.append(ValidationError(
                severity='error',
                category='missing_reference',
                message=f'Game {canonical_id} has no stadium_canonical_id'
            ))
        elif stadium_id.startswith('stadium_unknown'):
            errors.append(ValidationError(
                severity='warning',
                category='unknown_stadium',
                message=f'Game {canonical_id} has unknown stadium: {stadium_id}'
            ))
        elif stadium_id not in stadium_ids:
            errors.append(ValidationError(
                severity='error',
                category='dangling_reference',
                message=f'Game {canonical_id} references unknown stadium: {stadium_id}'
            ))

    return errors


def validate_no_cross_sport_references(games: list[dict]) -> list[ValidationError]:
    """Validate that games don't have cross-sport team references."""
    errors = []

    for game in games:
        canonical_id = game.get('canonical_id', '')
        game_sport = game.get('sport', '').lower()
        home_team_id = game.get('home_team_canonical_id', '')
        away_team_id = game.get('away_team_canonical_id', '')

        # Extract sport from team IDs (format: team_{sport}_{abbrev})
        def get_sport_from_id(team_id: str) -> Optional[str]:
            parts = team_id.split('_')
            if len(parts) >= 2:
                return parts[1]
            return None

        home_sport = get_sport_from_id(home_team_id)
        away_sport = get_sport_from_id(away_team_id)

        if home_sport and home_sport != game_sport:
            errors.append(ValidationError(
                severity='error',
                category='cross_sport',
                message=f'Game {canonical_id} ({game_sport}) has cross-sport home team ({home_sport})'
            ))

        if away_sport and away_sport != game_sport:
            errors.append(ValidationError(
                severity='error',
                category='cross_sport',
                message=f'Game {canonical_id} ({game_sport}) has cross-sport away team ({away_sport})'
            ))

    return errors


def validate_stadium_aliases(
    aliases: list[dict],
    stadium_ids: set[str]
) -> list[ValidationError]:
    """Validate that all stadium aliases reference valid stadiums."""
    errors = []

    for alias in aliases:
        alias_name = alias.get('alias_name', '')
        stadium_id = alias.get('stadium_canonical_id', '')

        if not stadium_id:
            errors.append(ValidationError(
                severity='error',
                category='missing_reference',
                message=f'Stadium alias "{alias_name}" has no stadium_canonical_id'
            ))
        elif stadium_id not in stadium_ids:
            errors.append(ValidationError(
                severity='error',
                category='dangling_reference',
                message=f'Stadium alias "{alias_name}" references unknown stadium: {stadium_id}'
            ))

    return errors


def validate_game_counts_per_team(games: list[dict]) -> list[ValidationError]:
    """Validate that each team has expected number of games."""
    errors = []

    # Count games per team
    team_game_counts = defaultdict(int)
    for game in games:
        home_id = game.get('home_team_canonical_id', '')
        away_id = game.get('away_team_canonical_id', '')
        team_game_counts[home_id] += 1
        team_game_counts[away_id] += 1

    # Check against expected counts
    for team_id, count in team_game_counts.items():
        # Extract sport from team ID
        parts = team_id.split('_')
        if len(parts) < 2:
            continue

        sport = parts[1]
        if sport in EXPECTED_GAMES:
            expected = EXPECTED_GAMES[sport]
            if count < expected['min']:
                errors.append(ValidationError(
                    severity='warning',
                    category='game_count',
                    message=f'Team {team_id} has only {count} games (expected ~{expected["expected"]})',
                    details={'count': count, 'expected': expected['expected'], 'min': expected['min']}
                ))
            elif count > expected['max']:
                errors.append(ValidationError(
                    severity='warning',
                    category='game_count',
                    message=f'Team {team_id} has {count} games (expected ~{expected["expected"]})',
                    details={'count': count, 'expected': expected['expected'], 'max': expected['max']}
                ))

    return errors


def validate_required_fields(
    stadiums: list[dict],
    teams: list[dict],
    games: list[dict]
) -> list[ValidationError]:
    """Validate that required fields are present."""
    errors = []

    # Required stadium fields
    stadium_required = ['canonical_id', 'name', 'sport', 'latitude', 'longitude']
    for s in stadiums:
        for field in stadium_required:
            if field not in s or s[field] is None:
                errors.append(ValidationError(
                    severity='error',
                    category='missing_field',
                    message=f'Stadium {s.get("canonical_id", "unknown")} missing required field: {field}'
                ))

    # Required team fields
    team_required = ['canonical_id', 'name', 'abbreviation', 'sport', 'stadium_canonical_id']
    for t in teams:
        for field in team_required:
            if field not in t or t[field] is None:
                errors.append(ValidationError(
                    severity='error',
                    category='missing_field',
                    message=f'Team {t.get("canonical_id", "unknown")} missing required field: {field}'
                ))

    # Required game fields
    game_required = ['canonical_id', 'sport', 'date', 'home_team_canonical_id', 'away_team_canonical_id', 'stadium_canonical_id']
    for g in games:
        for field in game_required:
            if field not in g or g[field] is None:
                errors.append(ValidationError(
                    severity='error',
                    category='missing_field',
                    message=f'Game {g.get("canonical_id", "unknown")} missing required field: {field}'
                ))

    return errors


# =============================================================================
# MAIN VALIDATION
# =============================================================================

def validate_canonical_data(
    stadiums: list[dict],
    teams: list[dict],
    games: list[dict],
    stadium_aliases: list[dict],
    verbose: bool = False
) -> ValidationResult:
    """
    Stage 4: Validate all canonical data.

    Runs all validation checks and returns results.

    Args:
        stadiums: List of canonical stadium dicts
        teams: List of canonical team dicts
        games: List of canonical game dicts
        stadium_aliases: List of stadium alias dicts
        verbose: Print detailed progress

    Returns:
        ValidationResult with is_valid, error/warning counts, and error list
    """
    all_errors = []

    # Build ID sets for reference checking
    stadium_ids = {s.get('canonical_id', '') for s in stadiums}
    team_ids = {t.get('canonical_id', '') for t in teams}

    print("Running validation checks...")

    # 1. Duplicate IDs
    if verbose:
        print("  Checking for duplicate IDs...")
    errors = validate_no_duplicate_ids(stadiums, teams, games)
    all_errors.extend(errors)
    if verbose and errors:
        print(f"    Found {len(errors)} duplicate ID issues")

    # 2. Required fields
    if verbose:
        print("  Checking required fields...")
    errors = validate_required_fields(stadiums, teams, games)
    all_errors.extend(errors)
    if verbose and errors:
        print(f"    Found {len(errors)} missing field issues")

    # 3. Team -> Stadium references
    if verbose:
        print("  Checking team -> stadium references...")
    errors = validate_team_stadium_references(teams, stadium_ids)
    all_errors.extend(errors)
    if verbose and errors:
        print(f"    Found {len(errors)} team-stadium reference issues")

    # 4. Game -> Team/Stadium references
    if verbose:
        print("  Checking game -> team/stadium references...")
    errors = validate_game_references(games, team_ids, stadium_ids)
    all_errors.extend(errors)
    if verbose and errors:
        print(f"    Found {len(errors)} game reference issues")

    # 5. Cross-sport references
    if verbose:
        print("  Checking for cross-sport references...")
    errors = validate_no_cross_sport_references(games)
    all_errors.extend(errors)
    if verbose and errors:
        print(f"    Found {len(errors)} cross-sport reference issues")

    # 6. Stadium aliases
    if verbose:
        print("  Checking stadium alias references...")
    errors = validate_stadium_aliases(stadium_aliases, stadium_ids)
    all_errors.extend(errors)
    if verbose and errors:
        print(f"    Found {len(errors)} stadium alias issues")

    # 7. Game counts per team
    if verbose:
        print("  Checking game counts per team...")
    errors = validate_game_counts_per_team(games)
    all_errors.extend(errors)
    if verbose and errors:
        print(f"    Found {len(errors)} game count issues")

    # Count by severity
    error_count = sum(1 for e in all_errors if e.severity == 'error')
    warning_count = sum(1 for e in all_errors if e.severity == 'warning')

    # Count by category
    by_category = defaultdict(int)
    for e in all_errors:
        by_category[e.category] += 1

    # Determine validity (no errors = valid, warnings are OK)
    is_valid = error_count == 0

    return ValidationResult(
        is_valid=is_valid,
        error_count=error_count,
        warning_count=warning_count,
        errors=[asdict(e) for e in all_errors],
        summary={
            'stadiums': len(stadiums),
            'teams': len(teams),
            'games': len(games),
            'aliases': len(stadium_aliases),
            'by_category': dict(by_category)
        }
    )


# =============================================================================
# MAIN
# =============================================================================

def main():
    parser = argparse.ArgumentParser(
        description='Validate canonical data'
    )
    parser.add_argument(
        '--data-dir', type=str, default=None,
        help='Directory containing all canonical JSON files'
    )
    parser.add_argument(
        '--stadiums', type=str, default=None,
        help='Input canonical stadiums JSON file'
    )
    parser.add_argument(
        '--teams', type=str, default=None,
        help='Input canonical teams JSON file'
    )
    parser.add_argument(
        '--games', type=str, default=None,
        help='Input canonical games JSON file'
    )
    parser.add_argument(
        '--aliases', type=str, default=None,
        help='Input stadium aliases JSON file'
    )
    parser.add_argument(
        '--output', type=str, default=None,
        help='Output file for validation report'
    )
    parser.add_argument(
        '--verbose', '-v', action='store_true',
        help='Verbose output'
    )
    parser.add_argument(
        '--strict', action='store_true',
        help='Exit with error code if validation fails'
    )

    args = parser.parse_args()

    # Determine file paths
    if args.data_dir:
        data_dir = Path(args.data_dir)
        stadiums_path = data_dir / 'stadiums_canonical.json'
        teams_path = data_dir / 'teams_canonical.json'
        games_path = data_dir / 'games_canonical.json'
        aliases_path = data_dir / 'stadium_aliases.json'
    else:
        stadiums_path = Path(args.stadiums or './data/stadiums_canonical.json')
        teams_path = Path(args.teams or './data/teams_canonical.json')
        games_path = Path(args.games or './data/games_canonical.json')
        aliases_path = Path(args.aliases or './data/stadium_aliases.json')

    # Load input files
    print(f"Loading canonical data...")

    with open(stadiums_path) as f:
        stadiums = json.load(f)
    print(f"  Loaded {len(stadiums)} stadiums from {stadiums_path}")

    with open(teams_path) as f:
        teams = json.load(f)
    print(f"  Loaded {len(teams)} teams from {teams_path}")

    with open(games_path) as f:
        games = json.load(f)
    print(f"  Loaded {len(games)} games from {games_path}")

    stadium_aliases = []
    if aliases_path.exists():
        with open(aliases_path) as f:
            stadium_aliases = json.load(f)
        print(f"  Loaded {len(stadium_aliases)} aliases from {aliases_path}")

    # Validate
    print()
    result = validate_canonical_data(
        stadiums, teams, games, stadium_aliases,
        verbose=args.verbose
    )

    # Print results
    print()
    print("=" * 60)
    print("VALIDATION RESULTS")
    print("=" * 60)
    print()

    if result.is_valid:
        print("  STATUS: PASSED")
    else:
        print("  STATUS: FAILED")

    print()
    print(f"  Errors:   {result.error_count}")
    print(f"  Warnings: {result.warning_count}")
    print()
    print(f"  Data Summary:")
    print(f"    Stadiums: {result.summary['stadiums']}")
    print(f"    Teams:    {result.summary['teams']}")
    print(f"    Games:    {result.summary['games']}")
    print(f"    Aliases:  {result.summary['aliases']}")

    if result.summary['by_category']:
        print()
        print(f"  Issues by Category:")
        for category, count in sorted(result.summary['by_category'].items()):
            print(f"    {category}: {count}")

    # Print errors (up to 20)
    if result.errors:
        errors_only = [e for e in result.errors if e['severity'] == 'error']
        warnings_only = [e for e in result.errors if e['severity'] == 'warning']

        if errors_only:
            print()
            print("  ERRORS (must fix):")
            for e in errors_only[:20]:
                print(f"    [{e['category']}] {e['message']}")
            if len(errors_only) > 20:
                print(f"    ... and {len(errors_only) - 20} more errors")

        if warnings_only and args.verbose:
            print()
            print("  WARNINGS (informational):")
            for e in warnings_only[:20]:
                print(f"    [{e['category']}] {e['message']}")
            if len(warnings_only) > 20:
                print(f"    ... and {len(warnings_only) - 20} more warnings")

    # Export report
    if args.output:
        output_path = Path(args.output)
        with open(output_path, 'w') as f:
            json.dump(asdict(result), f, indent=2)
        print()
        print(f"Report exported to {output_path}")

    # Exit code
    if args.strict and not result.is_valid:
        print()
        print("VALIDATION FAILED - Exiting with error code 1")
        exit(1)


if __name__ == '__main__':
    main()