#!/usr/bin/env python3 """ Canonical Data Validation for SportsTime ========================================= Stage 4 of the canonicalization pipeline. Validates all canonical data before CloudKit upload. FAILS if any ERROR-level issues are found. Usage: python validate_canonical.py --data-dir data/ python validate_canonical.py --stadiums data/stadiums_canonical.json \ --teams data/teams_canonical.json --games data/games_canonical.json """ import argparse import json from collections import defaultdict from dataclasses import dataclass, asdict from pathlib import Path from typing import Optional # ============================================================================= # DATA CLASSES # ============================================================================= @dataclass class ValidationError: """A validation error or warning.""" severity: str # 'error', 'warning' category: str message: str details: Optional[dict] = None @dataclass class ValidationResult: """Overall validation result.""" is_valid: bool error_count: int warning_count: int errors: list summary: dict # ============================================================================= # EXPECTED GAME COUNTS # ============================================================================= EXPECTED_GAMES = { 'nba': { 'expected': 82, 'min': 75, 'max': 90, 'description': 'NBA regular season (82 games)' }, 'nhl': { 'expected': 82, 'min': 75, 'max': 90, 'description': 'NHL regular season (82 games)' }, 'mlb': { 'expected': 162, 'min': 155, 'max': 168, 'description': 'MLB regular season (162 games)' }, } # ============================================================================= # VALIDATION FUNCTIONS # ============================================================================= def validate_no_duplicate_ids( stadiums: list[dict], teams: list[dict], games: list[dict] ) -> list[ValidationError]: """Check for duplicate canonical IDs.""" errors = [] # Stadiums seen_stadium_ids = set() for s in stadiums: canonical_id = s.get('canonical_id', '') if canonical_id in seen_stadium_ids: errors.append(ValidationError( severity='error', category='duplicate_id', message=f'Duplicate stadium canonical_id: {canonical_id}' )) seen_stadium_ids.add(canonical_id) # Teams seen_team_ids = set() for t in teams: canonical_id = t.get('canonical_id', '') if canonical_id in seen_team_ids: errors.append(ValidationError( severity='error', category='duplicate_id', message=f'Duplicate team canonical_id: {canonical_id}' )) seen_team_ids.add(canonical_id) # Games seen_game_ids = set() for g in games: canonical_id = g.get('canonical_id', '') if canonical_id in seen_game_ids: errors.append(ValidationError( severity='error', category='duplicate_id', message=f'Duplicate game canonical_id: {canonical_id}' )) seen_game_ids.add(canonical_id) return errors def validate_team_stadium_references( teams: list[dict], stadium_ids: set[str] ) -> list[ValidationError]: """Validate that all teams reference valid stadiums.""" errors = [] for team in teams: canonical_id = team.get('canonical_id', '') stadium_id = team.get('stadium_canonical_id', '') if not stadium_id: errors.append(ValidationError( severity='error', category='missing_reference', message=f'Team {canonical_id} has no stadium_canonical_id' )) elif stadium_id.startswith('stadium_unknown'): errors.append(ValidationError( severity='warning', category='unknown_stadium', message=f'Team {canonical_id} has unknown stadium: {stadium_id}' )) elif stadium_id not in stadium_ids: errors.append(ValidationError( severity='error', category='dangling_reference', message=f'Team {canonical_id} references unknown stadium: {stadium_id}' )) return errors def validate_game_references( games: list[dict], team_ids: set[str], stadium_ids: set[str] ) -> list[ValidationError]: """Validate that all games reference valid teams and stadiums.""" errors = [] for game in games: canonical_id = game.get('canonical_id', '') home_team_id = game.get('home_team_canonical_id', '') away_team_id = game.get('away_team_canonical_id', '') stadium_id = game.get('stadium_canonical_id', '') # Home team if not home_team_id: errors.append(ValidationError( severity='error', category='missing_reference', message=f'Game {canonical_id} has no home_team_canonical_id' )) elif home_team_id not in team_ids: errors.append(ValidationError( severity='error', category='dangling_reference', message=f'Game {canonical_id} references unknown home team: {home_team_id}' )) # Away team if not away_team_id: errors.append(ValidationError( severity='error', category='missing_reference', message=f'Game {canonical_id} has no away_team_canonical_id' )) elif away_team_id not in team_ids: errors.append(ValidationError( severity='error', category='dangling_reference', message=f'Game {canonical_id} references unknown away team: {away_team_id}' )) # Stadium if not stadium_id: errors.append(ValidationError( severity='error', category='missing_reference', message=f'Game {canonical_id} has no stadium_canonical_id' )) elif stadium_id.startswith('stadium_unknown'): errors.append(ValidationError( severity='warning', category='unknown_stadium', message=f'Game {canonical_id} has unknown stadium: {stadium_id}' )) elif stadium_id not in stadium_ids: errors.append(ValidationError( severity='error', category='dangling_reference', message=f'Game {canonical_id} references unknown stadium: {stadium_id}' )) return errors def validate_no_cross_sport_references(games: list[dict]) -> list[ValidationError]: """Validate that games don't have cross-sport team references.""" errors = [] for game in games: canonical_id = game.get('canonical_id', '') game_sport = game.get('sport', '').lower() home_team_id = game.get('home_team_canonical_id', '') away_team_id = game.get('away_team_canonical_id', '') # Extract sport from team IDs (format: team_{sport}_{abbrev}) def get_sport_from_id(team_id: str) -> Optional[str]: parts = team_id.split('_') if len(parts) >= 2: return parts[1] return None home_sport = get_sport_from_id(home_team_id) away_sport = get_sport_from_id(away_team_id) if home_sport and home_sport != game_sport: errors.append(ValidationError( severity='error', category='cross_sport', message=f'Game {canonical_id} ({game_sport}) has cross-sport home team ({home_sport})' )) if away_sport and away_sport != game_sport: errors.append(ValidationError( severity='error', category='cross_sport', message=f'Game {canonical_id} ({game_sport}) has cross-sport away team ({away_sport})' )) return errors def validate_stadium_aliases( aliases: list[dict], stadium_ids: set[str] ) -> list[ValidationError]: """Validate that all stadium aliases reference valid stadiums.""" errors = [] for alias in aliases: alias_name = alias.get('alias_name', '') stadium_id = alias.get('stadium_canonical_id', '') if not stadium_id: errors.append(ValidationError( severity='error', category='missing_reference', message=f'Stadium alias "{alias_name}" has no stadium_canonical_id' )) elif stadium_id not in stadium_ids: errors.append(ValidationError( severity='error', category='dangling_reference', message=f'Stadium alias "{alias_name}" references unknown stadium: {stadium_id}' )) return errors def validate_game_counts_per_team(games: list[dict]) -> list[ValidationError]: """Validate that each team has expected number of games.""" errors = [] # Count games per team team_game_counts = defaultdict(int) for game in games: home_id = game.get('home_team_canonical_id', '') away_id = game.get('away_team_canonical_id', '') team_game_counts[home_id] += 1 team_game_counts[away_id] += 1 # Check against expected counts for team_id, count in team_game_counts.items(): # Extract sport from team ID parts = team_id.split('_') if len(parts) < 2: continue sport = parts[1] if sport in EXPECTED_GAMES: expected = EXPECTED_GAMES[sport] if count < expected['min']: errors.append(ValidationError( severity='warning', category='game_count', message=f'Team {team_id} has only {count} games (expected ~{expected["expected"]})', details={'count': count, 'expected': expected['expected'], 'min': expected['min']} )) elif count > expected['max']: errors.append(ValidationError( severity='warning', category='game_count', message=f'Team {team_id} has {count} games (expected ~{expected["expected"]})', details={'count': count, 'expected': expected['expected'], 'max': expected['max']} )) return errors def validate_required_fields( stadiums: list[dict], teams: list[dict], games: list[dict] ) -> list[ValidationError]: """Validate that required fields are present.""" errors = [] # Required stadium fields stadium_required = ['canonical_id', 'name', 'sport', 'latitude', 'longitude'] for s in stadiums: for field in stadium_required: if field not in s or s[field] is None: errors.append(ValidationError( severity='error', category='missing_field', message=f'Stadium {s.get("canonical_id", "unknown")} missing required field: {field}' )) # Required team fields team_required = ['canonical_id', 'name', 'abbreviation', 'sport', 'stadium_canonical_id'] for t in teams: for field in team_required: if field not in t or t[field] is None: errors.append(ValidationError( severity='error', category='missing_field', message=f'Team {t.get("canonical_id", "unknown")} missing required field: {field}' )) # Required game fields game_required = ['canonical_id', 'sport', 'date', 'home_team_canonical_id', 'away_team_canonical_id', 'stadium_canonical_id'] for g in games: for field in game_required: if field not in g or g[field] is None: errors.append(ValidationError( severity='error', category='missing_field', message=f'Game {g.get("canonical_id", "unknown")} missing required field: {field}' )) return errors # ============================================================================= # MAIN VALIDATION # ============================================================================= def validate_canonical_data( stadiums: list[dict], teams: list[dict], games: list[dict], stadium_aliases: list[dict], verbose: bool = False ) -> ValidationResult: """ Stage 4: Validate all canonical data. Runs all validation checks and returns results. Args: stadiums: List of canonical stadium dicts teams: List of canonical team dicts games: List of canonical game dicts stadium_aliases: List of stadium alias dicts verbose: Print detailed progress Returns: ValidationResult with is_valid, error/warning counts, and error list """ all_errors = [] # Build ID sets for reference checking stadium_ids = {s.get('canonical_id', '') for s in stadiums} team_ids = {t.get('canonical_id', '') for t in teams} print("Running validation checks...") # 1. Duplicate IDs if verbose: print(" Checking for duplicate IDs...") errors = validate_no_duplicate_ids(stadiums, teams, games) all_errors.extend(errors) if verbose and errors: print(f" Found {len(errors)} duplicate ID issues") # 2. Required fields if verbose: print(" Checking required fields...") errors = validate_required_fields(stadiums, teams, games) all_errors.extend(errors) if verbose and errors: print(f" Found {len(errors)} missing field issues") # 3. Team -> Stadium references if verbose: print(" Checking team -> stadium references...") errors = validate_team_stadium_references(teams, stadium_ids) all_errors.extend(errors) if verbose and errors: print(f" Found {len(errors)} team-stadium reference issues") # 4. Game -> Team/Stadium references if verbose: print(" Checking game -> team/stadium references...") errors = validate_game_references(games, team_ids, stadium_ids) all_errors.extend(errors) if verbose and errors: print(f" Found {len(errors)} game reference issues") # 5. Cross-sport references if verbose: print(" Checking for cross-sport references...") errors = validate_no_cross_sport_references(games) all_errors.extend(errors) if verbose and errors: print(f" Found {len(errors)} cross-sport reference issues") # 6. Stadium aliases if verbose: print(" Checking stadium alias references...") errors = validate_stadium_aliases(stadium_aliases, stadium_ids) all_errors.extend(errors) if verbose and errors: print(f" Found {len(errors)} stadium alias issues") # 7. Game counts per team if verbose: print(" Checking game counts per team...") errors = validate_game_counts_per_team(games) all_errors.extend(errors) if verbose and errors: print(f" Found {len(errors)} game count issues") # Count by severity error_count = sum(1 for e in all_errors if e.severity == 'error') warning_count = sum(1 for e in all_errors if e.severity == 'warning') # Count by category by_category = defaultdict(int) for e in all_errors: by_category[e.category] += 1 # Determine validity (no errors = valid, warnings are OK) is_valid = error_count == 0 return ValidationResult( is_valid=is_valid, error_count=error_count, warning_count=warning_count, errors=[asdict(e) for e in all_errors], summary={ 'stadiums': len(stadiums), 'teams': len(teams), 'games': len(games), 'aliases': len(stadium_aliases), 'by_category': dict(by_category) } ) # ============================================================================= # MAIN # ============================================================================= def main(): parser = argparse.ArgumentParser( description='Validate canonical data' ) parser.add_argument( '--data-dir', type=str, default=None, help='Directory containing all canonical JSON files' ) parser.add_argument( '--stadiums', type=str, default=None, help='Input canonical stadiums JSON file' ) parser.add_argument( '--teams', type=str, default=None, help='Input canonical teams JSON file' ) parser.add_argument( '--games', type=str, default=None, help='Input canonical games JSON file' ) parser.add_argument( '--aliases', type=str, default=None, help='Input stadium aliases JSON file' ) parser.add_argument( '--output', type=str, default=None, help='Output file for validation report' ) parser.add_argument( '--verbose', '-v', action='store_true', help='Verbose output' ) parser.add_argument( '--strict', action='store_true', help='Exit with error code if validation fails' ) args = parser.parse_args() # Determine file paths if args.data_dir: data_dir = Path(args.data_dir) stadiums_path = data_dir / 'stadiums_canonical.json' teams_path = data_dir / 'teams_canonical.json' games_path = data_dir / 'games_canonical.json' aliases_path = data_dir / 'stadium_aliases.json' else: stadiums_path = Path(args.stadiums or './data/stadiums_canonical.json') teams_path = Path(args.teams or './data/teams_canonical.json') games_path = Path(args.games or './data/games_canonical.json') aliases_path = Path(args.aliases or './data/stadium_aliases.json') # Load input files print(f"Loading canonical data...") with open(stadiums_path) as f: stadiums = json.load(f) print(f" Loaded {len(stadiums)} stadiums from {stadiums_path}") with open(teams_path) as f: teams = json.load(f) print(f" Loaded {len(teams)} teams from {teams_path}") with open(games_path) as f: games = json.load(f) print(f" Loaded {len(games)} games from {games_path}") stadium_aliases = [] if aliases_path.exists(): with open(aliases_path) as f: stadium_aliases = json.load(f) print(f" Loaded {len(stadium_aliases)} aliases from {aliases_path}") # Validate print() result = validate_canonical_data( stadiums, teams, games, stadium_aliases, verbose=args.verbose ) # Print results print() print("=" * 60) print("VALIDATION RESULTS") print("=" * 60) print() if result.is_valid: print(" STATUS: PASSED") else: print(" STATUS: FAILED") print() print(f" Errors: {result.error_count}") print(f" Warnings: {result.warning_count}") print() print(f" Data Summary:") print(f" Stadiums: {result.summary['stadiums']}") print(f" Teams: {result.summary['teams']}") print(f" Games: {result.summary['games']}") print(f" Aliases: {result.summary['aliases']}") if result.summary['by_category']: print() print(f" Issues by Category:") for category, count in sorted(result.summary['by_category'].items()): print(f" {category}: {count}") # Print errors (up to 20) if result.errors: errors_only = [e for e in result.errors if e['severity'] == 'error'] warnings_only = [e for e in result.errors if e['severity'] == 'warning'] if errors_only: print() print(" ERRORS (must fix):") for e in errors_only[:20]: print(f" [{e['category']}] {e['message']}") if len(errors_only) > 20: print(f" ... and {len(errors_only) - 20} more errors") if warnings_only and args.verbose: print() print(" WARNINGS (informational):") for e in warnings_only[:20]: print(f" [{e['category']}] {e['message']}") if len(warnings_only) > 20: print(f" ... and {len(warnings_only) - 20} more warnings") # Export report if args.output: output_path = Path(args.output) with open(output_path, 'w') as f: json.dump(asdict(result), f, indent=2) print() print(f"Report exported to {output_path}") # Exit code if args.strict and not result.is_valid: print() print("VALIDATION FAILED - Exiting with error code 1") exit(1) if __name__ == '__main__': main()