Files
Sportstime/Scripts/validate_canonical.py
2026-01-10 11:16:15 -06:00

661 lines
21 KiB
Python

#!/usr/bin/env python3
"""
Canonical Data Validation for SportsTime
=========================================
Stage 4 of the canonicalization pipeline.
Validates all canonical data before CloudKit upload.
FAILS if any ERROR-level issues are found.
Usage:
python validate_canonical.py --data-dir data/
python validate_canonical.py --stadiums data/stadiums_canonical.json \
--teams data/teams_canonical.json --games data/games_canonical.json
"""
import argparse
import json
from collections import defaultdict
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class ValidationError:
"""A validation error or warning."""
severity: str # 'error', 'warning'
category: str
message: str
details: Optional[dict] = None
@dataclass
class ValidationResult:
"""Overall validation result."""
is_valid: bool
error_count: int
warning_count: int
errors: list
summary: dict
# =============================================================================
# EXPECTED GAME COUNTS
# =============================================================================
EXPECTED_GAMES = {
'nba': {
'expected': 82,
'min': 75,
'max': 90,
'description': 'NBA regular season (82 games)'
},
'nhl': {
'expected': 82,
'min': 75,
'max': 90,
'description': 'NHL regular season (82 games)'
},
'mlb': {
'expected': 162,
'min': 155,
'max': 168,
'description': 'MLB regular season (162 games)'
},
'nfl': {
'expected': 17,
'min': 15,
'max': 20,
'description': 'NFL regular season (17 games)'
},
'wnba': {
'expected': 40,
'min': 35,
'max': 45,
'description': 'WNBA regular season (40 games)'
},
'mls': {
'expected': 34,
'min': 30,
'max': 40,
'description': 'MLS regular season (34 games)'
},
'nwsl': {
'expected': 26,
'min': 22,
'max': 30,
'description': 'NWSL regular season (26 games)'
},
}
# =============================================================================
# VALIDATION FUNCTIONS
# =============================================================================
def validate_no_duplicate_ids(
stadiums: list[dict],
teams: list[dict],
games: list[dict]
) -> list[ValidationError]:
"""Check for duplicate canonical IDs."""
errors = []
# Stadiums
seen_stadium_ids = set()
for s in stadiums:
canonical_id = s.get('canonical_id', '')
if canonical_id in seen_stadium_ids:
errors.append(ValidationError(
severity='error',
category='duplicate_id',
message=f'Duplicate stadium canonical_id: {canonical_id}'
))
seen_stadium_ids.add(canonical_id)
# Teams
seen_team_ids = set()
for t in teams:
canonical_id = t.get('canonical_id', '')
if canonical_id in seen_team_ids:
errors.append(ValidationError(
severity='error',
category='duplicate_id',
message=f'Duplicate team canonical_id: {canonical_id}'
))
seen_team_ids.add(canonical_id)
# Games
seen_game_ids = set()
for g in games:
canonical_id = g.get('canonical_id', '')
if canonical_id in seen_game_ids:
errors.append(ValidationError(
severity='error',
category='duplicate_id',
message=f'Duplicate game canonical_id: {canonical_id}'
))
seen_game_ids.add(canonical_id)
return errors
def validate_team_stadium_references(
teams: list[dict],
stadium_ids: set[str]
) -> list[ValidationError]:
"""Validate that all teams reference valid stadiums."""
errors = []
for team in teams:
canonical_id = team.get('canonical_id', '')
stadium_id = team.get('stadium_canonical_id', '')
if not stadium_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Team {canonical_id} has no stadium_canonical_id'
))
elif stadium_id.startswith('stadium_unknown'):
errors.append(ValidationError(
severity='warning',
category='unknown_stadium',
message=f'Team {canonical_id} has unknown stadium: {stadium_id}'
))
elif stadium_id not in stadium_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Team {canonical_id} references unknown stadium: {stadium_id}'
))
return errors
def validate_game_references(
games: list[dict],
team_ids: set[str],
stadium_ids: set[str]
) -> list[ValidationError]:
"""Validate that all games reference valid teams and stadiums."""
errors = []
for game in games:
canonical_id = game.get('canonical_id', '')
home_team_id = game.get('home_team_canonical_id', '')
away_team_id = game.get('away_team_canonical_id', '')
stadium_id = game.get('stadium_canonical_id', '')
# Home team
if not home_team_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Game {canonical_id} has no home_team_canonical_id'
))
elif home_team_id not in team_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Game {canonical_id} references unknown home team: {home_team_id}'
))
# Away team
if not away_team_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Game {canonical_id} has no away_team_canonical_id'
))
elif away_team_id not in team_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Game {canonical_id} references unknown away team: {away_team_id}'
))
# Stadium
if not stadium_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Game {canonical_id} has no stadium_canonical_id'
))
elif stadium_id.startswith('stadium_unknown'):
errors.append(ValidationError(
severity='warning',
category='unknown_stadium',
message=f'Game {canonical_id} has unknown stadium: {stadium_id}'
))
elif stadium_id not in stadium_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Game {canonical_id} references unknown stadium: {stadium_id}'
))
return errors
def validate_no_cross_sport_references(games: list[dict]) -> list[ValidationError]:
"""Validate that games don't have cross-sport team references."""
errors = []
for game in games:
canonical_id = game.get('canonical_id', '')
game_sport = game.get('sport', '').lower()
home_team_id = game.get('home_team_canonical_id', '')
away_team_id = game.get('away_team_canonical_id', '')
# Extract sport from team IDs (format: team_{sport}_{abbrev})
def get_sport_from_id(team_id: str) -> Optional[str]:
parts = team_id.split('_')
if len(parts) >= 2:
return parts[1]
return None
home_sport = get_sport_from_id(home_team_id)
away_sport = get_sport_from_id(away_team_id)
if home_sport and home_sport != game_sport:
errors.append(ValidationError(
severity='error',
category='cross_sport',
message=f'Game {canonical_id} ({game_sport}) has cross-sport home team ({home_sport})'
))
if away_sport and away_sport != game_sport:
errors.append(ValidationError(
severity='error',
category='cross_sport',
message=f'Game {canonical_id} ({game_sport}) has cross-sport away team ({away_sport})'
))
return errors
def validate_stadium_aliases(
aliases: list[dict],
stadium_ids: set[str]
) -> list[ValidationError]:
"""Validate that all stadium aliases reference valid stadiums."""
errors = []
for alias in aliases:
alias_name = alias.get('alias_name', '')
stadium_id = alias.get('stadium_canonical_id', '')
if not stadium_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Stadium alias "{alias_name}" has no stadium_canonical_id'
))
elif stadium_id not in stadium_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Stadium alias "{alias_name}" references unknown stadium: {stadium_id}'
))
return errors
def validate_game_counts_per_team(games: list[dict]) -> list[ValidationError]:
"""Validate that each team has expected number of games."""
errors = []
# Count games per team
team_game_counts = defaultdict(int)
for game in games:
home_id = game.get('home_team_canonical_id', '')
away_id = game.get('away_team_canonical_id', '')
team_game_counts[home_id] += 1
team_game_counts[away_id] += 1
# Check against expected counts
for team_id, count in team_game_counts.items():
# Extract sport from team ID
parts = team_id.split('_')
if len(parts) < 2:
continue
sport = parts[1]
if sport in EXPECTED_GAMES:
expected = EXPECTED_GAMES[sport]
if count < expected['min']:
errors.append(ValidationError(
severity='warning',
category='game_count',
message=f'Team {team_id} has only {count} games (expected ~{expected["expected"]})',
details={'count': count, 'expected': expected['expected'], 'min': expected['min']}
))
elif count > expected['max']:
errors.append(ValidationError(
severity='warning',
category='game_count',
message=f'Team {team_id} has {count} games (expected ~{expected["expected"]})',
details={'count': count, 'expected': expected['expected'], 'max': expected['max']}
))
return errors
def validate_required_fields(
stadiums: list[dict],
teams: list[dict],
games: list[dict]
) -> list[ValidationError]:
"""Validate that required fields are present."""
errors = []
# Required stadium fields
stadium_required = ['canonical_id', 'name', 'sport', 'latitude', 'longitude']
for s in stadiums:
for field in stadium_required:
if field not in s or s[field] is None:
errors.append(ValidationError(
severity='error',
category='missing_field',
message=f'Stadium {s.get("canonical_id", "unknown")} missing required field: {field}'
))
# Required team fields
team_required = ['canonical_id', 'name', 'abbreviation', 'sport', 'stadium_canonical_id']
for t in teams:
for field in team_required:
if field not in t or t[field] is None:
errors.append(ValidationError(
severity='error',
category='missing_field',
message=f'Team {t.get("canonical_id", "unknown")} missing required field: {field}'
))
# Required game fields
game_required = ['canonical_id', 'sport', 'date', 'home_team_canonical_id', 'away_team_canonical_id', 'stadium_canonical_id']
for g in games:
for field in game_required:
if field not in g or g[field] is None:
errors.append(ValidationError(
severity='error',
category='missing_field',
message=f'Game {g.get("canonical_id", "unknown")} missing required field: {field}'
))
return errors
# =============================================================================
# MAIN VALIDATION
# =============================================================================
def validate_canonical_data(
stadiums: list[dict],
teams: list[dict],
games: list[dict],
stadium_aliases: list[dict],
verbose: bool = False
) -> ValidationResult:
"""
Stage 4: Validate all canonical data.
Runs all validation checks and returns results.
Args:
stadiums: List of canonical stadium dicts
teams: List of canonical team dicts
games: List of canonical game dicts
stadium_aliases: List of stadium alias dicts
verbose: Print detailed progress
Returns:
ValidationResult with is_valid, error/warning counts, and error list
"""
all_errors = []
# Build ID sets for reference checking
stadium_ids = {s.get('canonical_id', '') for s in stadiums}
team_ids = {t.get('canonical_id', '') for t in teams}
print("Running validation checks...")
# 1. Duplicate IDs
if verbose:
print(" Checking for duplicate IDs...")
errors = validate_no_duplicate_ids(stadiums, teams, games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} duplicate ID issues")
# 2. Required fields
if verbose:
print(" Checking required fields...")
errors = validate_required_fields(stadiums, teams, games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} missing field issues")
# 3. Team -> Stadium references
if verbose:
print(" Checking team -> stadium references...")
errors = validate_team_stadium_references(teams, stadium_ids)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} team-stadium reference issues")
# 4. Game -> Team/Stadium references
if verbose:
print(" Checking game -> team/stadium references...")
errors = validate_game_references(games, team_ids, stadium_ids)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} game reference issues")
# 5. Cross-sport references
if verbose:
print(" Checking for cross-sport references...")
errors = validate_no_cross_sport_references(games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} cross-sport reference issues")
# 6. Stadium aliases
if verbose:
print(" Checking stadium alias references...")
errors = validate_stadium_aliases(stadium_aliases, stadium_ids)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} stadium alias issues")
# 7. Game counts per team
if verbose:
print(" Checking game counts per team...")
errors = validate_game_counts_per_team(games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} game count issues")
# Count by severity
error_count = sum(1 for e in all_errors if e.severity == 'error')
warning_count = sum(1 for e in all_errors if e.severity == 'warning')
# Count by category
by_category = defaultdict(int)
for e in all_errors:
by_category[e.category] += 1
# Determine validity (no errors = valid, warnings are OK)
is_valid = error_count == 0
return ValidationResult(
is_valid=is_valid,
error_count=error_count,
warning_count=warning_count,
errors=[asdict(e) for e in all_errors],
summary={
'stadiums': len(stadiums),
'teams': len(teams),
'games': len(games),
'aliases': len(stadium_aliases),
'by_category': dict(by_category)
}
)
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Validate canonical data'
)
parser.add_argument(
'--data-dir', type=str, default=None,
help='Directory containing all canonical JSON files'
)
parser.add_argument(
'--stadiums', type=str, default=None,
help='Input canonical stadiums JSON file'
)
parser.add_argument(
'--teams', type=str, default=None,
help='Input canonical teams JSON file'
)
parser.add_argument(
'--games', type=str, default=None,
help='Input canonical games JSON file'
)
parser.add_argument(
'--aliases', type=str, default=None,
help='Input stadium aliases JSON file'
)
parser.add_argument(
'--output', type=str, default=None,
help='Output file for validation report'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
parser.add_argument(
'--strict', action='store_true',
help='Exit with error code if validation fails'
)
args = parser.parse_args()
# Determine file paths
if args.data_dir:
data_dir = Path(args.data_dir)
stadiums_path = data_dir / 'stadiums_canonical.json'
teams_path = data_dir / 'teams_canonical.json'
games_path = data_dir / 'games_canonical.json'
aliases_path = data_dir / 'stadium_aliases.json'
else:
stadiums_path = Path(args.stadiums or './data/stadiums_canonical.json')
teams_path = Path(args.teams or './data/teams_canonical.json')
games_path = Path(args.games or './data/games_canonical.json')
aliases_path = Path(args.aliases or './data/stadium_aliases.json')
# Load input files
print(f"Loading canonical data...")
with open(stadiums_path) as f:
stadiums = json.load(f)
print(f" Loaded {len(stadiums)} stadiums from {stadiums_path}")
with open(teams_path) as f:
teams = json.load(f)
print(f" Loaded {len(teams)} teams from {teams_path}")
with open(games_path) as f:
games = json.load(f)
print(f" Loaded {len(games)} games from {games_path}")
stadium_aliases = []
if aliases_path.exists():
with open(aliases_path) as f:
stadium_aliases = json.load(f)
print(f" Loaded {len(stadium_aliases)} aliases from {aliases_path}")
# Validate
print()
result = validate_canonical_data(
stadiums, teams, games, stadium_aliases,
verbose=args.verbose
)
# Print results
print()
print("=" * 60)
print("VALIDATION RESULTS")
print("=" * 60)
print()
if result.is_valid:
print(" STATUS: PASSED")
else:
print(" STATUS: FAILED")
print()
print(f" Errors: {result.error_count}")
print(f" Warnings: {result.warning_count}")
print()
print(f" Data Summary:")
print(f" Stadiums: {result.summary['stadiums']}")
print(f" Teams: {result.summary['teams']}")
print(f" Games: {result.summary['games']}")
print(f" Aliases: {result.summary['aliases']}")
if result.summary['by_category']:
print()
print(f" Issues by Category:")
for category, count in sorted(result.summary['by_category'].items()):
print(f" {category}: {count}")
# Print errors (up to 20)
if result.errors:
errors_only = [e for e in result.errors if e['severity'] == 'error']
warnings_only = [e for e in result.errors if e['severity'] == 'warning']
if errors_only:
print()
print(" ERRORS (must fix):")
for e in errors_only[:20]:
print(f" [{e['category']}] {e['message']}")
if len(errors_only) > 20:
print(f" ... and {len(errors_only) - 20} more errors")
if warnings_only and args.verbose:
print()
print(" WARNINGS (informational):")
for e in warnings_only[:20]:
print(f" [{e['category']}] {e['message']}")
if len(warnings_only) > 20:
print(f" ... and {len(warnings_only) - 20} more warnings")
# Export report
if args.output:
output_path = Path(args.output)
with open(output_path, 'w') as f:
json.dump(asdict(result), f, indent=2)
print()
print(f"Report exported to {output_path}")
# Exit code
if args.strict and not result.is_valid:
print()
print("VALIDATION FAILED - Exiting with error code 1")
exit(1)
if __name__ == '__main__':
main()