661 lines
21 KiB
Python
661 lines
21 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Canonical Data Validation for SportsTime
|
|
=========================================
|
|
Stage 4 of the canonicalization pipeline.
|
|
|
|
Validates all canonical data before CloudKit upload.
|
|
FAILS if any ERROR-level issues are found.
|
|
|
|
Usage:
|
|
python validate_canonical.py --data-dir data/
|
|
python validate_canonical.py --stadiums data/stadiums_canonical.json \
|
|
--teams data/teams_canonical.json --games data/games_canonical.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass, asdict
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
|
|
# =============================================================================
|
|
# DATA CLASSES
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class ValidationError:
|
|
"""A validation error or warning."""
|
|
severity: str # 'error', 'warning'
|
|
category: str
|
|
message: str
|
|
details: Optional[dict] = None
|
|
|
|
|
|
@dataclass
|
|
class ValidationResult:
|
|
"""Overall validation result."""
|
|
is_valid: bool
|
|
error_count: int
|
|
warning_count: int
|
|
errors: list
|
|
summary: dict
|
|
|
|
|
|
# =============================================================================
|
|
# EXPECTED GAME COUNTS
|
|
# =============================================================================
|
|
|
|
EXPECTED_GAMES = {
|
|
'nba': {
|
|
'expected': 82,
|
|
'min': 75,
|
|
'max': 90,
|
|
'description': 'NBA regular season (82 games)'
|
|
},
|
|
'nhl': {
|
|
'expected': 82,
|
|
'min': 75,
|
|
'max': 90,
|
|
'description': 'NHL regular season (82 games)'
|
|
},
|
|
'mlb': {
|
|
'expected': 162,
|
|
'min': 155,
|
|
'max': 168,
|
|
'description': 'MLB regular season (162 games)'
|
|
},
|
|
'nfl': {
|
|
'expected': 17,
|
|
'min': 15,
|
|
'max': 20,
|
|
'description': 'NFL regular season (17 games)'
|
|
},
|
|
'wnba': {
|
|
'expected': 40,
|
|
'min': 35,
|
|
'max': 45,
|
|
'description': 'WNBA regular season (40 games)'
|
|
},
|
|
'mls': {
|
|
'expected': 34,
|
|
'min': 30,
|
|
'max': 40,
|
|
'description': 'MLS regular season (34 games)'
|
|
},
|
|
'nwsl': {
|
|
'expected': 26,
|
|
'min': 22,
|
|
'max': 30,
|
|
'description': 'NWSL regular season (26 games)'
|
|
},
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# VALIDATION FUNCTIONS
|
|
# =============================================================================
|
|
|
|
def validate_no_duplicate_ids(
|
|
stadiums: list[dict],
|
|
teams: list[dict],
|
|
games: list[dict]
|
|
) -> list[ValidationError]:
|
|
"""Check for duplicate canonical IDs."""
|
|
errors = []
|
|
|
|
# Stadiums
|
|
seen_stadium_ids = set()
|
|
for s in stadiums:
|
|
canonical_id = s.get('canonical_id', '')
|
|
if canonical_id in seen_stadium_ids:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='duplicate_id',
|
|
message=f'Duplicate stadium canonical_id: {canonical_id}'
|
|
))
|
|
seen_stadium_ids.add(canonical_id)
|
|
|
|
# Teams
|
|
seen_team_ids = set()
|
|
for t in teams:
|
|
canonical_id = t.get('canonical_id', '')
|
|
if canonical_id in seen_team_ids:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='duplicate_id',
|
|
message=f'Duplicate team canonical_id: {canonical_id}'
|
|
))
|
|
seen_team_ids.add(canonical_id)
|
|
|
|
# Games
|
|
seen_game_ids = set()
|
|
for g in games:
|
|
canonical_id = g.get('canonical_id', '')
|
|
if canonical_id in seen_game_ids:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='duplicate_id',
|
|
message=f'Duplicate game canonical_id: {canonical_id}'
|
|
))
|
|
seen_game_ids.add(canonical_id)
|
|
|
|
return errors
|
|
|
|
|
|
def validate_team_stadium_references(
|
|
teams: list[dict],
|
|
stadium_ids: set[str]
|
|
) -> list[ValidationError]:
|
|
"""Validate that all teams reference valid stadiums."""
|
|
errors = []
|
|
|
|
for team in teams:
|
|
canonical_id = team.get('canonical_id', '')
|
|
stadium_id = team.get('stadium_canonical_id', '')
|
|
|
|
if not stadium_id:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='missing_reference',
|
|
message=f'Team {canonical_id} has no stadium_canonical_id'
|
|
))
|
|
elif stadium_id.startswith('stadium_unknown'):
|
|
errors.append(ValidationError(
|
|
severity='warning',
|
|
category='unknown_stadium',
|
|
message=f'Team {canonical_id} has unknown stadium: {stadium_id}'
|
|
))
|
|
elif stadium_id not in stadium_ids:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='dangling_reference',
|
|
message=f'Team {canonical_id} references unknown stadium: {stadium_id}'
|
|
))
|
|
|
|
return errors
|
|
|
|
|
|
def validate_game_references(
|
|
games: list[dict],
|
|
team_ids: set[str],
|
|
stadium_ids: set[str]
|
|
) -> list[ValidationError]:
|
|
"""Validate that all games reference valid teams and stadiums."""
|
|
errors = []
|
|
|
|
for game in games:
|
|
canonical_id = game.get('canonical_id', '')
|
|
home_team_id = game.get('home_team_canonical_id', '')
|
|
away_team_id = game.get('away_team_canonical_id', '')
|
|
stadium_id = game.get('stadium_canonical_id', '')
|
|
|
|
# Home team
|
|
if not home_team_id:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='missing_reference',
|
|
message=f'Game {canonical_id} has no home_team_canonical_id'
|
|
))
|
|
elif home_team_id not in team_ids:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='dangling_reference',
|
|
message=f'Game {canonical_id} references unknown home team: {home_team_id}'
|
|
))
|
|
|
|
# Away team
|
|
if not away_team_id:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='missing_reference',
|
|
message=f'Game {canonical_id} has no away_team_canonical_id'
|
|
))
|
|
elif away_team_id not in team_ids:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='dangling_reference',
|
|
message=f'Game {canonical_id} references unknown away team: {away_team_id}'
|
|
))
|
|
|
|
# Stadium
|
|
if not stadium_id:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='missing_reference',
|
|
message=f'Game {canonical_id} has no stadium_canonical_id'
|
|
))
|
|
elif stadium_id.startswith('stadium_unknown'):
|
|
errors.append(ValidationError(
|
|
severity='warning',
|
|
category='unknown_stadium',
|
|
message=f'Game {canonical_id} has unknown stadium: {stadium_id}'
|
|
))
|
|
elif stadium_id not in stadium_ids:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='dangling_reference',
|
|
message=f'Game {canonical_id} references unknown stadium: {stadium_id}'
|
|
))
|
|
|
|
return errors
|
|
|
|
|
|
def validate_no_cross_sport_references(games: list[dict]) -> list[ValidationError]:
|
|
"""Validate that games don't have cross-sport team references."""
|
|
errors = []
|
|
|
|
for game in games:
|
|
canonical_id = game.get('canonical_id', '')
|
|
game_sport = game.get('sport', '').lower()
|
|
home_team_id = game.get('home_team_canonical_id', '')
|
|
away_team_id = game.get('away_team_canonical_id', '')
|
|
|
|
# Extract sport from team IDs (format: team_{sport}_{abbrev})
|
|
def get_sport_from_id(team_id: str) -> Optional[str]:
|
|
parts = team_id.split('_')
|
|
if len(parts) >= 2:
|
|
return parts[1]
|
|
return None
|
|
|
|
home_sport = get_sport_from_id(home_team_id)
|
|
away_sport = get_sport_from_id(away_team_id)
|
|
|
|
if home_sport and home_sport != game_sport:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='cross_sport',
|
|
message=f'Game {canonical_id} ({game_sport}) has cross-sport home team ({home_sport})'
|
|
))
|
|
|
|
if away_sport and away_sport != game_sport:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='cross_sport',
|
|
message=f'Game {canonical_id} ({game_sport}) has cross-sport away team ({away_sport})'
|
|
))
|
|
|
|
return errors
|
|
|
|
|
|
def validate_stadium_aliases(
|
|
aliases: list[dict],
|
|
stadium_ids: set[str]
|
|
) -> list[ValidationError]:
|
|
"""Validate that all stadium aliases reference valid stadiums."""
|
|
errors = []
|
|
|
|
for alias in aliases:
|
|
alias_name = alias.get('alias_name', '')
|
|
stadium_id = alias.get('stadium_canonical_id', '')
|
|
|
|
if not stadium_id:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='missing_reference',
|
|
message=f'Stadium alias "{alias_name}" has no stadium_canonical_id'
|
|
))
|
|
elif stadium_id not in stadium_ids:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='dangling_reference',
|
|
message=f'Stadium alias "{alias_name}" references unknown stadium: {stadium_id}'
|
|
))
|
|
|
|
return errors
|
|
|
|
|
|
def validate_game_counts_per_team(games: list[dict]) -> list[ValidationError]:
|
|
"""Validate that each team has expected number of games."""
|
|
errors = []
|
|
|
|
# Count games per team
|
|
team_game_counts = defaultdict(int)
|
|
for game in games:
|
|
home_id = game.get('home_team_canonical_id', '')
|
|
away_id = game.get('away_team_canonical_id', '')
|
|
team_game_counts[home_id] += 1
|
|
team_game_counts[away_id] += 1
|
|
|
|
# Check against expected counts
|
|
for team_id, count in team_game_counts.items():
|
|
# Extract sport from team ID
|
|
parts = team_id.split('_')
|
|
if len(parts) < 2:
|
|
continue
|
|
|
|
sport = parts[1]
|
|
if sport in EXPECTED_GAMES:
|
|
expected = EXPECTED_GAMES[sport]
|
|
if count < expected['min']:
|
|
errors.append(ValidationError(
|
|
severity='warning',
|
|
category='game_count',
|
|
message=f'Team {team_id} has only {count} games (expected ~{expected["expected"]})',
|
|
details={'count': count, 'expected': expected['expected'], 'min': expected['min']}
|
|
))
|
|
elif count > expected['max']:
|
|
errors.append(ValidationError(
|
|
severity='warning',
|
|
category='game_count',
|
|
message=f'Team {team_id} has {count} games (expected ~{expected["expected"]})',
|
|
details={'count': count, 'expected': expected['expected'], 'max': expected['max']}
|
|
))
|
|
|
|
return errors
|
|
|
|
|
|
def validate_required_fields(
|
|
stadiums: list[dict],
|
|
teams: list[dict],
|
|
games: list[dict]
|
|
) -> list[ValidationError]:
|
|
"""Validate that required fields are present."""
|
|
errors = []
|
|
|
|
# Required stadium fields
|
|
stadium_required = ['canonical_id', 'name', 'sport', 'latitude', 'longitude']
|
|
for s in stadiums:
|
|
for field in stadium_required:
|
|
if field not in s or s[field] is None:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='missing_field',
|
|
message=f'Stadium {s.get("canonical_id", "unknown")} missing required field: {field}'
|
|
))
|
|
|
|
# Required team fields
|
|
team_required = ['canonical_id', 'name', 'abbreviation', 'sport', 'stadium_canonical_id']
|
|
for t in teams:
|
|
for field in team_required:
|
|
if field not in t or t[field] is None:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='missing_field',
|
|
message=f'Team {t.get("canonical_id", "unknown")} missing required field: {field}'
|
|
))
|
|
|
|
# Required game fields
|
|
game_required = ['canonical_id', 'sport', 'date', 'home_team_canonical_id', 'away_team_canonical_id', 'stadium_canonical_id']
|
|
for g in games:
|
|
for field in game_required:
|
|
if field not in g or g[field] is None:
|
|
errors.append(ValidationError(
|
|
severity='error',
|
|
category='missing_field',
|
|
message=f'Game {g.get("canonical_id", "unknown")} missing required field: {field}'
|
|
))
|
|
|
|
return errors
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN VALIDATION
|
|
# =============================================================================
|
|
|
|
def validate_canonical_data(
|
|
stadiums: list[dict],
|
|
teams: list[dict],
|
|
games: list[dict],
|
|
stadium_aliases: list[dict],
|
|
verbose: bool = False
|
|
) -> ValidationResult:
|
|
"""
|
|
Stage 4: Validate all canonical data.
|
|
|
|
Runs all validation checks and returns results.
|
|
|
|
Args:
|
|
stadiums: List of canonical stadium dicts
|
|
teams: List of canonical team dicts
|
|
games: List of canonical game dicts
|
|
stadium_aliases: List of stadium alias dicts
|
|
verbose: Print detailed progress
|
|
|
|
Returns:
|
|
ValidationResult with is_valid, error/warning counts, and error list
|
|
"""
|
|
all_errors = []
|
|
|
|
# Build ID sets for reference checking
|
|
stadium_ids = {s.get('canonical_id', '') for s in stadiums}
|
|
team_ids = {t.get('canonical_id', '') for t in teams}
|
|
|
|
print("Running validation checks...")
|
|
|
|
# 1. Duplicate IDs
|
|
if verbose:
|
|
print(" Checking for duplicate IDs...")
|
|
errors = validate_no_duplicate_ids(stadiums, teams, games)
|
|
all_errors.extend(errors)
|
|
if verbose and errors:
|
|
print(f" Found {len(errors)} duplicate ID issues")
|
|
|
|
# 2. Required fields
|
|
if verbose:
|
|
print(" Checking required fields...")
|
|
errors = validate_required_fields(stadiums, teams, games)
|
|
all_errors.extend(errors)
|
|
if verbose and errors:
|
|
print(f" Found {len(errors)} missing field issues")
|
|
|
|
# 3. Team -> Stadium references
|
|
if verbose:
|
|
print(" Checking team -> stadium references...")
|
|
errors = validate_team_stadium_references(teams, stadium_ids)
|
|
all_errors.extend(errors)
|
|
if verbose and errors:
|
|
print(f" Found {len(errors)} team-stadium reference issues")
|
|
|
|
# 4. Game -> Team/Stadium references
|
|
if verbose:
|
|
print(" Checking game -> team/stadium references...")
|
|
errors = validate_game_references(games, team_ids, stadium_ids)
|
|
all_errors.extend(errors)
|
|
if verbose and errors:
|
|
print(f" Found {len(errors)} game reference issues")
|
|
|
|
# 5. Cross-sport references
|
|
if verbose:
|
|
print(" Checking for cross-sport references...")
|
|
errors = validate_no_cross_sport_references(games)
|
|
all_errors.extend(errors)
|
|
if verbose and errors:
|
|
print(f" Found {len(errors)} cross-sport reference issues")
|
|
|
|
# 6. Stadium aliases
|
|
if verbose:
|
|
print(" Checking stadium alias references...")
|
|
errors = validate_stadium_aliases(stadium_aliases, stadium_ids)
|
|
all_errors.extend(errors)
|
|
if verbose and errors:
|
|
print(f" Found {len(errors)} stadium alias issues")
|
|
|
|
# 7. Game counts per team
|
|
if verbose:
|
|
print(" Checking game counts per team...")
|
|
errors = validate_game_counts_per_team(games)
|
|
all_errors.extend(errors)
|
|
if verbose and errors:
|
|
print(f" Found {len(errors)} game count issues")
|
|
|
|
# Count by severity
|
|
error_count = sum(1 for e in all_errors if e.severity == 'error')
|
|
warning_count = sum(1 for e in all_errors if e.severity == 'warning')
|
|
|
|
# Count by category
|
|
by_category = defaultdict(int)
|
|
for e in all_errors:
|
|
by_category[e.category] += 1
|
|
|
|
# Determine validity (no errors = valid, warnings are OK)
|
|
is_valid = error_count == 0
|
|
|
|
return ValidationResult(
|
|
is_valid=is_valid,
|
|
error_count=error_count,
|
|
warning_count=warning_count,
|
|
errors=[asdict(e) for e in all_errors],
|
|
summary={
|
|
'stadiums': len(stadiums),
|
|
'teams': len(teams),
|
|
'games': len(games),
|
|
'aliases': len(stadium_aliases),
|
|
'by_category': dict(by_category)
|
|
}
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Validate canonical data'
|
|
)
|
|
parser.add_argument(
|
|
'--data-dir', type=str, default=None,
|
|
help='Directory containing all canonical JSON files'
|
|
)
|
|
parser.add_argument(
|
|
'--stadiums', type=str, default=None,
|
|
help='Input canonical stadiums JSON file'
|
|
)
|
|
parser.add_argument(
|
|
'--teams', type=str, default=None,
|
|
help='Input canonical teams JSON file'
|
|
)
|
|
parser.add_argument(
|
|
'--games', type=str, default=None,
|
|
help='Input canonical games JSON file'
|
|
)
|
|
parser.add_argument(
|
|
'--aliases', type=str, default=None,
|
|
help='Input stadium aliases JSON file'
|
|
)
|
|
parser.add_argument(
|
|
'--output', type=str, default=None,
|
|
help='Output file for validation report'
|
|
)
|
|
parser.add_argument(
|
|
'--verbose', '-v', action='store_true',
|
|
help='Verbose output'
|
|
)
|
|
parser.add_argument(
|
|
'--strict', action='store_true',
|
|
help='Exit with error code if validation fails'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Determine file paths
|
|
if args.data_dir:
|
|
data_dir = Path(args.data_dir)
|
|
stadiums_path = data_dir / 'stadiums_canonical.json'
|
|
teams_path = data_dir / 'teams_canonical.json'
|
|
games_path = data_dir / 'games_canonical.json'
|
|
aliases_path = data_dir / 'stadium_aliases.json'
|
|
else:
|
|
stadiums_path = Path(args.stadiums or './data/stadiums_canonical.json')
|
|
teams_path = Path(args.teams or './data/teams_canonical.json')
|
|
games_path = Path(args.games or './data/games_canonical.json')
|
|
aliases_path = Path(args.aliases or './data/stadium_aliases.json')
|
|
|
|
# Load input files
|
|
print(f"Loading canonical data...")
|
|
|
|
with open(stadiums_path) as f:
|
|
stadiums = json.load(f)
|
|
print(f" Loaded {len(stadiums)} stadiums from {stadiums_path}")
|
|
|
|
with open(teams_path) as f:
|
|
teams = json.load(f)
|
|
print(f" Loaded {len(teams)} teams from {teams_path}")
|
|
|
|
with open(games_path) as f:
|
|
games = json.load(f)
|
|
print(f" Loaded {len(games)} games from {games_path}")
|
|
|
|
stadium_aliases = []
|
|
if aliases_path.exists():
|
|
with open(aliases_path) as f:
|
|
stadium_aliases = json.load(f)
|
|
print(f" Loaded {len(stadium_aliases)} aliases from {aliases_path}")
|
|
|
|
# Validate
|
|
print()
|
|
result = validate_canonical_data(
|
|
stadiums, teams, games, stadium_aliases,
|
|
verbose=args.verbose
|
|
)
|
|
|
|
# Print results
|
|
print()
|
|
print("=" * 60)
|
|
print("VALIDATION RESULTS")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
if result.is_valid:
|
|
print(" STATUS: PASSED")
|
|
else:
|
|
print(" STATUS: FAILED")
|
|
|
|
print()
|
|
print(f" Errors: {result.error_count}")
|
|
print(f" Warnings: {result.warning_count}")
|
|
print()
|
|
print(f" Data Summary:")
|
|
print(f" Stadiums: {result.summary['stadiums']}")
|
|
print(f" Teams: {result.summary['teams']}")
|
|
print(f" Games: {result.summary['games']}")
|
|
print(f" Aliases: {result.summary['aliases']}")
|
|
|
|
if result.summary['by_category']:
|
|
print()
|
|
print(f" Issues by Category:")
|
|
for category, count in sorted(result.summary['by_category'].items()):
|
|
print(f" {category}: {count}")
|
|
|
|
# Print errors (up to 20)
|
|
if result.errors:
|
|
errors_only = [e for e in result.errors if e['severity'] == 'error']
|
|
warnings_only = [e for e in result.errors if e['severity'] == 'warning']
|
|
|
|
if errors_only:
|
|
print()
|
|
print(" ERRORS (must fix):")
|
|
for e in errors_only[:20]:
|
|
print(f" [{e['category']}] {e['message']}")
|
|
if len(errors_only) > 20:
|
|
print(f" ... and {len(errors_only) - 20} more errors")
|
|
|
|
if warnings_only and args.verbose:
|
|
print()
|
|
print(" WARNINGS (informational):")
|
|
for e in warnings_only[:20]:
|
|
print(f" [{e['category']}] {e['message']}")
|
|
if len(warnings_only) > 20:
|
|
print(f" ... and {len(warnings_only) - 20} more warnings")
|
|
|
|
# Export report
|
|
if args.output:
|
|
output_path = Path(args.output)
|
|
with open(output_path, 'w') as f:
|
|
json.dump(asdict(result), f, indent=2)
|
|
print()
|
|
print(f"Report exported to {output_path}")
|
|
|
|
# Exit code
|
|
if args.strict and not result.is_valid:
|
|
print()
|
|
print("VALIDATION FAILED - Exiting with error code 1")
|
|
exit(1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|