Files
Sportstime/Scripts/validate_canonical.py
Trey t 7efcea7bd4 Add canonical ID pipeline and fix UUID consistency for CloudKit sync
- Add local canonicalization pipeline (stadiums, teams, games) that generates
  deterministic canonical IDs before CloudKit upload
- Fix CanonicalSyncService to use deterministic UUIDs from canonical IDs
  instead of random UUIDs from CloudKit records
- Add SyncStadium/SyncTeam/SyncGame types to CloudKitService that preserve
  canonical ID relationships during sync
- Add canonical ID field keys to CKModels for reading from CloudKit records
- Bundle canonical JSON files (stadiums_canonical, teams_canonical,
  games_canonical, stadium_aliases) for consistent bootstrap data
- Update BootstrapService to prefer canonical format files over legacy format

This ensures all entities use consistent deterministic UUIDs derived from
their canonical IDs, preventing duplicate records when syncing CloudKit
data with bootstrapped local data.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-09 10:30:09 -06:00

637 lines
21 KiB
Python

#!/usr/bin/env python3
"""
Canonical Data Validation for SportsTime
=========================================
Stage 4 of the canonicalization pipeline.
Validates all canonical data before CloudKit upload.
FAILS if any ERROR-level issues are found.
Usage:
python validate_canonical.py --data-dir data/
python validate_canonical.py --stadiums data/stadiums_canonical.json \
--teams data/teams_canonical.json --games data/games_canonical.json
"""
import argparse
import json
from collections import defaultdict
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class ValidationError:
"""A validation error or warning."""
severity: str # 'error', 'warning'
category: str
message: str
details: Optional[dict] = None
@dataclass
class ValidationResult:
"""Overall validation result."""
is_valid: bool
error_count: int
warning_count: int
errors: list
summary: dict
# =============================================================================
# EXPECTED GAME COUNTS
# =============================================================================
EXPECTED_GAMES = {
'nba': {
'expected': 82,
'min': 75,
'max': 90,
'description': 'NBA regular season (82 games)'
},
'nhl': {
'expected': 82,
'min': 75,
'max': 90,
'description': 'NHL regular season (82 games)'
},
'mlb': {
'expected': 162,
'min': 155,
'max': 168,
'description': 'MLB regular season (162 games)'
},
}
# =============================================================================
# VALIDATION FUNCTIONS
# =============================================================================
def validate_no_duplicate_ids(
stadiums: list[dict],
teams: list[dict],
games: list[dict]
) -> list[ValidationError]:
"""Check for duplicate canonical IDs."""
errors = []
# Stadiums
seen_stadium_ids = set()
for s in stadiums:
canonical_id = s.get('canonical_id', '')
if canonical_id in seen_stadium_ids:
errors.append(ValidationError(
severity='error',
category='duplicate_id',
message=f'Duplicate stadium canonical_id: {canonical_id}'
))
seen_stadium_ids.add(canonical_id)
# Teams
seen_team_ids = set()
for t in teams:
canonical_id = t.get('canonical_id', '')
if canonical_id in seen_team_ids:
errors.append(ValidationError(
severity='error',
category='duplicate_id',
message=f'Duplicate team canonical_id: {canonical_id}'
))
seen_team_ids.add(canonical_id)
# Games
seen_game_ids = set()
for g in games:
canonical_id = g.get('canonical_id', '')
if canonical_id in seen_game_ids:
errors.append(ValidationError(
severity='error',
category='duplicate_id',
message=f'Duplicate game canonical_id: {canonical_id}'
))
seen_game_ids.add(canonical_id)
return errors
def validate_team_stadium_references(
teams: list[dict],
stadium_ids: set[str]
) -> list[ValidationError]:
"""Validate that all teams reference valid stadiums."""
errors = []
for team in teams:
canonical_id = team.get('canonical_id', '')
stadium_id = team.get('stadium_canonical_id', '')
if not stadium_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Team {canonical_id} has no stadium_canonical_id'
))
elif stadium_id.startswith('stadium_unknown'):
errors.append(ValidationError(
severity='warning',
category='unknown_stadium',
message=f'Team {canonical_id} has unknown stadium: {stadium_id}'
))
elif stadium_id not in stadium_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Team {canonical_id} references unknown stadium: {stadium_id}'
))
return errors
def validate_game_references(
games: list[dict],
team_ids: set[str],
stadium_ids: set[str]
) -> list[ValidationError]:
"""Validate that all games reference valid teams and stadiums."""
errors = []
for game in games:
canonical_id = game.get('canonical_id', '')
home_team_id = game.get('home_team_canonical_id', '')
away_team_id = game.get('away_team_canonical_id', '')
stadium_id = game.get('stadium_canonical_id', '')
# Home team
if not home_team_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Game {canonical_id} has no home_team_canonical_id'
))
elif home_team_id not in team_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Game {canonical_id} references unknown home team: {home_team_id}'
))
# Away team
if not away_team_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Game {canonical_id} has no away_team_canonical_id'
))
elif away_team_id not in team_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Game {canonical_id} references unknown away team: {away_team_id}'
))
# Stadium
if not stadium_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Game {canonical_id} has no stadium_canonical_id'
))
elif stadium_id.startswith('stadium_unknown'):
errors.append(ValidationError(
severity='warning',
category='unknown_stadium',
message=f'Game {canonical_id} has unknown stadium: {stadium_id}'
))
elif stadium_id not in stadium_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Game {canonical_id} references unknown stadium: {stadium_id}'
))
return errors
def validate_no_cross_sport_references(games: list[dict]) -> list[ValidationError]:
"""Validate that games don't have cross-sport team references."""
errors = []
for game in games:
canonical_id = game.get('canonical_id', '')
game_sport = game.get('sport', '').lower()
home_team_id = game.get('home_team_canonical_id', '')
away_team_id = game.get('away_team_canonical_id', '')
# Extract sport from team IDs (format: team_{sport}_{abbrev})
def get_sport_from_id(team_id: str) -> Optional[str]:
parts = team_id.split('_')
if len(parts) >= 2:
return parts[1]
return None
home_sport = get_sport_from_id(home_team_id)
away_sport = get_sport_from_id(away_team_id)
if home_sport and home_sport != game_sport:
errors.append(ValidationError(
severity='error',
category='cross_sport',
message=f'Game {canonical_id} ({game_sport}) has cross-sport home team ({home_sport})'
))
if away_sport and away_sport != game_sport:
errors.append(ValidationError(
severity='error',
category='cross_sport',
message=f'Game {canonical_id} ({game_sport}) has cross-sport away team ({away_sport})'
))
return errors
def validate_stadium_aliases(
aliases: list[dict],
stadium_ids: set[str]
) -> list[ValidationError]:
"""Validate that all stadium aliases reference valid stadiums."""
errors = []
for alias in aliases:
alias_name = alias.get('alias_name', '')
stadium_id = alias.get('stadium_canonical_id', '')
if not stadium_id:
errors.append(ValidationError(
severity='error',
category='missing_reference',
message=f'Stadium alias "{alias_name}" has no stadium_canonical_id'
))
elif stadium_id not in stadium_ids:
errors.append(ValidationError(
severity='error',
category='dangling_reference',
message=f'Stadium alias "{alias_name}" references unknown stadium: {stadium_id}'
))
return errors
def validate_game_counts_per_team(games: list[dict]) -> list[ValidationError]:
"""Validate that each team has expected number of games."""
errors = []
# Count games per team
team_game_counts = defaultdict(int)
for game in games:
home_id = game.get('home_team_canonical_id', '')
away_id = game.get('away_team_canonical_id', '')
team_game_counts[home_id] += 1
team_game_counts[away_id] += 1
# Check against expected counts
for team_id, count in team_game_counts.items():
# Extract sport from team ID
parts = team_id.split('_')
if len(parts) < 2:
continue
sport = parts[1]
if sport in EXPECTED_GAMES:
expected = EXPECTED_GAMES[sport]
if count < expected['min']:
errors.append(ValidationError(
severity='warning',
category='game_count',
message=f'Team {team_id} has only {count} games (expected ~{expected["expected"]})',
details={'count': count, 'expected': expected['expected'], 'min': expected['min']}
))
elif count > expected['max']:
errors.append(ValidationError(
severity='warning',
category='game_count',
message=f'Team {team_id} has {count} games (expected ~{expected["expected"]})',
details={'count': count, 'expected': expected['expected'], 'max': expected['max']}
))
return errors
def validate_required_fields(
stadiums: list[dict],
teams: list[dict],
games: list[dict]
) -> list[ValidationError]:
"""Validate that required fields are present."""
errors = []
# Required stadium fields
stadium_required = ['canonical_id', 'name', 'sport', 'latitude', 'longitude']
for s in stadiums:
for field in stadium_required:
if field not in s or s[field] is None:
errors.append(ValidationError(
severity='error',
category='missing_field',
message=f'Stadium {s.get("canonical_id", "unknown")} missing required field: {field}'
))
# Required team fields
team_required = ['canonical_id', 'name', 'abbreviation', 'sport', 'stadium_canonical_id']
for t in teams:
for field in team_required:
if field not in t or t[field] is None:
errors.append(ValidationError(
severity='error',
category='missing_field',
message=f'Team {t.get("canonical_id", "unknown")} missing required field: {field}'
))
# Required game fields
game_required = ['canonical_id', 'sport', 'date', 'home_team_canonical_id', 'away_team_canonical_id', 'stadium_canonical_id']
for g in games:
for field in game_required:
if field not in g or g[field] is None:
errors.append(ValidationError(
severity='error',
category='missing_field',
message=f'Game {g.get("canonical_id", "unknown")} missing required field: {field}'
))
return errors
# =============================================================================
# MAIN VALIDATION
# =============================================================================
def validate_canonical_data(
stadiums: list[dict],
teams: list[dict],
games: list[dict],
stadium_aliases: list[dict],
verbose: bool = False
) -> ValidationResult:
"""
Stage 4: Validate all canonical data.
Runs all validation checks and returns results.
Args:
stadiums: List of canonical stadium dicts
teams: List of canonical team dicts
games: List of canonical game dicts
stadium_aliases: List of stadium alias dicts
verbose: Print detailed progress
Returns:
ValidationResult with is_valid, error/warning counts, and error list
"""
all_errors = []
# Build ID sets for reference checking
stadium_ids = {s.get('canonical_id', '') for s in stadiums}
team_ids = {t.get('canonical_id', '') for t in teams}
print("Running validation checks...")
# 1. Duplicate IDs
if verbose:
print(" Checking for duplicate IDs...")
errors = validate_no_duplicate_ids(stadiums, teams, games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} duplicate ID issues")
# 2. Required fields
if verbose:
print(" Checking required fields...")
errors = validate_required_fields(stadiums, teams, games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} missing field issues")
# 3. Team -> Stadium references
if verbose:
print(" Checking team -> stadium references...")
errors = validate_team_stadium_references(teams, stadium_ids)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} team-stadium reference issues")
# 4. Game -> Team/Stadium references
if verbose:
print(" Checking game -> team/stadium references...")
errors = validate_game_references(games, team_ids, stadium_ids)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} game reference issues")
# 5. Cross-sport references
if verbose:
print(" Checking for cross-sport references...")
errors = validate_no_cross_sport_references(games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} cross-sport reference issues")
# 6. Stadium aliases
if verbose:
print(" Checking stadium alias references...")
errors = validate_stadium_aliases(stadium_aliases, stadium_ids)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} stadium alias issues")
# 7. Game counts per team
if verbose:
print(" Checking game counts per team...")
errors = validate_game_counts_per_team(games)
all_errors.extend(errors)
if verbose and errors:
print(f" Found {len(errors)} game count issues")
# Count by severity
error_count = sum(1 for e in all_errors if e.severity == 'error')
warning_count = sum(1 for e in all_errors if e.severity == 'warning')
# Count by category
by_category = defaultdict(int)
for e in all_errors:
by_category[e.category] += 1
# Determine validity (no errors = valid, warnings are OK)
is_valid = error_count == 0
return ValidationResult(
is_valid=is_valid,
error_count=error_count,
warning_count=warning_count,
errors=[asdict(e) for e in all_errors],
summary={
'stadiums': len(stadiums),
'teams': len(teams),
'games': len(games),
'aliases': len(stadium_aliases),
'by_category': dict(by_category)
}
)
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Validate canonical data'
)
parser.add_argument(
'--data-dir', type=str, default=None,
help='Directory containing all canonical JSON files'
)
parser.add_argument(
'--stadiums', type=str, default=None,
help='Input canonical stadiums JSON file'
)
parser.add_argument(
'--teams', type=str, default=None,
help='Input canonical teams JSON file'
)
parser.add_argument(
'--games', type=str, default=None,
help='Input canonical games JSON file'
)
parser.add_argument(
'--aliases', type=str, default=None,
help='Input stadium aliases JSON file'
)
parser.add_argument(
'--output', type=str, default=None,
help='Output file for validation report'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Verbose output'
)
parser.add_argument(
'--strict', action='store_true',
help='Exit with error code if validation fails'
)
args = parser.parse_args()
# Determine file paths
if args.data_dir:
data_dir = Path(args.data_dir)
stadiums_path = data_dir / 'stadiums_canonical.json'
teams_path = data_dir / 'teams_canonical.json'
games_path = data_dir / 'games_canonical.json'
aliases_path = data_dir / 'stadium_aliases.json'
else:
stadiums_path = Path(args.stadiums or './data/stadiums_canonical.json')
teams_path = Path(args.teams or './data/teams_canonical.json')
games_path = Path(args.games or './data/games_canonical.json')
aliases_path = Path(args.aliases or './data/stadium_aliases.json')
# Load input files
print(f"Loading canonical data...")
with open(stadiums_path) as f:
stadiums = json.load(f)
print(f" Loaded {len(stadiums)} stadiums from {stadiums_path}")
with open(teams_path) as f:
teams = json.load(f)
print(f" Loaded {len(teams)} teams from {teams_path}")
with open(games_path) as f:
games = json.load(f)
print(f" Loaded {len(games)} games from {games_path}")
stadium_aliases = []
if aliases_path.exists():
with open(aliases_path) as f:
stadium_aliases = json.load(f)
print(f" Loaded {len(stadium_aliases)} aliases from {aliases_path}")
# Validate
print()
result = validate_canonical_data(
stadiums, teams, games, stadium_aliases,
verbose=args.verbose
)
# Print results
print()
print("=" * 60)
print("VALIDATION RESULTS")
print("=" * 60)
print()
if result.is_valid:
print(" STATUS: PASSED")
else:
print(" STATUS: FAILED")
print()
print(f" Errors: {result.error_count}")
print(f" Warnings: {result.warning_count}")
print()
print(f" Data Summary:")
print(f" Stadiums: {result.summary['stadiums']}")
print(f" Teams: {result.summary['teams']}")
print(f" Games: {result.summary['games']}")
print(f" Aliases: {result.summary['aliases']}")
if result.summary['by_category']:
print()
print(f" Issues by Category:")
for category, count in sorted(result.summary['by_category'].items()):
print(f" {category}: {count}")
# Print errors (up to 20)
if result.errors:
errors_only = [e for e in result.errors if e['severity'] == 'error']
warnings_only = [e for e in result.errors if e['severity'] == 'warning']
if errors_only:
print()
print(" ERRORS (must fix):")
for e in errors_only[:20]:
print(f" [{e['category']}] {e['message']}")
if len(errors_only) > 20:
print(f" ... and {len(errors_only) - 20} more errors")
if warnings_only and args.verbose:
print()
print(" WARNINGS (informational):")
for e in warnings_only[:20]:
print(f" [{e['category']}] {e['message']}")
if len(warnings_only) > 20:
print(f" ... and {len(warnings_only) - 20} more warnings")
# Export report
if args.output:
output_path = Path(args.output)
with open(output_path, 'w') as f:
json.dump(asdict(result), f, indent=2)
print()
print(f"Report exported to {output_path}")
# Exit code
if args.strict and not result.is_valid:
print()
print("VALIDATION FAILED - Exiting with error code 1")
exit(1)
if __name__ == '__main__':
main()