feat(06-01): add comprehensive validation command

Add --validate flag with local validation, CloudKit relationship
checking, and sync status comparison. Includes JSON export via
--output flag and menu option 16 for interactive mode.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-10 10:31:52 -06:00
parent 4266940c8f
commit 9f0edc4228

View File

@@ -58,6 +58,24 @@ try:
except ImportError:
HAS_CRYPTO = False
# Import validation functions from validate_canonical
try:
from validate_canonical import (
validate_canonical_data,
validate_no_duplicate_ids,
validate_required_fields,
validate_team_stadium_references,
validate_game_references,
validate_no_cross_sport_references,
validate_stadium_aliases,
validate_game_counts_per_team,
ValidationError,
ValidationResult,
)
HAS_VALIDATION = True
except ImportError:
HAS_VALIDATION = False
CONTAINER = "iCloud.com.sportstime.app"
HOST = "https://api.apple-cloudkit.com"
BATCH_SIZE = 200
@@ -139,17 +157,18 @@ def show_menu():
print(" 13. Smart sync + delete orphans")
print(" 14. Verify sync (quick)")
print(" 15. Verify sync (deep)")
print(" 16. Validate data (local + CloudKit)")
print(" 0. Exit")
print()
while True:
try:
choice = input("Enter choice [1-15, 0 to exit]: ").strip()
choice = input("Enter choice [1-16, 0 to exit]: ").strip()
if choice == '0':
return None
if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15']:
if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16']:
return int(choice)
print("Invalid choice. Please enter 1-15 or 0.")
print("Invalid choice. Please enter 1-16 or 0.")
except (EOFError, KeyboardInterrupt):
print("\nExiting.")
return None
@@ -1315,6 +1334,333 @@ def verify_sync(ck, data_dir, verbose=False, deep=False):
return total_mismatches == 0
def validate_all(ck, data_dir, output_file=None, verbose=False):
"""
Comprehensive validation report including:
1. Local data validation (from validate_canonical.py)
2. CloudKit relationship validation
3. Sync status comparison
"""
from dataclasses import asdict
data_dir = Path(data_dir)
print("\n" + "="*60)
print("Comprehensive Data Validation Report")
print("="*60)
# Load local data
stadiums = json.load(open(data_dir / 'stadiums_canonical.json')) if (data_dir / 'stadiums_canonical.json').exists() else []
teams = json.load(open(data_dir / 'teams_canonical.json')) if (data_dir / 'teams_canonical.json').exists() else []
# Load games from canonical/games/*.json OR games_canonical.json
canonical_games_dir = data_dir / 'canonical' / 'games'
games = []
game_files = list(canonical_games_dir.glob('*.json')) if canonical_games_dir.exists() else []
if game_files:
for games_file in sorted(game_files):
with open(games_file) as f:
games.extend(json.load(f))
elif (data_dir / 'games_canonical.json').exists():
games = json.load(open(data_dir / 'games_canonical.json'))
stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else []
team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else []
print(f"\nLocal data loaded:")
print(f" Stadiums: {len(stadiums)}")
print(f" Teams: {len(teams)}")
print(f" Games: {len(games)}")
print(f" Stadium aliases: {len(stadium_aliases)}")
print(f" Team aliases: {len(team_aliases)}")
report = {
'timestamp': datetime.now(timezone.utc).isoformat(),
'local_counts': {
'stadiums': len(stadiums),
'teams': len(teams),
'games': len(games),
'stadium_aliases': len(stadium_aliases),
'team_aliases': len(team_aliases),
},
'local_validation': None,
'cloudkit_validation': None,
'sync_status': None,
}
# =========================================================================
# Section 1: Local Validation (using validate_canonical.py)
# =========================================================================
print("\n" + "-"*60)
print("SECTION 1: Local Data Validation")
print("-"*60)
if HAS_VALIDATION:
result = validate_canonical_data(stadiums, teams, games, stadium_aliases, verbose=verbose)
report['local_validation'] = asdict(result)
if result.is_valid:
print("\n ✓ Local data VALID")
else:
print("\n ✗ Local data INVALID")
print(f" Errors: {result.error_count}")
print(f" Warnings: {result.warning_count}")
if result.summary.get('by_category'):
print("\n Issues by category:")
for category, count in sorted(result.summary['by_category'].items()):
print(f" {category}: {count}")
# Show first 5 errors
errors_only = [e for e in result.errors if e['severity'] == 'error']
if errors_only:
print("\n Errors (first 5):")
for e in errors_only[:5]:
print(f" [{e['category']}] {e['message']}")
if len(errors_only) > 5:
print(f" ... and {len(errors_only) - 5} more errors")
else:
print("\n ⚠ validate_canonical module not available")
print(" Run from Scripts/ directory or install validate_canonical.py")
report['local_validation'] = {'error': 'module not available'}
# =========================================================================
# Section 2: CloudKit Relationship Validation
# =========================================================================
print("\n" + "-"*60)
print("SECTION 2: CloudKit Relationship Validation")
print("-"*60)
cloudkit_errors = []
if ck:
print("\n Querying CloudKit data...")
# Query all CloudKit records
ck_stadiums = ck.query_all('Stadium', verbose=verbose)
ck_teams = ck.query_all('Team', verbose=verbose)
ck_games = ck.query_all('Game', verbose=verbose)
ck_stadium_aliases = ck.query_all('StadiumAlias', verbose=verbose)
print(f"\n CloudKit counts:")
print(f" Stadiums: {len(ck_stadiums)}")
print(f" Teams: {len(ck_teams)}")
print(f" Games: {len(ck_games)}")
print(f" Stadium aliases: {len(ck_stadium_aliases)}")
# Build ID sets for CloudKit records
ck_stadium_uuids = set(ck_stadiums.keys())
ck_team_uuids = set(ck_teams.keys())
# Check games reference valid teams and stadiums
print("\n Checking game references...")
games_missing_home = 0
games_missing_away = 0
games_missing_stadium = 0
for game_uuid, game_rec in ck_games.items():
fields = game_rec.get('fields', {})
home_ref = fields.get('homeTeamRef', {}).get('value', {})
home_uuid = home_ref.get('recordName', '') if isinstance(home_ref, dict) else ''
if home_uuid and home_uuid not in ck_team_uuids:
games_missing_home += 1
cloudkit_errors.append({
'type': 'game_missing_home_team',
'game_uuid': game_uuid,
'missing_team_uuid': home_uuid
})
away_ref = fields.get('awayTeamRef', {}).get('value', {})
away_uuid = away_ref.get('recordName', '') if isinstance(away_ref, dict) else ''
if away_uuid and away_uuid not in ck_team_uuids:
games_missing_away += 1
cloudkit_errors.append({
'type': 'game_missing_away_team',
'game_uuid': game_uuid,
'missing_team_uuid': away_uuid
})
stadium_ref = fields.get('stadiumRef', {}).get('value', {})
stadium_uuid = stadium_ref.get('recordName', '') if isinstance(stadium_ref, dict) else ''
if stadium_uuid and stadium_uuid not in ck_stadium_uuids:
games_missing_stadium += 1
cloudkit_errors.append({
'type': 'game_missing_stadium',
'game_uuid': game_uuid,
'missing_stadium_uuid': stadium_uuid
})
if games_missing_home or games_missing_away or games_missing_stadium:
print(f" Games with missing home team ref: {games_missing_home}")
print(f" Games with missing away team ref: {games_missing_away}")
print(f" Games with missing stadium ref: {games_missing_stadium}")
else:
print(" ✓ All game references valid")
# Check teams reference valid stadiums
print("\n Checking team references...")
teams_missing_stadium = 0
for team_uuid, team_rec in ck_teams.items():
fields = team_rec.get('fields', {})
stadium_canonical_id = fields.get('stadiumCanonicalId', {}).get('value', '')
if stadium_canonical_id:
expected_stadium_uuid = deterministic_uuid(stadium_canonical_id)
if expected_stadium_uuid not in ck_stadium_uuids:
teams_missing_stadium += 1
cloudkit_errors.append({
'type': 'team_missing_stadium',
'team_uuid': team_uuid,
'missing_stadium_id': stadium_canonical_id
})
if teams_missing_stadium:
print(f" Teams with missing stadium: {teams_missing_stadium}")
else:
print(" ✓ All team references valid")
# Check stadium aliases reference valid stadiums
print("\n Checking alias references...")
aliases_missing_stadium = 0
for alias_name, alias_rec in ck_stadium_aliases.items():
fields = alias_rec.get('fields', {})
stadium_canonical_id = fields.get('stadiumCanonicalId', {}).get('value', '')
if stadium_canonical_id:
expected_stadium_uuid = deterministic_uuid(stadium_canonical_id)
if expected_stadium_uuid not in ck_stadium_uuids:
aliases_missing_stadium += 1
cloudkit_errors.append({
'type': 'alias_missing_stadium',
'alias_name': alias_name,
'missing_stadium_id': stadium_canonical_id
})
if aliases_missing_stadium:
print(f" Aliases with missing stadium: {aliases_missing_stadium}")
else:
print(" ✓ All alias references valid")
report['cloudkit_validation'] = {
'counts': {
'stadiums': len(ck_stadiums),
'teams': len(ck_teams),
'games': len(ck_games),
'stadium_aliases': len(ck_stadium_aliases),
},
'errors': cloudkit_errors,
'error_count': len(cloudkit_errors),
}
else:
print("\n ⚠ CloudKit not connected (dry-run mode or missing credentials)")
print(" CloudKit validation skipped")
report['cloudkit_validation'] = {'error': 'not connected'}
# =========================================================================
# Section 3: Sync Status (Local vs CloudKit)
# =========================================================================
print("\n" + "-"*60)
print("SECTION 3: Sync Status")
print("-"*60)
if ck:
# Use existing compute_diff logic
# Build local records
def build_stadium_records_simple(stadiums):
records = {}
for s in stadiums:
stadium_id = s.get('canonical_id', s.get('id', ''))
record_name = deterministic_uuid(stadium_id)
records[record_name] = {'recordName': record_name}
return records
def build_team_records_simple(teams):
records = {}
for t in teams:
team_id = t.get('canonical_id', '')
record_name = deterministic_uuid(team_id)
records[record_name] = {'recordName': record_name}
return records
def build_game_records_simple(games):
records = {}
seen = set()
for g in games:
game_id = g.get('canonical_id', g.get('id', ''))
if game_id in seen:
continue
seen.add(game_id)
record_name = deterministic_uuid(game_id)
records[record_name] = {'recordName': record_name}
return records
local_stadium_records = build_stadium_records_simple(stadiums)
local_team_records = build_team_records_simple(teams)
local_game_records = build_game_records_simple(games)
sync_status = {}
print("\n Comparing local vs CloudKit...")
for record_type, local_records, ck_records in [
('Stadium', local_stadium_records, ck_stadiums),
('Team', local_team_records, ck_teams),
('Game', local_game_records, ck_games),
]:
local_names = set(local_records.keys())
cloud_names = set(ck_records.keys())
only_local = local_names - cloud_names
only_cloud = cloud_names - local_names
both = local_names & cloud_names
sync_status[record_type] = {
'local_count': len(local_names),
'cloud_count': len(cloud_names),
'only_local': len(only_local),
'only_cloud': len(only_cloud),
'in_both': len(both),
}
status = "in sync" if len(only_local) == 0 and len(only_cloud) == 0 else "out of sync"
print(f"\n {record_type}:")
print(f" Local: {len(local_names)}, CloudKit: {len(cloud_names)}")
print(f" Only in local (not uploaded): {len(only_local)}")
print(f" Only in CloudKit (orphans): {len(only_cloud)}")
print(f" Status: {status}")
report['sync_status'] = sync_status
else:
print("\n ⚠ CloudKit not connected - sync status unavailable")
report['sync_status'] = {'error': 'not connected'}
# =========================================================================
# Summary
# =========================================================================
print("\n" + "="*60)
print("VALIDATION SUMMARY")
print("="*60)
local_valid = report['local_validation'] and report['local_validation'].get('is_valid', False) if isinstance(report['local_validation'], dict) and 'error' not in report['local_validation'] else False
cloudkit_valid = report['cloudkit_validation'] and report['cloudkit_validation'].get('error_count', 1) == 0 if isinstance(report['cloudkit_validation'], dict) and 'error' not in report['cloudkit_validation'] else False
print(f"\n Local validation: {'✓ PASSED' if local_valid else '✗ FAILED'}")
print(f" CloudKit references: {'✓ PASSED' if cloudkit_valid else '⚠ ISSUES' if report.get('cloudkit_validation') and 'error' not in report['cloudkit_validation'] else 'N/A'}")
if report.get('sync_status') and 'error' not in report['sync_status']:
total_orphans = sum(s.get('only_cloud', 0) for s in report['sync_status'].values())
total_not_uploaded = sum(s.get('only_local', 0) for s in report['sync_status'].values())
print(f" Sync status: {total_not_uploaded} not uploaded, {total_orphans} orphans")
# Export to JSON if requested
if output_file:
with open(output_file, 'w') as f:
json.dump(report, f, indent=2, default=str)
print(f"\n Report exported to: {output_file}")
return report
# Valid record types for individual record management
VALID_RECORD_TYPES = ['Stadium', 'Team', 'Game', 'LeagueStructure', 'TeamAlias', 'StadiumAlias']
@@ -1648,6 +1994,8 @@ def main():
p.add_argument('--delete-orphans', action='store_true', help='With --smart-sync, also delete records not in local data')
p.add_argument('--verify', action='store_true', help='Verify CloudKit matches local data (quick: counts + spot-check)')
p.add_argument('--verify-deep', action='store_true', help='Verify CloudKit matches local data (deep: full field comparison)')
p.add_argument('--validate', action='store_true', help='Run comprehensive validation (local + CloudKit relationships + sync status)')
p.add_argument('--output', type=str, metavar='FILE', help='Output file for validation report (JSON format)')
# Individual record management
p.add_argument('--get', nargs=2, metavar=('TYPE', 'ID'), help='Get a single record (e.g., --get Stadium stadium_nba_td_garden)')
p.add_argument('--list', metavar='TYPE', help='List all recordNames for a type (e.g., --list Stadium)')
@@ -1665,7 +2013,7 @@ def main():
args.stadiums_only, args.games_only, args.games_files, args.league_structure_only,
args.team_aliases_only, args.stadium_aliases_only, args.canonical_only,
args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync,
args.verify, args.verify_deep, args.get, args.list, args.update_record, args.delete_record
args.verify, args.verify_deep, args.validate, args.get, args.list, args.update_record, args.delete_record
])
# Track selected game files (for option 4 or --games-files)
@@ -1716,6 +2064,8 @@ def main():
args.verify = True
elif choice == 15: # Verify sync (deep)
args.verify_deep = True
elif choice == 16: # Validate data
args.validate = True
print(f"\n{'='*50}")
print(f"CloudKit Import {'(DRY RUN)' if args.dry_run else ''}")
@@ -1810,6 +2160,14 @@ def main():
verify_sync(ck, args.data_dir, verbose=args.verbose, deep=args.verify_deep)
return
# Handle validate mode
if args.validate:
# CloudKit connection is optional for validation (can still run local checks)
if not args.dry_run and HAS_CRYPTO and os.path.exists(args.key_file):
ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env)
validate_all(ck, args.data_dir, output_file=args.output, verbose=args.verbose)
return
# Handle individual record operations
if args.get:
record_type, record_id = args.get