feat(06-01): add comprehensive validation command

Add --validate flag with local validation, CloudKit relationship checking, and sync status comparison. Includes JSON export via --output flag and menu option 16 for interactive mode. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 10:31:52 -06:00
parent 4266940c8f
commit 9f0edc4228
1 changed files with 362 additions and 4 deletions
--- a/Scripts/cloudkit_import.py
+++ b/Scripts/cloudkit_import.py
@@ -58,6 +58,24 @@ try:
 except ImportError:
    HAS_CRYPTO = False

+# Import validation functions from validate_canonical
+try:
+    from validate_canonical import (
+        validate_canonical_data,
+        validate_no_duplicate_ids,
+        validate_required_fields,
+        validate_team_stadium_references,
+        validate_game_references,
+        validate_no_cross_sport_references,
+        validate_stadium_aliases,
+        validate_game_counts_per_team,
+        ValidationError,
+        ValidationResult,
+    )
+    HAS_VALIDATION = True
+except ImportError:
+    HAS_VALIDATION = False
+
 CONTAINER = "iCloud.com.sportstime.app"
 HOST = "https://api.apple-cloudkit.com"
 BATCH_SIZE = 200
@@ -139,17 +157,18 @@ def show_menu():
    print("  13. Smart sync + delete orphans")
    print("  14. Verify sync (quick)")
    print("  15. Verify sync (deep)")
+    print("  16. Validate data (local + CloudKit)")
    print("  0. Exit")
    print()

    while True:
        try:
-            choice = input("Enter choice [1-15, 0 to exit]: ").strip()
+            choice = input("Enter choice [1-16, 0 to exit]: ").strip()
            if choice == '0':
                return None
-            if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15']:
+            if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16']:
                return int(choice)
-            print("Invalid choice. Please enter 1-15 or 0.")
+            print("Invalid choice. Please enter 1-16 or 0.")
        except (EOFError, KeyboardInterrupt):
            print("\nExiting.")
            return None
@@ -1315,6 +1334,333 @@ def verify_sync(ck, data_dir, verbose=False, deep=False):
    return total_mismatches == 0


+def validate_all(ck, data_dir, output_file=None, verbose=False):
+    """
+    Comprehensive validation report including:
+    1. Local data validation (from validate_canonical.py)
+    2. CloudKit relationship validation
+    3. Sync status comparison
+    """
+    from dataclasses import asdict
+    data_dir = Path(data_dir)
+
+    print("\n" + "="*60)
+    print("Comprehensive Data Validation Report")
+    print("="*60)
+
+    # Load local data
+    stadiums = json.load(open(data_dir / 'stadiums_canonical.json')) if (data_dir / 'stadiums_canonical.json').exists() else []
+    teams = json.load(open(data_dir / 'teams_canonical.json')) if (data_dir / 'teams_canonical.json').exists() else []
+
+    # Load games from canonical/games/*.json OR games_canonical.json
+    canonical_games_dir = data_dir / 'canonical' / 'games'
+    games = []
+    game_files = list(canonical_games_dir.glob('*.json')) if canonical_games_dir.exists() else []
+    if game_files:
+        for games_file in sorted(game_files):
+            with open(games_file) as f:
+                games.extend(json.load(f))
+    elif (data_dir / 'games_canonical.json').exists():
+        games = json.load(open(data_dir / 'games_canonical.json'))
+
+    stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else []
+    team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else []
+
+    print(f"\nLocal data loaded:")
+    print(f"  Stadiums: {len(stadiums)}")
+    print(f"  Teams: {len(teams)}")
+    print(f"  Games: {len(games)}")
+    print(f"  Stadium aliases: {len(stadium_aliases)}")
+    print(f"  Team aliases: {len(team_aliases)}")
+
+    report = {
+        'timestamp': datetime.now(timezone.utc).isoformat(),
+        'local_counts': {
+            'stadiums': len(stadiums),
+            'teams': len(teams),
+            'games': len(games),
+            'stadium_aliases': len(stadium_aliases),
+            'team_aliases': len(team_aliases),
+        },
+        'local_validation': None,
+        'cloudkit_validation': None,
+        'sync_status': None,
+    }
+
+    # =========================================================================
+    # Section 1: Local Validation (using validate_canonical.py)
+    # =========================================================================
+    print("\n" + "-"*60)
+    print("SECTION 1: Local Data Validation")
+    print("-"*60)
+
+    if HAS_VALIDATION:
+        result = validate_canonical_data(stadiums, teams, games, stadium_aliases, verbose=verbose)
+        report['local_validation'] = asdict(result)
+
+        if result.is_valid:
+            print("\n  ✓ Local data VALID")
+        else:
+            print("\n  ✗ Local data INVALID")
+        print(f"  Errors: {result.error_count}")
+        print(f"  Warnings: {result.warning_count}")
+
+        if result.summary.get('by_category'):
+            print("\n  Issues by category:")
+            for category, count in sorted(result.summary['by_category'].items()):
+                print(f"    {category}: {count}")
+
+        # Show first 5 errors
+        errors_only = [e for e in result.errors if e['severity'] == 'error']
+        if errors_only:
+            print("\n  Errors (first 5):")
+            for e in errors_only[:5]:
+                print(f"    [{e['category']}] {e['message']}")
+            if len(errors_only) > 5:
+                print(f"    ... and {len(errors_only) - 5} more errors")
+    else:
+        print("\n  ⚠ validate_canonical module not available")
+        print("    Run from Scripts/ directory or install validate_canonical.py")
+        report['local_validation'] = {'error': 'module not available'}
+
+    # =========================================================================
+    # Section 2: CloudKit Relationship Validation
+    # =========================================================================
+    print("\n" + "-"*60)
+    print("SECTION 2: CloudKit Relationship Validation")
+    print("-"*60)
+
+    cloudkit_errors = []
+
+    if ck:
+        print("\n  Querying CloudKit data...")
+
+        # Query all CloudKit records
+        ck_stadiums = ck.query_all('Stadium', verbose=verbose)
+        ck_teams = ck.query_all('Team', verbose=verbose)
+        ck_games = ck.query_all('Game', verbose=verbose)
+        ck_stadium_aliases = ck.query_all('StadiumAlias', verbose=verbose)
+
+        print(f"\n  CloudKit counts:")
+        print(f"    Stadiums: {len(ck_stadiums)}")
+        print(f"    Teams: {len(ck_teams)}")
+        print(f"    Games: {len(ck_games)}")
+        print(f"    Stadium aliases: {len(ck_stadium_aliases)}")
+
+        # Build ID sets for CloudKit records
+        ck_stadium_uuids = set(ck_stadiums.keys())
+        ck_team_uuids = set(ck_teams.keys())
+
+        # Check games reference valid teams and stadiums
+        print("\n  Checking game references...")
+        games_missing_home = 0
+        games_missing_away = 0
+        games_missing_stadium = 0
+
+        for game_uuid, game_rec in ck_games.items():
+            fields = game_rec.get('fields', {})
+
+            home_ref = fields.get('homeTeamRef', {}).get('value', {})
+            home_uuid = home_ref.get('recordName', '') if isinstance(home_ref, dict) else ''
+            if home_uuid and home_uuid not in ck_team_uuids:
+                games_missing_home += 1
+                cloudkit_errors.append({
+                    'type': 'game_missing_home_team',
+                    'game_uuid': game_uuid,
+                    'missing_team_uuid': home_uuid
+                })
+
+            away_ref = fields.get('awayTeamRef', {}).get('value', {})
+            away_uuid = away_ref.get('recordName', '') if isinstance(away_ref, dict) else ''
+            if away_uuid and away_uuid not in ck_team_uuids:
+                games_missing_away += 1
+                cloudkit_errors.append({
+                    'type': 'game_missing_away_team',
+                    'game_uuid': game_uuid,
+                    'missing_team_uuid': away_uuid
+                })
+
+            stadium_ref = fields.get('stadiumRef', {}).get('value', {})
+            stadium_uuid = stadium_ref.get('recordName', '') if isinstance(stadium_ref, dict) else ''
+            if stadium_uuid and stadium_uuid not in ck_stadium_uuids:
+                games_missing_stadium += 1
+                cloudkit_errors.append({
+                    'type': 'game_missing_stadium',
+                    'game_uuid': game_uuid,
+                    'missing_stadium_uuid': stadium_uuid
+                })
+
+        if games_missing_home or games_missing_away or games_missing_stadium:
+            print(f"    Games with missing home team ref: {games_missing_home}")
+            print(f"    Games with missing away team ref: {games_missing_away}")
+            print(f"    Games with missing stadium ref: {games_missing_stadium}")
+        else:
+            print("    ✓ All game references valid")
+
+        # Check teams reference valid stadiums
+        print("\n  Checking team references...")
+        teams_missing_stadium = 0
+
+        for team_uuid, team_rec in ck_teams.items():
+            fields = team_rec.get('fields', {})
+            stadium_canonical_id = fields.get('stadiumCanonicalId', {}).get('value', '')
+            if stadium_canonical_id:
+                expected_stadium_uuid = deterministic_uuid(stadium_canonical_id)
+                if expected_stadium_uuid not in ck_stadium_uuids:
+                    teams_missing_stadium += 1
+                    cloudkit_errors.append({
+                        'type': 'team_missing_stadium',
+                        'team_uuid': team_uuid,
+                        'missing_stadium_id': stadium_canonical_id
+                    })
+
+        if teams_missing_stadium:
+            print(f"    Teams with missing stadium: {teams_missing_stadium}")
+        else:
+            print("    ✓ All team references valid")
+
+        # Check stadium aliases reference valid stadiums
+        print("\n  Checking alias references...")
+        aliases_missing_stadium = 0
+
+        for alias_name, alias_rec in ck_stadium_aliases.items():
+            fields = alias_rec.get('fields', {})
+            stadium_canonical_id = fields.get('stadiumCanonicalId', {}).get('value', '')
+            if stadium_canonical_id:
+                expected_stadium_uuid = deterministic_uuid(stadium_canonical_id)
+                if expected_stadium_uuid not in ck_stadium_uuids:
+                    aliases_missing_stadium += 1
+                    cloudkit_errors.append({
+                        'type': 'alias_missing_stadium',
+                        'alias_name': alias_name,
+                        'missing_stadium_id': stadium_canonical_id
+                    })
+
+        if aliases_missing_stadium:
+            print(f"    Aliases with missing stadium: {aliases_missing_stadium}")
+        else:
+            print("    ✓ All alias references valid")
+
+        report['cloudkit_validation'] = {
+            'counts': {
+                'stadiums': len(ck_stadiums),
+                'teams': len(ck_teams),
+                'games': len(ck_games),
+                'stadium_aliases': len(ck_stadium_aliases),
+            },
+            'errors': cloudkit_errors,
+            'error_count': len(cloudkit_errors),
+        }
+    else:
+        print("\n  ⚠ CloudKit not connected (dry-run mode or missing credentials)")
+        print("    CloudKit validation skipped")
+        report['cloudkit_validation'] = {'error': 'not connected'}
+
+    # =========================================================================
+    # Section 3: Sync Status (Local vs CloudKit)
+    # =========================================================================
+    print("\n" + "-"*60)
+    print("SECTION 3: Sync Status")
+    print("-"*60)
+
+    if ck:
+        # Use existing compute_diff logic
+        # Build local records
+        def build_stadium_records_simple(stadiums):
+            records = {}
+            for s in stadiums:
+                stadium_id = s.get('canonical_id', s.get('id', ''))
+                record_name = deterministic_uuid(stadium_id)
+                records[record_name] = {'recordName': record_name}
+            return records
+
+        def build_team_records_simple(teams):
+            records = {}
+            for t in teams:
+                team_id = t.get('canonical_id', '')
+                record_name = deterministic_uuid(team_id)
+                records[record_name] = {'recordName': record_name}
+            return records
+
+        def build_game_records_simple(games):
+            records = {}
+            seen = set()
+            for g in games:
+                game_id = g.get('canonical_id', g.get('id', ''))
+                if game_id in seen:
+                    continue
+                seen.add(game_id)
+                record_name = deterministic_uuid(game_id)
+                records[record_name] = {'recordName': record_name}
+            return records
+
+        local_stadium_records = build_stadium_records_simple(stadiums)
+        local_team_records = build_team_records_simple(teams)
+        local_game_records = build_game_records_simple(games)
+
+        sync_status = {}
+
+        print("\n  Comparing local vs CloudKit...")
+
+        for record_type, local_records, ck_records in [
+            ('Stadium', local_stadium_records, ck_stadiums),
+            ('Team', local_team_records, ck_teams),
+            ('Game', local_game_records, ck_games),
+        ]:
+            local_names = set(local_records.keys())
+            cloud_names = set(ck_records.keys())
+
+            only_local = local_names - cloud_names
+            only_cloud = cloud_names - local_names
+            both = local_names & cloud_names
+
+            sync_status[record_type] = {
+                'local_count': len(local_names),
+                'cloud_count': len(cloud_names),
+                'only_local': len(only_local),
+                'only_cloud': len(only_cloud),
+                'in_both': len(both),
+            }
+
+            status = "in sync" if len(only_local) == 0 and len(only_cloud) == 0 else "out of sync"
+            print(f"\n    {record_type}:")
+            print(f"      Local: {len(local_names)}, CloudKit: {len(cloud_names)}")
+            print(f"      Only in local (not uploaded): {len(only_local)}")
+            print(f"      Only in CloudKit (orphans): {len(only_cloud)}")
+            print(f"      Status: {status}")
+
+        report['sync_status'] = sync_status
+    else:
+        print("\n  ⚠ CloudKit not connected - sync status unavailable")
+        report['sync_status'] = {'error': 'not connected'}
+
+    # =========================================================================
+    # Summary
+    # =========================================================================
+    print("\n" + "="*60)
+    print("VALIDATION SUMMARY")
+    print("="*60)
+
+    local_valid = report['local_validation'] and report['local_validation'].get('is_valid', False) if isinstance(report['local_validation'], dict) and 'error' not in report['local_validation'] else False
+    cloudkit_valid = report['cloudkit_validation'] and report['cloudkit_validation'].get('error_count', 1) == 0 if isinstance(report['cloudkit_validation'], dict) and 'error' not in report['cloudkit_validation'] else False
+
+    print(f"\n  Local validation: {'✓ PASSED' if local_valid else '✗ FAILED'}")
+    print(f"  CloudKit references: {'✓ PASSED' if cloudkit_valid else '⚠ ISSUES' if report.get('cloudkit_validation') and 'error' not in report['cloudkit_validation'] else 'N/A'}")
+
+    if report.get('sync_status') and 'error' not in report['sync_status']:
+        total_orphans = sum(s.get('only_cloud', 0) for s in report['sync_status'].values())
+        total_not_uploaded = sum(s.get('only_local', 0) for s in report['sync_status'].values())
+        print(f"  Sync status: {total_not_uploaded} not uploaded, {total_orphans} orphans")
+
+    # Export to JSON if requested
+    if output_file:
+        with open(output_file, 'w') as f:
+            json.dump(report, f, indent=2, default=str)
+        print(f"\n  Report exported to: {output_file}")
+
+    return report
+
+
 # Valid record types for individual record management
 VALID_RECORD_TYPES = ['Stadium', 'Team', 'Game', 'LeagueStructure', 'TeamAlias', 'StadiumAlias']

@@ -1648,6 +1994,8 @@ def main():
    p.add_argument('--delete-orphans', action='store_true', help='With --smart-sync, also delete records not in local data')
    p.add_argument('--verify', action='store_true', help='Verify CloudKit matches local data (quick: counts + spot-check)')
    p.add_argument('--verify-deep', action='store_true', help='Verify CloudKit matches local data (deep: full field comparison)')
+    p.add_argument('--validate', action='store_true', help='Run comprehensive validation (local + CloudKit relationships + sync status)')
+    p.add_argument('--output', type=str, metavar='FILE', help='Output file for validation report (JSON format)')
    # Individual record management
    p.add_argument('--get', nargs=2, metavar=('TYPE', 'ID'), help='Get a single record (e.g., --get Stadium stadium_nba_td_garden)')
    p.add_argument('--list', metavar='TYPE', help='List all recordNames for a type (e.g., --list Stadium)')
@@ -1665,7 +2013,7 @@ def main():
        args.stadiums_only, args.games_only, args.games_files, args.league_structure_only,
        args.team_aliases_only, args.stadium_aliases_only, args.canonical_only,
        args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync,
-        args.verify, args.verify_deep, args.get, args.list, args.update_record, args.delete_record
+        args.verify, args.verify_deep, args.validate, args.get, args.list, args.update_record, args.delete_record
    ])

    # Track selected game files (for option 4 or --games-files)
@@ -1716,6 +2064,8 @@ def main():
            args.verify = True
        elif choice == 15:  # Verify sync (deep)
            args.verify_deep = True
+        elif choice == 16:  # Validate data
+            args.validate = True

    print(f"\n{'='*50}")
    print(f"CloudKit Import {'(DRY RUN)' if args.dry_run else ''}")
@@ -1810,6 +2160,14 @@ def main():
        verify_sync(ck, args.data_dir, verbose=args.verbose, deep=args.verify_deep)
        return

+    # Handle validate mode
+    if args.validate:
+        # CloudKit connection is optional for validation (can still run local checks)
+        if not args.dry_run and HAS_CRYPTO and os.path.exists(args.key_file):
+            ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env)
+        validate_all(ck, args.data_dir, output_file=args.output, verbose=args.verbose)
+        return
+
    # Handle individual record operations
    if args.get:
        record_type, record_id = args.get