From 9f0edc42280c5e80b5e68aa3320512d1dfb186f2 Mon Sep 17 00:00:00 2001 From: Trey t Date: Sat, 10 Jan 2026 10:31:52 -0600 Subject: [PATCH] feat(06-01): add comprehensive validation command Add --validate flag with local validation, CloudKit relationship checking, and sync status comparison. Includes JSON export via --output flag and menu option 16 for interactive mode. Co-Authored-By: Claude Opus 4.5 --- Scripts/cloudkit_import.py | 366 ++++++++++++++++++++++++++++++++++++- 1 file changed, 362 insertions(+), 4 deletions(-) diff --git a/Scripts/cloudkit_import.py b/Scripts/cloudkit_import.py index 67cda2c..909e83b 100755 --- a/Scripts/cloudkit_import.py +++ b/Scripts/cloudkit_import.py @@ -58,6 +58,24 @@ try: except ImportError: HAS_CRYPTO = False +# Import validation functions from validate_canonical +try: + from validate_canonical import ( + validate_canonical_data, + validate_no_duplicate_ids, + validate_required_fields, + validate_team_stadium_references, + validate_game_references, + validate_no_cross_sport_references, + validate_stadium_aliases, + validate_game_counts_per_team, + ValidationError, + ValidationResult, + ) + HAS_VALIDATION = True +except ImportError: + HAS_VALIDATION = False + CONTAINER = "iCloud.com.sportstime.app" HOST = "https://api.apple-cloudkit.com" BATCH_SIZE = 200 @@ -139,17 +157,18 @@ def show_menu(): print(" 13. Smart sync + delete orphans") print(" 14. Verify sync (quick)") print(" 15. Verify sync (deep)") + print(" 16. Validate data (local + CloudKit)") print(" 0. Exit") print() while True: try: - choice = input("Enter choice [1-15, 0 to exit]: ").strip() + choice = input("Enter choice [1-16, 0 to exit]: ").strip() if choice == '0': return None - if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15']: + if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16']: return int(choice) - print("Invalid choice. Please enter 1-15 or 0.") + print("Invalid choice. Please enter 1-16 or 0.") except (EOFError, KeyboardInterrupt): print("\nExiting.") return None @@ -1315,6 +1334,333 @@ def verify_sync(ck, data_dir, verbose=False, deep=False): return total_mismatches == 0 +def validate_all(ck, data_dir, output_file=None, verbose=False): + """ + Comprehensive validation report including: + 1. Local data validation (from validate_canonical.py) + 2. CloudKit relationship validation + 3. Sync status comparison + """ + from dataclasses import asdict + data_dir = Path(data_dir) + + print("\n" + "="*60) + print("Comprehensive Data Validation Report") + print("="*60) + + # Load local data + stadiums = json.load(open(data_dir / 'stadiums_canonical.json')) if (data_dir / 'stadiums_canonical.json').exists() else [] + teams = json.load(open(data_dir / 'teams_canonical.json')) if (data_dir / 'teams_canonical.json').exists() else [] + + # Load games from canonical/games/*.json OR games_canonical.json + canonical_games_dir = data_dir / 'canonical' / 'games' + games = [] + game_files = list(canonical_games_dir.glob('*.json')) if canonical_games_dir.exists() else [] + if game_files: + for games_file in sorted(game_files): + with open(games_file) as f: + games.extend(json.load(f)) + elif (data_dir / 'games_canonical.json').exists(): + games = json.load(open(data_dir / 'games_canonical.json')) + + stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else [] + team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else [] + + print(f"\nLocal data loaded:") + print(f" Stadiums: {len(stadiums)}") + print(f" Teams: {len(teams)}") + print(f" Games: {len(games)}") + print(f" Stadium aliases: {len(stadium_aliases)}") + print(f" Team aliases: {len(team_aliases)}") + + report = { + 'timestamp': datetime.now(timezone.utc).isoformat(), + 'local_counts': { + 'stadiums': len(stadiums), + 'teams': len(teams), + 'games': len(games), + 'stadium_aliases': len(stadium_aliases), + 'team_aliases': len(team_aliases), + }, + 'local_validation': None, + 'cloudkit_validation': None, + 'sync_status': None, + } + + # ========================================================================= + # Section 1: Local Validation (using validate_canonical.py) + # ========================================================================= + print("\n" + "-"*60) + print("SECTION 1: Local Data Validation") + print("-"*60) + + if HAS_VALIDATION: + result = validate_canonical_data(stadiums, teams, games, stadium_aliases, verbose=verbose) + report['local_validation'] = asdict(result) + + if result.is_valid: + print("\n ✓ Local data VALID") + else: + print("\n ✗ Local data INVALID") + print(f" Errors: {result.error_count}") + print(f" Warnings: {result.warning_count}") + + if result.summary.get('by_category'): + print("\n Issues by category:") + for category, count in sorted(result.summary['by_category'].items()): + print(f" {category}: {count}") + + # Show first 5 errors + errors_only = [e for e in result.errors if e['severity'] == 'error'] + if errors_only: + print("\n Errors (first 5):") + for e in errors_only[:5]: + print(f" [{e['category']}] {e['message']}") + if len(errors_only) > 5: + print(f" ... and {len(errors_only) - 5} more errors") + else: + print("\n ⚠ validate_canonical module not available") + print(" Run from Scripts/ directory or install validate_canonical.py") + report['local_validation'] = {'error': 'module not available'} + + # ========================================================================= + # Section 2: CloudKit Relationship Validation + # ========================================================================= + print("\n" + "-"*60) + print("SECTION 2: CloudKit Relationship Validation") + print("-"*60) + + cloudkit_errors = [] + + if ck: + print("\n Querying CloudKit data...") + + # Query all CloudKit records + ck_stadiums = ck.query_all('Stadium', verbose=verbose) + ck_teams = ck.query_all('Team', verbose=verbose) + ck_games = ck.query_all('Game', verbose=verbose) + ck_stadium_aliases = ck.query_all('StadiumAlias', verbose=verbose) + + print(f"\n CloudKit counts:") + print(f" Stadiums: {len(ck_stadiums)}") + print(f" Teams: {len(ck_teams)}") + print(f" Games: {len(ck_games)}") + print(f" Stadium aliases: {len(ck_stadium_aliases)}") + + # Build ID sets for CloudKit records + ck_stadium_uuids = set(ck_stadiums.keys()) + ck_team_uuids = set(ck_teams.keys()) + + # Check games reference valid teams and stadiums + print("\n Checking game references...") + games_missing_home = 0 + games_missing_away = 0 + games_missing_stadium = 0 + + for game_uuid, game_rec in ck_games.items(): + fields = game_rec.get('fields', {}) + + home_ref = fields.get('homeTeamRef', {}).get('value', {}) + home_uuid = home_ref.get('recordName', '') if isinstance(home_ref, dict) else '' + if home_uuid and home_uuid not in ck_team_uuids: + games_missing_home += 1 + cloudkit_errors.append({ + 'type': 'game_missing_home_team', + 'game_uuid': game_uuid, + 'missing_team_uuid': home_uuid + }) + + away_ref = fields.get('awayTeamRef', {}).get('value', {}) + away_uuid = away_ref.get('recordName', '') if isinstance(away_ref, dict) else '' + if away_uuid and away_uuid not in ck_team_uuids: + games_missing_away += 1 + cloudkit_errors.append({ + 'type': 'game_missing_away_team', + 'game_uuid': game_uuid, + 'missing_team_uuid': away_uuid + }) + + stadium_ref = fields.get('stadiumRef', {}).get('value', {}) + stadium_uuid = stadium_ref.get('recordName', '') if isinstance(stadium_ref, dict) else '' + if stadium_uuid and stadium_uuid not in ck_stadium_uuids: + games_missing_stadium += 1 + cloudkit_errors.append({ + 'type': 'game_missing_stadium', + 'game_uuid': game_uuid, + 'missing_stadium_uuid': stadium_uuid + }) + + if games_missing_home or games_missing_away or games_missing_stadium: + print(f" Games with missing home team ref: {games_missing_home}") + print(f" Games with missing away team ref: {games_missing_away}") + print(f" Games with missing stadium ref: {games_missing_stadium}") + else: + print(" ✓ All game references valid") + + # Check teams reference valid stadiums + print("\n Checking team references...") + teams_missing_stadium = 0 + + for team_uuid, team_rec in ck_teams.items(): + fields = team_rec.get('fields', {}) + stadium_canonical_id = fields.get('stadiumCanonicalId', {}).get('value', '') + if stadium_canonical_id: + expected_stadium_uuid = deterministic_uuid(stadium_canonical_id) + if expected_stadium_uuid not in ck_stadium_uuids: + teams_missing_stadium += 1 + cloudkit_errors.append({ + 'type': 'team_missing_stadium', + 'team_uuid': team_uuid, + 'missing_stadium_id': stadium_canonical_id + }) + + if teams_missing_stadium: + print(f" Teams with missing stadium: {teams_missing_stadium}") + else: + print(" ✓ All team references valid") + + # Check stadium aliases reference valid stadiums + print("\n Checking alias references...") + aliases_missing_stadium = 0 + + for alias_name, alias_rec in ck_stadium_aliases.items(): + fields = alias_rec.get('fields', {}) + stadium_canonical_id = fields.get('stadiumCanonicalId', {}).get('value', '') + if stadium_canonical_id: + expected_stadium_uuid = deterministic_uuid(stadium_canonical_id) + if expected_stadium_uuid not in ck_stadium_uuids: + aliases_missing_stadium += 1 + cloudkit_errors.append({ + 'type': 'alias_missing_stadium', + 'alias_name': alias_name, + 'missing_stadium_id': stadium_canonical_id + }) + + if aliases_missing_stadium: + print(f" Aliases with missing stadium: {aliases_missing_stadium}") + else: + print(" ✓ All alias references valid") + + report['cloudkit_validation'] = { + 'counts': { + 'stadiums': len(ck_stadiums), + 'teams': len(ck_teams), + 'games': len(ck_games), + 'stadium_aliases': len(ck_stadium_aliases), + }, + 'errors': cloudkit_errors, + 'error_count': len(cloudkit_errors), + } + else: + print("\n ⚠ CloudKit not connected (dry-run mode or missing credentials)") + print(" CloudKit validation skipped") + report['cloudkit_validation'] = {'error': 'not connected'} + + # ========================================================================= + # Section 3: Sync Status (Local vs CloudKit) + # ========================================================================= + print("\n" + "-"*60) + print("SECTION 3: Sync Status") + print("-"*60) + + if ck: + # Use existing compute_diff logic + # Build local records + def build_stadium_records_simple(stadiums): + records = {} + for s in stadiums: + stadium_id = s.get('canonical_id', s.get('id', '')) + record_name = deterministic_uuid(stadium_id) + records[record_name] = {'recordName': record_name} + return records + + def build_team_records_simple(teams): + records = {} + for t in teams: + team_id = t.get('canonical_id', '') + record_name = deterministic_uuid(team_id) + records[record_name] = {'recordName': record_name} + return records + + def build_game_records_simple(games): + records = {} + seen = set() + for g in games: + game_id = g.get('canonical_id', g.get('id', '')) + if game_id in seen: + continue + seen.add(game_id) + record_name = deterministic_uuid(game_id) + records[record_name] = {'recordName': record_name} + return records + + local_stadium_records = build_stadium_records_simple(stadiums) + local_team_records = build_team_records_simple(teams) + local_game_records = build_game_records_simple(games) + + sync_status = {} + + print("\n Comparing local vs CloudKit...") + + for record_type, local_records, ck_records in [ + ('Stadium', local_stadium_records, ck_stadiums), + ('Team', local_team_records, ck_teams), + ('Game', local_game_records, ck_games), + ]: + local_names = set(local_records.keys()) + cloud_names = set(ck_records.keys()) + + only_local = local_names - cloud_names + only_cloud = cloud_names - local_names + both = local_names & cloud_names + + sync_status[record_type] = { + 'local_count': len(local_names), + 'cloud_count': len(cloud_names), + 'only_local': len(only_local), + 'only_cloud': len(only_cloud), + 'in_both': len(both), + } + + status = "in sync" if len(only_local) == 0 and len(only_cloud) == 0 else "out of sync" + print(f"\n {record_type}:") + print(f" Local: {len(local_names)}, CloudKit: {len(cloud_names)}") + print(f" Only in local (not uploaded): {len(only_local)}") + print(f" Only in CloudKit (orphans): {len(only_cloud)}") + print(f" Status: {status}") + + report['sync_status'] = sync_status + else: + print("\n ⚠ CloudKit not connected - sync status unavailable") + report['sync_status'] = {'error': 'not connected'} + + # ========================================================================= + # Summary + # ========================================================================= + print("\n" + "="*60) + print("VALIDATION SUMMARY") + print("="*60) + + local_valid = report['local_validation'] and report['local_validation'].get('is_valid', False) if isinstance(report['local_validation'], dict) and 'error' not in report['local_validation'] else False + cloudkit_valid = report['cloudkit_validation'] and report['cloudkit_validation'].get('error_count', 1) == 0 if isinstance(report['cloudkit_validation'], dict) and 'error' not in report['cloudkit_validation'] else False + + print(f"\n Local validation: {'✓ PASSED' if local_valid else '✗ FAILED'}") + print(f" CloudKit references: {'✓ PASSED' if cloudkit_valid else '⚠ ISSUES' if report.get('cloudkit_validation') and 'error' not in report['cloudkit_validation'] else 'N/A'}") + + if report.get('sync_status') and 'error' not in report['sync_status']: + total_orphans = sum(s.get('only_cloud', 0) for s in report['sync_status'].values()) + total_not_uploaded = sum(s.get('only_local', 0) for s in report['sync_status'].values()) + print(f" Sync status: {total_not_uploaded} not uploaded, {total_orphans} orphans") + + # Export to JSON if requested + if output_file: + with open(output_file, 'w') as f: + json.dump(report, f, indent=2, default=str) + print(f"\n Report exported to: {output_file}") + + return report + + # Valid record types for individual record management VALID_RECORD_TYPES = ['Stadium', 'Team', 'Game', 'LeagueStructure', 'TeamAlias', 'StadiumAlias'] @@ -1648,6 +1994,8 @@ def main(): p.add_argument('--delete-orphans', action='store_true', help='With --smart-sync, also delete records not in local data') p.add_argument('--verify', action='store_true', help='Verify CloudKit matches local data (quick: counts + spot-check)') p.add_argument('--verify-deep', action='store_true', help='Verify CloudKit matches local data (deep: full field comparison)') + p.add_argument('--validate', action='store_true', help='Run comprehensive validation (local + CloudKit relationships + sync status)') + p.add_argument('--output', type=str, metavar='FILE', help='Output file for validation report (JSON format)') # Individual record management p.add_argument('--get', nargs=2, metavar=('TYPE', 'ID'), help='Get a single record (e.g., --get Stadium stadium_nba_td_garden)') p.add_argument('--list', metavar='TYPE', help='List all recordNames for a type (e.g., --list Stadium)') @@ -1665,7 +2013,7 @@ def main(): args.stadiums_only, args.games_only, args.games_files, args.league_structure_only, args.team_aliases_only, args.stadium_aliases_only, args.canonical_only, args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync, - args.verify, args.verify_deep, args.get, args.list, args.update_record, args.delete_record + args.verify, args.verify_deep, args.validate, args.get, args.list, args.update_record, args.delete_record ]) # Track selected game files (for option 4 or --games-files) @@ -1716,6 +2064,8 @@ def main(): args.verify = True elif choice == 15: # Verify sync (deep) args.verify_deep = True + elif choice == 16: # Validate data + args.validate = True print(f"\n{'='*50}") print(f"CloudKit Import {'(DRY RUN)' if args.dry_run else ''}") @@ -1810,6 +2160,14 @@ def main(): verify_sync(ck, args.data_dir, verbose=args.verbose, deep=args.verify_deep) return + # Handle validate mode + if args.validate: + # CloudKit connection is optional for validation (can still run local checks) + if not args.dry_run and HAS_CRYPTO and os.path.exists(args.key_file): + ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env) + validate_all(ck, args.data_dir, output_file=args.output, verbose=args.verbose) + return + # Handle individual record operations if args.get: record_type, record_id = args.get