From 5763db4a61bd9f33ec027bd03a8b08c654cc9412 Mon Sep 17 00:00:00 2001 From: Trey t Date: Sat, 10 Jan 2026 10:13:08 -0600 Subject: [PATCH] feat(05-02): add sync verification with --verify flag - Add --verify flag for quick verification (counts + 5-record spot-check) - Add --verify-deep flag for full field-by-field comparison - Add verify_sync() function to compare CloudKit vs local data - Add lookup() method to CloudKit class for record lookups - Add menu options 14-15 for verify sync quick/deep --- Scripts/cloudkit_import.py | 291 ++++++++++++++++++++++++++++++++++++- 1 file changed, 287 insertions(+), 4 deletions(-) diff --git a/Scripts/cloudkit_import.py b/Scripts/cloudkit_import.py index f99a1e0..78e93e0 100755 --- a/Scripts/cloudkit_import.py +++ b/Scripts/cloudkit_import.py @@ -137,17 +137,19 @@ def show_menu(): print(" 11. Dry run (preview only)") print(" 12. Smart sync (diff-based, only upload changes)") print(" 13. Smart sync + delete orphans") + print(" 14. Verify sync (quick)") + print(" 15. Verify sync (deep)") print(" 0. Exit") print() while True: try: - choice = input("Enter choice [1-13, 0 to exit]: ").strip() + choice = input("Enter choice [1-15, 0 to exit]: ").strip() if choice == '0': return None - if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13']: + if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15']: return int(choice) - print("Invalid choice. Please enter 1-13 or 0.") + print("Invalid choice. Please enter 1-15 or 0.") except (EOFError, KeyboardInterrupt): print("\nExiting.") return None @@ -293,6 +295,36 @@ class CloudKit: return all_records + def lookup(self, record_type, record_names, verbose=False): + """Lookup specific records by recordName.""" + if not record_names: + return [] + + path = f"{self.path_base}/records/lookup" + records_to_lookup = [{'recordName': name} for name in record_names] + body = json.dumps({'records': records_to_lookup}) + date = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + headers = { + 'Content-Type': 'application/json', + 'X-Apple-CloudKit-Request-KeyID': self.key_id, + 'X-Apple-CloudKit-Request-ISO8601Date': date, + 'X-Apple-CloudKit-Request-SignatureV1': self._sign(date, body, path), + } + + if verbose: + print(f" Looking up {len(record_names)} {record_type} records...") + + try: + r = requests.post(f"{HOST}{path}", headers=headers, data=body, timeout=30) + if r.status_code == 200: + result = r.json() + return result.get('records', []) + return {'error': f"{r.status_code}: {r.text[:200]}"} + except requests.exceptions.Timeout: + return {'error': 'Request timed out after 30s'} + except Exception as e: + return {'error': f"Request failed: {e}"} + def delete_all(self, record_type, verbose=False): """Delete all records of a given type.""" total_deleted = 0 @@ -1051,6 +1083,238 @@ def run_smart_sync(ck, data_dir, dry_run=False, verbose=False, delete_orphans=Fa return total_stats +def verify_sync(ck, data_dir, verbose=False, deep=False): + """ + Verify that CloudKit data matches local canonical data. + Quick mode: compares counts and spot-checks 5 random records per type. + Deep mode: full field-by-field comparison of all records. + """ + import random + from pathlib import Path + data_dir = Path(data_dir) + + print("\n" + "="*50) + print(f"CloudKit Sync Verification {'(DEEP)' if deep else '(Quick)'}") + print("="*50) + + if deep: + print("\n⚠️ Deep verification may take several minutes for large datasets\n") + + # Load local data + stadiums = json.load(open(data_dir / 'stadiums_canonical.json')) if (data_dir / 'stadiums_canonical.json').exists() else [] + teams = json.load(open(data_dir / 'teams_canonical.json')) if (data_dir / 'teams_canonical.json').exists() else [] + + # Load games from canonical/games/*.json + canonical_games_dir = data_dir / 'canonical' / 'games' + games = [] + if canonical_games_dir.exists(): + for games_file in sorted(canonical_games_dir.glob('*.json')): + with open(games_file) as f: + games.extend(json.load(f)) + + league_structure = json.load(open(data_dir / 'league_structure.json')) if (data_dir / 'league_structure.json').exists() else [] + team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else [] + stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else [] + + # Deduplicate games by canonical_id + seen_ids = set() + unique_games = [] + for g in games: + game_id = g.get('canonical_id', g.get('id', '')) + if game_id not in seen_ids: + seen_ids.add(game_id) + unique_games.append(g) + games = unique_games + + local_counts = { + 'Stadium': len(stadiums), + 'Team': len(teams), + 'Game': len(games), + 'LeagueStructure': len(league_structure), + 'TeamAlias': len(team_aliases), + 'StadiumAlias': len(stadium_aliases), + } + + print(f"Local data: {local_counts['Stadium']} stadiums, {local_counts['Team']} teams, {local_counts['Game']} games") + print(f" {local_counts['LeagueStructure']} league structures, {local_counts['TeamAlias']} team aliases, {local_counts['StadiumAlias']} stadium aliases\n") + + # Build local record maps for spot-check comparison + def build_local_record_map(record_type, data): + """Build a map of recordName -> fields for comparison.""" + records = {} + if record_type == 'Stadium': + for s in data: + stadium_id = s.get('canonical_id', s.get('id', '')) + record_name = deterministic_uuid(stadium_id) + records[record_name] = { + 'canonicalId': stadium_id, + 'name': s['name'], + 'city': s['city'], + 'sport': s['sport'], + } + elif record_type == 'Team': + for t in data: + team_id = t.get('canonical_id', '') + record_name = deterministic_uuid(team_id) + records[record_name] = { + 'canonicalId': team_id, + 'abbreviation': t['abbreviation'], + 'name': t['name'], + 'city': t['city'], + 'sport': t['sport'], + } + elif record_type == 'Game': + for g in data: + game_id = g.get('canonical_id', g.get('id', '')) + record_name = deterministic_uuid(game_id) + records[record_name] = { + 'canonicalId': game_id, + 'sport': g['sport'], + 'season': g.get('season', ''), + } + elif record_type == 'LeagueStructure': + for ls in data: + record_name = ls['id'] + records[record_name] = { + 'structureId': ls['id'], + 'sport': ls['sport'], + 'type': ls['type'], + 'name': ls['name'], + } + elif record_type == 'TeamAlias': + for ta in data: + record_name = ta['id'] + records[record_name] = { + 'aliasId': ta['id'], + 'teamCanonicalId': ta['team_canonical_id'], + 'aliasType': ta['alias_type'], + 'aliasValue': ta['alias_value'], + } + elif record_type == 'StadiumAlias': + for sa in data: + stadium_id = sa['stadium_canonical_id'] + sport = stadium_id.split('_')[1] if '_' in stadium_id else 'unknown' + record_name = f"{sport}_{sa['alias_name'].lower()}" + records[record_name] = { + 'aliasName': sa['alias_name'].lower(), + 'stadiumCanonicalId': sa['stadium_canonical_id'], + } + return records + + data_map = { + 'Stadium': stadiums, + 'Team': teams, + 'Game': games, + 'LeagueStructure': league_structure, + 'TeamAlias': team_aliases, + 'StadiumAlias': stadium_aliases, + } + + results = [] + total_mismatches = 0 + + for record_type in ['Stadium', 'Team', 'Game', 'LeagueStructure', 'TeamAlias', 'StadiumAlias']: + local_count = local_counts[record_type] + if local_count == 0: + print(f"{record_type}: No local data, skipping") + continue + + # Query CloudKit count + print(f"Checking {record_type}...") + cloud_records = ck.query_all(record_type, verbose=verbose) + cloud_count = len(cloud_records) + + # Count comparison + if cloud_count == local_count: + status = "[OK]" + elif cloud_count < local_count: + status = f"[MISMATCH: {local_count - cloud_count} missing in CloudKit]" + total_mismatches += 1 + else: + status = f"[MISMATCH: {cloud_count - local_count} extra in CloudKit]" + total_mismatches += 1 + + print(f" {record_type}: CloudKit={cloud_count}, Local={local_count} {status}") + + # Spot-check or deep verification + local_records = build_local_record_map(record_type, data_map[record_type]) + + if deep: + # Full field-by-field comparison + field_mismatches = [] + for record_name, local_fields in local_records.items(): + cloud_rec = cloud_records.get(record_name) + if not cloud_rec: + field_mismatches.append(f" {record_name}: Missing in CloudKit") + continue + + cloud_fields = cloud_rec.get('fields', {}) + for field_name, expected_value in local_fields.items(): + cloud_field = cloud_fields.get(field_name, {}) + cloud_value = cloud_field.get('value') + if cloud_value != expected_value: + field_mismatches.append(f" {record_name}.{field_name}: expected '{expected_value}', got '{cloud_value}'") + + if field_mismatches: + print(f" Field mismatches ({len(field_mismatches)}):") + for m in field_mismatches[:10]: # Show first 10 + print(m) + if len(field_mismatches) > 10: + print(f" ... and {len(field_mismatches) - 10} more") + total_mismatches += len(field_mismatches) + else: + print(f" All fields verified [OK]") + + elif cloud_count == local_count and cloud_count > 0: + # Spot-check 5 random records + sample_size = min(5, cloud_count) + sample_names = random.sample(list(local_records.keys()), sample_size) + + spot_check_ok = True + for record_name in sample_names: + local_fields = local_records[record_name] + cloud_rec = cloud_records.get(record_name) + if not cloud_rec: + print(f" Spot-check failed: {record_name} missing in CloudKit") + spot_check_ok = False + continue + + cloud_fields = cloud_rec.get('fields', {}) + for field_name, expected_value in local_fields.items(): + cloud_field = cloud_fields.get(field_name, {}) + cloud_value = cloud_field.get('value') + if cloud_value != expected_value: + print(f" Spot-check mismatch: {record_name}.{field_name}: expected '{expected_value}', got '{cloud_value}'") + spot_check_ok = False + + if spot_check_ok: + print(f" Spot-check ({sample_size} records): [OK]") + else: + total_mismatches += 1 + + results.append({ + 'type': record_type, + 'local': local_count, + 'cloud': cloud_count, + 'match': cloud_count == local_count, + }) + + # Summary + print("\n" + "="*50) + print("Verification Summary") + print("="*50) + for r in results: + status = "[OK]" if r['match'] else "[MISMATCH]" + print(f" {r['type']}: Local={r['local']}, CloudKit={r['cloud']} {status}") + + if total_mismatches == 0: + print("\n✓ All data verified - CloudKit matches local data") + else: + print(f"\n⚠ Found {total_mismatches} mismatch(es)") + + return total_mismatches == 0 + + def main(): p = argparse.ArgumentParser(description='Import JSON to CloudKit') p.add_argument('--key-id', default=DEFAULT_KEY_ID) @@ -1070,6 +1334,8 @@ def main(): p.add_argument('--diff', action='store_true', help='Show diff between local and CloudKit without importing') p.add_argument('--smart-sync', action='store_true', help='Differential sync: only upload new/changed records') p.add_argument('--delete-orphans', action='store_true', help='With --smart-sync, also delete records not in local data') + p.add_argument('--verify', action='store_true', help='Verify CloudKit matches local data (quick: counts + spot-check)') + p.add_argument('--verify-deep', action='store_true', help='Verify CloudKit matches local data (deep: full field comparison)') p.add_argument('--dry-run', action='store_true') p.add_argument('--verbose', '-v', action='store_true') p.add_argument('--interactive', '-i', action='store_true', help='Show interactive menu') @@ -1079,7 +1345,8 @@ def main(): has_action_flag = any([ args.stadiums_only, args.games_only, args.games_files, args.league_structure_only, args.team_aliases_only, args.stadium_aliases_only, args.canonical_only, - args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync + args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync, + args.verify, args.verify_deep ]) # Track selected game files (for option 4 or --games-files) @@ -1126,6 +1393,10 @@ def main(): elif choice == 13: # Smart sync + delete orphans args.smart_sync = True args.delete_orphans = True + elif choice == 14: # Verify sync (quick) + args.verify = True + elif choice == 15: # Verify sync (deep) + args.verify_deep = True print(f"\n{'='*50}") print(f"CloudKit Import {'(DRY RUN)' if args.dry_run else ''}") @@ -1208,6 +1479,18 @@ def main(): show_diff_report(ck, args.data_dir, verbose=args.verbose) return + # Handle verify mode + if args.verify or args.verify_deep: + if not ck: + # Need CloudKit connection for verification + if not HAS_CRYPTO: + sys.exit("Error: pip install cryptography") + if not os.path.exists(args.key_file): + sys.exit(f"Error: Key file not found: {args.key_file}") + ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env) + verify_sync(ck, args.data_dir, verbose=args.verbose, deep=args.verify_deep) + return + # Handle smart sync mode (differential upload) if args.smart_sync: if not ck: