From 5763db4a61bd9f33ec027bd03a8b08c654cc9412 Mon Sep 17 00:00:00 2001
From: Trey t <treytartt@fastmail.com>
Date: Sat, 10 Jan 2026 10:13:08 -0600
Subject: [PATCH] feat(05-02): add sync verification with --verify flag

- Add --verify flag for quick verification (counts + 5-record spot-check)
- Add --verify-deep flag for full field-by-field comparison
- Add verify_sync() function to compare CloudKit vs local data
- Add lookup() method to CloudKit class for record lookups
- Add menu options 14-15 for verify sync quick/deep
---
 Scripts/cloudkit_import.py | 291 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 287 insertions(+), 4 deletions(-)

diff --git a/Scripts/cloudkit_import.py b/Scripts/cloudkit_import.py
index f99a1e0..78e93e0 100755
--- a/Scripts/cloudkit_import.py
+++ b/Scripts/cloudkit_import.py
@@ -137,17 +137,19 @@ def show_menu():
     print("  11. Dry run (preview only)")
     print("  12. Smart sync (diff-based, only upload changes)")
     print("  13. Smart sync + delete orphans")
+    print("  14. Verify sync (quick)")
+    print("  15. Verify sync (deep)")
     print("  0. Exit")
     print()
 
     while True:
         try:
-            choice = input("Enter choice [1-13, 0 to exit]: ").strip()
+            choice = input("Enter choice [1-15, 0 to exit]: ").strip()
             if choice == '0':
                 return None
-            if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13']:
+            if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15']:
                 return int(choice)
-            print("Invalid choice. Please enter 1-13 or 0.")
+            print("Invalid choice. Please enter 1-15 or 0.")
         except (EOFError, KeyboardInterrupt):
             print("\nExiting.")
             return None
@@ -293,6 +295,36 @@ class CloudKit:
 
         return all_records
 
+    def lookup(self, record_type, record_names, verbose=False):
+        """Lookup specific records by recordName."""
+        if not record_names:
+            return []
+
+        path = f"{self.path_base}/records/lookup"
+        records_to_lookup = [{'recordName': name} for name in record_names]
+        body = json.dumps({'records': records_to_lookup})
+        date = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
+        headers = {
+            'Content-Type': 'application/json',
+            'X-Apple-CloudKit-Request-KeyID': self.key_id,
+            'X-Apple-CloudKit-Request-ISO8601Date': date,
+            'X-Apple-CloudKit-Request-SignatureV1': self._sign(date, body, path),
+        }
+
+        if verbose:
+            print(f"    Looking up {len(record_names)} {record_type} records...")
+
+        try:
+            r = requests.post(f"{HOST}{path}", headers=headers, data=body, timeout=30)
+            if r.status_code == 200:
+                result = r.json()
+                return result.get('records', [])
+            return {'error': f"{r.status_code}: {r.text[:200]}"}
+        except requests.exceptions.Timeout:
+            return {'error': 'Request timed out after 30s'}
+        except Exception as e:
+            return {'error': f"Request failed: {e}"}
+
     def delete_all(self, record_type, verbose=False):
         """Delete all records of a given type."""
         total_deleted = 0
@@ -1051,6 +1083,238 @@ def run_smart_sync(ck, data_dir, dry_run=False, verbose=False, delete_orphans=Fa
     return total_stats
 
 
+def verify_sync(ck, data_dir, verbose=False, deep=False):
+    """
+    Verify that CloudKit data matches local canonical data.
+    Quick mode: compares counts and spot-checks 5 random records per type.
+    Deep mode: full field-by-field comparison of all records.
+    """
+    import random
+    from pathlib import Path
+    data_dir = Path(data_dir)
+
+    print("\n" + "="*50)
+    print(f"CloudKit Sync Verification {'(DEEP)' if deep else '(Quick)'}")
+    print("="*50)
+
+    if deep:
+        print("\n⚠️  Deep verification may take several minutes for large datasets\n")
+
+    # Load local data
+    stadiums = json.load(open(data_dir / 'stadiums_canonical.json')) if (data_dir / 'stadiums_canonical.json').exists() else []
+    teams = json.load(open(data_dir / 'teams_canonical.json')) if (data_dir / 'teams_canonical.json').exists() else []
+
+    # Load games from canonical/games/*.json
+    canonical_games_dir = data_dir / 'canonical' / 'games'
+    games = []
+    if canonical_games_dir.exists():
+        for games_file in sorted(canonical_games_dir.glob('*.json')):
+            with open(games_file) as f:
+                games.extend(json.load(f))
+
+    league_structure = json.load(open(data_dir / 'league_structure.json')) if (data_dir / 'league_structure.json').exists() else []
+    team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else []
+    stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else []
+
+    # Deduplicate games by canonical_id
+    seen_ids = set()
+    unique_games = []
+    for g in games:
+        game_id = g.get('canonical_id', g.get('id', ''))
+        if game_id not in seen_ids:
+            seen_ids.add(game_id)
+            unique_games.append(g)
+    games = unique_games
+
+    local_counts = {
+        'Stadium': len(stadiums),
+        'Team': len(teams),
+        'Game': len(games),
+        'LeagueStructure': len(league_structure),
+        'TeamAlias': len(team_aliases),
+        'StadiumAlias': len(stadium_aliases),
+    }
+
+    print(f"Local data: {local_counts['Stadium']} stadiums, {local_counts['Team']} teams, {local_counts['Game']} games")
+    print(f"            {local_counts['LeagueStructure']} league structures, {local_counts['TeamAlias']} team aliases, {local_counts['StadiumAlias']} stadium aliases\n")
+
+    # Build local record maps for spot-check comparison
+    def build_local_record_map(record_type, data):
+        """Build a map of recordName -> fields for comparison."""
+        records = {}
+        if record_type == 'Stadium':
+            for s in data:
+                stadium_id = s.get('canonical_id', s.get('id', ''))
+                record_name = deterministic_uuid(stadium_id)
+                records[record_name] = {
+                    'canonicalId': stadium_id,
+                    'name': s['name'],
+                    'city': s['city'],
+                    'sport': s['sport'],
+                }
+        elif record_type == 'Team':
+            for t in data:
+                team_id = t.get('canonical_id', '')
+                record_name = deterministic_uuid(team_id)
+                records[record_name] = {
+                    'canonicalId': team_id,
+                    'abbreviation': t['abbreviation'],
+                    'name': t['name'],
+                    'city': t['city'],
+                    'sport': t['sport'],
+                }
+        elif record_type == 'Game':
+            for g in data:
+                game_id = g.get('canonical_id', g.get('id', ''))
+                record_name = deterministic_uuid(game_id)
+                records[record_name] = {
+                    'canonicalId': game_id,
+                    'sport': g['sport'],
+                    'season': g.get('season', ''),
+                }
+        elif record_type == 'LeagueStructure':
+            for ls in data:
+                record_name = ls['id']
+                records[record_name] = {
+                    'structureId': ls['id'],
+                    'sport': ls['sport'],
+                    'type': ls['type'],
+                    'name': ls['name'],
+                }
+        elif record_type == 'TeamAlias':
+            for ta in data:
+                record_name = ta['id']
+                records[record_name] = {
+                    'aliasId': ta['id'],
+                    'teamCanonicalId': ta['team_canonical_id'],
+                    'aliasType': ta['alias_type'],
+                    'aliasValue': ta['alias_value'],
+                }
+        elif record_type == 'StadiumAlias':
+            for sa in data:
+                stadium_id = sa['stadium_canonical_id']
+                sport = stadium_id.split('_')[1] if '_' in stadium_id else 'unknown'
+                record_name = f"{sport}_{sa['alias_name'].lower()}"
+                records[record_name] = {
+                    'aliasName': sa['alias_name'].lower(),
+                    'stadiumCanonicalId': sa['stadium_canonical_id'],
+                }
+        return records
+
+    data_map = {
+        'Stadium': stadiums,
+        'Team': teams,
+        'Game': games,
+        'LeagueStructure': league_structure,
+        'TeamAlias': team_aliases,
+        'StadiumAlias': stadium_aliases,
+    }
+
+    results = []
+    total_mismatches = 0
+
+    for record_type in ['Stadium', 'Team', 'Game', 'LeagueStructure', 'TeamAlias', 'StadiumAlias']:
+        local_count = local_counts[record_type]
+        if local_count == 0:
+            print(f"{record_type}: No local data, skipping")
+            continue
+
+        # Query CloudKit count
+        print(f"Checking {record_type}...")
+        cloud_records = ck.query_all(record_type, verbose=verbose)
+        cloud_count = len(cloud_records)
+
+        # Count comparison
+        if cloud_count == local_count:
+            status = "[OK]"
+        elif cloud_count < local_count:
+            status = f"[MISMATCH: {local_count - cloud_count} missing in CloudKit]"
+            total_mismatches += 1
+        else:
+            status = f"[MISMATCH: {cloud_count - local_count} extra in CloudKit]"
+            total_mismatches += 1
+
+        print(f"  {record_type}: CloudKit={cloud_count}, Local={local_count} {status}")
+
+        # Spot-check or deep verification
+        local_records = build_local_record_map(record_type, data_map[record_type])
+
+        if deep:
+            # Full field-by-field comparison
+            field_mismatches = []
+            for record_name, local_fields in local_records.items():
+                cloud_rec = cloud_records.get(record_name)
+                if not cloud_rec:
+                    field_mismatches.append(f"    {record_name}: Missing in CloudKit")
+                    continue
+
+                cloud_fields = cloud_rec.get('fields', {})
+                for field_name, expected_value in local_fields.items():
+                    cloud_field = cloud_fields.get(field_name, {})
+                    cloud_value = cloud_field.get('value')
+                    if cloud_value != expected_value:
+                        field_mismatches.append(f"    {record_name}.{field_name}: expected '{expected_value}', got '{cloud_value}'")
+
+            if field_mismatches:
+                print(f"  Field mismatches ({len(field_mismatches)}):")
+                for m in field_mismatches[:10]:  # Show first 10
+                    print(m)
+                if len(field_mismatches) > 10:
+                    print(f"    ... and {len(field_mismatches) - 10} more")
+                total_mismatches += len(field_mismatches)
+            else:
+                print(f"  All fields verified [OK]")
+
+        elif cloud_count == local_count and cloud_count > 0:
+            # Spot-check 5 random records
+            sample_size = min(5, cloud_count)
+            sample_names = random.sample(list(local_records.keys()), sample_size)
+
+            spot_check_ok = True
+            for record_name in sample_names:
+                local_fields = local_records[record_name]
+                cloud_rec = cloud_records.get(record_name)
+                if not cloud_rec:
+                    print(f"    Spot-check failed: {record_name} missing in CloudKit")
+                    spot_check_ok = False
+                    continue
+
+                cloud_fields = cloud_rec.get('fields', {})
+                for field_name, expected_value in local_fields.items():
+                    cloud_field = cloud_fields.get(field_name, {})
+                    cloud_value = cloud_field.get('value')
+                    if cloud_value != expected_value:
+                        print(f"    Spot-check mismatch: {record_name}.{field_name}: expected '{expected_value}', got '{cloud_value}'")
+                        spot_check_ok = False
+
+            if spot_check_ok:
+                print(f"  Spot-check ({sample_size} records): [OK]")
+            else:
+                total_mismatches += 1
+
+        results.append({
+            'type': record_type,
+            'local': local_count,
+            'cloud': cloud_count,
+            'match': cloud_count == local_count,
+        })
+
+    # Summary
+    print("\n" + "="*50)
+    print("Verification Summary")
+    print("="*50)
+    for r in results:
+        status = "[OK]" if r['match'] else "[MISMATCH]"
+        print(f"  {r['type']}: Local={r['local']}, CloudKit={r['cloud']} {status}")
+
+    if total_mismatches == 0:
+        print("\n✓ All data verified - CloudKit matches local data")
+    else:
+        print(f"\n⚠ Found {total_mismatches} mismatch(es)")
+
+    return total_mismatches == 0
+
+
 def main():
     p = argparse.ArgumentParser(description='Import JSON to CloudKit')
     p.add_argument('--key-id', default=DEFAULT_KEY_ID)
@@ -1070,6 +1334,8 @@ def main():
     p.add_argument('--diff', action='store_true', help='Show diff between local and CloudKit without importing')
     p.add_argument('--smart-sync', action='store_true', help='Differential sync: only upload new/changed records')
     p.add_argument('--delete-orphans', action='store_true', help='With --smart-sync, also delete records not in local data')
+    p.add_argument('--verify', action='store_true', help='Verify CloudKit matches local data (quick: counts + spot-check)')
+    p.add_argument('--verify-deep', action='store_true', help='Verify CloudKit matches local data (deep: full field comparison)')
     p.add_argument('--dry-run', action='store_true')
     p.add_argument('--verbose', '-v', action='store_true')
     p.add_argument('--interactive', '-i', action='store_true', help='Show interactive menu')
@@ -1079,7 +1345,8 @@ def main():
     has_action_flag = any([
         args.stadiums_only, args.games_only, args.games_files, args.league_structure_only,
         args.team_aliases_only, args.stadium_aliases_only, args.canonical_only,
-        args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync
+        args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync,
+        args.verify, args.verify_deep
     ])
 
     # Track selected game files (for option 4 or --games-files)
@@ -1126,6 +1393,10 @@ def main():
         elif choice == 13:  # Smart sync + delete orphans
             args.smart_sync = True
             args.delete_orphans = True
+        elif choice == 14:  # Verify sync (quick)
+            args.verify = True
+        elif choice == 15:  # Verify sync (deep)
+            args.verify_deep = True
 
     print(f"\n{'='*50}")
     print(f"CloudKit Import {'(DRY RUN)' if args.dry_run else ''}")
@@ -1208,6 +1479,18 @@ def main():
         show_diff_report(ck, args.data_dir, verbose=args.verbose)
         return
 
+    # Handle verify mode
+    if args.verify or args.verify_deep:
+        if not ck:
+            # Need CloudKit connection for verification
+            if not HAS_CRYPTO:
+                sys.exit("Error: pip install cryptography")
+            if not os.path.exists(args.key_file):
+                sys.exit(f"Error: Key file not found: {args.key_file}")
+            ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env)
+        verify_sync(ck, args.data_dir, verbose=args.verbose, deep=args.verify_deep)
+        return
+
     # Handle smart sync mode (differential upload)
     if args.smart_sync:
         if not ck: