feat(05-02): add sync verification with --verify flag

- Add --verify flag for quick verification (counts + 5-record spot-check)
- Add --verify-deep flag for full field-by-field comparison
- Add verify_sync() function to compare CloudKit vs local data
- Add lookup() method to CloudKit class for record lookups
- Add menu options 14-15 for verify sync quick/deep
This commit is contained in:
Trey t
2026-01-10 10:13:08 -06:00
parent b42a57fba2
commit 5763db4a61

View File

@@ -137,17 +137,19 @@ def show_menu():
print(" 11. Dry run (preview only)")
print(" 12. Smart sync (diff-based, only upload changes)")
print(" 13. Smart sync + delete orphans")
print(" 14. Verify sync (quick)")
print(" 15. Verify sync (deep)")
print(" 0. Exit")
print()
while True:
try:
choice = input("Enter choice [1-13, 0 to exit]: ").strip()
choice = input("Enter choice [1-15, 0 to exit]: ").strip()
if choice == '0':
return None
if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13']:
if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15']:
return int(choice)
print("Invalid choice. Please enter 1-13 or 0.")
print("Invalid choice. Please enter 1-15 or 0.")
except (EOFError, KeyboardInterrupt):
print("\nExiting.")
return None
@@ -293,6 +295,36 @@ class CloudKit:
return all_records
def lookup(self, record_type, record_names, verbose=False):
"""Lookup specific records by recordName."""
if not record_names:
return []
path = f"{self.path_base}/records/lookup"
records_to_lookup = [{'recordName': name} for name in record_names]
body = json.dumps({'records': records_to_lookup})
date = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
headers = {
'Content-Type': 'application/json',
'X-Apple-CloudKit-Request-KeyID': self.key_id,
'X-Apple-CloudKit-Request-ISO8601Date': date,
'X-Apple-CloudKit-Request-SignatureV1': self._sign(date, body, path),
}
if verbose:
print(f" Looking up {len(record_names)} {record_type} records...")
try:
r = requests.post(f"{HOST}{path}", headers=headers, data=body, timeout=30)
if r.status_code == 200:
result = r.json()
return result.get('records', [])
return {'error': f"{r.status_code}: {r.text[:200]}"}
except requests.exceptions.Timeout:
return {'error': 'Request timed out after 30s'}
except Exception as e:
return {'error': f"Request failed: {e}"}
def delete_all(self, record_type, verbose=False):
"""Delete all records of a given type."""
total_deleted = 0
@@ -1051,6 +1083,238 @@ def run_smart_sync(ck, data_dir, dry_run=False, verbose=False, delete_orphans=Fa
return total_stats
def verify_sync(ck, data_dir, verbose=False, deep=False):
"""
Verify that CloudKit data matches local canonical data.
Quick mode: compares counts and spot-checks 5 random records per type.
Deep mode: full field-by-field comparison of all records.
"""
import random
from pathlib import Path
data_dir = Path(data_dir)
print("\n" + "="*50)
print(f"CloudKit Sync Verification {'(DEEP)' if deep else '(Quick)'}")
print("="*50)
if deep:
print("\n⚠️ Deep verification may take several minutes for large datasets\n")
# Load local data
stadiums = json.load(open(data_dir / 'stadiums_canonical.json')) if (data_dir / 'stadiums_canonical.json').exists() else []
teams = json.load(open(data_dir / 'teams_canonical.json')) if (data_dir / 'teams_canonical.json').exists() else []
# Load games from canonical/games/*.json
canonical_games_dir = data_dir / 'canonical' / 'games'
games = []
if canonical_games_dir.exists():
for games_file in sorted(canonical_games_dir.glob('*.json')):
with open(games_file) as f:
games.extend(json.load(f))
league_structure = json.load(open(data_dir / 'league_structure.json')) if (data_dir / 'league_structure.json').exists() else []
team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else []
stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else []
# Deduplicate games by canonical_id
seen_ids = set()
unique_games = []
for g in games:
game_id = g.get('canonical_id', g.get('id', ''))
if game_id not in seen_ids:
seen_ids.add(game_id)
unique_games.append(g)
games = unique_games
local_counts = {
'Stadium': len(stadiums),
'Team': len(teams),
'Game': len(games),
'LeagueStructure': len(league_structure),
'TeamAlias': len(team_aliases),
'StadiumAlias': len(stadium_aliases),
}
print(f"Local data: {local_counts['Stadium']} stadiums, {local_counts['Team']} teams, {local_counts['Game']} games")
print(f" {local_counts['LeagueStructure']} league structures, {local_counts['TeamAlias']} team aliases, {local_counts['StadiumAlias']} stadium aliases\n")
# Build local record maps for spot-check comparison
def build_local_record_map(record_type, data):
"""Build a map of recordName -> fields for comparison."""
records = {}
if record_type == 'Stadium':
for s in data:
stadium_id = s.get('canonical_id', s.get('id', ''))
record_name = deterministic_uuid(stadium_id)
records[record_name] = {
'canonicalId': stadium_id,
'name': s['name'],
'city': s['city'],
'sport': s['sport'],
}
elif record_type == 'Team':
for t in data:
team_id = t.get('canonical_id', '')
record_name = deterministic_uuid(team_id)
records[record_name] = {
'canonicalId': team_id,
'abbreviation': t['abbreviation'],
'name': t['name'],
'city': t['city'],
'sport': t['sport'],
}
elif record_type == 'Game':
for g in data:
game_id = g.get('canonical_id', g.get('id', ''))
record_name = deterministic_uuid(game_id)
records[record_name] = {
'canonicalId': game_id,
'sport': g['sport'],
'season': g.get('season', ''),
}
elif record_type == 'LeagueStructure':
for ls in data:
record_name = ls['id']
records[record_name] = {
'structureId': ls['id'],
'sport': ls['sport'],
'type': ls['type'],
'name': ls['name'],
}
elif record_type == 'TeamAlias':
for ta in data:
record_name = ta['id']
records[record_name] = {
'aliasId': ta['id'],
'teamCanonicalId': ta['team_canonical_id'],
'aliasType': ta['alias_type'],
'aliasValue': ta['alias_value'],
}
elif record_type == 'StadiumAlias':
for sa in data:
stadium_id = sa['stadium_canonical_id']
sport = stadium_id.split('_')[1] if '_' in stadium_id else 'unknown'
record_name = f"{sport}_{sa['alias_name'].lower()}"
records[record_name] = {
'aliasName': sa['alias_name'].lower(),
'stadiumCanonicalId': sa['stadium_canonical_id'],
}
return records
data_map = {
'Stadium': stadiums,
'Team': teams,
'Game': games,
'LeagueStructure': league_structure,
'TeamAlias': team_aliases,
'StadiumAlias': stadium_aliases,
}
results = []
total_mismatches = 0
for record_type in ['Stadium', 'Team', 'Game', 'LeagueStructure', 'TeamAlias', 'StadiumAlias']:
local_count = local_counts[record_type]
if local_count == 0:
print(f"{record_type}: No local data, skipping")
continue
# Query CloudKit count
print(f"Checking {record_type}...")
cloud_records = ck.query_all(record_type, verbose=verbose)
cloud_count = len(cloud_records)
# Count comparison
if cloud_count == local_count:
status = "[OK]"
elif cloud_count < local_count:
status = f"[MISMATCH: {local_count - cloud_count} missing in CloudKit]"
total_mismatches += 1
else:
status = f"[MISMATCH: {cloud_count - local_count} extra in CloudKit]"
total_mismatches += 1
print(f" {record_type}: CloudKit={cloud_count}, Local={local_count} {status}")
# Spot-check or deep verification
local_records = build_local_record_map(record_type, data_map[record_type])
if deep:
# Full field-by-field comparison
field_mismatches = []
for record_name, local_fields in local_records.items():
cloud_rec = cloud_records.get(record_name)
if not cloud_rec:
field_mismatches.append(f" {record_name}: Missing in CloudKit")
continue
cloud_fields = cloud_rec.get('fields', {})
for field_name, expected_value in local_fields.items():
cloud_field = cloud_fields.get(field_name, {})
cloud_value = cloud_field.get('value')
if cloud_value != expected_value:
field_mismatches.append(f" {record_name}.{field_name}: expected '{expected_value}', got '{cloud_value}'")
if field_mismatches:
print(f" Field mismatches ({len(field_mismatches)}):")
for m in field_mismatches[:10]: # Show first 10
print(m)
if len(field_mismatches) > 10:
print(f" ... and {len(field_mismatches) - 10} more")
total_mismatches += len(field_mismatches)
else:
print(f" All fields verified [OK]")
elif cloud_count == local_count and cloud_count > 0:
# Spot-check 5 random records
sample_size = min(5, cloud_count)
sample_names = random.sample(list(local_records.keys()), sample_size)
spot_check_ok = True
for record_name in sample_names:
local_fields = local_records[record_name]
cloud_rec = cloud_records.get(record_name)
if not cloud_rec:
print(f" Spot-check failed: {record_name} missing in CloudKit")
spot_check_ok = False
continue
cloud_fields = cloud_rec.get('fields', {})
for field_name, expected_value in local_fields.items():
cloud_field = cloud_fields.get(field_name, {})
cloud_value = cloud_field.get('value')
if cloud_value != expected_value:
print(f" Spot-check mismatch: {record_name}.{field_name}: expected '{expected_value}', got '{cloud_value}'")
spot_check_ok = False
if spot_check_ok:
print(f" Spot-check ({sample_size} records): [OK]")
else:
total_mismatches += 1
results.append({
'type': record_type,
'local': local_count,
'cloud': cloud_count,
'match': cloud_count == local_count,
})
# Summary
print("\n" + "="*50)
print("Verification Summary")
print("="*50)
for r in results:
status = "[OK]" if r['match'] else "[MISMATCH]"
print(f" {r['type']}: Local={r['local']}, CloudKit={r['cloud']} {status}")
if total_mismatches == 0:
print("\n✓ All data verified - CloudKit matches local data")
else:
print(f"\n⚠ Found {total_mismatches} mismatch(es)")
return total_mismatches == 0
def main():
p = argparse.ArgumentParser(description='Import JSON to CloudKit')
p.add_argument('--key-id', default=DEFAULT_KEY_ID)
@@ -1070,6 +1334,8 @@ def main():
p.add_argument('--diff', action='store_true', help='Show diff between local and CloudKit without importing')
p.add_argument('--smart-sync', action='store_true', help='Differential sync: only upload new/changed records')
p.add_argument('--delete-orphans', action='store_true', help='With --smart-sync, also delete records not in local data')
p.add_argument('--verify', action='store_true', help='Verify CloudKit matches local data (quick: counts + spot-check)')
p.add_argument('--verify-deep', action='store_true', help='Verify CloudKit matches local data (deep: full field comparison)')
p.add_argument('--dry-run', action='store_true')
p.add_argument('--verbose', '-v', action='store_true')
p.add_argument('--interactive', '-i', action='store_true', help='Show interactive menu')
@@ -1079,7 +1345,8 @@ def main():
has_action_flag = any([
args.stadiums_only, args.games_only, args.games_files, args.league_structure_only,
args.team_aliases_only, args.stadium_aliases_only, args.canonical_only,
args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync
args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync,
args.verify, args.verify_deep
])
# Track selected game files (for option 4 or --games-files)
@@ -1126,6 +1393,10 @@ def main():
elif choice == 13: # Smart sync + delete orphans
args.smart_sync = True
args.delete_orphans = True
elif choice == 14: # Verify sync (quick)
args.verify = True
elif choice == 15: # Verify sync (deep)
args.verify_deep = True
print(f"\n{'='*50}")
print(f"CloudKit Import {'(DRY RUN)' if args.dry_run else ''}")
@@ -1208,6 +1479,18 @@ def main():
show_diff_report(ck, args.data_dir, verbose=args.verbose)
return
# Handle verify mode
if args.verify or args.verify_deep:
if not ck:
# Need CloudKit connection for verification
if not HAS_CRYPTO:
sys.exit("Error: pip install cryptography")
if not os.path.exists(args.key_file):
sys.exit(f"Error: Key file not found: {args.key_file}")
ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env)
verify_sync(ck, args.data_dir, verbose=args.verbose, deep=args.verify_deep)
return
# Handle smart sync mode (differential upload)
if args.smart_sync:
if not ck: