From 0c74495ee566d403c989d6a7ded7d897da855c86 Mon Sep 17 00:00:00 2001 From: Trey t Date: Sat, 10 Jan 2026 10:05:29 -0600 Subject: [PATCH] feat(05-01): add change detection with diff reporting - query_all() method with pagination - compute_diff() returns new/updated/unchanged/deleted - --diff flag shows report without importing Co-Authored-By: Claude Opus 4.5 --- Scripts/cloudkit_import.py | 380 ++++++++++++++++++++++++++++++++++++- 1 file changed, 379 insertions(+), 1 deletion(-) diff --git a/Scripts/cloudkit_import.py b/Scripts/cloudkit_import.py index b3a858f..d0ebc38 100755 --- a/Scripts/cloudkit_import.py +++ b/Scripts/cloudkit_import.py @@ -232,6 +232,65 @@ class CloudKit: except Exception as e: return {'error': f"Request failed: {e}"} + def query_all(self, record_type, verbose=False): + """Query all records of a given type with pagination.""" + all_records = {} + continuation_marker = None + + while True: + path = f"{self.path_base}/records/query" + query_body = { + 'query': {'recordType': record_type}, + 'resultsLimit': 200 + } + if continuation_marker: + query_body['continuationMarker'] = continuation_marker + + body = json.dumps(query_body) + date = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + headers = { + 'Content-Type': 'application/json', + 'X-Apple-CloudKit-Request-KeyID': self.key_id, + 'X-Apple-CloudKit-Request-ISO8601Date': date, + 'X-Apple-CloudKit-Request-SignatureV1': self._sign(date, body, path), + } + + if verbose: + page_num = len(all_records) // 200 + 1 + print(f" Querying {record_type} (page {page_num})...") + + try: + r = requests.post(f"{HOST}{path}", headers=headers, data=body, timeout=60) + if r.status_code != 200: + print(f" Query error: {r.status_code}: {r.text[:200]}") + break + + result = r.json() + records = result.get('records', []) + + for rec in records: + record_name = rec.get('recordName', '') + all_records[record_name] = rec + + if verbose: + print(f" Found {len(records)} records (total: {len(all_records)})") + + # Check for continuation + continuation_marker = result.get('continuationMarker') + if not continuation_marker: + break + + time.sleep(0.3) # Rate limiting + + except requests.exceptions.Timeout: + print(f" Query timeout after 60s") + break + except Exception as e: + print(f" Query failed: {e}") + break + + return all_records + def delete_all(self, record_type, verbose=False): """Delete all records of a given type.""" total_deleted = 0 @@ -328,6 +387,312 @@ def import_data(ck, records, name, dry_run, verbose): return total +def fields_equal(local_value, cloud_value, field_type=None): + """ + Compare a local field value with a cloud field value. + Handles special cases for locations and references. + """ + # Handle location fields (compare with tolerance) + if isinstance(local_value, dict) and 'latitude' in local_value: + if not isinstance(cloud_value, dict) or 'latitude' not in cloud_value: + return False + lat_diff = abs(float(local_value['latitude']) - float(cloud_value['latitude'])) + lng_diff = abs(float(local_value['longitude']) - float(cloud_value['longitude'])) + return lat_diff < 0.0001 and lng_diff < 0.0001 + + # Handle reference fields (compare by recordName only) + if isinstance(local_value, dict) and 'recordName' in local_value: + if not isinstance(cloud_value, dict) or 'recordName' not in cloud_value: + return False + return local_value['recordName'] == cloud_value['recordName'] + + # Handle lists + if isinstance(local_value, list): + if not isinstance(cloud_value, list): + return False + return sorted(local_value) == sorted(cloud_value) + + # Direct comparison for strings, ints, etc. + return local_value == cloud_value + + +def compute_diff(local_records, cloud_records, verbose=False): + """ + Compare local records against cloud records. + Returns dict with: 'new', 'updated', 'unchanged', 'deleted' lists. + Each item includes the record and (for updates) the cloud recordChangeTag. + """ + diff = { + 'new': [], + 'updated': [], + 'unchanged': [], + 'deleted': [] + } + + local_names = set(local_records.keys()) + cloud_names = set(cloud_records.keys()) + + # New records: in local but not in cloud + for name in local_names - cloud_names: + diff['new'].append({'record': local_records[name]}) + + # Deleted records: in cloud but not in local + for name in cloud_names - local_names: + diff['deleted'].append({ + 'record': cloud_records[name], + 'recordChangeTag': cloud_records[name].get('recordChangeTag', '') + }) + + # Check existing records for changes + for name in local_names & cloud_names: + local_rec = local_records[name] + cloud_rec = cloud_records[name] + local_fields = local_rec.get('fields', {}) + cloud_fields = cloud_rec.get('fields', {}) + + # Compare all fields (except metadata like recordChangeTag) + has_diff = False + diff_fields = [] + + for field_name, field_data in local_fields.items(): + local_value = field_data.get('value') + cloud_field = cloud_fields.get(field_name, {}) + cloud_value = cloud_field.get('value') + + if not fields_equal(local_value, cloud_value): + has_diff = True + diff_fields.append(field_name) + + if has_diff: + diff['updated'].append({ + 'record': local_rec, + 'recordChangeTag': cloud_rec.get('recordChangeTag', ''), + 'changed_fields': diff_fields + }) + if verbose: + print(f" Changed: {name} - fields: {', '.join(diff_fields)}") + else: + diff['unchanged'].append({'record': local_rec}) + + return diff + + +def show_diff_report(ck, data_dir, verbose=False): + """Query CloudKit and show diff report for all record types.""" + from pathlib import Path + data_dir = Path(data_dir) + + print("\n" + "="*50) + print("CloudKit Diff Report") + print("="*50 + "\n") + + # Load local data + stadiums = json.load(open(data_dir / 'stadiums_canonical.json')) if (data_dir / 'stadiums_canonical.json').exists() else [] + teams = json.load(open(data_dir / 'teams_canonical.json')) if (data_dir / 'teams_canonical.json').exists() else [] + + # Load games from canonical/games/*.json + canonical_games_dir = data_dir / 'canonical' / 'games' + games = [] + if canonical_games_dir.exists(): + for games_file in sorted(canonical_games_dir.glob('*.json')): + with open(games_file) as f: + games.extend(json.load(f)) + + league_structure = json.load(open(data_dir / 'league_structure.json')) if (data_dir / 'league_structure.json').exists() else [] + team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else [] + stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else [] + + print(f"Local data: {len(stadiums)} stadiums, {len(teams)} teams, {len(games)} games") + print(f" {len(league_structure)} league structures, {len(team_aliases)} team aliases, {len(stadium_aliases)} stadium aliases\n") + + # Build local record maps (same format as CloudKit records) + def build_stadium_records(stadiums): + records = {} + for s in stadiums: + stadium_id = s.get('canonical_id', s.get('id', '')) + record_name = deterministic_uuid(stadium_id) + team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', [])) + fields = { + 'stadiumId': {'value': record_name}, + 'canonicalId': {'value': stadium_id}, + 'name': {'value': s['name']}, + 'city': {'value': s['city']}, + 'state': {'value': s.get('state', '')}, + 'sport': {'value': s['sport']}, + 'source': {'value': s.get('source', 'canonical')}, + 'teamAbbrevs': {'value': team_abbrevs}, + } + if s.get('latitude'): + fields['location'] = {'value': {'latitude': s['latitude'], 'longitude': s['longitude']}} + if s.get('capacity'): + fields['capacity'] = {'value': s['capacity']} + records[record_name] = {'recordType': 'Stadium', 'recordName': record_name, 'fields': fields} + return records + + def build_team_records(teams): + records = {} + for t in teams: + team_id = t.get('canonical_id', '') + record_name = deterministic_uuid(team_id) + fields = { + 'teamId': {'value': record_name}, + 'canonicalId': {'value': team_id}, + 'abbreviation': {'value': t['abbreviation']}, + 'name': {'value': t['name']}, + 'city': {'value': t['city']}, + 'sport': {'value': t['sport']}, + 'stadiumCanonicalId': {'value': t.get('stadium_canonical_id', '')}, + } + if t.get('conference_id'): + fields['conferenceId'] = {'value': t['conference_id']} + if t.get('division_id'): + fields['divisionId'] = {'value': t['division_id']} + records[record_name] = {'recordType': 'Team', 'recordName': record_name, 'fields': fields} + return records + + def build_game_records(games, stadiums): + records = {} + stadium_id_map = {s.get('canonical_id', s.get('id', '')): deterministic_uuid(s.get('canonical_id', s.get('id', ''))) for s in stadiums} + seen_ids = set() + for g in games: + game_id = g.get('canonical_id', g.get('id', '')) + if game_id in seen_ids: + continue + seen_ids.add(game_id) + game_uuid = deterministic_uuid(game_id) + sport = g['sport'] + fields = { + 'gameId': {'value': game_uuid}, + 'canonicalId': {'value': game_id}, + 'sport': {'value': sport}, + 'season': {'value': g.get('season', '')}, + 'source': {'value': g.get('source', 'canonical')}, + } + # Parse date/time + if g.get('date'): + try: + time_str = g.get('time', '7:00p') + hour, minute = 19, 0 + if time_str: + clean_time = time_str.lower().replace(' ', '') + is_pm = 'p' in clean_time + time_parts = clean_time.replace('p', '').replace('a', '').split(':') + if time_parts: + hour = int(time_parts[0]) + if is_pm and hour != 12: + hour += 12 + elif not is_pm and hour == 12: + hour = 0 + if len(time_parts) > 1: + minute = int(time_parts[1]) + dt = datetime.strptime(f"{g['date']} {hour:02d}:{minute:02d}", '%Y-%m-%d %H:%M') + fields['dateTime'] = {'value': int(dt.timestamp() * 1000), 'type': 'TIMESTAMP'} + except: + pass + # Team references + home_team_canonical_id = g.get('home_team_canonical_id', '') + away_team_canonical_id = g.get('away_team_canonical_id', '') + home_team_uuid = deterministic_uuid(home_team_canonical_id) + away_team_uuid = deterministic_uuid(away_team_canonical_id) + fields['homeTeamRef'] = {'value': {'recordName': home_team_uuid, 'action': 'NONE'}} + fields['awayTeamRef'] = {'value': {'recordName': away_team_uuid, 'action': 'NONE'}} + # Stadium reference + if g.get('stadium_canonical_id'): + stadium_canonical_id = g['stadium_canonical_id'] + stadium_uuid = stadium_id_map.get(stadium_canonical_id) + if stadium_uuid: + fields['stadiumRef'] = {'value': {'recordName': stadium_uuid, 'action': 'NONE'}} + fields['stadiumCanonicalId'] = {'value': stadium_canonical_id} + records[game_uuid] = {'recordType': 'Game', 'recordName': game_uuid, 'fields': fields} + return records + + def build_league_structure_records(league_structure): + records = {} + for ls in league_structure: + record_name = ls['id'] + fields = { + 'structureId': {'value': ls['id']}, + 'sport': {'value': ls['sport']}, + 'type': {'value': ls['type']}, + 'name': {'value': ls['name']}, + 'displayOrder': {'value': ls['display_order']}, + 'schemaVersion': {'value': 1}, + } + if ls.get('abbreviation'): + fields['abbreviation'] = {'value': ls['abbreviation']} + if ls.get('parent_id'): + fields['parentId'] = {'value': ls['parent_id']} + records[record_name] = {'recordType': 'LeagueStructure', 'recordName': record_name, 'fields': fields} + return records + + def build_team_alias_records(team_aliases): + records = {} + for ta in team_aliases: + record_name = ta['id'] + fields = { + 'aliasId': {'value': ta['id']}, + 'teamCanonicalId': {'value': ta['team_canonical_id']}, + 'aliasType': {'value': ta['alias_type']}, + 'aliasValue': {'value': ta['alias_value']}, + 'schemaVersion': {'value': 1}, + } + records[record_name] = {'recordType': 'TeamAlias', 'recordName': record_name, 'fields': fields} + return records + + def build_stadium_alias_records(stadium_aliases): + records = {} + for sa in stadium_aliases: + stadium_id = sa['stadium_canonical_id'] + sport = stadium_id.split('_')[1] if '_' in stadium_id else 'unknown' + record_name = f"{sport}_{sa['alias_name'].lower()}" + fields = { + 'aliasName': {'value': sa['alias_name'].lower()}, + 'stadiumCanonicalId': {'value': sa['stadium_canonical_id']}, + 'schemaVersion': {'value': 1}, + } + records[record_name] = {'recordType': 'StadiumAlias', 'recordName': record_name, 'fields': fields} + return records + + # Compute and display diffs for each record type + record_types = [ + ('Stadium', stadiums, build_stadium_records), + ('Team', teams, build_team_records), + ('Game', games, lambda g: build_game_records(g, stadiums)), + ('LeagueStructure', league_structure, build_league_structure_records), + ('TeamAlias', team_aliases, build_team_alias_records), + ('StadiumAlias', stadium_aliases, build_stadium_alias_records), + ] + + all_diffs = {} + for record_type, data, builder in record_types: + if not data: + print(f"{record_type}: No local data") + continue + + print(f"Checking {record_type}...") + local_records = builder(data) + + # Query cloud records + cloud_records = ck.query_all(record_type, verbose=verbose) + + # Compute diff + diff = compute_diff(local_records, cloud_records, verbose=verbose) + all_diffs[record_type] = diff + + # Report + print(f" {record_type}: {len(diff['unchanged'])} unchanged, {len(diff['new'])} new, {len(diff['updated'])} updated, {len(diff['deleted'])} deleted") + + print("\n" + "="*50) + print("Diff Summary") + print("="*50) + for record_type, diff in all_diffs.items(): + total_changes = len(diff['new']) + len(diff['updated']) + len(diff['deleted']) + status = "in sync" if total_changes == 0 else f"{total_changes} changes needed" + print(f" {record_type}: {status}") + + return all_diffs + + def main(): p = argparse.ArgumentParser(description='Import JSON to CloudKit') p.add_argument('--key-id', default=DEFAULT_KEY_ID) @@ -344,6 +709,7 @@ def main(): p.add_argument('--canonical-only', action='store_true', help='Import only canonical data (league structure + team aliases + stadium aliases)') p.add_argument('--delete-all', action='store_true', help='Delete all records before importing') p.add_argument('--delete-only', action='store_true', help='Only delete records, do not import') + p.add_argument('--diff', action='store_true', help='Show diff between local and CloudKit without importing') p.add_argument('--dry-run', action='store_true') p.add_argument('--verbose', '-v', action='store_true') p.add_argument('--interactive', '-i', action='store_true', help='Show interactive menu') @@ -353,7 +719,7 @@ def main(): has_action_flag = any([ args.stadiums_only, args.games_only, args.games_files, args.league_structure_only, args.team_aliases_only, args.stadium_aliases_only, args.canonical_only, - args.delete_all, args.delete_only, args.dry_run + args.delete_all, args.delete_only, args.dry_run, args.diff ]) # Track selected game files (for option 4 or --games-files) @@ -465,6 +831,18 @@ def main(): sys.exit(f"Error: Key file not found: {args.key_file}") ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env) + # Handle diff mode (show diff without importing) + if args.diff: + if not ck: + # Need CloudKit connection for diff + if not HAS_CRYPTO: + sys.exit("Error: pip install cryptography") + if not os.path.exists(args.key_file): + sys.exit(f"Error: Key file not found: {args.key_file}") + ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env) + show_diff_report(ck, args.data_dir, verbose=args.verbose) + return + # Handle deletion if args.delete_all or args.delete_only: if not ck: