feat(05-01): add change detection with diff reporting

- query_all() method with pagination
- compute_diff() returns new/updated/unchanged/deleted
- --diff flag shows report without importing

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-10 10:05:29 -06:00
parent e5c6d0fec7
commit 0c74495ee5

View File

@@ -232,6 +232,65 @@ class CloudKit:
except Exception as e:
return {'error': f"Request failed: {e}"}
def query_all(self, record_type, verbose=False):
"""Query all records of a given type with pagination."""
all_records = {}
continuation_marker = None
while True:
path = f"{self.path_base}/records/query"
query_body = {
'query': {'recordType': record_type},
'resultsLimit': 200
}
if continuation_marker:
query_body['continuationMarker'] = continuation_marker
body = json.dumps(query_body)
date = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
headers = {
'Content-Type': 'application/json',
'X-Apple-CloudKit-Request-KeyID': self.key_id,
'X-Apple-CloudKit-Request-ISO8601Date': date,
'X-Apple-CloudKit-Request-SignatureV1': self._sign(date, body, path),
}
if verbose:
page_num = len(all_records) // 200 + 1
print(f" Querying {record_type} (page {page_num})...")
try:
r = requests.post(f"{HOST}{path}", headers=headers, data=body, timeout=60)
if r.status_code != 200:
print(f" Query error: {r.status_code}: {r.text[:200]}")
break
result = r.json()
records = result.get('records', [])
for rec in records:
record_name = rec.get('recordName', '')
all_records[record_name] = rec
if verbose:
print(f" Found {len(records)} records (total: {len(all_records)})")
# Check for continuation
continuation_marker = result.get('continuationMarker')
if not continuation_marker:
break
time.sleep(0.3) # Rate limiting
except requests.exceptions.Timeout:
print(f" Query timeout after 60s")
break
except Exception as e:
print(f" Query failed: {e}")
break
return all_records
def delete_all(self, record_type, verbose=False):
"""Delete all records of a given type."""
total_deleted = 0
@@ -328,6 +387,312 @@ def import_data(ck, records, name, dry_run, verbose):
return total
def fields_equal(local_value, cloud_value, field_type=None):
"""
Compare a local field value with a cloud field value.
Handles special cases for locations and references.
"""
# Handle location fields (compare with tolerance)
if isinstance(local_value, dict) and 'latitude' in local_value:
if not isinstance(cloud_value, dict) or 'latitude' not in cloud_value:
return False
lat_diff = abs(float(local_value['latitude']) - float(cloud_value['latitude']))
lng_diff = abs(float(local_value['longitude']) - float(cloud_value['longitude']))
return lat_diff < 0.0001 and lng_diff < 0.0001
# Handle reference fields (compare by recordName only)
if isinstance(local_value, dict) and 'recordName' in local_value:
if not isinstance(cloud_value, dict) or 'recordName' not in cloud_value:
return False
return local_value['recordName'] == cloud_value['recordName']
# Handle lists
if isinstance(local_value, list):
if not isinstance(cloud_value, list):
return False
return sorted(local_value) == sorted(cloud_value)
# Direct comparison for strings, ints, etc.
return local_value == cloud_value
def compute_diff(local_records, cloud_records, verbose=False):
"""
Compare local records against cloud records.
Returns dict with: 'new', 'updated', 'unchanged', 'deleted' lists.
Each item includes the record and (for updates) the cloud recordChangeTag.
"""
diff = {
'new': [],
'updated': [],
'unchanged': [],
'deleted': []
}
local_names = set(local_records.keys())
cloud_names = set(cloud_records.keys())
# New records: in local but not in cloud
for name in local_names - cloud_names:
diff['new'].append({'record': local_records[name]})
# Deleted records: in cloud but not in local
for name in cloud_names - local_names:
diff['deleted'].append({
'record': cloud_records[name],
'recordChangeTag': cloud_records[name].get('recordChangeTag', '')
})
# Check existing records for changes
for name in local_names & cloud_names:
local_rec = local_records[name]
cloud_rec = cloud_records[name]
local_fields = local_rec.get('fields', {})
cloud_fields = cloud_rec.get('fields', {})
# Compare all fields (except metadata like recordChangeTag)
has_diff = False
diff_fields = []
for field_name, field_data in local_fields.items():
local_value = field_data.get('value')
cloud_field = cloud_fields.get(field_name, {})
cloud_value = cloud_field.get('value')
if not fields_equal(local_value, cloud_value):
has_diff = True
diff_fields.append(field_name)
if has_diff:
diff['updated'].append({
'record': local_rec,
'recordChangeTag': cloud_rec.get('recordChangeTag', ''),
'changed_fields': diff_fields
})
if verbose:
print(f" Changed: {name} - fields: {', '.join(diff_fields)}")
else:
diff['unchanged'].append({'record': local_rec})
return diff
def show_diff_report(ck, data_dir, verbose=False):
"""Query CloudKit and show diff report for all record types."""
from pathlib import Path
data_dir = Path(data_dir)
print("\n" + "="*50)
print("CloudKit Diff Report")
print("="*50 + "\n")
# Load local data
stadiums = json.load(open(data_dir / 'stadiums_canonical.json')) if (data_dir / 'stadiums_canonical.json').exists() else []
teams = json.load(open(data_dir / 'teams_canonical.json')) if (data_dir / 'teams_canonical.json').exists() else []
# Load games from canonical/games/*.json
canonical_games_dir = data_dir / 'canonical' / 'games'
games = []
if canonical_games_dir.exists():
for games_file in sorted(canonical_games_dir.glob('*.json')):
with open(games_file) as f:
games.extend(json.load(f))
league_structure = json.load(open(data_dir / 'league_structure.json')) if (data_dir / 'league_structure.json').exists() else []
team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else []
stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else []
print(f"Local data: {len(stadiums)} stadiums, {len(teams)} teams, {len(games)} games")
print(f" {len(league_structure)} league structures, {len(team_aliases)} team aliases, {len(stadium_aliases)} stadium aliases\n")
# Build local record maps (same format as CloudKit records)
def build_stadium_records(stadiums):
records = {}
for s in stadiums:
stadium_id = s.get('canonical_id', s.get('id', ''))
record_name = deterministic_uuid(stadium_id)
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
fields = {
'stadiumId': {'value': record_name},
'canonicalId': {'value': stadium_id},
'name': {'value': s['name']},
'city': {'value': s['city']},
'state': {'value': s.get('state', '')},
'sport': {'value': s['sport']},
'source': {'value': s.get('source', 'canonical')},
'teamAbbrevs': {'value': team_abbrevs},
}
if s.get('latitude'):
fields['location'] = {'value': {'latitude': s['latitude'], 'longitude': s['longitude']}}
if s.get('capacity'):
fields['capacity'] = {'value': s['capacity']}
records[record_name] = {'recordType': 'Stadium', 'recordName': record_name, 'fields': fields}
return records
def build_team_records(teams):
records = {}
for t in teams:
team_id = t.get('canonical_id', '')
record_name = deterministic_uuid(team_id)
fields = {
'teamId': {'value': record_name},
'canonicalId': {'value': team_id},
'abbreviation': {'value': t['abbreviation']},
'name': {'value': t['name']},
'city': {'value': t['city']},
'sport': {'value': t['sport']},
'stadiumCanonicalId': {'value': t.get('stadium_canonical_id', '')},
}
if t.get('conference_id'):
fields['conferenceId'] = {'value': t['conference_id']}
if t.get('division_id'):
fields['divisionId'] = {'value': t['division_id']}
records[record_name] = {'recordType': 'Team', 'recordName': record_name, 'fields': fields}
return records
def build_game_records(games, stadiums):
records = {}
stadium_id_map = {s.get('canonical_id', s.get('id', '')): deterministic_uuid(s.get('canonical_id', s.get('id', ''))) for s in stadiums}
seen_ids = set()
for g in games:
game_id = g.get('canonical_id', g.get('id', ''))
if game_id in seen_ids:
continue
seen_ids.add(game_id)
game_uuid = deterministic_uuid(game_id)
sport = g['sport']
fields = {
'gameId': {'value': game_uuid},
'canonicalId': {'value': game_id},
'sport': {'value': sport},
'season': {'value': g.get('season', '')},
'source': {'value': g.get('source', 'canonical')},
}
# Parse date/time
if g.get('date'):
try:
time_str = g.get('time', '7:00p')
hour, minute = 19, 0
if time_str:
clean_time = time_str.lower().replace(' ', '')
is_pm = 'p' in clean_time
time_parts = clean_time.replace('p', '').replace('a', '').split(':')
if time_parts:
hour = int(time_parts[0])
if is_pm and hour != 12:
hour += 12
elif not is_pm and hour == 12:
hour = 0
if len(time_parts) > 1:
minute = int(time_parts[1])
dt = datetime.strptime(f"{g['date']} {hour:02d}:{minute:02d}", '%Y-%m-%d %H:%M')
fields['dateTime'] = {'value': int(dt.timestamp() * 1000), 'type': 'TIMESTAMP'}
except:
pass
# Team references
home_team_canonical_id = g.get('home_team_canonical_id', '')
away_team_canonical_id = g.get('away_team_canonical_id', '')
home_team_uuid = deterministic_uuid(home_team_canonical_id)
away_team_uuid = deterministic_uuid(away_team_canonical_id)
fields['homeTeamRef'] = {'value': {'recordName': home_team_uuid, 'action': 'NONE'}}
fields['awayTeamRef'] = {'value': {'recordName': away_team_uuid, 'action': 'NONE'}}
# Stadium reference
if g.get('stadium_canonical_id'):
stadium_canonical_id = g['stadium_canonical_id']
stadium_uuid = stadium_id_map.get(stadium_canonical_id)
if stadium_uuid:
fields['stadiumRef'] = {'value': {'recordName': stadium_uuid, 'action': 'NONE'}}
fields['stadiumCanonicalId'] = {'value': stadium_canonical_id}
records[game_uuid] = {'recordType': 'Game', 'recordName': game_uuid, 'fields': fields}
return records
def build_league_structure_records(league_structure):
records = {}
for ls in league_structure:
record_name = ls['id']
fields = {
'structureId': {'value': ls['id']},
'sport': {'value': ls['sport']},
'type': {'value': ls['type']},
'name': {'value': ls['name']},
'displayOrder': {'value': ls['display_order']},
'schemaVersion': {'value': 1},
}
if ls.get('abbreviation'):
fields['abbreviation'] = {'value': ls['abbreviation']}
if ls.get('parent_id'):
fields['parentId'] = {'value': ls['parent_id']}
records[record_name] = {'recordType': 'LeagueStructure', 'recordName': record_name, 'fields': fields}
return records
def build_team_alias_records(team_aliases):
records = {}
for ta in team_aliases:
record_name = ta['id']
fields = {
'aliasId': {'value': ta['id']},
'teamCanonicalId': {'value': ta['team_canonical_id']},
'aliasType': {'value': ta['alias_type']},
'aliasValue': {'value': ta['alias_value']},
'schemaVersion': {'value': 1},
}
records[record_name] = {'recordType': 'TeamAlias', 'recordName': record_name, 'fields': fields}
return records
def build_stadium_alias_records(stadium_aliases):
records = {}
for sa in stadium_aliases:
stadium_id = sa['stadium_canonical_id']
sport = stadium_id.split('_')[1] if '_' in stadium_id else 'unknown'
record_name = f"{sport}_{sa['alias_name'].lower()}"
fields = {
'aliasName': {'value': sa['alias_name'].lower()},
'stadiumCanonicalId': {'value': sa['stadium_canonical_id']},
'schemaVersion': {'value': 1},
}
records[record_name] = {'recordType': 'StadiumAlias', 'recordName': record_name, 'fields': fields}
return records
# Compute and display diffs for each record type
record_types = [
('Stadium', stadiums, build_stadium_records),
('Team', teams, build_team_records),
('Game', games, lambda g: build_game_records(g, stadiums)),
('LeagueStructure', league_structure, build_league_structure_records),
('TeamAlias', team_aliases, build_team_alias_records),
('StadiumAlias', stadium_aliases, build_stadium_alias_records),
]
all_diffs = {}
for record_type, data, builder in record_types:
if not data:
print(f"{record_type}: No local data")
continue
print(f"Checking {record_type}...")
local_records = builder(data)
# Query cloud records
cloud_records = ck.query_all(record_type, verbose=verbose)
# Compute diff
diff = compute_diff(local_records, cloud_records, verbose=verbose)
all_diffs[record_type] = diff
# Report
print(f" {record_type}: {len(diff['unchanged'])} unchanged, {len(diff['new'])} new, {len(diff['updated'])} updated, {len(diff['deleted'])} deleted")
print("\n" + "="*50)
print("Diff Summary")
print("="*50)
for record_type, diff in all_diffs.items():
total_changes = len(diff['new']) + len(diff['updated']) + len(diff['deleted'])
status = "in sync" if total_changes == 0 else f"{total_changes} changes needed"
print(f" {record_type}: {status}")
return all_diffs
def main():
p = argparse.ArgumentParser(description='Import JSON to CloudKit')
p.add_argument('--key-id', default=DEFAULT_KEY_ID)
@@ -344,6 +709,7 @@ def main():
p.add_argument('--canonical-only', action='store_true', help='Import only canonical data (league structure + team aliases + stadium aliases)')
p.add_argument('--delete-all', action='store_true', help='Delete all records before importing')
p.add_argument('--delete-only', action='store_true', help='Only delete records, do not import')
p.add_argument('--diff', action='store_true', help='Show diff between local and CloudKit without importing')
p.add_argument('--dry-run', action='store_true')
p.add_argument('--verbose', '-v', action='store_true')
p.add_argument('--interactive', '-i', action='store_true', help='Show interactive menu')
@@ -353,7 +719,7 @@ def main():
has_action_flag = any([
args.stadiums_only, args.games_only, args.games_files, args.league_structure_only,
args.team_aliases_only, args.stadium_aliases_only, args.canonical_only,
args.delete_all, args.delete_only, args.dry_run
args.delete_all, args.delete_only, args.dry_run, args.diff
])
# Track selected game files (for option 4 or --games-files)
@@ -465,6 +831,18 @@ def main():
sys.exit(f"Error: Key file not found: {args.key_file}")
ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env)
# Handle diff mode (show diff without importing)
if args.diff:
if not ck:
# Need CloudKit connection for diff
if not HAS_CRYPTO:
sys.exit("Error: pip install cryptography")
if not os.path.exists(args.key_file):
sys.exit(f"Error: Key file not found: {args.key_file}")
ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env)
show_diff_report(ck, args.data_dir, verbose=args.verbose)
return
# Handle deletion
if args.delete_all or args.delete_only:
if not ck: