From 9d2dbf61ddbe4c615c3073c6301e3b6d1f73a913 Mon Sep 17 00:00:00 2001 From: Trey t Date: Sat, 10 Jan 2026 10:34:39 -0600 Subject: [PATCH] feat(06-01): add orphan listing and completeness metrics Add --list-orphans flag with orphan detection by record type, data completeness metrics (coordinates, capacity, team/stadium refs), health score calculation (0-100), and actionable recommendations. Includes JSON export and menu option 17. Co-Authored-By: Claude Opus 4.5 --- Scripts/cloudkit_import.py | 299 ++++++++++++++++++++++++++++++++++++- 1 file changed, 295 insertions(+), 4 deletions(-) diff --git a/Scripts/cloudkit_import.py b/Scripts/cloudkit_import.py index 909e83b..f647b16 100755 --- a/Scripts/cloudkit_import.py +++ b/Scripts/cloudkit_import.py @@ -158,17 +158,18 @@ def show_menu(): print(" 14. Verify sync (quick)") print(" 15. Verify sync (deep)") print(" 16. Validate data (local + CloudKit)") + print(" 17. List orphan records") print(" 0. Exit") print() while True: try: - choice = input("Enter choice [1-16, 0 to exit]: ").strip() + choice = input("Enter choice [1-17, 0 to exit]: ").strip() if choice == '0': return None - if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16']: + if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17']: return int(choice) - print("Invalid choice. Please enter 1-16 or 0.") + print("Invalid choice. Please enter 1-17 or 0.") except (EOFError, KeyboardInterrupt): print("\nExiting.") return None @@ -1661,6 +1662,284 @@ def validate_all(ck, data_dir, output_file=None, verbose=False): return report +def list_orphans(ck, data_dir, output_file=None, verbose=False): + """ + List orphan records in CloudKit (records not in local data). + Also shows data completeness metrics and health score. + """ + data_dir = Path(data_dir) + + print("\n" + "="*60) + print("Orphan Records & Data Quality Report") + print("="*60) + + # Load local data + stadiums = json.load(open(data_dir / 'stadiums_canonical.json')) if (data_dir / 'stadiums_canonical.json').exists() else [] + teams = json.load(open(data_dir / 'teams_canonical.json')) if (data_dir / 'teams_canonical.json').exists() else [] + + # Load games + canonical_games_dir = data_dir / 'canonical' / 'games' + games = [] + game_files = list(canonical_games_dir.glob('*.json')) if canonical_games_dir.exists() else [] + if game_files: + for games_file in sorted(game_files): + with open(games_file) as f: + games.extend(json.load(f)) + elif (data_dir / 'games_canonical.json').exists(): + games = json.load(open(data_dir / 'games_canonical.json')) + + stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else [] + team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else [] + + print(f"\nLocal data loaded:") + print(f" Stadiums: {len(stadiums)}") + print(f" Teams: {len(teams)}") + print(f" Games: {len(games)}") + print(f" Stadium aliases: {len(stadium_aliases)}") + print(f" Team aliases: {len(team_aliases)}") + + # Build local record sets + local_stadium_ids = {deterministic_uuid(s.get('canonical_id', s.get('id', ''))) for s in stadiums} + local_team_ids = {deterministic_uuid(t.get('canonical_id', '')) for t in teams} + local_game_ids = {deterministic_uuid(g.get('canonical_id', g.get('id', ''))) for g in games} + local_stadium_alias_ids = {deterministic_uuid(a.get('alias', '')) for a in stadium_aliases} + local_team_alias_ids = {deterministic_uuid(a.get('alias', '')) for a in team_aliases} + + # Fetch CloudKit records + print("\nFetching CloudKit records...") + ck_stadiums = {r['recordName']: r for r in ck.query_all('Stadium', verbose=verbose)} + ck_teams = {r['recordName']: r for r in ck.query_all('Team', verbose=verbose)} + ck_games = {r['recordName']: r for r in ck.query_all('Game', verbose=verbose)} + ck_stadium_aliases = {r['recordName']: r for r in ck.query_all('StadiumAlias', verbose=verbose)} + ck_team_aliases = {r['recordName']: r for r in ck.query_all('TeamAlias', verbose=verbose)} + + print(f"\nCloudKit records:") + print(f" Stadiums: {len(ck_stadiums)}") + print(f" Teams: {len(ck_teams)}") + print(f" Games: {len(ck_games)}") + print(f" Stadium aliases: {len(ck_stadium_aliases)}") + print(f" Team aliases: {len(ck_team_aliases)}") + + # ========================================================================= + # Section 1: Orphan Listing + # ========================================================================= + print("\n" + "-"*60) + print("SECTION 1: Orphan Records (in CloudKit but not in local data)") + print("-"*60) + + report = { + 'timestamp': datetime.now(timezone.utc).isoformat(), + 'orphans': {}, + 'completeness': {}, + 'health_score': 0, + 'recommendations': [], + } + + orphan_types = [ + ('Stadium', set(ck_stadiums.keys()) - local_stadium_ids, ck_stadiums), + ('Team', set(ck_teams.keys()) - local_team_ids, ck_teams), + ('Game', set(ck_games.keys()) - local_game_ids, ck_games), + ('StadiumAlias', set(ck_stadium_aliases.keys()) - local_stadium_alias_ids, ck_stadium_aliases), + ('TeamAlias', set(ck_team_aliases.keys()) - local_team_alias_ids, ck_team_aliases), + ] + + total_orphans = 0 + for record_type, orphan_ids, ck_records in orphan_types: + orphan_count = len(orphan_ids) + total_orphans += orphan_count + + report['orphans'][record_type] = { + 'count': orphan_count, + 'sample': [], + } + + print(f"\n {record_type}: {orphan_count} orphan(s)") + + if orphan_count > 0: + # Show first 10 + sample = sorted(list(orphan_ids))[:10] + for record_name in sample: + rec = ck_records.get(record_name, {}) + fields = rec.get('fields', {}) + # Try to get a human-readable identifier + canonical_id = fields.get('canonicalId', {}).get('value', '') + name = fields.get('name', {}).get('value', '') + display = canonical_id or name or record_name + + print(f" - {display}") + report['orphans'][record_type]['sample'].append({ + 'recordName': record_name, + 'canonicalId': canonical_id, + 'name': name, + }) + + if orphan_count > 10: + print(f" ... and {orphan_count - 10} more") + + if total_orphans == 0: + print("\n ✓ No orphan records found") + + # ========================================================================= + # Section 2: Data Completeness Metrics + # ========================================================================= + print("\n" + "-"*60) + print("SECTION 2: Data Completeness Metrics") + print("-"*60) + + # Stadium completeness + stadiums_with_coords = sum(1 for s in stadiums if s.get('latitude') and s.get('longitude')) + stadiums_with_capacity = sum(1 for s in stadiums if s.get('capacity')) + stadiums_with_year = sum(1 for s in stadiums if s.get('year_opened')) + unknown_stadiums = sum(1 for s in stadiums if s.get('canonical_id', '').startswith('stadium_unknown_')) + + stadium_completeness = { + 'total': len(stadiums), + 'with_coordinates': stadiums_with_coords, + 'with_capacity': stadiums_with_capacity, + 'with_year_opened': stadiums_with_year, + 'unknown_stadiums': unknown_stadiums, + 'pct_coordinates': round(100 * stadiums_with_coords / max(len(stadiums), 1), 1), + 'pct_capacity': round(100 * stadiums_with_capacity / max(len(stadiums), 1), 1), + 'pct_year': round(100 * stadiums_with_year / max(len(stadiums), 1), 1), + } + + print(f"\n Stadiums ({len(stadiums)} total):") + print(f" With coordinates: {stadiums_with_coords} ({stadium_completeness['pct_coordinates']}%)") + print(f" With capacity: {stadiums_with_capacity} ({stadium_completeness['pct_capacity']}%)") + print(f" With year_opened: {stadiums_with_year} ({stadium_completeness['pct_year']}%)") + print(f" Unknown stadiums: {unknown_stadiums}") + + # Team completeness + stadium_ids = {s.get('canonical_id', s.get('id', '')) for s in stadiums} + teams_with_stadium = sum(1 for t in teams if t.get('stadium_id') in stadium_ids) + + team_completeness = { + 'total': len(teams), + 'with_valid_stadium': teams_with_stadium, + 'pct_valid_stadium': round(100 * teams_with_stadium / max(len(teams), 1), 1), + } + + print(f"\n Teams ({len(teams)} total):") + print(f" With valid stadium ref: {teams_with_stadium} ({team_completeness['pct_valid_stadium']}%)") + + # Game completeness + team_ids = {t.get('canonical_id', '') for t in teams} + games_with_home = sum(1 for g in games if g.get('home_team_id') in team_ids) + games_with_away = sum(1 for g in games if g.get('away_team_id') in team_ids) + games_with_stadium = sum(1 for g in games if g.get('stadium_id') in stadium_ids) + + game_completeness = { + 'total': len(games), + 'with_home_team': games_with_home, + 'with_away_team': games_with_away, + 'with_stadium': games_with_stadium, + 'pct_home_team': round(100 * games_with_home / max(len(games), 1), 1), + 'pct_away_team': round(100 * games_with_away / max(len(games), 1), 1), + 'pct_stadium': round(100 * games_with_stadium / max(len(games), 1), 1), + } + + print(f"\n Games ({len(games)} total):") + print(f" With resolved home team: {games_with_home} ({game_completeness['pct_home_team']}%)") + print(f" With resolved away team: {games_with_away} ({game_completeness['pct_away_team']}%)") + print(f" With resolved stadium: {games_with_stadium} ({game_completeness['pct_stadium']}%)") + + report['completeness'] = { + 'stadiums': stadium_completeness, + 'teams': team_completeness, + 'games': game_completeness, + } + + # ========================================================================= + # Section 3: Health Score + # ========================================================================= + print("\n" + "-"*60) + print("SECTION 3: Health Score") + print("-"*60) + + # Calculate health score (0-100) + # Factors: + # - Orphan penalty: -2 points per orphan (max -30) + # - Completeness: average of key metrics + # - Unknown stadiums penalty: -1 per unknown (max -10) + + orphan_penalty = min(30, total_orphans * 2) + unknown_penalty = min(10, unknown_stadiums) + + completeness_scores = [ + stadium_completeness['pct_coordinates'], + stadium_completeness['pct_capacity'], + team_completeness['pct_valid_stadium'], + game_completeness['pct_home_team'], + game_completeness['pct_away_team'], + game_completeness['pct_stadium'], + ] + avg_completeness = sum(completeness_scores) / len(completeness_scores) + + health_score = max(0, min(100, avg_completeness - orphan_penalty - unknown_penalty)) + report['health_score'] = round(health_score, 1) + + # Determine status + if health_score >= 90: + status = "✓ EXCELLENT" + elif health_score >= 70: + status = "○ GOOD" + elif health_score >= 50: + status = "△ FAIR" + else: + status = "✗ NEEDS ATTENTION" + + print(f"\n Health Score: {health_score:.1f}/100 {status}") + print(f"\n Score breakdown:") + print(f" Base completeness: {avg_completeness:.1f}") + print(f" Orphan penalty: -{orphan_penalty}") + print(f" Unknown stadium penalty: -{unknown_penalty}") + + # Recommendations + recommendations = [] + if total_orphans > 0: + recommendations.append(f"Delete {total_orphans} orphan records with --smart-sync --delete-orphans") + if unknown_stadiums > 0: + recommendations.append(f"Review {unknown_stadiums} unknown stadiums (stadium_unknown_*)") + if stadium_completeness['pct_coordinates'] < 100: + missing = len(stadiums) - stadiums_with_coords + recommendations.append(f"Add coordinates to {missing} stadium(s)") + if stadium_completeness['pct_capacity'] < 100: + missing = len(stadiums) - stadiums_with_capacity + recommendations.append(f"Add capacity to {missing} stadium(s)") + if game_completeness['pct_home_team'] < 100 or game_completeness['pct_away_team'] < 100: + recommendations.append("Review games with unresolved team references") + if game_completeness['pct_stadium'] < 100: + recommendations.append("Review games with unresolved stadium references") + + report['recommendations'] = recommendations + + if recommendations: + print(f"\n Recommendations:") + for rec in recommendations: + print(f" • {rec}") + else: + print("\n ✓ No recommendations - data is in great shape!") + + # ========================================================================= + # Summary + # ========================================================================= + print("\n" + "="*60) + print("SUMMARY") + print("="*60) + + print(f"\n Total orphans: {total_orphans}") + print(f" Health score: {health_score:.1f}/100") + print(f" Status: {status}") + + # Export to JSON if requested + if output_file: + with open(output_file, 'w') as f: + json.dump(report, f, indent=2, default=str) + print(f"\n Report exported to: {output_file}") + + return report + + # Valid record types for individual record management VALID_RECORD_TYPES = ['Stadium', 'Team', 'Game', 'LeagueStructure', 'TeamAlias', 'StadiumAlias'] @@ -1995,6 +2274,7 @@ def main(): p.add_argument('--verify', action='store_true', help='Verify CloudKit matches local data (quick: counts + spot-check)') p.add_argument('--verify-deep', action='store_true', help='Verify CloudKit matches local data (deep: full field comparison)') p.add_argument('--validate', action='store_true', help='Run comprehensive validation (local + CloudKit relationships + sync status)') + p.add_argument('--list-orphans', action='store_true', help='List orphan records in CloudKit (not in local data)') p.add_argument('--output', type=str, metavar='FILE', help='Output file for validation report (JSON format)') # Individual record management p.add_argument('--get', nargs=2, metavar=('TYPE', 'ID'), help='Get a single record (e.g., --get Stadium stadium_nba_td_garden)') @@ -2013,7 +2293,7 @@ def main(): args.stadiums_only, args.games_only, args.games_files, args.league_structure_only, args.team_aliases_only, args.stadium_aliases_only, args.canonical_only, args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync, - args.verify, args.verify_deep, args.validate, args.get, args.list, args.update_record, args.delete_record + args.verify, args.verify_deep, args.validate, args.list_orphans, args.get, args.list, args.update_record, args.delete_record ]) # Track selected game files (for option 4 or --games-files) @@ -2066,6 +2346,8 @@ def main(): args.verify_deep = True elif choice == 16: # Validate data args.validate = True + elif choice == 17: # List orphan records + args.list_orphans = True print(f"\n{'='*50}") print(f"CloudKit Import {'(DRY RUN)' if args.dry_run else ''}") @@ -2168,6 +2450,15 @@ def main(): validate_all(ck, args.data_dir, output_file=args.output, verbose=args.verbose) return + # Handle list-orphans mode + if args.list_orphans: + if args.dry_run or not HAS_CRYPTO or not os.path.exists(args.key_file): + print("\nError: --list-orphans requires CloudKit connection (cannot use with --dry-run)") + return + ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env) + list_orphans(ck, args.data_dir, output_file=args.output, verbose=args.verbose) + return + # Handle individual record operations if args.get: record_type, record_id = args.get