feat(06-01): add orphan listing and completeness metrics

Add --list-orphans flag with orphan detection by record type, data completeness metrics (coordinates, capacity, team/stadium refs), health score calculation (0-100), and actionable recommendations. Includes JSON export and menu option 17. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 10:34:39 -06:00
parent 9f0edc4228
commit 9d2dbf61dd
1 changed files with 295 additions and 4 deletions
@@ -158,17 +158,18 @@ def show_menu():
    print("  14. Verify sync (quick)")
    print("  15. Verify sync (deep)")
    print("  16. Validate data (local + CloudKit)")
+    print("  17. List orphan records")
    print("  0. Exit")
    print()

    while True:
        try:
-            choice = input("Enter choice [1-16, 0 to exit]: ").strip()
+            choice = input("Enter choice [1-17, 0 to exit]: ").strip()
            if choice == '0':
                return None
-            if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16']:
+            if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17']:
                return int(choice)
-            print("Invalid choice. Please enter 1-16 or 0.")
+            print("Invalid choice. Please enter 1-17 or 0.")
        except (EOFError, KeyboardInterrupt):
            print("\nExiting.")
            return None
@@ -1661,6 +1662,284 @@ def validate_all(ck, data_dir, output_file=None, verbose=False):
    return report


+def list_orphans(ck, data_dir, output_file=None, verbose=False):
+    """
+    List orphan records in CloudKit (records not in local data).
+    Also shows data completeness metrics and health score.
+    """
+    data_dir = Path(data_dir)
+
+    print("\n" + "="*60)
+    print("Orphan Records & Data Quality Report")
+    print("="*60)
+
+    # Load local data
+    stadiums = json.load(open(data_dir / 'stadiums_canonical.json')) if (data_dir / 'stadiums_canonical.json').exists() else []
+    teams = json.load(open(data_dir / 'teams_canonical.json')) if (data_dir / 'teams_canonical.json').exists() else []
+
+    # Load games
+    canonical_games_dir = data_dir / 'canonical' / 'games'
+    games = []
+    game_files = list(canonical_games_dir.glob('*.json')) if canonical_games_dir.exists() else []
+    if game_files:
+        for games_file in sorted(game_files):
+            with open(games_file) as f:
+                games.extend(json.load(f))
+    elif (data_dir / 'games_canonical.json').exists():
+        games = json.load(open(data_dir / 'games_canonical.json'))
+
+    stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else []
+    team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else []
+
+    print(f"\nLocal data loaded:")
+    print(f"  Stadiums: {len(stadiums)}")
+    print(f"  Teams: {len(teams)}")
+    print(f"  Games: {len(games)}")
+    print(f"  Stadium aliases: {len(stadium_aliases)}")
+    print(f"  Team aliases: {len(team_aliases)}")
+
+    # Build local record sets
+    local_stadium_ids = {deterministic_uuid(s.get('canonical_id', s.get('id', ''))) for s in stadiums}
+    local_team_ids = {deterministic_uuid(t.get('canonical_id', '')) for t in teams}
+    local_game_ids = {deterministic_uuid(g.get('canonical_id', g.get('id', ''))) for g in games}
+    local_stadium_alias_ids = {deterministic_uuid(a.get('alias', '')) for a in stadium_aliases}
+    local_team_alias_ids = {deterministic_uuid(a.get('alias', '')) for a in team_aliases}
+
+    # Fetch CloudKit records
+    print("\nFetching CloudKit records...")
+    ck_stadiums = {r['recordName']: r for r in ck.query_all('Stadium', verbose=verbose)}
+    ck_teams = {r['recordName']: r for r in ck.query_all('Team', verbose=verbose)}
+    ck_games = {r['recordName']: r for r in ck.query_all('Game', verbose=verbose)}
+    ck_stadium_aliases = {r['recordName']: r for r in ck.query_all('StadiumAlias', verbose=verbose)}
+    ck_team_aliases = {r['recordName']: r for r in ck.query_all('TeamAlias', verbose=verbose)}
+
+    print(f"\nCloudKit records:")
+    print(f"  Stadiums: {len(ck_stadiums)}")
+    print(f"  Teams: {len(ck_teams)}")
+    print(f"  Games: {len(ck_games)}")
+    print(f"  Stadium aliases: {len(ck_stadium_aliases)}")
+    print(f"  Team aliases: {len(ck_team_aliases)}")
+
+    # =========================================================================
+    # Section 1: Orphan Listing
+    # =========================================================================
+    print("\n" + "-"*60)
+    print("SECTION 1: Orphan Records (in CloudKit but not in local data)")
+    print("-"*60)
+
+    report = {
+        'timestamp': datetime.now(timezone.utc).isoformat(),
+        'orphans': {},
+        'completeness': {},
+        'health_score': 0,
+        'recommendations': [],
+    }
+
+    orphan_types = [
+        ('Stadium', set(ck_stadiums.keys()) - local_stadium_ids, ck_stadiums),
+        ('Team', set(ck_teams.keys()) - local_team_ids, ck_teams),
+        ('Game', set(ck_games.keys()) - local_game_ids, ck_games),
+        ('StadiumAlias', set(ck_stadium_aliases.keys()) - local_stadium_alias_ids, ck_stadium_aliases),
+        ('TeamAlias', set(ck_team_aliases.keys()) - local_team_alias_ids, ck_team_aliases),
+    ]
+
+    total_orphans = 0
+    for record_type, orphan_ids, ck_records in orphan_types:
+        orphan_count = len(orphan_ids)
+        total_orphans += orphan_count
+
+        report['orphans'][record_type] = {
+            'count': orphan_count,
+            'sample': [],
+        }
+
+        print(f"\n  {record_type}: {orphan_count} orphan(s)")
+
+        if orphan_count > 0:
+            # Show first 10
+            sample = sorted(list(orphan_ids))[:10]
+            for record_name in sample:
+                rec = ck_records.get(record_name, {})
+                fields = rec.get('fields', {})
+                # Try to get a human-readable identifier
+                canonical_id = fields.get('canonicalId', {}).get('value', '')
+                name = fields.get('name', {}).get('value', '')
+                display = canonical_id or name or record_name
+
+                print(f"    - {display}")
+                report['orphans'][record_type]['sample'].append({
+                    'recordName': record_name,
+                    'canonicalId': canonical_id,
+                    'name': name,
+                })
+
+            if orphan_count > 10:
+                print(f"    ... and {orphan_count - 10} more")
+
+    if total_orphans == 0:
+        print("\n  ✓ No orphan records found")
+
+    # =========================================================================
+    # Section 2: Data Completeness Metrics
+    # =========================================================================
+    print("\n" + "-"*60)
+    print("SECTION 2: Data Completeness Metrics")
+    print("-"*60)
+
+    # Stadium completeness
+    stadiums_with_coords = sum(1 for s in stadiums if s.get('latitude') and s.get('longitude'))
+    stadiums_with_capacity = sum(1 for s in stadiums if s.get('capacity'))
+    stadiums_with_year = sum(1 for s in stadiums if s.get('year_opened'))
+    unknown_stadiums = sum(1 for s in stadiums if s.get('canonical_id', '').startswith('stadium_unknown_'))
+
+    stadium_completeness = {
+        'total': len(stadiums),
+        'with_coordinates': stadiums_with_coords,
+        'with_capacity': stadiums_with_capacity,
+        'with_year_opened': stadiums_with_year,
+        'unknown_stadiums': unknown_stadiums,
+        'pct_coordinates': round(100 * stadiums_with_coords / max(len(stadiums), 1), 1),
+        'pct_capacity': round(100 * stadiums_with_capacity / max(len(stadiums), 1), 1),
+        'pct_year': round(100 * stadiums_with_year / max(len(stadiums), 1), 1),
+    }
+
+    print(f"\n  Stadiums ({len(stadiums)} total):")
+    print(f"    With coordinates: {stadiums_with_coords} ({stadium_completeness['pct_coordinates']}%)")
+    print(f"    With capacity: {stadiums_with_capacity} ({stadium_completeness['pct_capacity']}%)")
+    print(f"    With year_opened: {stadiums_with_year} ({stadium_completeness['pct_year']}%)")
+    print(f"    Unknown stadiums: {unknown_stadiums}")
+
+    # Team completeness
+    stadium_ids = {s.get('canonical_id', s.get('id', '')) for s in stadiums}
+    teams_with_stadium = sum(1 for t in teams if t.get('stadium_id') in stadium_ids)
+
+    team_completeness = {
+        'total': len(teams),
+        'with_valid_stadium': teams_with_stadium,
+        'pct_valid_stadium': round(100 * teams_with_stadium / max(len(teams), 1), 1),
+    }
+
+    print(f"\n  Teams ({len(teams)} total):")
+    print(f"    With valid stadium ref: {teams_with_stadium} ({team_completeness['pct_valid_stadium']}%)")
+
+    # Game completeness
+    team_ids = {t.get('canonical_id', '') for t in teams}
+    games_with_home = sum(1 for g in games if g.get('home_team_id') in team_ids)
+    games_with_away = sum(1 for g in games if g.get('away_team_id') in team_ids)
+    games_with_stadium = sum(1 for g in games if g.get('stadium_id') in stadium_ids)
+
+    game_completeness = {
+        'total': len(games),
+        'with_home_team': games_with_home,
+        'with_away_team': games_with_away,
+        'with_stadium': games_with_stadium,
+        'pct_home_team': round(100 * games_with_home / max(len(games), 1), 1),
+        'pct_away_team': round(100 * games_with_away / max(len(games), 1), 1),
+        'pct_stadium': round(100 * games_with_stadium / max(len(games), 1), 1),
+    }
+
+    print(f"\n  Games ({len(games)} total):")
+    print(f"    With resolved home team: {games_with_home} ({game_completeness['pct_home_team']}%)")
+    print(f"    With resolved away team: {games_with_away} ({game_completeness['pct_away_team']}%)")
+    print(f"    With resolved stadium: {games_with_stadium} ({game_completeness['pct_stadium']}%)")
+
+    report['completeness'] = {
+        'stadiums': stadium_completeness,
+        'teams': team_completeness,
+        'games': game_completeness,
+    }
+
+    # =========================================================================
+    # Section 3: Health Score
+    # =========================================================================
+    print("\n" + "-"*60)
+    print("SECTION 3: Health Score")
+    print("-"*60)
+
+    # Calculate health score (0-100)
+    # Factors:
+    # - Orphan penalty: -2 points per orphan (max -30)
+    # - Completeness: average of key metrics
+    # - Unknown stadiums penalty: -1 per unknown (max -10)
+
+    orphan_penalty = min(30, total_orphans * 2)
+    unknown_penalty = min(10, unknown_stadiums)
+
+    completeness_scores = [
+        stadium_completeness['pct_coordinates'],
+        stadium_completeness['pct_capacity'],
+        team_completeness['pct_valid_stadium'],
+        game_completeness['pct_home_team'],
+        game_completeness['pct_away_team'],
+        game_completeness['pct_stadium'],
+    ]
+    avg_completeness = sum(completeness_scores) / len(completeness_scores)
+
+    health_score = max(0, min(100, avg_completeness - orphan_penalty - unknown_penalty))
+    report['health_score'] = round(health_score, 1)
+
+    # Determine status
+    if health_score >= 90:
+        status = "✓ EXCELLENT"
+    elif health_score >= 70:
+        status = "○ GOOD"
+    elif health_score >= 50:
+        status = "△ FAIR"
+    else:
+        status = "✗ NEEDS ATTENTION"
+
+    print(f"\n  Health Score: {health_score:.1f}/100 {status}")
+    print(f"\n  Score breakdown:")
+    print(f"    Base completeness: {avg_completeness:.1f}")
+    print(f"    Orphan penalty: -{orphan_penalty}")
+    print(f"    Unknown stadium penalty: -{unknown_penalty}")
+
+    # Recommendations
+    recommendations = []
+    if total_orphans > 0:
+        recommendations.append(f"Delete {total_orphans} orphan records with --smart-sync --delete-orphans")
+    if unknown_stadiums > 0:
+        recommendations.append(f"Review {unknown_stadiums} unknown stadiums (stadium_unknown_*)")
+    if stadium_completeness['pct_coordinates'] < 100:
+        missing = len(stadiums) - stadiums_with_coords
+        recommendations.append(f"Add coordinates to {missing} stadium(s)")
+    if stadium_completeness['pct_capacity'] < 100:
+        missing = len(stadiums) - stadiums_with_capacity
+        recommendations.append(f"Add capacity to {missing} stadium(s)")
+    if game_completeness['pct_home_team'] < 100 or game_completeness['pct_away_team'] < 100:
+        recommendations.append("Review games with unresolved team references")
+    if game_completeness['pct_stadium'] < 100:
+        recommendations.append("Review games with unresolved stadium references")
+
+    report['recommendations'] = recommendations
+
+    if recommendations:
+        print(f"\n  Recommendations:")
+        for rec in recommendations:
+            print(f"    • {rec}")
+    else:
+        print("\n  ✓ No recommendations - data is in great shape!")
+
+    # =========================================================================
+    # Summary
+    # =========================================================================
+    print("\n" + "="*60)
+    print("SUMMARY")
+    print("="*60)
+
+    print(f"\n  Total orphans: {total_orphans}")
+    print(f"  Health score: {health_score:.1f}/100")
+    print(f"  Status: {status}")
+
+    # Export to JSON if requested
+    if output_file:
+        with open(output_file, 'w') as f:
+            json.dump(report, f, indent=2, default=str)
+        print(f"\n  Report exported to: {output_file}")
+
+    return report
+
+
 # Valid record types for individual record management
 VALID_RECORD_TYPES = ['Stadium', 'Team', 'Game', 'LeagueStructure', 'TeamAlias', 'StadiumAlias']

@@ -1995,6 +2274,7 @@ def main():
    p.add_argument('--verify', action='store_true', help='Verify CloudKit matches local data (quick: counts + spot-check)')
    p.add_argument('--verify-deep', action='store_true', help='Verify CloudKit matches local data (deep: full field comparison)')
    p.add_argument('--validate', action='store_true', help='Run comprehensive validation (local + CloudKit relationships + sync status)')
+    p.add_argument('--list-orphans', action='store_true', help='List orphan records in CloudKit (not in local data)')
    p.add_argument('--output', type=str, metavar='FILE', help='Output file for validation report (JSON format)')
    # Individual record management
    p.add_argument('--get', nargs=2, metavar=('TYPE', 'ID'), help='Get a single record (e.g., --get Stadium stadium_nba_td_garden)')
@@ -2013,7 +2293,7 @@ def main():
        args.stadiums_only, args.games_only, args.games_files, args.league_structure_only,
        args.team_aliases_only, args.stadium_aliases_only, args.canonical_only,
        args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync,
-        args.verify, args.verify_deep, args.validate, args.get, args.list, args.update_record, args.delete_record
+        args.verify, args.verify_deep, args.validate, args.list_orphans, args.get, args.list, args.update_record, args.delete_record
    ])

    # Track selected game files (for option 4 or --games-files)
@@ -2066,6 +2346,8 @@ def main():
            args.verify_deep = True
        elif choice == 16:  # Validate data
            args.validate = True
+        elif choice == 17:  # List orphan records
+            args.list_orphans = True

    print(f"\n{'='*50}")
    print(f"CloudKit Import {'(DRY RUN)' if args.dry_run else ''}")
@@ -2168,6 +2450,15 @@ def main():
        validate_all(ck, args.data_dir, output_file=args.output, verbose=args.verbose)
        return

+    # Handle list-orphans mode
+    if args.list_orphans:
+        if args.dry_run or not HAS_CRYPTO or not os.path.exists(args.key_file):
+            print("\nError: --list-orphans requires CloudKit connection (cannot use with --dry-run)")
+            return
+        ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env)
+        list_orphans(ck, args.data_dir, output_file=args.output, verbose=args.verbose)
+        return
+
    # Handle individual record operations
    if args.get:
        record_type, record_id = args.get