feat(06-01): add orphan listing and completeness metrics

Add --list-orphans flag with orphan detection by record type,
data completeness metrics (coordinates, capacity, team/stadium refs),
health score calculation (0-100), and actionable recommendations.
Includes JSON export and menu option 17.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-10 10:34:39 -06:00
parent 9f0edc4228
commit 9d2dbf61dd

View File

@@ -158,17 +158,18 @@ def show_menu():
print(" 14. Verify sync (quick)")
print(" 15. Verify sync (deep)")
print(" 16. Validate data (local + CloudKit)")
print(" 17. List orphan records")
print(" 0. Exit")
print()
while True:
try:
choice = input("Enter choice [1-16, 0 to exit]: ").strip()
choice = input("Enter choice [1-17, 0 to exit]: ").strip()
if choice == '0':
return None
if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16']:
if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17']:
return int(choice)
print("Invalid choice. Please enter 1-16 or 0.")
print("Invalid choice. Please enter 1-17 or 0.")
except (EOFError, KeyboardInterrupt):
print("\nExiting.")
return None
@@ -1661,6 +1662,284 @@ def validate_all(ck, data_dir, output_file=None, verbose=False):
return report
def list_orphans(ck, data_dir, output_file=None, verbose=False):
"""
List orphan records in CloudKit (records not in local data).
Also shows data completeness metrics and health score.
"""
data_dir = Path(data_dir)
print("\n" + "="*60)
print("Orphan Records & Data Quality Report")
print("="*60)
# Load local data
stadiums = json.load(open(data_dir / 'stadiums_canonical.json')) if (data_dir / 'stadiums_canonical.json').exists() else []
teams = json.load(open(data_dir / 'teams_canonical.json')) if (data_dir / 'teams_canonical.json').exists() else []
# Load games
canonical_games_dir = data_dir / 'canonical' / 'games'
games = []
game_files = list(canonical_games_dir.glob('*.json')) if canonical_games_dir.exists() else []
if game_files:
for games_file in sorted(game_files):
with open(games_file) as f:
games.extend(json.load(f))
elif (data_dir / 'games_canonical.json').exists():
games = json.load(open(data_dir / 'games_canonical.json'))
stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else []
team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else []
print(f"\nLocal data loaded:")
print(f" Stadiums: {len(stadiums)}")
print(f" Teams: {len(teams)}")
print(f" Games: {len(games)}")
print(f" Stadium aliases: {len(stadium_aliases)}")
print(f" Team aliases: {len(team_aliases)}")
# Build local record sets
local_stadium_ids = {deterministic_uuid(s.get('canonical_id', s.get('id', ''))) for s in stadiums}
local_team_ids = {deterministic_uuid(t.get('canonical_id', '')) for t in teams}
local_game_ids = {deterministic_uuid(g.get('canonical_id', g.get('id', ''))) for g in games}
local_stadium_alias_ids = {deterministic_uuid(a.get('alias', '')) for a in stadium_aliases}
local_team_alias_ids = {deterministic_uuid(a.get('alias', '')) for a in team_aliases}
# Fetch CloudKit records
print("\nFetching CloudKit records...")
ck_stadiums = {r['recordName']: r for r in ck.query_all('Stadium', verbose=verbose)}
ck_teams = {r['recordName']: r for r in ck.query_all('Team', verbose=verbose)}
ck_games = {r['recordName']: r for r in ck.query_all('Game', verbose=verbose)}
ck_stadium_aliases = {r['recordName']: r for r in ck.query_all('StadiumAlias', verbose=verbose)}
ck_team_aliases = {r['recordName']: r for r in ck.query_all('TeamAlias', verbose=verbose)}
print(f"\nCloudKit records:")
print(f" Stadiums: {len(ck_stadiums)}")
print(f" Teams: {len(ck_teams)}")
print(f" Games: {len(ck_games)}")
print(f" Stadium aliases: {len(ck_stadium_aliases)}")
print(f" Team aliases: {len(ck_team_aliases)}")
# =========================================================================
# Section 1: Orphan Listing
# =========================================================================
print("\n" + "-"*60)
print("SECTION 1: Orphan Records (in CloudKit but not in local data)")
print("-"*60)
report = {
'timestamp': datetime.now(timezone.utc).isoformat(),
'orphans': {},
'completeness': {},
'health_score': 0,
'recommendations': [],
}
orphan_types = [
('Stadium', set(ck_stadiums.keys()) - local_stadium_ids, ck_stadiums),
('Team', set(ck_teams.keys()) - local_team_ids, ck_teams),
('Game', set(ck_games.keys()) - local_game_ids, ck_games),
('StadiumAlias', set(ck_stadium_aliases.keys()) - local_stadium_alias_ids, ck_stadium_aliases),
('TeamAlias', set(ck_team_aliases.keys()) - local_team_alias_ids, ck_team_aliases),
]
total_orphans = 0
for record_type, orphan_ids, ck_records in orphan_types:
orphan_count = len(orphan_ids)
total_orphans += orphan_count
report['orphans'][record_type] = {
'count': orphan_count,
'sample': [],
}
print(f"\n {record_type}: {orphan_count} orphan(s)")
if orphan_count > 0:
# Show first 10
sample = sorted(list(orphan_ids))[:10]
for record_name in sample:
rec = ck_records.get(record_name, {})
fields = rec.get('fields', {})
# Try to get a human-readable identifier
canonical_id = fields.get('canonicalId', {}).get('value', '')
name = fields.get('name', {}).get('value', '')
display = canonical_id or name or record_name
print(f" - {display}")
report['orphans'][record_type]['sample'].append({
'recordName': record_name,
'canonicalId': canonical_id,
'name': name,
})
if orphan_count > 10:
print(f" ... and {orphan_count - 10} more")
if total_orphans == 0:
print("\n ✓ No orphan records found")
# =========================================================================
# Section 2: Data Completeness Metrics
# =========================================================================
print("\n" + "-"*60)
print("SECTION 2: Data Completeness Metrics")
print("-"*60)
# Stadium completeness
stadiums_with_coords = sum(1 for s in stadiums if s.get('latitude') and s.get('longitude'))
stadiums_with_capacity = sum(1 for s in stadiums if s.get('capacity'))
stadiums_with_year = sum(1 for s in stadiums if s.get('year_opened'))
unknown_stadiums = sum(1 for s in stadiums if s.get('canonical_id', '').startswith('stadium_unknown_'))
stadium_completeness = {
'total': len(stadiums),
'with_coordinates': stadiums_with_coords,
'with_capacity': stadiums_with_capacity,
'with_year_opened': stadiums_with_year,
'unknown_stadiums': unknown_stadiums,
'pct_coordinates': round(100 * stadiums_with_coords / max(len(stadiums), 1), 1),
'pct_capacity': round(100 * stadiums_with_capacity / max(len(stadiums), 1), 1),
'pct_year': round(100 * stadiums_with_year / max(len(stadiums), 1), 1),
}
print(f"\n Stadiums ({len(stadiums)} total):")
print(f" With coordinates: {stadiums_with_coords} ({stadium_completeness['pct_coordinates']}%)")
print(f" With capacity: {stadiums_with_capacity} ({stadium_completeness['pct_capacity']}%)")
print(f" With year_opened: {stadiums_with_year} ({stadium_completeness['pct_year']}%)")
print(f" Unknown stadiums: {unknown_stadiums}")
# Team completeness
stadium_ids = {s.get('canonical_id', s.get('id', '')) for s in stadiums}
teams_with_stadium = sum(1 for t in teams if t.get('stadium_id') in stadium_ids)
team_completeness = {
'total': len(teams),
'with_valid_stadium': teams_with_stadium,
'pct_valid_stadium': round(100 * teams_with_stadium / max(len(teams), 1), 1),
}
print(f"\n Teams ({len(teams)} total):")
print(f" With valid stadium ref: {teams_with_stadium} ({team_completeness['pct_valid_stadium']}%)")
# Game completeness
team_ids = {t.get('canonical_id', '') for t in teams}
games_with_home = sum(1 for g in games if g.get('home_team_id') in team_ids)
games_with_away = sum(1 for g in games if g.get('away_team_id') in team_ids)
games_with_stadium = sum(1 for g in games if g.get('stadium_id') in stadium_ids)
game_completeness = {
'total': len(games),
'with_home_team': games_with_home,
'with_away_team': games_with_away,
'with_stadium': games_with_stadium,
'pct_home_team': round(100 * games_with_home / max(len(games), 1), 1),
'pct_away_team': round(100 * games_with_away / max(len(games), 1), 1),
'pct_stadium': round(100 * games_with_stadium / max(len(games), 1), 1),
}
print(f"\n Games ({len(games)} total):")
print(f" With resolved home team: {games_with_home} ({game_completeness['pct_home_team']}%)")
print(f" With resolved away team: {games_with_away} ({game_completeness['pct_away_team']}%)")
print(f" With resolved stadium: {games_with_stadium} ({game_completeness['pct_stadium']}%)")
report['completeness'] = {
'stadiums': stadium_completeness,
'teams': team_completeness,
'games': game_completeness,
}
# =========================================================================
# Section 3: Health Score
# =========================================================================
print("\n" + "-"*60)
print("SECTION 3: Health Score")
print("-"*60)
# Calculate health score (0-100)
# Factors:
# - Orphan penalty: -2 points per orphan (max -30)
# - Completeness: average of key metrics
# - Unknown stadiums penalty: -1 per unknown (max -10)
orphan_penalty = min(30, total_orphans * 2)
unknown_penalty = min(10, unknown_stadiums)
completeness_scores = [
stadium_completeness['pct_coordinates'],
stadium_completeness['pct_capacity'],
team_completeness['pct_valid_stadium'],
game_completeness['pct_home_team'],
game_completeness['pct_away_team'],
game_completeness['pct_stadium'],
]
avg_completeness = sum(completeness_scores) / len(completeness_scores)
health_score = max(0, min(100, avg_completeness - orphan_penalty - unknown_penalty))
report['health_score'] = round(health_score, 1)
# Determine status
if health_score >= 90:
status = "✓ EXCELLENT"
elif health_score >= 70:
status = "○ GOOD"
elif health_score >= 50:
status = "△ FAIR"
else:
status = "✗ NEEDS ATTENTION"
print(f"\n Health Score: {health_score:.1f}/100 {status}")
print(f"\n Score breakdown:")
print(f" Base completeness: {avg_completeness:.1f}")
print(f" Orphan penalty: -{orphan_penalty}")
print(f" Unknown stadium penalty: -{unknown_penalty}")
# Recommendations
recommendations = []
if total_orphans > 0:
recommendations.append(f"Delete {total_orphans} orphan records with --smart-sync --delete-orphans")
if unknown_stadiums > 0:
recommendations.append(f"Review {unknown_stadiums} unknown stadiums (stadium_unknown_*)")
if stadium_completeness['pct_coordinates'] < 100:
missing = len(stadiums) - stadiums_with_coords
recommendations.append(f"Add coordinates to {missing} stadium(s)")
if stadium_completeness['pct_capacity'] < 100:
missing = len(stadiums) - stadiums_with_capacity
recommendations.append(f"Add capacity to {missing} stadium(s)")
if game_completeness['pct_home_team'] < 100 or game_completeness['pct_away_team'] < 100:
recommendations.append("Review games with unresolved team references")
if game_completeness['pct_stadium'] < 100:
recommendations.append("Review games with unresolved stadium references")
report['recommendations'] = recommendations
if recommendations:
print(f"\n Recommendations:")
for rec in recommendations:
print(f"{rec}")
else:
print("\n ✓ No recommendations - data is in great shape!")
# =========================================================================
# Summary
# =========================================================================
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"\n Total orphans: {total_orphans}")
print(f" Health score: {health_score:.1f}/100")
print(f" Status: {status}")
# Export to JSON if requested
if output_file:
with open(output_file, 'w') as f:
json.dump(report, f, indent=2, default=str)
print(f"\n Report exported to: {output_file}")
return report
# Valid record types for individual record management
VALID_RECORD_TYPES = ['Stadium', 'Team', 'Game', 'LeagueStructure', 'TeamAlias', 'StadiumAlias']
@@ -1995,6 +2274,7 @@ def main():
p.add_argument('--verify', action='store_true', help='Verify CloudKit matches local data (quick: counts + spot-check)')
p.add_argument('--verify-deep', action='store_true', help='Verify CloudKit matches local data (deep: full field comparison)')
p.add_argument('--validate', action='store_true', help='Run comprehensive validation (local + CloudKit relationships + sync status)')
p.add_argument('--list-orphans', action='store_true', help='List orphan records in CloudKit (not in local data)')
p.add_argument('--output', type=str, metavar='FILE', help='Output file for validation report (JSON format)')
# Individual record management
p.add_argument('--get', nargs=2, metavar=('TYPE', 'ID'), help='Get a single record (e.g., --get Stadium stadium_nba_td_garden)')
@@ -2013,7 +2293,7 @@ def main():
args.stadiums_only, args.games_only, args.games_files, args.league_structure_only,
args.team_aliases_only, args.stadium_aliases_only, args.canonical_only,
args.delete_all, args.delete_only, args.dry_run, args.diff, args.smart_sync,
args.verify, args.verify_deep, args.validate, args.get, args.list, args.update_record, args.delete_record
args.verify, args.verify_deep, args.validate, args.list_orphans, args.get, args.list, args.update_record, args.delete_record
])
# Track selected game files (for option 4 or --games-files)
@@ -2066,6 +2346,8 @@ def main():
args.verify_deep = True
elif choice == 16: # Validate data
args.validate = True
elif choice == 17: # List orphan records
args.list_orphans = True
print(f"\n{'='*50}")
print(f"CloudKit Import {'(DRY RUN)' if args.dry_run else ''}")
@@ -2168,6 +2450,15 @@ def main():
validate_all(ck, args.data_dir, output_file=args.output, verbose=args.verbose)
return
# Handle list-orphans mode
if args.list_orphans:
if args.dry_run or not HAS_CRYPTO or not os.path.exists(args.key_file):
print("\nError: --list-orphans requires CloudKit connection (cannot use with --dry-run)")
return
ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env)
list_orphans(ck, args.data_dir, output_file=args.output, verbose=args.verbose)
return
# Handle individual record operations
if args.get:
record_type, record_id = args.get