Files
SportstimeAPI/sportstime_parser/cli.py
Trey t 52d445bca4 feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00

1295 lines
46 KiB
Python

"""CLI subcommand definitions for sportstime-parser."""
import argparse
import sys
from typing import Optional
from .config import (
DEFAULT_SEASON,
CLOUDKIT_ENVIRONMENT,
SUPPORTED_SPORTS,
OUTPUT_DIR,
)
from .utils.logging import get_logger, set_verbose, log_success, log_failure
def create_parser() -> argparse.ArgumentParser:
"""Create the main argument parser with all subcommands."""
parser = argparse.ArgumentParser(
prog="sportstime-parser",
description="Sports data scraper and CloudKit uploader for SportsTime app",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
sportstime-parser scrape nba --season 2025
sportstime-parser scrape all --season 2025
sportstime-parser validate nba --season 2025
sportstime-parser upload nba --season 2025
sportstime-parser status
sportstime-parser purge --environment development
sportstime-parser count --environment development
sportstime-parser upload-static --environment development
""",
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Enable verbose output",
)
subparsers = parser.add_subparsers(
dest="command",
title="commands",
description="Available commands",
metavar="COMMAND",
)
# Scrape subcommand
scrape_parser = subparsers.add_parser(
"scrape",
help="Scrape game schedules, teams, and stadiums",
description="Scrape sports data from multiple sources",
)
scrape_parser.add_argument(
"sport",
choices=SUPPORTED_SPORTS + ["all"],
help="Sport to scrape (or 'all' for all sports)",
)
scrape_parser.add_argument(
"--season", "-s",
type=int,
default=DEFAULT_SEASON,
help=f"Season start year (default: {DEFAULT_SEASON})",
)
scrape_parser.add_argument(
"--dry-run",
action="store_true",
help="Parse and validate only, don't write output files",
)
scrape_parser.set_defaults(func=cmd_scrape)
# Validate subcommand
validate_parser = subparsers.add_parser(
"validate",
help="Run validation on existing scraped data",
description="Validate scraped data and regenerate reports",
)
validate_parser.add_argument(
"sport",
choices=SUPPORTED_SPORTS + ["all"],
help="Sport to validate (or 'all' for all sports)",
)
validate_parser.add_argument(
"--season", "-s",
type=int,
default=DEFAULT_SEASON,
help=f"Season start year (default: {DEFAULT_SEASON})",
)
validate_parser.set_defaults(func=cmd_validate)
# Upload subcommand
upload_parser = subparsers.add_parser(
"upload",
help="Upload scraped data to CloudKit",
description="Upload data to CloudKit with resumable, diff-based updates",
)
upload_parser.add_argument(
"sport",
choices=SUPPORTED_SPORTS + ["all"],
help="Sport to upload (or 'all' for all sports)",
)
upload_parser.add_argument(
"--season", "-s",
type=int,
default=DEFAULT_SEASON,
help=f"Season start year (default: {DEFAULT_SEASON})",
)
upload_parser.add_argument(
"--environment", "-e",
choices=["development", "production"],
default=CLOUDKIT_ENVIRONMENT,
help=f"CloudKit environment (default: {CLOUDKIT_ENVIRONMENT})",
)
upload_parser.add_argument(
"--resume",
action="store_true",
help="Resume interrupted upload from last checkpoint",
)
upload_parser.set_defaults(func=cmd_upload)
# Status subcommand
status_parser = subparsers.add_parser(
"status",
help="Show current scrape and upload status",
description="Display summary of scraped data and upload progress",
)
status_parser.set_defaults(func=cmd_status)
# Retry subcommand
retry_parser = subparsers.add_parser(
"retry",
help="Retry failed uploads",
description="Retry records that failed during previous upload attempts",
)
retry_parser.add_argument(
"sport",
choices=SUPPORTED_SPORTS + ["all"],
help="Sport to retry (or 'all' for all sports)",
)
retry_parser.add_argument(
"--season", "-s",
type=int,
default=DEFAULT_SEASON,
help=f"Season start year (default: {DEFAULT_SEASON})",
)
retry_parser.add_argument(
"--environment", "-e",
choices=["development", "production"],
default=CLOUDKIT_ENVIRONMENT,
help=f"CloudKit environment (default: {CLOUDKIT_ENVIRONMENT})",
)
retry_parser.add_argument(
"--max-retries",
type=int,
default=3,
help="Maximum retry attempts per record (default: 3)",
)
retry_parser.set_defaults(func=cmd_retry)
# Clear subcommand
clear_parser = subparsers.add_parser(
"clear",
help="Clear upload session state",
description="Delete upload session state files to start fresh",
)
clear_parser.add_argument(
"sport",
choices=SUPPORTED_SPORTS + ["all"],
help="Sport to clear (or 'all' for all sports)",
)
clear_parser.add_argument(
"--season", "-s",
type=int,
default=DEFAULT_SEASON,
help=f"Season start year (default: {DEFAULT_SEASON})",
)
clear_parser.add_argument(
"--environment", "-e",
choices=["development", "production"],
default=CLOUDKIT_ENVIRONMENT,
help=f"CloudKit environment (default: {CLOUDKIT_ENVIRONMENT})",
)
clear_parser.set_defaults(func=cmd_clear)
# Purge subcommand
purge_parser = subparsers.add_parser(
"purge",
help="Delete all records from CloudKit (DESTRUCTIVE)",
description="Delete ALL records from CloudKit. This is destructive and cannot be undone.",
)
purge_parser.add_argument(
"--environment", "-e",
choices=["development", "production"],
default=CLOUDKIT_ENVIRONMENT,
help=f"CloudKit environment (default: {CLOUDKIT_ENVIRONMENT})",
)
purge_parser.add_argument(
"--yes", "-y",
action="store_true",
help="Skip confirmation prompt",
)
purge_parser.set_defaults(func=cmd_purge)
# Count subcommand
count_parser = subparsers.add_parser(
"count",
help="Count records in CloudKit by type",
description="Display count of all record types in CloudKit",
)
count_parser.add_argument(
"--environment", "-e",
choices=["development", "production"],
default=CLOUDKIT_ENVIRONMENT,
help=f"CloudKit environment (default: {CLOUDKIT_ENVIRONMENT})",
)
count_parser.set_defaults(func=cmd_count)
# Upload-static subcommand
upload_static_parser = subparsers.add_parser(
"upload-static",
help="Upload static reference data to CloudKit",
description="Upload league structure, team aliases, stadium aliases, and sports to CloudKit",
)
upload_static_parser.add_argument(
"--environment", "-e",
choices=["development", "production"],
default=CLOUDKIT_ENVIRONMENT,
help=f"CloudKit environment (default: {CLOUDKIT_ENVIRONMENT})",
)
upload_static_parser.set_defaults(func=cmd_upload_static)
return parser
def get_scraper(sport: str, season: int):
"""Get the appropriate scraper for a sport.
Args:
sport: Sport code
season: Season start year
Returns:
Scraper instance
Raises:
NotImplementedError: If sport scraper is not yet implemented
"""
if sport == "nba":
from .scrapers.nba import create_nba_scraper
return create_nba_scraper(season)
elif sport == "mlb":
from .scrapers.mlb import create_mlb_scraper
return create_mlb_scraper(season)
elif sport == "nfl":
from .scrapers.nfl import create_nfl_scraper
return create_nfl_scraper(season)
elif sport == "nhl":
from .scrapers.nhl import create_nhl_scraper
return create_nhl_scraper(season)
elif sport == "mls":
from .scrapers.mls import create_mls_scraper
return create_mls_scraper(season)
elif sport == "wnba":
from .scrapers.wnba import create_wnba_scraper
return create_wnba_scraper(season)
elif sport == "nwsl":
from .scrapers.nwsl import create_nwsl_scraper
return create_nwsl_scraper(season)
else:
raise NotImplementedError(f"Scraper for {sport} not yet implemented")
def cmd_scrape(args: argparse.Namespace) -> int:
"""Execute the scrape command with canonical output format."""
import json
from .validators.report import generate_report, validate_games
from .normalizers.timezone import get_stadium_timezone
from .validators.schema import SchemaValidationError, validate_batch
logger = get_logger()
sports = SUPPORTED_SPORTS if args.sport == "all" else [args.sport]
logger.info(f"Scraping {', '.join(sports)} for {args.season}-{args.season + 1} season")
if args.dry_run:
logger.info("Dry run mode - no files will be written")
# Ensure output directory exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
success_count = 0
failure_count = 0
for sport in sports:
logger.info(f"\n{'='*50}")
logger.info(f"Scraping {sport.upper()}...")
logger.info(f"{'='*50}")
try:
# Get scraper for this sport
scraper = get_scraper(sport, args.season)
# Scrape all data
result = scraper.scrape_all()
if not result.success:
log_failure(f"{sport.upper()}: {result.error_message}")
failure_count += 1
continue
# Validate games
validation_issues = validate_games(result.games)
all_review_items = result.review_items + validation_issues
# Generate validation report
report = generate_report(
sport=sport,
season=args.season,
source=result.source,
games=result.games,
teams=result.teams,
stadiums=result.stadiums,
review_items=all_review_items,
)
# Log summary
logger.info(f"Games: {report.summary.total_games}")
logger.info(f"Teams: {len(result.teams)}")
logger.info(f"Stadiums: {len(result.stadiums)}")
logger.info(f"Coverage: {report.summary.game_coverage:.1f}%")
logger.info(f"Review items: {report.summary.review_count}")
if not args.dry_run:
# Build mappings for canonical conversion
stadium_timezone_map: dict[str, str] = {}
for stadium in result.stadiums:
tz = get_stadium_timezone(stadium.state, stadium.timezone)
stadium_timezone_map[stadium.id] = tz
stadium_team_abbrevs: dict[str, list[str]] = {}
for team in result.teams:
if team.stadium_id:
if team.stadium_id not in stadium_team_abbrevs:
stadium_team_abbrevs[team.stadium_id] = []
stadium_team_abbrevs[team.stadium_id].append(team.abbreviation)
# Convert to canonical format
canonical_stadiums = [
s.to_canonical_dict(primary_team_abbrevs=stadium_team_abbrevs.get(s.id, []))
for s in result.stadiums
]
canonical_teams = [t.to_canonical_dict() for t in result.teams]
canonical_games = [
g.to_canonical_dict(stadium_timezone=stadium_timezone_map.get(g.stadium_id, "America/New_York"))
for g in result.games
]
# Validate canonical output
stadium_errors = validate_batch(canonical_stadiums, "stadium", fail_fast=False)
team_errors = validate_batch(canonical_teams, "team", fail_fast=False)
game_errors = validate_batch(canonical_games, "game", fail_fast=False)
if stadium_errors or team_errors or game_errors:
for idx, errors in stadium_errors:
for e in errors:
logger.error(f"Stadium {result.stadiums[idx].id}: {e}")
for idx, errors in team_errors:
for e in errors:
logger.error(f"Team {result.teams[idx].id}: {e}")
for idx, errors in game_errors[:10]:
for e in errors:
logger.error(f"Game {result.games[idx].id}: {e}")
if len(game_errors) > 10:
logger.error(f"... and {len(game_errors) - 10} more game errors")
raise SchemaValidationError("canonical", ["Schema validation failed"])
# Save canonical output files
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
with open(games_file, "w", encoding="utf-8") as f:
json.dump(canonical_games, f, indent=2)
with open(teams_file, "w", encoding="utf-8") as f:
json.dump(canonical_teams, f, indent=2)
with open(stadiums_file, "w", encoding="utf-8") as f:
json.dump(canonical_stadiums, f, indent=2)
# Save validation report
report_path = report.save()
logger.info(f"Saved games to: {games_file}")
logger.info(f"Saved teams to: {teams_file}")
logger.info(f"Saved stadiums to: {stadiums_file}")
logger.info(f"Saved report to: {report_path}")
log_success(f"{sport.upper()}: Scraped {result.game_count} games")
success_count += 1
except NotImplementedError as e:
logger.warning(str(e))
failure_count += 1
continue
except SchemaValidationError as e:
log_failure(f"{sport.upper()}: {e}")
failure_count += 1
continue
except Exception as e:
log_failure(f"{sport.upper()}: {e}")
logger.exception("Scraping failed")
failure_count += 1
continue
# Final summary
logger.info(f"\n{'='*50}")
logger.info("SUMMARY")
logger.info(f"{'='*50}")
logger.info(f"Successful: {success_count}")
logger.info(f"Failed: {failure_count}")
return 0 if failure_count == 0 else 1
def cmd_validate(args: argparse.Namespace) -> int:
"""Execute the validate command."""
from .models.game import load_games
from .models.team import load_teams
from .models.stadium import load_stadiums
from .validators.report import generate_report, validate_games
logger = get_logger()
sports = SUPPORTED_SPORTS if args.sport == "all" else [args.sport]
logger.info(f"Validating {', '.join(sports)} for {args.season}-{args.season + 1} season")
for sport in sports:
logger.info(f"\nValidating {sport.upper()}...")
# Load existing data
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
if not games_file.exists():
logger.warning(f"No games file found: {games_file}")
continue
try:
games = load_games(str(games_file))
teams = load_teams(str(teams_file)) if teams_file.exists() else []
stadiums = load_stadiums(str(stadiums_file)) if stadiums_file.exists() else []
# Run validation
review_items = validate_games(games)
# Generate report
report = generate_report(
sport=sport,
season=args.season,
source="existing",
games=games,
teams=teams,
stadiums=stadiums,
review_items=review_items,
)
# Save report
report_path = report.save()
logger.info(f"Games: {report.summary.total_games}")
logger.info(f"Valid: {report.summary.valid_games}")
logger.info(f"Review items: {report.summary.review_count}")
logger.info(f"Saved report to: {report_path}")
log_success(f"{sport.upper()}: Validation complete")
except Exception as e:
log_failure(f"{sport.upper()}: {e}")
logger.exception("Validation failed")
continue
return 0
def cmd_upload(args: argparse.Namespace) -> int:
"""Execute the upload command."""
from .models.game import load_games
from .models.team import load_teams
from .models.stadium import load_stadiums
from .uploaders import (
CloudKitClient,
CloudKitError,
CloudKitAuthError,
CloudKitRateLimitError,
RecordType,
RecordDiffer,
StateManager,
game_to_cloudkit_record,
team_to_cloudkit_record,
stadium_to_cloudkit_record,
)
from .utils.progress import create_progress_bar
logger = get_logger()
sports = SUPPORTED_SPORTS if args.sport == "all" else [args.sport]
logger.info(f"Uploading {', '.join(sports)} for {args.season}-{args.season + 1} season")
logger.info(f"Environment: {args.environment}")
# Initialize CloudKit client
client = CloudKitClient(environment=args.environment)
if not client.is_configured:
log_failure("CloudKit not configured")
logger.error(
"Set CLOUDKIT_KEY_ID and CLOUDKIT_PRIVATE_KEY_PATH environment variables.\n"
"Get credentials from Apple Developer Portal > Certificates, Identifiers & Profiles > Keys"
)
return 1
# Initialize state manager
state_manager = StateManager()
differ = RecordDiffer()
success_count = 0
failure_count = 0
for sport in sports:
logger.info(f"\n{'='*50}")
logger.info(f"Uploading {sport.upper()}...")
logger.info(f"{'='*50}")
try:
# Load local data
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
if not games_file.exists():
logger.warning(f"No games file found: {games_file}")
logger.warning("Run 'scrape' command first")
failure_count += 1
continue
games = load_games(str(games_file))
teams = load_teams(str(teams_file)) if teams_file.exists() else []
stadiums = load_stadiums(str(stadiums_file)) if stadiums_file.exists() else []
logger.info(f"Loaded {len(games)} games, {len(teams)} teams, {len(stadiums)} stadiums")
# Fetch existing CloudKit records for diff
logger.info("Fetching existing CloudKit records...")
try:
remote_games = client.fetch_all_records(RecordType.GAME)
remote_teams = client.fetch_all_records(RecordType.TEAM)
remote_stadiums = client.fetch_all_records(RecordType.STADIUM)
except CloudKitAuthError as e:
log_failure(f"Authentication failed: {e}")
return 1
except CloudKitRateLimitError:
log_failure("Rate limit exceeded - try again later")
return 1
except CloudKitError as e:
log_failure(f"Failed to fetch records: {e}")
failure_count += 1
continue
# Filter remote records to this sport/season
remote_games = [
r for r in remote_games
if r.get("fields", {}).get("sport", {}).get("value") == sport
and r.get("fields", {}).get("season", {}).get("value") == args.season
]
remote_teams = [
r for r in remote_teams
if r.get("fields", {}).get("sport", {}).get("value") == sport
]
remote_stadiums = [
r for r in remote_stadiums
if r.get("fields", {}).get("sport", {}).get("value") == sport
]
logger.info(f"Found {len(remote_games)} games, {len(remote_teams)} teams, {len(remote_stadiums)} stadiums in CloudKit")
# Calculate diffs
logger.info("Calculating changes...")
game_diff = differ.diff_games(games, remote_games)
team_diff = differ.diff_teams(teams, remote_teams)
stadium_diff = differ.diff_stadiums(stadiums, remote_stadiums)
total_creates = game_diff.create_count + team_diff.create_count + stadium_diff.create_count
total_updates = game_diff.update_count + team_diff.update_count + stadium_diff.update_count
total_unchanged = game_diff.unchanged_count + team_diff.unchanged_count + stadium_diff.unchanged_count
logger.info(f"Creates: {total_creates}, Updates: {total_updates}, Unchanged: {total_unchanged}")
if total_creates == 0 and total_updates == 0:
log_success(f"{sport.upper()}: Already up to date")
success_count += 1
continue
# Prepare records for upload
all_records = []
all_records.extend(game_diff.get_records_to_upload())
all_records.extend(team_diff.get_records_to_upload())
all_records.extend(stadium_diff.get_records_to_upload())
# Create or resume upload session
record_info = [(r.record_name, r.record_type.value) for r in all_records]
session = state_manager.get_session_or_create(
sport=sport,
season=args.season,
environment=args.environment,
record_names=record_info,
resume=args.resume,
)
if args.resume:
pending = session.get_pending_records()
logger.info(f"Resuming: {len(pending)} records pending")
# Filter to only pending records
pending_set = set(pending)
all_records = [r for r in all_records if r.record_name in pending_set]
# Upload records with progress
logger.info(f"Uploading {len(all_records)} records...")
with create_progress_bar(total=len(all_records), description="Uploading") as progress:
batch_result = client.save_records(all_records)
# Update session state
for op_result in batch_result.successful:
session.mark_uploaded(op_result.record_name, op_result.record_change_tag)
progress.advance()
for op_result in batch_result.failed:
session.mark_failed(op_result.record_name, op_result.error_message or "Unknown error")
progress.advance()
# Save session state
state_manager.save_session(session)
# Report results
logger.info(f"Uploaded: {batch_result.success_count}")
logger.info(f"Failed: {batch_result.failure_count}")
if batch_result.failure_count > 0:
log_failure(f"{sport.upper()}: {batch_result.failure_count} records failed")
for op_result in batch_result.failed[:5]: # Show first 5 failures
logger.error(f" {op_result.record_name}: {op_result.error_message}")
if batch_result.failure_count > 5:
logger.error(f" ... and {batch_result.failure_count - 5} more")
failure_count += 1
else:
log_success(f"{sport.upper()}: Uploaded {batch_result.success_count} records")
# Clear session on complete success
state_manager.delete_session(sport, args.season, args.environment)
success_count += 1
except Exception as e:
log_failure(f"{sport.upper()}: {e}")
logger.exception("Upload failed")
failure_count += 1
continue
# Final summary
logger.info(f"\n{'='*50}")
logger.info("SUMMARY")
logger.info(f"{'='*50}")
logger.info(f"Successful: {success_count}")
logger.info(f"Failed: {failure_count}")
return 0 if failure_count == 0 else 1
def cmd_status(args: argparse.Namespace) -> int:
"""Execute the status command."""
from datetime import datetime
from .config import STATE_DIR, EXPECTED_GAME_COUNTS
from .uploaders import StateManager
logger = get_logger()
logger.info("SportsTime Parser Status")
logger.info("=" * 50)
logger.info("")
# Check for scraped data
logger.info("[bold]Scraped Data[/bold]")
logger.info("-" * 40)
total_games = 0
scraped_sports = 0
for sport in SUPPORTED_SPORTS:
games_file = OUTPUT_DIR / f"games_{sport}_{DEFAULT_SEASON}.json"
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
if games_file.exists():
from .models.game import load_games
from .models.team import load_teams
from .models.stadium import load_stadiums
try:
games = load_games(str(games_file))
teams = load_teams(str(teams_file)) if teams_file.exists() else []
stadiums = load_stadiums(str(stadiums_file)) if stadiums_file.exists() else []
game_count = len(games)
expected = EXPECTED_GAME_COUNTS.get(sport, 0)
coverage = (game_count / expected * 100) if expected > 0 else 0
# Format with coverage indicator
if coverage >= 95:
status = "[green]✓[/green]"
elif coverage >= 80:
status = "[yellow]~[/yellow]"
else:
status = "[red]![/red]"
logger.info(
f" {status} {sport.upper():6} {game_count:5} games, "
f"{len(teams):2} teams, {len(stadiums):2} stadiums "
f"({coverage:.0f}% coverage)"
)
total_games += game_count
scraped_sports += 1
except Exception as e:
logger.info(f" [red]✗[/red] {sport.upper():6} Error loading: {e}")
else:
logger.info(f" [dim]-[/dim] {sport.upper():6} Not scraped")
logger.info("-" * 40)
logger.info(f" Total: {total_games} games across {scraped_sports} sports")
logger.info("")
# Check for upload sessions
logger.info("[bold]Upload Sessions[/bold]")
logger.info("-" * 40)
state_manager = StateManager()
sessions = state_manager.list_sessions()
if sessions:
for session in sessions:
sport = session["sport"].upper()
season = session["season"]
env = session["environment"]
progress = session["progress"]
percent = session["progress_percent"]
status = session["status"]
failed = session["failed_count"]
if status == "complete":
status_icon = "[green]✓[/green]"
elif failed > 0:
status_icon = "[yellow]![/yellow]"
else:
status_icon = "[blue]→[/blue]"
logger.info(
f" {status_icon} {sport} {season} ({env}): "
f"{progress} ({percent})"
)
if failed > 0:
logger.info(f" [yellow]⚠ {failed} failed records[/yellow]")
# Show last updated time
try:
last_updated = datetime.fromisoformat(session["last_updated"])
age = datetime.utcnow() - last_updated
if age.days > 0:
age_str = f"{age.days} days ago"
elif age.seconds > 3600:
age_str = f"{age.seconds // 3600} hours ago"
elif age.seconds > 60:
age_str = f"{age.seconds // 60} minutes ago"
else:
age_str = "just now"
logger.info(f" Last updated: {age_str}")
except (ValueError, KeyError):
pass
else:
logger.info(" No upload sessions found")
logger.info("")
# CloudKit configuration status
logger.info("[bold]CloudKit Configuration[/bold]")
logger.info("-" * 40)
import os
key_id = os.environ.get("CLOUDKIT_KEY_ID")
key_path = os.environ.get("CLOUDKIT_PRIVATE_KEY_PATH")
key_content = os.environ.get("CLOUDKIT_PRIVATE_KEY")
if key_id:
logger.info(f" [green]✓[/green] CLOUDKIT_KEY_ID: {key_id[:8]}...")
else:
logger.info(" [red]✗[/red] CLOUDKIT_KEY_ID: Not set")
if key_path:
from pathlib import Path
if Path(key_path).exists():
logger.info(f" [green]✓[/green] CLOUDKIT_PRIVATE_KEY_PATH: {key_path}")
else:
logger.info(f" [red]✗[/red] CLOUDKIT_PRIVATE_KEY_PATH: File not found: {key_path}")
elif key_content:
logger.info(" [green]✓[/green] CLOUDKIT_PRIVATE_KEY: Set (inline)")
else:
logger.info(" [red]✗[/red] CLOUDKIT_PRIVATE_KEY: Not set")
logger.info("")
return 0
def cmd_retry(args: argparse.Namespace) -> int:
"""Execute the retry command for failed uploads."""
from .models.game import load_games
from .models.team import load_teams
from .models.stadium import load_stadiums
from .uploaders import (
CloudKitClient,
CloudKitError,
CloudKitAuthError,
CloudKitRateLimitError,
StateManager,
game_to_cloudkit_record,
team_to_cloudkit_record,
stadium_to_cloudkit_record,
)
from .utils.progress import create_progress_bar
logger = get_logger()
sports = SUPPORTED_SPORTS if args.sport == "all" else [args.sport]
logger.info(f"Retrying failed uploads for {', '.join(sports)}")
logger.info(f"Environment: {args.environment}")
logger.info(f"Max retries per record: {args.max_retries}")
# Initialize CloudKit client
client = CloudKitClient(environment=args.environment)
if not client.is_configured:
log_failure("CloudKit not configured")
return 1
# Initialize state manager
state_manager = StateManager()
total_retried = 0
total_succeeded = 0
total_failed = 0
for sport in sports:
# Load existing session
session = state_manager.load_session(sport, args.season, args.environment)
if session is None:
logger.info(f"{sport.upper()}: No upload session found")
continue
# Get records eligible for retry
retryable = session.get_retryable_records(max_retries=args.max_retries)
if not retryable:
failed_count = session.failed_count
if failed_count > 0:
logger.info(f"{sport.upper()}: {failed_count} failed records exceeded max retries")
else:
logger.info(f"{sport.upper()}: No failed records to retry")
continue
logger.info(f"{sport.upper()}: Retrying {len(retryable)} failed records...")
# Load local data to get the records
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
if not games_file.exists():
logger.warning(f"No games file found: {games_file}")
continue
games = load_games(str(games_file))
teams = load_teams(str(teams_file)) if teams_file.exists() else []
stadiums = load_stadiums(str(stadiums_file)) if stadiums_file.exists() else []
# Build record lookup
records_to_retry = []
retryable_set = set(retryable)
for game in games:
if game.id in retryable_set:
records_to_retry.append(game_to_cloudkit_record(game))
for team in teams:
if team.id in retryable_set:
records_to_retry.append(team_to_cloudkit_record(team))
for stadium in stadiums:
if stadium.id in retryable_set:
records_to_retry.append(stadium_to_cloudkit_record(stadium))
if not records_to_retry:
logger.warning(f"{sport.upper()}: Could not find records for retry")
continue
# Mark as pending for retry
for record_name in retryable:
session.mark_pending(record_name)
# Retry upload
try:
with create_progress_bar(total=len(records_to_retry), description="Retrying") as progress:
batch_result = client.save_records(records_to_retry)
for op_result in batch_result.successful:
session.mark_uploaded(op_result.record_name, op_result.record_change_tag)
progress.advance()
total_succeeded += 1
for op_result in batch_result.failed:
session.mark_failed(op_result.record_name, op_result.error_message or "Unknown error")
progress.advance()
total_failed += 1
state_manager.save_session(session)
total_retried += len(records_to_retry)
if batch_result.failure_count > 0:
log_failure(f"{sport.upper()}: {batch_result.failure_count} still failing")
else:
log_success(f"{sport.upper()}: All {batch_result.success_count} retries succeeded")
# Clear session if all complete
if session.is_complete:
state_manager.delete_session(sport, args.season, args.environment)
except CloudKitAuthError as e:
log_failure(f"Authentication failed: {e}")
return 1
except CloudKitRateLimitError:
log_failure("Rate limit exceeded - try again later")
state_manager.save_session(session)
return 1
except CloudKitError as e:
log_failure(f"Upload error: {e}")
state_manager.save_session(session)
continue
# Summary
logger.info(f"\n{'='*50}")
logger.info("RETRY SUMMARY")
logger.info(f"{'='*50}")
logger.info(f"Retried: {total_retried}")
logger.info(f"Succeeded: {total_succeeded}")
logger.info(f"Failed: {total_failed}")
return 0 if total_failed == 0 else 1
def cmd_clear(args: argparse.Namespace) -> int:
"""Execute the clear command to delete upload state."""
from .uploaders import StateManager
logger = get_logger()
sports = SUPPORTED_SPORTS if args.sport == "all" else [args.sport]
logger.info(f"Clearing upload state for {', '.join(sports)}")
state_manager = StateManager()
cleared_count = 0
for sport in sports:
if state_manager.delete_session(sport, args.season, args.environment):
logger.info(f" [green]✓[/green] Cleared {sport.upper()} {args.season} ({args.environment})")
cleared_count += 1
else:
logger.info(f" [dim]-[/dim] No session for {sport.upper()} {args.season} ({args.environment})")
logger.info(f"\nCleared {cleared_count} session(s)")
return 0
def cmd_purge(args: argparse.Namespace) -> int:
"""Execute the purge command to delete all CloudKit records."""
from .uploaders.cloudkit import CloudKitClient, RecordType
logger = get_logger()
# Check CloudKit configuration
client = CloudKitClient(environment=args.environment)
if not client.is_configured:
logger.error("CloudKit not configured. Check CLOUDKIT_KEY_ID and private key.")
return 1
# Confirmation prompt
if not args.yes:
logger.warning(f"[bold red]WARNING: This will delete ALL records from CloudKit ({args.environment})![/bold red]")
logger.warning("This action cannot be undone.")
logger.info("")
response = input(f"Type 'DELETE {args.environment.upper()}' to confirm: ")
if response != f"DELETE {args.environment.upper()}":
logger.info("Aborted.")
return 1
logger.info(f"Purging all records from CloudKit ({args.environment})...")
logger.info("")
record_types = [
RecordType.GAME,
RecordType.TEAM,
RecordType.STADIUM,
RecordType.TEAM_ALIAS,
RecordType.STADIUM_ALIAS,
RecordType.SPORT,
RecordType.LEAGUE_STRUCTURE,
]
total_deleted = 0
total_failed = 0
for record_type in record_types:
logger.info(f"Fetching {record_type.value} records...")
try:
records = client.fetch_all_records(record_type)
except Exception as e:
logger.error(f" Failed to fetch: {e}")
continue
if not records:
logger.info(f" No {record_type.value} records found")
continue
logger.info(f" Deleting {len(records)} {record_type.value} records...")
try:
result = client.delete_records(record_type, records)
total_deleted += result.success_count
total_failed += result.failure_count
logger.info(f" [green]✓[/green] Deleted: {result.success_count}, Failed: {result.failure_count}")
except Exception as e:
logger.error(f" Failed to delete: {e}")
total_failed += len(records)
logger.info("")
logger.info(f"{'='*50}")
logger.info(f"Total deleted: {total_deleted}")
logger.info(f"Total failed: {total_failed}")
return 0 if total_failed == 0 else 1
def cmd_upload_static(args: argparse.Namespace) -> int:
"""Execute the upload-static command to upload reference data to CloudKit."""
import json
from rich.progress import Progress, SpinnerColumn, TextColumn
from .uploaders.cloudkit import CloudKitClient, RecordType
from .uploaders.diff import RecordDiffer
from .models.aliases import TeamAlias, StadiumAlias
from .models.sport import Sport, LeagueStructure, LeagueStructureType
from .config import SCRIPTS_DIR
logger = get_logger()
# Check CloudKit configuration
client = CloudKitClient(environment=args.environment)
if not client.is_configured:
logger.error("CloudKit not configured. Check CLOUDKIT_KEY_ID and private key.")
return 1
logger.info(f"Uploading static reference data to CloudKit ({args.environment})")
logger.info(f"{'='*50}")
differ = RecordDiffer()
total_uploaded = 0
total_failed = 0
# Define sports (hardcoded since there's no sports.json)
sports = [
Sport(id="MLB", abbreviation="MLB", display_name="Major League Baseball",
icon_name="baseball.fill", color_hex="#002D72", season_start_month=3, season_end_month=11),
Sport(id="NBA", abbreviation="NBA", display_name="National Basketball Association",
icon_name="basketball.fill", color_hex="#1D428A", season_start_month=10, season_end_month=6),
Sport(id="NFL", abbreviation="NFL", display_name="National Football League",
icon_name="football.fill", color_hex="#013369", season_start_month=9, season_end_month=2),
Sport(id="NHL", abbreviation="NHL", display_name="National Hockey League",
icon_name="hockey.puck.fill", color_hex="#000000", season_start_month=10, season_end_month=6),
Sport(id="MLS", abbreviation="MLS", display_name="Major League Soccer",
icon_name="soccerball", color_hex="#80A63A", season_start_month=2, season_end_month=11),
Sport(id="WNBA", abbreviation="WNBA", display_name="Women's National Basketball Association",
icon_name="basketball.fill", color_hex="#FF6600", season_start_month=5, season_end_month=10),
Sport(id="NWSL", abbreviation="NWSL", display_name="National Women's Soccer League",
icon_name="soccerball", color_hex="#003087", season_start_month=3, season_end_month=11),
]
# Upload Sports
logger.info("Uploading Sports...")
try:
remote_sports = client.fetch_all_records(RecordType.SPORT)
except Exception:
remote_sports = []
diff_result = differ.diff_sports(sports, remote_sports)
records_to_upload = diff_result.get_records_to_upload()
if records_to_upload:
result = client.save_records(records_to_upload)
total_uploaded += result.success_count
total_failed += result.failure_count
logger.info(f" [green]✓[/green] Sports: {result.success_count} uploaded, {result.failure_count} failed")
else:
logger.info(f" [dim]-[/dim] Sports: No changes")
# Load and upload League Structures
logger.info("Uploading League Structures...")
league_structure_file = SCRIPTS_DIR / "league_structure.json"
if league_structure_file.exists():
with open(league_structure_file, "r") as f:
data = json.load(f)
structures = []
for d in data:
# Handle "type" vs "structure_type" field name
structure_type = d.get("structure_type") or d.get("type")
structures.append(LeagueStructure(
id=d["id"],
sport=d["sport"],
structure_type=LeagueStructureType(structure_type),
name=d["name"],
abbreviation=d.get("abbreviation"),
parent_id=d.get("parent_id"),
display_order=d.get("display_order", 0),
))
try:
remote_structures = client.fetch_all_records(RecordType.LEAGUE_STRUCTURE)
except Exception:
remote_structures = []
diff_result = differ.diff_league_structures(structures, remote_structures)
records_to_upload = diff_result.get_records_to_upload()
if records_to_upload:
result = client.save_records(records_to_upload)
total_uploaded += result.success_count
total_failed += result.failure_count
logger.info(f" [green]✓[/green] League Structures: {result.success_count} uploaded, {result.failure_count} failed")
else:
logger.info(f" [dim]-[/dim] League Structures: No changes ({len(structures)} unchanged)")
else:
logger.warning(f" [yellow]![/yellow] league_structure.json not found")
# Load and upload Team Aliases
logger.info("Uploading Team Aliases...")
team_aliases_file = SCRIPTS_DIR / "team_aliases.json"
if team_aliases_file.exists():
with open(team_aliases_file, "r") as f:
data = json.load(f)
aliases = [TeamAlias.from_dict(d) for d in data]
try:
remote_aliases = client.fetch_all_records(RecordType.TEAM_ALIAS)
except Exception:
remote_aliases = []
diff_result = differ.diff_team_aliases(aliases, remote_aliases)
records_to_upload = diff_result.get_records_to_upload()
if records_to_upload:
result = client.save_records(records_to_upload)
total_uploaded += result.success_count
total_failed += result.failure_count
logger.info(f" [green]✓[/green] Team Aliases: {result.success_count} uploaded, {result.failure_count} failed")
else:
logger.info(f" [dim]-[/dim] Team Aliases: No changes ({len(aliases)} unchanged)")
else:
logger.warning(f" [yellow]![/yellow] team_aliases.json not found")
# Load and upload Stadium Aliases
logger.info("Uploading Stadium Aliases...")
stadium_aliases_file = SCRIPTS_DIR / "stadium_aliases.json"
if stadium_aliases_file.exists():
with open(stadium_aliases_file, "r") as f:
data = json.load(f)
aliases = [StadiumAlias.from_dict(d) for d in data]
try:
remote_aliases = client.fetch_all_records(RecordType.STADIUM_ALIAS)
except Exception:
remote_aliases = []
diff_result = differ.diff_stadium_aliases(aliases, remote_aliases)
records_to_upload = diff_result.get_records_to_upload()
if records_to_upload:
result = client.save_records(records_to_upload)
total_uploaded += result.success_count
total_failed += result.failure_count
logger.info(f" [green]✓[/green] Stadium Aliases: {result.success_count} uploaded, {result.failure_count} failed")
else:
logger.info(f" [dim]-[/dim] Stadium Aliases: No changes ({len(aliases)} unchanged)")
else:
logger.warning(f" [yellow]![/yellow] stadium_aliases.json not found")
logger.info(f"{'='*50}")
logger.info(f"Total uploaded: {total_uploaded}")
logger.info(f"Total failed: {total_failed}")
return 0 if total_failed == 0 else 1
def cmd_count(args: argparse.Namespace) -> int:
"""Execute the count command to show CloudKit record counts."""
from .uploaders.cloudkit import CloudKitClient, RecordType
logger = get_logger()
# Check CloudKit configuration
client = CloudKitClient(environment=args.environment)
if not client.is_configured:
logger.error("CloudKit not configured. Check CLOUDKIT_KEY_ID and private key.")
return 1
logger.info(f"CloudKit record counts ({args.environment})")
logger.info(f"{'='*50}")
record_types = [
RecordType.GAME,
RecordType.TEAM,
RecordType.STADIUM,
RecordType.TEAM_ALIAS,
RecordType.STADIUM_ALIAS,
RecordType.SPORT,
RecordType.LEAGUE_STRUCTURE,
]
total = 0
errors = []
for record_type in record_types:
try:
records = client.fetch_all_records(record_type)
count = len(records)
total += count
logger.info(f" {record_type.value:<20} {count:>6}")
except Exception as e:
logger.error(f" {record_type.value:<20} [red]Not queryable[/red]")
errors.append(record_type.value)
logger.info(f"{'='*50}")
logger.info(f" {'Total':<20} {total:>6}")
if errors:
logger.info("")
logger.warning(f"[yellow]Records not queryable: {', '.join(errors)}[/yellow]")
logger.warning("[yellow]Enable QUERYABLE index in CloudKit Dashboard[/yellow]")
return 0
def run_cli(argv: Optional[list[str]] = None) -> int:
"""Parse arguments and run the appropriate command."""
parser = create_parser()
args = parser.parse_args(argv)
if args.verbose:
set_verbose(True)
if args.command is None:
parser.print_help()
return 1
return args.func(args)