feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
914
Scripts/sportstime_parser/cli.py
Normal file
914
Scripts/sportstime_parser/cli.py
Normal file
@@ -0,0 +1,914 @@
|
||||
"""CLI subcommand definitions for sportstime-parser."""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from .config import (
|
||||
DEFAULT_SEASON,
|
||||
CLOUDKIT_ENVIRONMENT,
|
||||
SUPPORTED_SPORTS,
|
||||
OUTPUT_DIR,
|
||||
)
|
||||
from .utils.logging import get_logger, set_verbose, log_success, log_failure
|
||||
|
||||
|
||||
def create_parser() -> argparse.ArgumentParser:
|
||||
"""Create the main argument parser with all subcommands."""
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="sportstime-parser",
|
||||
description="Sports data scraper and CloudKit uploader for SportsTime app",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
sportstime-parser scrape nba --season 2025
|
||||
sportstime-parser scrape all --season 2025
|
||||
sportstime-parser validate nba --season 2025
|
||||
sportstime-parser upload nba --season 2025
|
||||
sportstime-parser status
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--verbose", "-v",
|
||||
action="store_true",
|
||||
help="Enable verbose output",
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(
|
||||
dest="command",
|
||||
title="commands",
|
||||
description="Available commands",
|
||||
metavar="COMMAND",
|
||||
)
|
||||
|
||||
# Scrape subcommand
|
||||
scrape_parser = subparsers.add_parser(
|
||||
"scrape",
|
||||
help="Scrape game schedules, teams, and stadiums",
|
||||
description="Scrape sports data from multiple sources",
|
||||
)
|
||||
scrape_parser.add_argument(
|
||||
"sport",
|
||||
choices=SUPPORTED_SPORTS + ["all"],
|
||||
help="Sport to scrape (or 'all' for all sports)",
|
||||
)
|
||||
scrape_parser.add_argument(
|
||||
"--season", "-s",
|
||||
type=int,
|
||||
default=DEFAULT_SEASON,
|
||||
help=f"Season start year (default: {DEFAULT_SEASON})",
|
||||
)
|
||||
scrape_parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Parse and validate only, don't write output files",
|
||||
)
|
||||
scrape_parser.set_defaults(func=cmd_scrape)
|
||||
|
||||
# Validate subcommand
|
||||
validate_parser = subparsers.add_parser(
|
||||
"validate",
|
||||
help="Run validation on existing scraped data",
|
||||
description="Validate scraped data and regenerate reports",
|
||||
)
|
||||
validate_parser.add_argument(
|
||||
"sport",
|
||||
choices=SUPPORTED_SPORTS + ["all"],
|
||||
help="Sport to validate (or 'all' for all sports)",
|
||||
)
|
||||
validate_parser.add_argument(
|
||||
"--season", "-s",
|
||||
type=int,
|
||||
default=DEFAULT_SEASON,
|
||||
help=f"Season start year (default: {DEFAULT_SEASON})",
|
||||
)
|
||||
validate_parser.set_defaults(func=cmd_validate)
|
||||
|
||||
# Upload subcommand
|
||||
upload_parser = subparsers.add_parser(
|
||||
"upload",
|
||||
help="Upload scraped data to CloudKit",
|
||||
description="Upload data to CloudKit with resumable, diff-based updates",
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"sport",
|
||||
choices=SUPPORTED_SPORTS + ["all"],
|
||||
help="Sport to upload (or 'all' for all sports)",
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"--season", "-s",
|
||||
type=int,
|
||||
default=DEFAULT_SEASON,
|
||||
help=f"Season start year (default: {DEFAULT_SEASON})",
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"--environment", "-e",
|
||||
choices=["development", "production"],
|
||||
default=CLOUDKIT_ENVIRONMENT,
|
||||
help=f"CloudKit environment (default: {CLOUDKIT_ENVIRONMENT})",
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"--resume",
|
||||
action="store_true",
|
||||
help="Resume interrupted upload from last checkpoint",
|
||||
)
|
||||
upload_parser.set_defaults(func=cmd_upload)
|
||||
|
||||
# Status subcommand
|
||||
status_parser = subparsers.add_parser(
|
||||
"status",
|
||||
help="Show current scrape and upload status",
|
||||
description="Display summary of scraped data and upload progress",
|
||||
)
|
||||
status_parser.set_defaults(func=cmd_status)
|
||||
|
||||
# Retry subcommand
|
||||
retry_parser = subparsers.add_parser(
|
||||
"retry",
|
||||
help="Retry failed uploads",
|
||||
description="Retry records that failed during previous upload attempts",
|
||||
)
|
||||
retry_parser.add_argument(
|
||||
"sport",
|
||||
choices=SUPPORTED_SPORTS + ["all"],
|
||||
help="Sport to retry (or 'all' for all sports)",
|
||||
)
|
||||
retry_parser.add_argument(
|
||||
"--season", "-s",
|
||||
type=int,
|
||||
default=DEFAULT_SEASON,
|
||||
help=f"Season start year (default: {DEFAULT_SEASON})",
|
||||
)
|
||||
retry_parser.add_argument(
|
||||
"--environment", "-e",
|
||||
choices=["development", "production"],
|
||||
default=CLOUDKIT_ENVIRONMENT,
|
||||
help=f"CloudKit environment (default: {CLOUDKIT_ENVIRONMENT})",
|
||||
)
|
||||
retry_parser.add_argument(
|
||||
"--max-retries",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Maximum retry attempts per record (default: 3)",
|
||||
)
|
||||
retry_parser.set_defaults(func=cmd_retry)
|
||||
|
||||
# Clear subcommand
|
||||
clear_parser = subparsers.add_parser(
|
||||
"clear",
|
||||
help="Clear upload session state",
|
||||
description="Delete upload session state files to start fresh",
|
||||
)
|
||||
clear_parser.add_argument(
|
||||
"sport",
|
||||
choices=SUPPORTED_SPORTS + ["all"],
|
||||
help="Sport to clear (or 'all' for all sports)",
|
||||
)
|
||||
clear_parser.add_argument(
|
||||
"--season", "-s",
|
||||
type=int,
|
||||
default=DEFAULT_SEASON,
|
||||
help=f"Season start year (default: {DEFAULT_SEASON})",
|
||||
)
|
||||
clear_parser.add_argument(
|
||||
"--environment", "-e",
|
||||
choices=["development", "production"],
|
||||
default=CLOUDKIT_ENVIRONMENT,
|
||||
help=f"CloudKit environment (default: {CLOUDKIT_ENVIRONMENT})",
|
||||
)
|
||||
clear_parser.set_defaults(func=cmd_clear)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def get_scraper(sport: str, season: int):
|
||||
"""Get the appropriate scraper for a sport.
|
||||
|
||||
Args:
|
||||
sport: Sport code
|
||||
season: Season start year
|
||||
|
||||
Returns:
|
||||
Scraper instance
|
||||
|
||||
Raises:
|
||||
NotImplementedError: If sport scraper is not yet implemented
|
||||
"""
|
||||
if sport == "nba":
|
||||
from .scrapers.nba import create_nba_scraper
|
||||
return create_nba_scraper(season)
|
||||
elif sport == "mlb":
|
||||
from .scrapers.mlb import create_mlb_scraper
|
||||
return create_mlb_scraper(season)
|
||||
elif sport == "nfl":
|
||||
from .scrapers.nfl import create_nfl_scraper
|
||||
return create_nfl_scraper(season)
|
||||
elif sport == "nhl":
|
||||
from .scrapers.nhl import create_nhl_scraper
|
||||
return create_nhl_scraper(season)
|
||||
elif sport == "mls":
|
||||
from .scrapers.mls import create_mls_scraper
|
||||
return create_mls_scraper(season)
|
||||
elif sport == "wnba":
|
||||
from .scrapers.wnba import create_wnba_scraper
|
||||
return create_wnba_scraper(season)
|
||||
elif sport == "nwsl":
|
||||
from .scrapers.nwsl import create_nwsl_scraper
|
||||
return create_nwsl_scraper(season)
|
||||
else:
|
||||
raise NotImplementedError(f"Scraper for {sport} not yet implemented")
|
||||
|
||||
|
||||
def cmd_scrape(args: argparse.Namespace) -> int:
|
||||
"""Execute the scrape command."""
|
||||
from .models.game import save_games
|
||||
from .models.team import save_teams
|
||||
from .models.stadium import save_stadiums
|
||||
from .validators.report import generate_report, validate_games
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
sports = SUPPORTED_SPORTS if args.sport == "all" else [args.sport]
|
||||
|
||||
logger.info(f"Scraping {', '.join(sports)} for {args.season}-{args.season + 1} season")
|
||||
|
||||
if args.dry_run:
|
||||
logger.info("Dry run mode - no files will be written")
|
||||
|
||||
# Ensure output directory exists
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
for sport in sports:
|
||||
logger.info(f"\n{'='*50}")
|
||||
logger.info(f"Scraping {sport.upper()}...")
|
||||
logger.info(f"{'='*50}")
|
||||
|
||||
try:
|
||||
# Get scraper for this sport
|
||||
scraper = get_scraper(sport, args.season)
|
||||
|
||||
# Scrape all data
|
||||
result = scraper.scrape_all()
|
||||
|
||||
if not result.success:
|
||||
log_failure(f"{sport.upper()}: {result.error_message}")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
# Validate games
|
||||
validation_issues = validate_games(result.games)
|
||||
all_review_items = result.review_items + validation_issues
|
||||
|
||||
# Generate validation report
|
||||
report = generate_report(
|
||||
sport=sport,
|
||||
season=args.season,
|
||||
source=result.source,
|
||||
games=result.games,
|
||||
teams=result.teams,
|
||||
stadiums=result.stadiums,
|
||||
review_items=all_review_items,
|
||||
)
|
||||
|
||||
# Log summary
|
||||
logger.info(f"Games: {report.summary.total_games}")
|
||||
logger.info(f"Teams: {len(result.teams)}")
|
||||
logger.info(f"Stadiums: {len(result.stadiums)}")
|
||||
logger.info(f"Coverage: {report.summary.game_coverage:.1f}%")
|
||||
logger.info(f"Review items: {report.summary.review_count}")
|
||||
|
||||
if not args.dry_run:
|
||||
# Save output files
|
||||
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
|
||||
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
|
||||
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
|
||||
|
||||
save_games(result.games, str(games_file))
|
||||
save_teams(result.teams, str(teams_file))
|
||||
save_stadiums(result.stadiums, str(stadiums_file))
|
||||
|
||||
# Save validation report
|
||||
report_path = report.save()
|
||||
|
||||
logger.info(f"Saved games to: {games_file}")
|
||||
logger.info(f"Saved teams to: {teams_file}")
|
||||
logger.info(f"Saved stadiums to: {stadiums_file}")
|
||||
logger.info(f"Saved report to: {report_path}")
|
||||
|
||||
log_success(f"{sport.upper()}: Scraped {result.game_count} games")
|
||||
success_count += 1
|
||||
|
||||
except NotImplementedError as e:
|
||||
logger.warning(str(e))
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
log_failure(f"{sport.upper()}: {e}")
|
||||
logger.exception("Scraping failed")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
# Final summary
|
||||
logger.info(f"\n{'='*50}")
|
||||
logger.info("SUMMARY")
|
||||
logger.info(f"{'='*50}")
|
||||
logger.info(f"Successful: {success_count}")
|
||||
logger.info(f"Failed: {failure_count}")
|
||||
|
||||
return 0 if failure_count == 0 else 1
|
||||
|
||||
|
||||
def cmd_validate(args: argparse.Namespace) -> int:
|
||||
"""Execute the validate command."""
|
||||
from .models.game import load_games
|
||||
from .models.team import load_teams
|
||||
from .models.stadium import load_stadiums
|
||||
from .validators.report import generate_report, validate_games
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
sports = SUPPORTED_SPORTS if args.sport == "all" else [args.sport]
|
||||
|
||||
logger.info(f"Validating {', '.join(sports)} for {args.season}-{args.season + 1} season")
|
||||
|
||||
for sport in sports:
|
||||
logger.info(f"\nValidating {sport.upper()}...")
|
||||
|
||||
# Load existing data
|
||||
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
|
||||
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
|
||||
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
|
||||
|
||||
if not games_file.exists():
|
||||
logger.warning(f"No games file found: {games_file}")
|
||||
continue
|
||||
|
||||
try:
|
||||
games = load_games(str(games_file))
|
||||
teams = load_teams(str(teams_file)) if teams_file.exists() else []
|
||||
stadiums = load_stadiums(str(stadiums_file)) if stadiums_file.exists() else []
|
||||
|
||||
# Run validation
|
||||
review_items = validate_games(games)
|
||||
|
||||
# Generate report
|
||||
report = generate_report(
|
||||
sport=sport,
|
||||
season=args.season,
|
||||
source="existing",
|
||||
games=games,
|
||||
teams=teams,
|
||||
stadiums=stadiums,
|
||||
review_items=review_items,
|
||||
)
|
||||
|
||||
# Save report
|
||||
report_path = report.save()
|
||||
|
||||
logger.info(f"Games: {report.summary.total_games}")
|
||||
logger.info(f"Valid: {report.summary.valid_games}")
|
||||
logger.info(f"Review items: {report.summary.review_count}")
|
||||
logger.info(f"Saved report to: {report_path}")
|
||||
|
||||
log_success(f"{sport.upper()}: Validation complete")
|
||||
|
||||
except Exception as e:
|
||||
log_failure(f"{sport.upper()}: {e}")
|
||||
logger.exception("Validation failed")
|
||||
continue
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_upload(args: argparse.Namespace) -> int:
|
||||
"""Execute the upload command."""
|
||||
from .models.game import load_games
|
||||
from .models.team import load_teams
|
||||
from .models.stadium import load_stadiums
|
||||
from .uploaders import (
|
||||
CloudKitClient,
|
||||
CloudKitError,
|
||||
CloudKitAuthError,
|
||||
CloudKitRateLimitError,
|
||||
RecordType,
|
||||
RecordDiffer,
|
||||
StateManager,
|
||||
game_to_cloudkit_record,
|
||||
team_to_cloudkit_record,
|
||||
stadium_to_cloudkit_record,
|
||||
)
|
||||
from .utils.progress import create_progress_bar
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
sports = SUPPORTED_SPORTS if args.sport == "all" else [args.sport]
|
||||
|
||||
logger.info(f"Uploading {', '.join(sports)} for {args.season}-{args.season + 1} season")
|
||||
logger.info(f"Environment: {args.environment}")
|
||||
|
||||
# Initialize CloudKit client
|
||||
client = CloudKitClient(environment=args.environment)
|
||||
|
||||
if not client.is_configured:
|
||||
log_failure("CloudKit not configured")
|
||||
logger.error(
|
||||
"Set CLOUDKIT_KEY_ID and CLOUDKIT_PRIVATE_KEY_PATH environment variables.\n"
|
||||
"Get credentials from Apple Developer Portal > Certificates, Identifiers & Profiles > Keys"
|
||||
)
|
||||
return 1
|
||||
|
||||
# Initialize state manager
|
||||
state_manager = StateManager()
|
||||
differ = RecordDiffer()
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
for sport in sports:
|
||||
logger.info(f"\n{'='*50}")
|
||||
logger.info(f"Uploading {sport.upper()}...")
|
||||
logger.info(f"{'='*50}")
|
||||
|
||||
try:
|
||||
# Load local data
|
||||
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
|
||||
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
|
||||
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
|
||||
|
||||
if not games_file.exists():
|
||||
logger.warning(f"No games file found: {games_file}")
|
||||
logger.warning("Run 'scrape' command first")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
games = load_games(str(games_file))
|
||||
teams = load_teams(str(teams_file)) if teams_file.exists() else []
|
||||
stadiums = load_stadiums(str(stadiums_file)) if stadiums_file.exists() else []
|
||||
|
||||
logger.info(f"Loaded {len(games)} games, {len(teams)} teams, {len(stadiums)} stadiums")
|
||||
|
||||
# Fetch existing CloudKit records for diff
|
||||
logger.info("Fetching existing CloudKit records...")
|
||||
|
||||
try:
|
||||
remote_games = client.fetch_all_records(RecordType.GAME)
|
||||
remote_teams = client.fetch_all_records(RecordType.TEAM)
|
||||
remote_stadiums = client.fetch_all_records(RecordType.STADIUM)
|
||||
except CloudKitAuthError as e:
|
||||
log_failure(f"Authentication failed: {e}")
|
||||
return 1
|
||||
except CloudKitRateLimitError:
|
||||
log_failure("Rate limit exceeded - try again later")
|
||||
return 1
|
||||
except CloudKitError as e:
|
||||
log_failure(f"Failed to fetch records: {e}")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
# Filter remote records to this sport/season
|
||||
remote_games = [
|
||||
r for r in remote_games
|
||||
if r.get("fields", {}).get("sport", {}).get("value") == sport
|
||||
and r.get("fields", {}).get("season", {}).get("value") == args.season
|
||||
]
|
||||
remote_teams = [
|
||||
r for r in remote_teams
|
||||
if r.get("fields", {}).get("sport", {}).get("value") == sport
|
||||
]
|
||||
remote_stadiums = [
|
||||
r for r in remote_stadiums
|
||||
if r.get("fields", {}).get("sport", {}).get("value") == sport
|
||||
]
|
||||
|
||||
logger.info(f"Found {len(remote_games)} games, {len(remote_teams)} teams, {len(remote_stadiums)} stadiums in CloudKit")
|
||||
|
||||
# Calculate diffs
|
||||
logger.info("Calculating changes...")
|
||||
|
||||
game_diff = differ.diff_games(games, remote_games)
|
||||
team_diff = differ.diff_teams(teams, remote_teams)
|
||||
stadium_diff = differ.diff_stadiums(stadiums, remote_stadiums)
|
||||
|
||||
total_creates = game_diff.create_count + team_diff.create_count + stadium_diff.create_count
|
||||
total_updates = game_diff.update_count + team_diff.update_count + stadium_diff.update_count
|
||||
total_unchanged = game_diff.unchanged_count + team_diff.unchanged_count + stadium_diff.unchanged_count
|
||||
|
||||
logger.info(f"Creates: {total_creates}, Updates: {total_updates}, Unchanged: {total_unchanged}")
|
||||
|
||||
if total_creates == 0 and total_updates == 0:
|
||||
log_success(f"{sport.upper()}: Already up to date")
|
||||
success_count += 1
|
||||
continue
|
||||
|
||||
# Prepare records for upload
|
||||
all_records = []
|
||||
all_records.extend(game_diff.get_records_to_upload())
|
||||
all_records.extend(team_diff.get_records_to_upload())
|
||||
all_records.extend(stadium_diff.get_records_to_upload())
|
||||
|
||||
# Create or resume upload session
|
||||
record_info = [(r.record_name, r.record_type.value) for r in all_records]
|
||||
session = state_manager.get_session_or_create(
|
||||
sport=sport,
|
||||
season=args.season,
|
||||
environment=args.environment,
|
||||
record_names=record_info,
|
||||
resume=args.resume,
|
||||
)
|
||||
|
||||
if args.resume:
|
||||
pending = session.get_pending_records()
|
||||
logger.info(f"Resuming: {len(pending)} records pending")
|
||||
# Filter to only pending records
|
||||
pending_set = set(pending)
|
||||
all_records = [r for r in all_records if r.record_name in pending_set]
|
||||
|
||||
# Upload records with progress
|
||||
logger.info(f"Uploading {len(all_records)} records...")
|
||||
|
||||
with create_progress_bar(total=len(all_records), description="Uploading") as progress:
|
||||
batch_result = client.save_records(all_records)
|
||||
|
||||
# Update session state
|
||||
for op_result in batch_result.successful:
|
||||
session.mark_uploaded(op_result.record_name, op_result.record_change_tag)
|
||||
progress.advance()
|
||||
|
||||
for op_result in batch_result.failed:
|
||||
session.mark_failed(op_result.record_name, op_result.error_message or "Unknown error")
|
||||
progress.advance()
|
||||
|
||||
# Save session state
|
||||
state_manager.save_session(session)
|
||||
|
||||
# Report results
|
||||
logger.info(f"Uploaded: {batch_result.success_count}")
|
||||
logger.info(f"Failed: {batch_result.failure_count}")
|
||||
|
||||
if batch_result.failure_count > 0:
|
||||
log_failure(f"{sport.upper()}: {batch_result.failure_count} records failed")
|
||||
for op_result in batch_result.failed[:5]: # Show first 5 failures
|
||||
logger.error(f" {op_result.record_name}: {op_result.error_message}")
|
||||
if batch_result.failure_count > 5:
|
||||
logger.error(f" ... and {batch_result.failure_count - 5} more")
|
||||
failure_count += 1
|
||||
else:
|
||||
log_success(f"{sport.upper()}: Uploaded {batch_result.success_count} records")
|
||||
# Clear session on complete success
|
||||
state_manager.delete_session(sport, args.season, args.environment)
|
||||
success_count += 1
|
||||
|
||||
except Exception as e:
|
||||
log_failure(f"{sport.upper()}: {e}")
|
||||
logger.exception("Upload failed")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
# Final summary
|
||||
logger.info(f"\n{'='*50}")
|
||||
logger.info("SUMMARY")
|
||||
logger.info(f"{'='*50}")
|
||||
logger.info(f"Successful: {success_count}")
|
||||
logger.info(f"Failed: {failure_count}")
|
||||
|
||||
return 0 if failure_count == 0 else 1
|
||||
|
||||
|
||||
def cmd_status(args: argparse.Namespace) -> int:
|
||||
"""Execute the status command."""
|
||||
from datetime import datetime
|
||||
from .config import STATE_DIR, EXPECTED_GAME_COUNTS
|
||||
from .uploaders import StateManager
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
logger.info("SportsTime Parser Status")
|
||||
logger.info("=" * 50)
|
||||
logger.info("")
|
||||
|
||||
# Check for scraped data
|
||||
logger.info("[bold]Scraped Data[/bold]")
|
||||
logger.info("-" * 40)
|
||||
|
||||
total_games = 0
|
||||
scraped_sports = 0
|
||||
|
||||
for sport in SUPPORTED_SPORTS:
|
||||
games_file = OUTPUT_DIR / f"games_{sport}_{DEFAULT_SEASON}.json"
|
||||
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
|
||||
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
|
||||
|
||||
if games_file.exists():
|
||||
from .models.game import load_games
|
||||
from .models.team import load_teams
|
||||
from .models.stadium import load_stadiums
|
||||
|
||||
try:
|
||||
games = load_games(str(games_file))
|
||||
teams = load_teams(str(teams_file)) if teams_file.exists() else []
|
||||
stadiums = load_stadiums(str(stadiums_file)) if stadiums_file.exists() else []
|
||||
|
||||
game_count = len(games)
|
||||
expected = EXPECTED_GAME_COUNTS.get(sport, 0)
|
||||
coverage = (game_count / expected * 100) if expected > 0 else 0
|
||||
|
||||
# Format with coverage indicator
|
||||
if coverage >= 95:
|
||||
status = "[green]✓[/green]"
|
||||
elif coverage >= 80:
|
||||
status = "[yellow]~[/yellow]"
|
||||
else:
|
||||
status = "[red]![/red]"
|
||||
|
||||
logger.info(
|
||||
f" {status} {sport.upper():6} {game_count:5} games, "
|
||||
f"{len(teams):2} teams, {len(stadiums):2} stadiums "
|
||||
f"({coverage:.0f}% coverage)"
|
||||
)
|
||||
|
||||
total_games += game_count
|
||||
scraped_sports += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.info(f" [red]✗[/red] {sport.upper():6} Error loading: {e}")
|
||||
else:
|
||||
logger.info(f" [dim]-[/dim] {sport.upper():6} Not scraped")
|
||||
|
||||
logger.info("-" * 40)
|
||||
logger.info(f" Total: {total_games} games across {scraped_sports} sports")
|
||||
logger.info("")
|
||||
|
||||
# Check for upload sessions
|
||||
logger.info("[bold]Upload Sessions[/bold]")
|
||||
logger.info("-" * 40)
|
||||
|
||||
state_manager = StateManager()
|
||||
sessions = state_manager.list_sessions()
|
||||
|
||||
if sessions:
|
||||
for session in sessions:
|
||||
sport = session["sport"].upper()
|
||||
season = session["season"]
|
||||
env = session["environment"]
|
||||
progress = session["progress"]
|
||||
percent = session["progress_percent"]
|
||||
status = session["status"]
|
||||
failed = session["failed_count"]
|
||||
|
||||
if status == "complete":
|
||||
status_icon = "[green]✓[/green]"
|
||||
elif failed > 0:
|
||||
status_icon = "[yellow]![/yellow]"
|
||||
else:
|
||||
status_icon = "[blue]→[/blue]"
|
||||
|
||||
logger.info(
|
||||
f" {status_icon} {sport} {season} ({env}): "
|
||||
f"{progress} ({percent})"
|
||||
)
|
||||
|
||||
if failed > 0:
|
||||
logger.info(f" [yellow]⚠ {failed} failed records[/yellow]")
|
||||
|
||||
# Show last updated time
|
||||
try:
|
||||
last_updated = datetime.fromisoformat(session["last_updated"])
|
||||
age = datetime.utcnow() - last_updated
|
||||
if age.days > 0:
|
||||
age_str = f"{age.days} days ago"
|
||||
elif age.seconds > 3600:
|
||||
age_str = f"{age.seconds // 3600} hours ago"
|
||||
elif age.seconds > 60:
|
||||
age_str = f"{age.seconds // 60} minutes ago"
|
||||
else:
|
||||
age_str = "just now"
|
||||
logger.info(f" Last updated: {age_str}")
|
||||
except (ValueError, KeyError):
|
||||
pass
|
||||
|
||||
else:
|
||||
logger.info(" No upload sessions found")
|
||||
|
||||
logger.info("")
|
||||
|
||||
# CloudKit configuration status
|
||||
logger.info("[bold]CloudKit Configuration[/bold]")
|
||||
logger.info("-" * 40)
|
||||
|
||||
import os
|
||||
key_id = os.environ.get("CLOUDKIT_KEY_ID")
|
||||
key_path = os.environ.get("CLOUDKIT_PRIVATE_KEY_PATH")
|
||||
key_content = os.environ.get("CLOUDKIT_PRIVATE_KEY")
|
||||
|
||||
if key_id:
|
||||
logger.info(f" [green]✓[/green] CLOUDKIT_KEY_ID: {key_id[:8]}...")
|
||||
else:
|
||||
logger.info(" [red]✗[/red] CLOUDKIT_KEY_ID: Not set")
|
||||
|
||||
if key_path:
|
||||
from pathlib import Path
|
||||
if Path(key_path).exists():
|
||||
logger.info(f" [green]✓[/green] CLOUDKIT_PRIVATE_KEY_PATH: {key_path}")
|
||||
else:
|
||||
logger.info(f" [red]✗[/red] CLOUDKIT_PRIVATE_KEY_PATH: File not found: {key_path}")
|
||||
elif key_content:
|
||||
logger.info(" [green]✓[/green] CLOUDKIT_PRIVATE_KEY: Set (inline)")
|
||||
else:
|
||||
logger.info(" [red]✗[/red] CLOUDKIT_PRIVATE_KEY: Not set")
|
||||
|
||||
logger.info("")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_retry(args: argparse.Namespace) -> int:
|
||||
"""Execute the retry command for failed uploads."""
|
||||
from .models.game import load_games
|
||||
from .models.team import load_teams
|
||||
from .models.stadium import load_stadiums
|
||||
from .uploaders import (
|
||||
CloudKitClient,
|
||||
CloudKitError,
|
||||
CloudKitAuthError,
|
||||
CloudKitRateLimitError,
|
||||
StateManager,
|
||||
game_to_cloudkit_record,
|
||||
team_to_cloudkit_record,
|
||||
stadium_to_cloudkit_record,
|
||||
)
|
||||
from .utils.progress import create_progress_bar
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
sports = SUPPORTED_SPORTS if args.sport == "all" else [args.sport]
|
||||
|
||||
logger.info(f"Retrying failed uploads for {', '.join(sports)}")
|
||||
logger.info(f"Environment: {args.environment}")
|
||||
logger.info(f"Max retries per record: {args.max_retries}")
|
||||
|
||||
# Initialize CloudKit client
|
||||
client = CloudKitClient(environment=args.environment)
|
||||
|
||||
if not client.is_configured:
|
||||
log_failure("CloudKit not configured")
|
||||
return 1
|
||||
|
||||
# Initialize state manager
|
||||
state_manager = StateManager()
|
||||
|
||||
total_retried = 0
|
||||
total_succeeded = 0
|
||||
total_failed = 0
|
||||
|
||||
for sport in sports:
|
||||
# Load existing session
|
||||
session = state_manager.load_session(sport, args.season, args.environment)
|
||||
|
||||
if session is None:
|
||||
logger.info(f"{sport.upper()}: No upload session found")
|
||||
continue
|
||||
|
||||
# Get records eligible for retry
|
||||
retryable = session.get_retryable_records(max_retries=args.max_retries)
|
||||
|
||||
if not retryable:
|
||||
failed_count = session.failed_count
|
||||
if failed_count > 0:
|
||||
logger.info(f"{sport.upper()}: {failed_count} failed records exceeded max retries")
|
||||
else:
|
||||
logger.info(f"{sport.upper()}: No failed records to retry")
|
||||
continue
|
||||
|
||||
logger.info(f"{sport.upper()}: Retrying {len(retryable)} failed records...")
|
||||
|
||||
# Load local data to get the records
|
||||
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
|
||||
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
|
||||
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
|
||||
|
||||
if not games_file.exists():
|
||||
logger.warning(f"No games file found: {games_file}")
|
||||
continue
|
||||
|
||||
games = load_games(str(games_file))
|
||||
teams = load_teams(str(teams_file)) if teams_file.exists() else []
|
||||
stadiums = load_stadiums(str(stadiums_file)) if stadiums_file.exists() else []
|
||||
|
||||
# Build record lookup
|
||||
records_to_retry = []
|
||||
retryable_set = set(retryable)
|
||||
|
||||
for game in games:
|
||||
if game.id in retryable_set:
|
||||
records_to_retry.append(game_to_cloudkit_record(game))
|
||||
|
||||
for team in teams:
|
||||
if team.id in retryable_set:
|
||||
records_to_retry.append(team_to_cloudkit_record(team))
|
||||
|
||||
for stadium in stadiums:
|
||||
if stadium.id in retryable_set:
|
||||
records_to_retry.append(stadium_to_cloudkit_record(stadium))
|
||||
|
||||
if not records_to_retry:
|
||||
logger.warning(f"{sport.upper()}: Could not find records for retry")
|
||||
continue
|
||||
|
||||
# Mark as pending for retry
|
||||
for record_name in retryable:
|
||||
session.mark_pending(record_name)
|
||||
|
||||
# Retry upload
|
||||
try:
|
||||
with create_progress_bar(total=len(records_to_retry), description="Retrying") as progress:
|
||||
batch_result = client.save_records(records_to_retry)
|
||||
|
||||
for op_result in batch_result.successful:
|
||||
session.mark_uploaded(op_result.record_name, op_result.record_change_tag)
|
||||
progress.advance()
|
||||
total_succeeded += 1
|
||||
|
||||
for op_result in batch_result.failed:
|
||||
session.mark_failed(op_result.record_name, op_result.error_message or "Unknown error")
|
||||
progress.advance()
|
||||
total_failed += 1
|
||||
|
||||
state_manager.save_session(session)
|
||||
|
||||
total_retried += len(records_to_retry)
|
||||
|
||||
if batch_result.failure_count > 0:
|
||||
log_failure(f"{sport.upper()}: {batch_result.failure_count} still failing")
|
||||
else:
|
||||
log_success(f"{sport.upper()}: All {batch_result.success_count} retries succeeded")
|
||||
|
||||
# Clear session if all complete
|
||||
if session.is_complete:
|
||||
state_manager.delete_session(sport, args.season, args.environment)
|
||||
|
||||
except CloudKitAuthError as e:
|
||||
log_failure(f"Authentication failed: {e}")
|
||||
return 1
|
||||
except CloudKitRateLimitError:
|
||||
log_failure("Rate limit exceeded - try again later")
|
||||
state_manager.save_session(session)
|
||||
return 1
|
||||
except CloudKitError as e:
|
||||
log_failure(f"Upload error: {e}")
|
||||
state_manager.save_session(session)
|
||||
continue
|
||||
|
||||
# Summary
|
||||
logger.info(f"\n{'='*50}")
|
||||
logger.info("RETRY SUMMARY")
|
||||
logger.info(f"{'='*50}")
|
||||
logger.info(f"Retried: {total_retried}")
|
||||
logger.info(f"Succeeded: {total_succeeded}")
|
||||
logger.info(f"Failed: {total_failed}")
|
||||
|
||||
return 0 if total_failed == 0 else 1
|
||||
|
||||
|
||||
def cmd_clear(args: argparse.Namespace) -> int:
|
||||
"""Execute the clear command to delete upload state."""
|
||||
from .uploaders import StateManager
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
sports = SUPPORTED_SPORTS if args.sport == "all" else [args.sport]
|
||||
|
||||
logger.info(f"Clearing upload state for {', '.join(sports)}")
|
||||
|
||||
state_manager = StateManager()
|
||||
cleared_count = 0
|
||||
|
||||
for sport in sports:
|
||||
if state_manager.delete_session(sport, args.season, args.environment):
|
||||
logger.info(f" [green]✓[/green] Cleared {sport.upper()} {args.season} ({args.environment})")
|
||||
cleared_count += 1
|
||||
else:
|
||||
logger.info(f" [dim]-[/dim] No session for {sport.upper()} {args.season} ({args.environment})")
|
||||
|
||||
logger.info(f"\nCleared {cleared_count} session(s)")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def run_cli(argv: Optional[list[str]] = None) -> int:
|
||||
"""Parse arguments and run the appropriate command."""
|
||||
parser = create_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.verbose:
|
||||
set_verbose(True)
|
||||
|
||||
if args.command is None:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
return args.func(args)
|
||||
Reference in New Issue
Block a user