Remove CFB/NASCAR/PGA and streamline to 8 supported sports

- Remove College Football, NASCAR, and PGA from scraper and app
- Clean all data files (stadiums, games, pipeline reports)
- Update Sport.swift enum and all UI components
- Add sportstime.py CLI tool for pipeline management
- Add DATA_SCRAPING.md documentation
- Add WNBA/MLS/NWSL implementation documentation
- Scraper now supports: NBA, MLB, NHL, NFL, WNBA, MLS, NWSL, CBB

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-09 23:22:13 -06:00
parent f5e509a9ae
commit 8790d2ad73
35 changed files with 117819 additions and 65871 deletions

View File

@@ -7,24 +7,43 @@ Imports canonical JSON data into CloudKit. Run after canonicalization pipeline.
Expected input files (from canonicalization pipeline):
- stadiums_canonical.json
- teams_canonical.json
- games_canonical.json
- games_canonical.json OR canonical/games/*.json (new structure)
- stadium_aliases.json
- league_structure.json
- team_aliases.json
File Structure (Option B - by sport/season):
data/
games/ # Raw scraped games
mlb_2025.json
nba_2025.json
...
canonical/ # Canonicalized data
games/
mlb_2025.json
nba_2025.json
...
stadiums.json
games_canonical.json # Combined (backward compatibility)
stadiums_canonical.json
teams_canonical.json
Setup:
1. CloudKit Dashboard > Tokens & Keys > Server-to-Server Keys
2. Create key with Read/Write access to public database
3. Download .p8 file and note Key ID
Usage:
python cloudkit_import.py --dry-run # Preview first
python cloudkit_import.py --key-id XX --key-file key.p8 # Import all
python cloudkit_import.py --stadiums-only ... # Stadiums first
python cloudkit_import.py --games-only ... # Games after
python cloudkit_import.py --stadium-aliases-only ... # Stadium aliases only
python cloudkit_import.py --delete-all ... # Delete then import
python cloudkit_import.py --delete-only ... # Delete only (no import)
python cloudkit_import.py # Interactive menu
python cloudkit_import.py --dry-run # Preview first
python cloudkit_import.py --key-id XX --key-file key.p8 # Import all
python cloudkit_import.py --stadiums-only # Stadiums first
python cloudkit_import.py --games-only # All games
python cloudkit_import.py --games-files mlb_2025.json # Specific game file
python cloudkit_import.py --games-files mlb_2025.json,nba_2025.json # Multiple files
python cloudkit_import.py --stadium-aliases-only # Stadium aliases only
python cloudkit_import.py --delete-all # Delete then import
python cloudkit_import.py --delete-only # Delete only (no import)
"""
import argparse, json, time, os, sys, hashlib, base64, requests
@@ -48,6 +67,58 @@ DEFAULT_KEY_ID = "152be0715e0276e31aaea5cbfe79dc872f298861a55c70fae14e5fe3e026cf
DEFAULT_KEY_FILE = "eckey.pem"
def show_game_files_menu(data_dir: Path) -> list[str]:
"""Show available game files and let user select which to import."""
canonical_games_dir = data_dir / 'canonical' / 'games'
if not canonical_games_dir.exists():
print("\n No canonical/games/ directory found.")
return []
game_files = sorted(canonical_games_dir.glob('*.json'))
if not game_files:
print("\n No game files found in canonical/games/")
return []
print("\n" + "="*50)
print("Select Game Files to Import")
print("="*50)
print("\n Available files:")
for i, f in enumerate(game_files, 1):
# Count games in file
with open(f) as fp:
games = json.load(fp)
print(f" {i}. {f.name} ({len(games):,} games)")
print(f"\n a. All files")
print(f" 0. Cancel")
print()
while True:
try:
choice = input("Enter file numbers (comma-separated), 'a' for all, or 0 to cancel: ").strip().lower()
if choice == '0':
return []
if choice == 'a':
return [f.name for f in game_files]
# Parse comma-separated numbers
indices = [int(x.strip()) for x in choice.split(',')]
selected = []
for idx in indices:
if 1 <= idx <= len(game_files):
selected.append(game_files[idx-1].name)
else:
print(f"Invalid selection: {idx}")
continue
if selected:
return selected
print("No valid selections. Try again.")
except (ValueError, EOFError, KeyboardInterrupt):
print("\nCancelled.")
return []
def show_menu():
"""Show interactive menu and return selected action."""
print("\n" + "="*50)
@@ -55,25 +126,26 @@ def show_menu():
print("="*50)
print("\n 1. Import all (stadiums, teams, games, league structure, team aliases, stadium aliases)")
print(" 2. Stadiums only")
print(" 3. Games only")
print(" 4. League structure only")
print(" 5. Team aliases only")
print(" 6. Stadium aliases only")
print(" 7. Canonical only (league structure + team aliases + stadium aliases)")
print(" 8. Delete all then import")
print(" 9. Delete only (no import)")
print(" 10. Dry run (preview only)")
print(" 3. Games only (all files)")
print(" 4. Games - select specific files")
print(" 5. League structure only")
print(" 6. Team aliases only")
print(" 7. Stadium aliases only")
print(" 8. Canonical only (league structure + team aliases + stadium aliases)")
print(" 9. Delete all then import")
print(" 10. Delete only (no import)")
print(" 11. Dry run (preview only)")
print(" 0. Exit")
print()
while True:
try:
choice = input("Enter choice [1-10, 0 to exit]: ").strip()
choice = input("Enter choice [1-11, 0 to exit]: ").strip()
if choice == '0':
return None
if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']:
if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']:
return int(choice)
print("Invalid choice. Please enter 1-10 or 0.")
print("Invalid choice. Please enter 1-11 or 0.")
except (EOFError, KeyboardInterrupt):
print("\nExiting.")
return None
@@ -265,6 +337,7 @@ def main():
p.add_argument('--data-dir', default='./data')
p.add_argument('--stadiums-only', action='store_true')
p.add_argument('--games-only', action='store_true')
p.add_argument('--games-files', type=str, help='Comma-separated list of game files to import (e.g., mlb_2025.json,nba_2025.json)')
p.add_argument('--league-structure-only', action='store_true', help='Import only league structure')
p.add_argument('--team-aliases-only', action='store_true', help='Import only team aliases')
p.add_argument('--stadium-aliases-only', action='store_true', help='Import only stadium aliases')
@@ -278,11 +351,18 @@ def main():
# Show interactive menu if no action flags provided or --interactive
has_action_flag = any([
args.stadiums_only, args.games_only, args.league_structure_only,
args.stadiums_only, args.games_only, args.games_files, args.league_structure_only,
args.team_aliases_only, args.stadium_aliases_only, args.canonical_only,
args.delete_all, args.delete_only, args.dry_run
])
# Track selected game files (for option 4 or --games-files)
selected_game_files = None
if args.games_files:
# Parse comma-separated list from command line
selected_game_files = [f.strip() for f in args.games_files.split(',')]
args.games_only = True # Imply --games-only
if args.interactive or not has_action_flag:
choice = show_menu()
if choice is None:
@@ -293,21 +373,27 @@ def main():
pass # Default behavior
elif choice == 2: # Stadiums only
args.stadiums_only = True
elif choice == 3: # Games only
elif choice == 3: # Games only (all files)
args.games_only = True
elif choice == 4: # League structure only
elif choice == 4: # Games - select specific files
args.games_only = True
selected_game_files = show_game_files_menu(Path(args.data_dir))
if not selected_game_files:
print("No files selected. Exiting.")
return
elif choice == 5: # League structure only
args.league_structure_only = True
elif choice == 5: # Team aliases only
elif choice == 6: # Team aliases only
args.team_aliases_only = True
elif choice == 6: # Stadium aliases only
elif choice == 7: # Stadium aliases only
args.stadium_aliases_only = True
elif choice == 7: # Canonical only
elif choice == 8: # Canonical only
args.canonical_only = True
elif choice == 8: # Delete all then import
elif choice == 9: # Delete all then import
args.delete_all = True
elif choice == 9: # Delete only
elif choice == 10: # Delete only
args.delete_only = True
elif choice == 10: # Dry run
elif choice == 11: # Dry run
args.dry_run = True
print(f"\n{'='*50}")
@@ -332,12 +418,34 @@ def main():
else:
teams = [] # Legacy: extracted from stadiums
if (data_dir / 'games_canonical.json').exists():
# Load games: try new structure first (canonical/games/*.json), then fallback
canonical_games_dir = data_dir / 'canonical' / 'games'
games = []
games_source = None
if selected_game_files:
# Load only the selected files
for filename in selected_game_files:
filepath = canonical_games_dir / filename
if filepath.exists():
with open(filepath) as f:
file_games = json.load(f)
games.extend(file_games)
print(f" Loading {filename}: {len(file_games):,} games")
games_source = f"selected files: {', '.join(selected_game_files)}"
elif canonical_games_dir.exists() and any(canonical_games_dir.glob('*.json')):
# New structure: load all sport/season files
for games_file in sorted(canonical_games_dir.glob('*.json')):
with open(games_file) as f:
file_games = json.load(f)
games.extend(file_games)
games_source = "canonical/games/*.json"
elif (data_dir / 'games_canonical.json').exists():
games = json.load(open(data_dir / 'games_canonical.json'))
games_source = "games_canonical.json"
elif (data_dir / 'games.json').exists():
games = json.load(open(data_dir / 'games.json'))
else:
games = []
games_source = "games.json (legacy)"
league_structure = json.load(open(data_dir / 'league_structure.json')) if (data_dir / 'league_structure.json').exists() else []
team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else []
@@ -345,6 +453,8 @@ def main():
print(f"Using {'canonical' if use_canonical else 'legacy'} format")
print(f"Loaded {len(stadiums)} stadiums, {len(teams)} teams, {len(games)} games")
if games_source:
print(f" Games loaded from: {games_source}")
print(f"Loaded {len(league_structure)} league structures, {len(team_aliases)} team aliases, {len(stadium_aliases)} stadium aliases\n")
ck = None