- Remove College Football, NASCAR, and PGA from scraper and app - Clean all data files (stadiums, games, pipeline reports) - Update Sport.swift enum and all UI components - Add sportstime.py CLI tool for pipeline management - Add DATA_SCRAPING.md documentation - Add WNBA/MLS/NWSL implementation documentation - Scraper now supports: NBA, MLB, NHL, NFL, WNBA, MLS, NWSL, CBB Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
807 lines
35 KiB
Python
Executable File
807 lines
35 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
CloudKit Import Script
|
|
======================
|
|
Imports canonical JSON data into CloudKit. Run after canonicalization pipeline.
|
|
|
|
Expected input files (from canonicalization pipeline):
|
|
- stadiums_canonical.json
|
|
- teams_canonical.json
|
|
- games_canonical.json OR canonical/games/*.json (new structure)
|
|
- stadium_aliases.json
|
|
- league_structure.json
|
|
- team_aliases.json
|
|
|
|
File Structure (Option B - by sport/season):
|
|
data/
|
|
games/ # Raw scraped games
|
|
mlb_2025.json
|
|
nba_2025.json
|
|
...
|
|
canonical/ # Canonicalized data
|
|
games/
|
|
mlb_2025.json
|
|
nba_2025.json
|
|
...
|
|
stadiums.json
|
|
games_canonical.json # Combined (backward compatibility)
|
|
stadiums_canonical.json
|
|
teams_canonical.json
|
|
|
|
Setup:
|
|
1. CloudKit Dashboard > Tokens & Keys > Server-to-Server Keys
|
|
2. Create key with Read/Write access to public database
|
|
3. Download .p8 file and note Key ID
|
|
|
|
Usage:
|
|
python cloudkit_import.py # Interactive menu
|
|
python cloudkit_import.py --dry-run # Preview first
|
|
python cloudkit_import.py --key-id XX --key-file key.p8 # Import all
|
|
python cloudkit_import.py --stadiums-only # Stadiums first
|
|
python cloudkit_import.py --games-only # All games
|
|
python cloudkit_import.py --games-files mlb_2025.json # Specific game file
|
|
python cloudkit_import.py --games-files mlb_2025.json,nba_2025.json # Multiple files
|
|
python cloudkit_import.py --stadium-aliases-only # Stadium aliases only
|
|
python cloudkit_import.py --delete-all # Delete then import
|
|
python cloudkit_import.py --delete-only # Delete only (no import)
|
|
"""
|
|
|
|
import argparse, json, time, os, sys, hashlib, base64, requests
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from cryptography.hazmat.primitives import hashes, serialization
|
|
from cryptography.hazmat.primitives.asymmetric import ec
|
|
from cryptography.hazmat.backends import default_backend
|
|
HAS_CRYPTO = True
|
|
except ImportError:
|
|
HAS_CRYPTO = False
|
|
|
|
CONTAINER = "iCloud.com.sportstime.app"
|
|
HOST = "https://api.apple-cloudkit.com"
|
|
BATCH_SIZE = 200
|
|
|
|
# Hardcoded credentials
|
|
DEFAULT_KEY_ID = "152be0715e0276e31aaea5cbfe79dc872f298861a55c70fae14e5fe3e026cff9"
|
|
DEFAULT_KEY_FILE = "eckey.pem"
|
|
|
|
|
|
def show_game_files_menu(data_dir: Path) -> list[str]:
|
|
"""Show available game files and let user select which to import."""
|
|
canonical_games_dir = data_dir / 'canonical' / 'games'
|
|
|
|
if not canonical_games_dir.exists():
|
|
print("\n No canonical/games/ directory found.")
|
|
return []
|
|
|
|
game_files = sorted(canonical_games_dir.glob('*.json'))
|
|
if not game_files:
|
|
print("\n No game files found in canonical/games/")
|
|
return []
|
|
|
|
print("\n" + "="*50)
|
|
print("Select Game Files to Import")
|
|
print("="*50)
|
|
print("\n Available files:")
|
|
for i, f in enumerate(game_files, 1):
|
|
# Count games in file
|
|
with open(f) as fp:
|
|
games = json.load(fp)
|
|
print(f" {i}. {f.name} ({len(games):,} games)")
|
|
|
|
print(f"\n a. All files")
|
|
print(f" 0. Cancel")
|
|
print()
|
|
|
|
while True:
|
|
try:
|
|
choice = input("Enter file numbers (comma-separated), 'a' for all, or 0 to cancel: ").strip().lower()
|
|
if choice == '0':
|
|
return []
|
|
if choice == 'a':
|
|
return [f.name for f in game_files]
|
|
|
|
# Parse comma-separated numbers
|
|
indices = [int(x.strip()) for x in choice.split(',')]
|
|
selected = []
|
|
for idx in indices:
|
|
if 1 <= idx <= len(game_files):
|
|
selected.append(game_files[idx-1].name)
|
|
else:
|
|
print(f"Invalid selection: {idx}")
|
|
continue
|
|
if selected:
|
|
return selected
|
|
print("No valid selections. Try again.")
|
|
except (ValueError, EOFError, KeyboardInterrupt):
|
|
print("\nCancelled.")
|
|
return []
|
|
|
|
|
|
def show_menu():
|
|
"""Show interactive menu and return selected action."""
|
|
print("\n" + "="*50)
|
|
print("CloudKit Import - Select Action")
|
|
print("="*50)
|
|
print("\n 1. Import all (stadiums, teams, games, league structure, team aliases, stadium aliases)")
|
|
print(" 2. Stadiums only")
|
|
print(" 3. Games only (all files)")
|
|
print(" 4. Games - select specific files")
|
|
print(" 5. League structure only")
|
|
print(" 6. Team aliases only")
|
|
print(" 7. Stadium aliases only")
|
|
print(" 8. Canonical only (league structure + team aliases + stadium aliases)")
|
|
print(" 9. Delete all then import")
|
|
print(" 10. Delete only (no import)")
|
|
print(" 11. Dry run (preview only)")
|
|
print(" 0. Exit")
|
|
print()
|
|
|
|
while True:
|
|
try:
|
|
choice = input("Enter choice [1-11, 0 to exit]: ").strip()
|
|
if choice == '0':
|
|
return None
|
|
if choice in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']:
|
|
return int(choice)
|
|
print("Invalid choice. Please enter 1-11 or 0.")
|
|
except (EOFError, KeyboardInterrupt):
|
|
print("\nExiting.")
|
|
return None
|
|
|
|
|
|
def deterministic_uuid(string: str) -> str:
|
|
"""
|
|
Generate a deterministic UUID from a string using SHA256.
|
|
Matches the StubDataProvider.deterministicUUID() implementation in Swift.
|
|
"""
|
|
# SHA256 hash of the string
|
|
hash_bytes = hashlib.sha256(string.encode('utf-8')).digest()
|
|
# Use first 16 bytes
|
|
uuid_bytes = bytearray(hash_bytes[:16])
|
|
# Set UUID version (4) and variant bits to match Swift implementation
|
|
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x40
|
|
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80
|
|
# Format as UUID string
|
|
return f"{uuid_bytes[0:4].hex()}-{uuid_bytes[4:6].hex()}-{uuid_bytes[6:8].hex()}-{uuid_bytes[8:10].hex()}-{uuid_bytes[10:16].hex()}".upper()
|
|
|
|
|
|
class CloudKit:
|
|
def __init__(self, key_id, private_key, container, env):
|
|
self.key_id = key_id
|
|
self.private_key = private_key
|
|
self.path_base = f"/database/1/{container}/{env}/public"
|
|
|
|
def _sign(self, date, body, path):
|
|
key = serialization.load_pem_private_key(self.private_key, None, default_backend())
|
|
body_hash = base64.b64encode(hashlib.sha256(body.encode()).digest()).decode()
|
|
sig = key.sign(f"{date}:{body_hash}:{path}".encode(), ec.ECDSA(hashes.SHA256()))
|
|
return base64.b64encode(sig).decode()
|
|
|
|
def modify(self, operations):
|
|
path = f"{self.path_base}/records/modify"
|
|
body = json.dumps({'operations': operations})
|
|
date = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
headers = {
|
|
'Content-Type': 'application/json',
|
|
'X-Apple-CloudKit-Request-KeyID': self.key_id,
|
|
'X-Apple-CloudKit-Request-ISO8601Date': date,
|
|
'X-Apple-CloudKit-Request-SignatureV1': self._sign(date, body, path),
|
|
}
|
|
r = requests.post(f"{HOST}{path}", headers=headers, data=body, timeout=60)
|
|
if r.status_code == 200:
|
|
return r.json()
|
|
else:
|
|
try:
|
|
err = r.json()
|
|
reason = err.get('reason', 'Unknown')
|
|
code = err.get('serverErrorCode', r.status_code)
|
|
return {'error': f"{code}: {reason}"}
|
|
except:
|
|
return {'error': f"{r.status_code}: {r.text[:200]}"}
|
|
|
|
def query(self, record_type, limit=200, verbose=False):
|
|
"""Query records of a given type."""
|
|
path = f"{self.path_base}/records/query"
|
|
body = json.dumps({
|
|
'query': {'recordType': record_type},
|
|
'resultsLimit': limit
|
|
})
|
|
date = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
headers = {
|
|
'Content-Type': 'application/json',
|
|
'X-Apple-CloudKit-Request-KeyID': self.key_id,
|
|
'X-Apple-CloudKit-Request-ISO8601Date': date,
|
|
'X-Apple-CloudKit-Request-SignatureV1': self._sign(date, body, path),
|
|
}
|
|
if verbose:
|
|
print(f" Querying {record_type}...")
|
|
try:
|
|
r = requests.post(f"{HOST}{path}", headers=headers, data=body, timeout=30)
|
|
if verbose:
|
|
print(f" Response: {r.status_code}")
|
|
if r.status_code == 200:
|
|
result = r.json()
|
|
if verbose:
|
|
print(f" Found {len(result.get('records', []))} records")
|
|
return result
|
|
return {'error': f"{r.status_code}: {r.text[:200]}"}
|
|
except requests.exceptions.Timeout:
|
|
return {'error': 'Request timed out after 30s'}
|
|
except Exception as e:
|
|
return {'error': f"Request failed: {e}"}
|
|
|
|
def delete_all(self, record_type, verbose=False):
|
|
"""Delete all records of a given type."""
|
|
total_deleted = 0
|
|
while True:
|
|
result = self.query(record_type, verbose=verbose)
|
|
if 'error' in result:
|
|
print(f" Query error: {result['error']}")
|
|
break
|
|
|
|
records = result.get('records', [])
|
|
if not records:
|
|
break
|
|
|
|
# Build delete operations (recordChangeTag required for delete)
|
|
ops = [{
|
|
'operationType': 'delete',
|
|
'record': {
|
|
'recordName': r['recordName'],
|
|
'recordType': record_type,
|
|
'recordChangeTag': r.get('recordChangeTag', '')
|
|
}
|
|
} for r in records]
|
|
|
|
if verbose:
|
|
print(f" Sending delete for {len(ops)} records...")
|
|
|
|
delete_result = self.modify(ops)
|
|
|
|
if verbose:
|
|
print(f" Delete response: {json.dumps(delete_result)[:500]}")
|
|
|
|
if 'error' in delete_result:
|
|
print(f" Delete error: {delete_result['error']}")
|
|
break
|
|
|
|
# Check for individual record errors
|
|
result_records = delete_result.get('records', [])
|
|
successful = [r for r in result_records if 'serverErrorCode' not in r]
|
|
failed = [r for r in result_records if 'serverErrorCode' in r]
|
|
|
|
if failed and verbose:
|
|
print(f" Failed: {failed[0]}")
|
|
|
|
total_deleted += len(successful)
|
|
print(f" Deleted {len(successful)} {record_type} records" + (f" ({len(failed)} failed)" if failed else ""))
|
|
|
|
time.sleep(0.5)
|
|
|
|
return total_deleted
|
|
|
|
|
|
def import_data(ck, records, name, dry_run, verbose):
|
|
total = 0
|
|
errors = 0
|
|
for i in range(0, len(records), BATCH_SIZE):
|
|
batch = records[i:i+BATCH_SIZE]
|
|
ops = [{'operationType': 'forceReplace', 'record': r} for r in batch]
|
|
|
|
if verbose:
|
|
print(f" Batch {i//BATCH_SIZE + 1}: {len(batch)} records, {len(ops)} ops")
|
|
|
|
if not ops:
|
|
print(f" Warning: Empty batch at index {i}, skipping")
|
|
continue
|
|
|
|
if dry_run:
|
|
print(f" [DRY RUN] Would create {len(batch)} {name}")
|
|
total += len(batch)
|
|
else:
|
|
result = ck.modify(ops)
|
|
if 'error' in result:
|
|
errors += 1
|
|
if errors <= 3: # Only show first 3 errors
|
|
print(f" Error: {result['error']}")
|
|
if verbose and batch:
|
|
print(f" Sample record: {json.dumps(batch[0], indent=2)[:500]}")
|
|
if errors == 3:
|
|
print(" (suppressing further errors...)")
|
|
else:
|
|
result_records = result.get('records', [])
|
|
# Count only successful records (no serverErrorCode)
|
|
successful = [r for r in result_records if 'serverErrorCode' not in r]
|
|
failed = [r for r in result_records if 'serverErrorCode' in r]
|
|
n = len(successful)
|
|
total += n
|
|
print(f" Created {n} {name}")
|
|
if failed:
|
|
print(f" Failed {len(failed)} records: {failed[0].get('serverErrorCode')}: {failed[0].get('reason')}")
|
|
if verbose:
|
|
print(f" Response: {json.dumps(result, indent=2)[:1000]}")
|
|
time.sleep(0.5)
|
|
if errors > 0:
|
|
print(f" Total errors: {errors}")
|
|
return total
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser(description='Import JSON to CloudKit')
|
|
p.add_argument('--key-id', default=DEFAULT_KEY_ID)
|
|
p.add_argument('--key-file', default=DEFAULT_KEY_FILE)
|
|
p.add_argument('--container', default=CONTAINER)
|
|
p.add_argument('--env', choices=['development', 'production'], default='development')
|
|
p.add_argument('--data-dir', default='./data')
|
|
p.add_argument('--stadiums-only', action='store_true')
|
|
p.add_argument('--games-only', action='store_true')
|
|
p.add_argument('--games-files', type=str, help='Comma-separated list of game files to import (e.g., mlb_2025.json,nba_2025.json)')
|
|
p.add_argument('--league-structure-only', action='store_true', help='Import only league structure')
|
|
p.add_argument('--team-aliases-only', action='store_true', help='Import only team aliases')
|
|
p.add_argument('--stadium-aliases-only', action='store_true', help='Import only stadium aliases')
|
|
p.add_argument('--canonical-only', action='store_true', help='Import only canonical data (league structure + team aliases + stadium aliases)')
|
|
p.add_argument('--delete-all', action='store_true', help='Delete all records before importing')
|
|
p.add_argument('--delete-only', action='store_true', help='Only delete records, do not import')
|
|
p.add_argument('--dry-run', action='store_true')
|
|
p.add_argument('--verbose', '-v', action='store_true')
|
|
p.add_argument('--interactive', '-i', action='store_true', help='Show interactive menu')
|
|
args = p.parse_args()
|
|
|
|
# Show interactive menu if no action flags provided or --interactive
|
|
has_action_flag = any([
|
|
args.stadiums_only, args.games_only, args.games_files, args.league_structure_only,
|
|
args.team_aliases_only, args.stadium_aliases_only, args.canonical_only,
|
|
args.delete_all, args.delete_only, args.dry_run
|
|
])
|
|
|
|
# Track selected game files (for option 4 or --games-files)
|
|
selected_game_files = None
|
|
if args.games_files:
|
|
# Parse comma-separated list from command line
|
|
selected_game_files = [f.strip() for f in args.games_files.split(',')]
|
|
args.games_only = True # Imply --games-only
|
|
|
|
if args.interactive or not has_action_flag:
|
|
choice = show_menu()
|
|
if choice is None:
|
|
return
|
|
|
|
# Map menu choice to flags
|
|
if choice == 1: # Import all
|
|
pass # Default behavior
|
|
elif choice == 2: # Stadiums only
|
|
args.stadiums_only = True
|
|
elif choice == 3: # Games only (all files)
|
|
args.games_only = True
|
|
elif choice == 4: # Games - select specific files
|
|
args.games_only = True
|
|
selected_game_files = show_game_files_menu(Path(args.data_dir))
|
|
if not selected_game_files:
|
|
print("No files selected. Exiting.")
|
|
return
|
|
elif choice == 5: # League structure only
|
|
args.league_structure_only = True
|
|
elif choice == 6: # Team aliases only
|
|
args.team_aliases_only = True
|
|
elif choice == 7: # Stadium aliases only
|
|
args.stadium_aliases_only = True
|
|
elif choice == 8: # Canonical only
|
|
args.canonical_only = True
|
|
elif choice == 9: # Delete all then import
|
|
args.delete_all = True
|
|
elif choice == 10: # Delete only
|
|
args.delete_only = True
|
|
elif choice == 11: # Dry run
|
|
args.dry_run = True
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"CloudKit Import {'(DRY RUN)' if args.dry_run else ''}")
|
|
print(f"{'='*50}")
|
|
print(f"Container: {args.container}")
|
|
print(f"Environment: {args.env}\n")
|
|
|
|
data_dir = Path(args.data_dir)
|
|
|
|
# Load canonical format files (from canonicalization pipeline)
|
|
# Fall back to legacy format for backward compatibility
|
|
if (data_dir / 'stadiums_canonical.json').exists():
|
|
stadiums = json.load(open(data_dir / 'stadiums_canonical.json'))
|
|
use_canonical = True
|
|
else:
|
|
stadiums = json.load(open(data_dir / 'stadiums.json'))
|
|
use_canonical = False
|
|
|
|
if (data_dir / 'teams_canonical.json').exists():
|
|
teams = json.load(open(data_dir / 'teams_canonical.json'))
|
|
else:
|
|
teams = [] # Legacy: extracted from stadiums
|
|
|
|
# Load games: try new structure first (canonical/games/*.json), then fallback
|
|
canonical_games_dir = data_dir / 'canonical' / 'games'
|
|
games = []
|
|
games_source = None
|
|
|
|
if selected_game_files:
|
|
# Load only the selected files
|
|
for filename in selected_game_files:
|
|
filepath = canonical_games_dir / filename
|
|
if filepath.exists():
|
|
with open(filepath) as f:
|
|
file_games = json.load(f)
|
|
games.extend(file_games)
|
|
print(f" Loading {filename}: {len(file_games):,} games")
|
|
games_source = f"selected files: {', '.join(selected_game_files)}"
|
|
elif canonical_games_dir.exists() and any(canonical_games_dir.glob('*.json')):
|
|
# New structure: load all sport/season files
|
|
for games_file in sorted(canonical_games_dir.glob('*.json')):
|
|
with open(games_file) as f:
|
|
file_games = json.load(f)
|
|
games.extend(file_games)
|
|
games_source = "canonical/games/*.json"
|
|
elif (data_dir / 'games_canonical.json').exists():
|
|
games = json.load(open(data_dir / 'games_canonical.json'))
|
|
games_source = "games_canonical.json"
|
|
elif (data_dir / 'games.json').exists():
|
|
games = json.load(open(data_dir / 'games.json'))
|
|
games_source = "games.json (legacy)"
|
|
|
|
league_structure = json.load(open(data_dir / 'league_structure.json')) if (data_dir / 'league_structure.json').exists() else []
|
|
team_aliases = json.load(open(data_dir / 'team_aliases.json')) if (data_dir / 'team_aliases.json').exists() else []
|
|
stadium_aliases = json.load(open(data_dir / 'stadium_aliases.json')) if (data_dir / 'stadium_aliases.json').exists() else []
|
|
|
|
print(f"Using {'canonical' if use_canonical else 'legacy'} format")
|
|
print(f"Loaded {len(stadiums)} stadiums, {len(teams)} teams, {len(games)} games")
|
|
if games_source:
|
|
print(f" Games loaded from: {games_source}")
|
|
print(f"Loaded {len(league_structure)} league structures, {len(team_aliases)} team aliases, {len(stadium_aliases)} stadium aliases\n")
|
|
|
|
ck = None
|
|
if not args.dry_run:
|
|
if not HAS_CRYPTO:
|
|
sys.exit("Error: pip install cryptography")
|
|
if not os.path.exists(args.key_file):
|
|
sys.exit(f"Error: Key file not found: {args.key_file}")
|
|
ck = CloudKit(args.key_id, open(args.key_file, 'rb').read(), args.container, args.env)
|
|
|
|
# Handle deletion
|
|
if args.delete_all or args.delete_only:
|
|
if not ck:
|
|
sys.exit("Error: --key-id and --key-file required for deletion")
|
|
|
|
print("--- Deleting Existing Records ---")
|
|
# Delete in order: dependent records first, then base records
|
|
for record_type in ['Game', 'TeamAlias', 'StadiumAlias', 'Team', 'LeagueStructure', 'Stadium']:
|
|
print(f" Deleting {record_type} records...")
|
|
deleted = ck.delete_all(record_type, verbose=args.verbose)
|
|
print(f" Deleted {deleted} {record_type} records")
|
|
|
|
if args.delete_only:
|
|
print(f"\n{'='*50}")
|
|
print("DELETE COMPLETE")
|
|
print()
|
|
return
|
|
|
|
stats = {'stadiums': 0, 'teams': 0, 'games': 0, 'league_structures': 0, 'team_aliases': 0, 'stadium_aliases': 0}
|
|
team_map = {}
|
|
|
|
# Determine what to import based on flags
|
|
import_stadiums = not args.games_only and not args.league_structure_only and not args.team_aliases_only and not args.stadium_aliases_only and not args.canonical_only
|
|
import_teams = not args.games_only and not args.league_structure_only and not args.team_aliases_only and not args.stadium_aliases_only and not args.canonical_only
|
|
import_games = not args.stadiums_only and not args.league_structure_only and not args.team_aliases_only and not args.stadium_aliases_only and not args.canonical_only
|
|
import_league_structure = args.league_structure_only or args.canonical_only or (not args.stadiums_only and not args.games_only and not args.team_aliases_only and not args.stadium_aliases_only)
|
|
import_team_aliases = args.team_aliases_only or args.canonical_only or (not args.stadiums_only and not args.games_only and not args.league_structure_only and not args.stadium_aliases_only)
|
|
import_stadium_aliases = args.stadium_aliases_only or args.canonical_only or (not args.stadiums_only and not args.games_only and not args.league_structure_only and not args.team_aliases_only)
|
|
|
|
# Build stadium ID lookup
|
|
# Canonical format uses canonical_id, legacy uses id
|
|
def get_stadium_id(s):
|
|
return s.get('canonical_id', s.get('id', ''))
|
|
|
|
def get_team_id(t):
|
|
return t.get('canonical_id', '')
|
|
|
|
stadium_id_map = {get_stadium_id(s): deterministic_uuid(get_stadium_id(s)) for s in stadiums}
|
|
|
|
# Import stadiums
|
|
if import_stadiums:
|
|
print("--- Stadiums ---")
|
|
recs = []
|
|
for s in stadiums:
|
|
stadium_id = get_stadium_id(s)
|
|
record_name = deterministic_uuid(stadium_id)
|
|
# Canonical format uses primary_team_abbrevs, legacy uses team_abbrevs
|
|
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
|
|
|
|
fields = {
|
|
'stadiumId': {'value': record_name},
|
|
'canonicalId': {'value': stadium_id}, # Store canonical_id as string
|
|
'name': {'value': s['name']},
|
|
'city': {'value': s['city']},
|
|
'state': {'value': s.get('state', '')},
|
|
'sport': {'value': s['sport']},
|
|
'source': {'value': s.get('source', 'canonical')},
|
|
'teamAbbrevs': {'value': team_abbrevs},
|
|
}
|
|
if s.get('latitude'):
|
|
fields['location'] = {'value': {'latitude': s['latitude'], 'longitude': s['longitude']}}
|
|
if s.get('capacity'):
|
|
fields['capacity'] = {'value': s['capacity']}
|
|
|
|
recs.append({'recordType': 'Stadium', 'recordName': record_name, 'fields': fields})
|
|
stats['stadiums'] = import_data(ck, recs, 'stadiums', args.dry_run, args.verbose)
|
|
|
|
# Import teams (canonical format has dedicated teams file)
|
|
if import_teams:
|
|
print("--- Teams ---")
|
|
if teams:
|
|
# Canonical format: use teams_canonical.json
|
|
recs = []
|
|
for t in teams:
|
|
team_id = get_team_id(t)
|
|
record_name = deterministic_uuid(team_id)
|
|
team_map[(t['sport'], t['abbreviation'])] = record_name
|
|
|
|
fields = {
|
|
'teamId': {'value': record_name},
|
|
'canonicalId': {'value': team_id}, # Store canonical_id as string
|
|
'abbreviation': {'value': t['abbreviation']},
|
|
'name': {'value': t['name']},
|
|
'city': {'value': t['city']},
|
|
'sport': {'value': t['sport']},
|
|
'stadiumCanonicalId': {'value': t.get('stadium_canonical_id', '')},
|
|
}
|
|
if t.get('conference_id'):
|
|
fields['conferenceId'] = {'value': t['conference_id']}
|
|
if t.get('division_id'):
|
|
fields['divisionId'] = {'value': t['division_id']}
|
|
|
|
recs.append({'recordType': 'Team', 'recordName': record_name, 'fields': fields})
|
|
stats['teams'] = import_data(ck, recs, 'teams', args.dry_run, args.verbose)
|
|
else:
|
|
# Legacy format: extract teams from stadiums
|
|
teams_dict = {}
|
|
for s in stadiums:
|
|
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
|
|
for abbr in team_abbrevs:
|
|
team_key = f"{s['sport']}_{abbr}"
|
|
if team_key not in teams_dict:
|
|
teams_dict[team_key] = {'abbr': abbr, 'city': s['city'], 'sport': s['sport']}
|
|
team_uuid = deterministic_uuid(team_key)
|
|
team_map[(s['sport'], abbr)] = team_uuid
|
|
|
|
recs = [{
|
|
'recordType': 'Team', 'recordName': deterministic_uuid(team_key),
|
|
'fields': {
|
|
'teamId': {'value': deterministic_uuid(team_key)},
|
|
'canonicalId': {'value': team_key},
|
|
'abbreviation': {'value': info['abbr']},
|
|
'name': {'value': info['abbr']},
|
|
'city': {'value': info['city']},
|
|
'sport': {'value': info['sport']},
|
|
}
|
|
} for team_key, info in teams_dict.items()]
|
|
stats['teams'] = import_data(ck, recs, 'teams', args.dry_run, args.verbose)
|
|
|
|
# Import games
|
|
if import_games and games:
|
|
# Detect canonical game format (has canonical_id field)
|
|
use_canonical_games = games and 'canonical_id' in games[0]
|
|
|
|
# Rebuild team_map if only importing games (--games-only flag)
|
|
if not team_map:
|
|
if teams:
|
|
# Canonical format: use teams_canonical.json
|
|
for t in teams:
|
|
team_id = get_team_id(t)
|
|
team_map[(t['sport'], t['abbreviation'])] = deterministic_uuid(team_id)
|
|
else:
|
|
# Legacy format: extract from stadiums
|
|
for s in stadiums:
|
|
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
|
|
for abbr in team_abbrevs:
|
|
team_key = f"{s['sport']}_{abbr}"
|
|
team_map[(s['sport'], abbr)] = deterministic_uuid(team_key)
|
|
|
|
# Build team -> stadium map for stadiumRef (legacy format needs this)
|
|
team_stadium_map = {}
|
|
for s in stadiums:
|
|
stadium_id = get_stadium_id(s)
|
|
stadium_uuid = stadium_id_map[stadium_id]
|
|
team_abbrevs = s.get('primary_team_abbrevs', s.get('team_abbrevs', []))
|
|
for abbr in team_abbrevs:
|
|
team_stadium_map[(s['sport'], abbr)] = stadium_uuid
|
|
|
|
print("--- Games ---")
|
|
print(f" Using {'canonical' if use_canonical_games else 'legacy'} game format")
|
|
|
|
# Deduplicate games by ID (canonical_id or id)
|
|
seen_ids = set()
|
|
unique_games = []
|
|
for g in games:
|
|
game_id = g.get('canonical_id', g.get('id', ''))
|
|
if game_id not in seen_ids:
|
|
seen_ids.add(game_id)
|
|
unique_games.append(g)
|
|
|
|
if len(unique_games) < len(games):
|
|
print(f" Removed {len(games) - len(unique_games)} duplicate games")
|
|
|
|
recs = []
|
|
for g in unique_games:
|
|
# Get game ID (canonical or legacy)
|
|
game_id = g.get('canonical_id', g.get('id', ''))
|
|
game_uuid = deterministic_uuid(game_id)
|
|
sport = g['sport']
|
|
|
|
fields = {
|
|
'gameId': {'value': game_uuid},
|
|
'canonicalId': {'value': game_id}, # Store canonical_id as string
|
|
'sport': {'value': sport},
|
|
'season': {'value': g.get('season', '')},
|
|
'source': {'value': g.get('source', 'canonical' if use_canonical_games else '')},
|
|
}
|
|
|
|
# Parse date/time
|
|
if g.get('date'):
|
|
try:
|
|
# Parse time like "7:30p" or "10:00a"
|
|
time_str = g.get('time', '7:00p')
|
|
hour, minute = 19, 0
|
|
if time_str:
|
|
clean_time = time_str.lower().replace(' ', '')
|
|
is_pm = 'p' in clean_time
|
|
time_parts = clean_time.replace('p', '').replace('a', '').split(':')
|
|
if time_parts:
|
|
hour = int(time_parts[0])
|
|
if is_pm and hour != 12:
|
|
hour += 12
|
|
elif not is_pm and hour == 12:
|
|
hour = 0
|
|
if len(time_parts) > 1:
|
|
minute = int(time_parts[1])
|
|
dt = datetime.strptime(f"{g['date']} {hour:02d}:{minute:02d}", '%Y-%m-%d %H:%M')
|
|
# CloudKit expects TIMESTAMP type with milliseconds since epoch
|
|
fields['dateTime'] = {'value': int(dt.timestamp() * 1000), 'type': 'TIMESTAMP'}
|
|
except Exception as e:
|
|
if args.verbose:
|
|
print(f" Warning: Failed to parse date/time for {game_id}: {e}")
|
|
|
|
# Team references
|
|
if use_canonical_games:
|
|
# Canonical format: extract team abbrev from canonical ID (team_nba_atl -> atl)
|
|
home_team_canonical_id = g.get('home_team_canonical_id', '')
|
|
away_team_canonical_id = g.get('away_team_canonical_id', '')
|
|
home_team_uuid = deterministic_uuid(home_team_canonical_id)
|
|
away_team_uuid = deterministic_uuid(away_team_canonical_id)
|
|
else:
|
|
# Legacy format: use abbreviations
|
|
home_team_key = f"{sport}_{g.get('home_team_abbrev', '')}"
|
|
away_team_key = f"{sport}_{g.get('away_team_abbrev', '')}"
|
|
home_team_uuid = deterministic_uuid(home_team_key)
|
|
away_team_uuid = deterministic_uuid(away_team_key)
|
|
|
|
fields['homeTeamRef'] = {'value': {'recordName': home_team_uuid, 'action': 'NONE'}}
|
|
fields['awayTeamRef'] = {'value': {'recordName': away_team_uuid, 'action': 'NONE'}}
|
|
|
|
# Stadium reference
|
|
if use_canonical_games and g.get('stadium_canonical_id'):
|
|
# Canonical format: use stadium_canonical_id directly
|
|
stadium_canonical_id = g['stadium_canonical_id']
|
|
stadium_uuid = stadium_id_map.get(stadium_canonical_id)
|
|
if stadium_uuid:
|
|
fields['stadiumRef'] = {'value': {'recordName': stadium_uuid, 'action': 'NONE'}}
|
|
fields['stadiumCanonicalId'] = {'value': stadium_canonical_id}
|
|
else:
|
|
# Legacy format: look up by home team abbrev
|
|
stadium_uuid = team_stadium_map.get((sport, g.get('home_team_abbrev', '')))
|
|
if stadium_uuid:
|
|
fields['stadiumRef'] = {'value': {'recordName': stadium_uuid, 'action': 'NONE'}}
|
|
|
|
recs.append({'recordType': 'Game', 'recordName': game_uuid, 'fields': fields})
|
|
|
|
stats['games'] = import_data(ck, recs, 'games', args.dry_run, args.verbose)
|
|
|
|
# Import league structure
|
|
if import_league_structure and league_structure:
|
|
print("--- League Structure ---")
|
|
now_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
|
|
recs = [{
|
|
'recordType': 'LeagueStructure',
|
|
'recordName': ls['id'], # Use the id as recordName
|
|
'fields': {
|
|
'structureId': {'value': ls['id']},
|
|
'sport': {'value': ls['sport']},
|
|
'type': {'value': ls['type']},
|
|
'name': {'value': ls['name']},
|
|
'displayOrder': {'value': ls['display_order']},
|
|
'schemaVersion': {'value': 1},
|
|
'lastModified': {'value': now_ms, 'type': 'TIMESTAMP'},
|
|
**({'abbreviation': {'value': ls['abbreviation']}} if ls.get('abbreviation') else {}),
|
|
**({'parentId': {'value': ls['parent_id']}} if ls.get('parent_id') else {}),
|
|
}
|
|
} for ls in league_structure]
|
|
stats['league_structures'] = import_data(ck, recs, 'league structures', args.dry_run, args.verbose)
|
|
|
|
# Import team aliases
|
|
if import_team_aliases and team_aliases:
|
|
print("--- Team Aliases ---")
|
|
now_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
|
|
recs = []
|
|
for ta in team_aliases:
|
|
fields = {
|
|
'aliasId': {'value': ta['id']},
|
|
'teamCanonicalId': {'value': ta['team_canonical_id']},
|
|
'aliasType': {'value': ta['alias_type']},
|
|
'aliasValue': {'value': ta['alias_value']},
|
|
'schemaVersion': {'value': 1},
|
|
'lastModified': {'value': now_ms, 'type': 'TIMESTAMP'},
|
|
}
|
|
# Add optional date fields
|
|
if ta.get('valid_from'):
|
|
try:
|
|
dt = datetime.strptime(ta['valid_from'], '%Y-%m-%d')
|
|
fields['validFrom'] = {'value': int(dt.timestamp() * 1000), 'type': 'TIMESTAMP'}
|
|
except:
|
|
pass
|
|
if ta.get('valid_until'):
|
|
try:
|
|
dt = datetime.strptime(ta['valid_until'], '%Y-%m-%d')
|
|
fields['validUntil'] = {'value': int(dt.timestamp() * 1000), 'type': 'TIMESTAMP'}
|
|
except:
|
|
pass
|
|
recs.append({
|
|
'recordType': 'TeamAlias',
|
|
'recordName': ta['id'], # Use the id as recordName
|
|
'fields': fields
|
|
})
|
|
stats['team_aliases'] = import_data(ck, recs, 'team aliases', args.dry_run, args.verbose)
|
|
|
|
# Import stadium aliases
|
|
if import_stadium_aliases and stadium_aliases:
|
|
print("--- Stadium Aliases ---")
|
|
now_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
|
|
recs = []
|
|
for sa in stadium_aliases:
|
|
fields = {
|
|
'aliasName': {'value': sa['alias_name'].lower()}, # Normalize to lowercase
|
|
'stadiumCanonicalId': {'value': sa['stadium_canonical_id']},
|
|
'schemaVersion': {'value': 1},
|
|
'lastModified': {'value': now_ms, 'type': 'TIMESTAMP'},
|
|
}
|
|
# Add optional date fields
|
|
if sa.get('valid_from'):
|
|
try:
|
|
dt = datetime.strptime(sa['valid_from'], '%Y-%m-%d')
|
|
fields['validFrom'] = {'value': int(dt.timestamp() * 1000), 'type': 'TIMESTAMP'}
|
|
except:
|
|
pass
|
|
if sa.get('valid_until'):
|
|
try:
|
|
dt = datetime.strptime(sa['valid_until'], '%Y-%m-%d')
|
|
fields['validUntil'] = {'value': int(dt.timestamp() * 1000), 'type': 'TIMESTAMP'}
|
|
except:
|
|
pass
|
|
# Extract sport from stadium_canonical_id (e.g., "stadium_nba_td_garden" -> "nba")
|
|
# This makes record names unique for shared venues (TD Garden has NBA and NHL entries)
|
|
stadium_id = sa['stadium_canonical_id']
|
|
sport = stadium_id.split('_')[1] if '_' in stadium_id else 'unknown'
|
|
record_name = f"{sport}_{sa['alias_name'].lower()}"
|
|
recs.append({
|
|
'recordType': 'StadiumAlias',
|
|
'recordName': record_name,
|
|
'fields': fields
|
|
})
|
|
stats['stadium_aliases'] = import_data(ck, recs, 'stadium aliases', args.dry_run, args.verbose)
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"COMPLETE: {stats['stadiums']} stadiums, {stats['teams']} teams, {stats['games']} games, {stats['league_structures']} league structures, {stats['team_aliases']} team aliases, {stats['stadium_aliases']} stadium aliases")
|
|
if args.dry_run:
|
|
print("[DRY RUN - nothing imported]")
|
|
print()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|