feat(scripts): add sportstime-parser data pipeline

Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-20 18:56:25 -06:00
parent ac78042a7e
commit 52d445bca4
76 changed files with 25065 additions and 0 deletions

View File

@@ -0,0 +1,52 @@
"""CloudKit uploaders for sportstime-parser."""
from .cloudkit import (
CloudKitClient,
CloudKitRecord,
CloudKitError,
CloudKitAuthError,
CloudKitRateLimitError,
CloudKitServerError,
RecordType,
OperationResult,
BatchResult,
)
from .state import (
RecordState,
UploadSession,
StateManager,
)
from .diff import (
DiffAction,
RecordDiff,
DiffResult,
RecordDiffer,
game_to_cloudkit_record,
team_to_cloudkit_record,
stadium_to_cloudkit_record,
)
__all__ = [
# CloudKit client
"CloudKitClient",
"CloudKitRecord",
"CloudKitError",
"CloudKitAuthError",
"CloudKitRateLimitError",
"CloudKitServerError",
"RecordType",
"OperationResult",
"BatchResult",
# State manager
"RecordState",
"UploadSession",
"StateManager",
# Differ
"DiffAction",
"RecordDiff",
"DiffResult",
"RecordDiffer",
"game_to_cloudkit_record",
"team_to_cloudkit_record",
"stadium_to_cloudkit_record",
]

View File

@@ -0,0 +1,578 @@
"""CloudKit Web Services client for sportstime-parser.
This module provides a client for uploading data to CloudKit using the
CloudKit Web Services API. It handles JWT authentication, request signing,
and batch operations.
Reference: https://developer.apple.com/documentation/cloudkitwebservices
"""
import base64
import hashlib
import json
import os
import time
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Optional
from enum import Enum
import jwt
import requests
from cryptography.hazmat.primitives import hashes, serialization
from cryptography.hazmat.primitives.asymmetric import ec
from cryptography.hazmat.backends import default_backend
from ..config import (
CLOUDKIT_CONTAINER_ID,
CLOUDKIT_ENVIRONMENT,
CLOUDKIT_BATCH_SIZE,
CLOUDKIT_KEY_ID,
CLOUDKIT_PRIVATE_KEY_PATH,
)
from ..utils.logging import get_logger
class RecordType(str, Enum):
"""CloudKit record types for SportsTime.
Must match CKRecordType constants in CKModels.swift.
"""
GAME = "Game"
TEAM = "Team"
STADIUM = "Stadium"
TEAM_ALIAS = "TeamAlias"
STADIUM_ALIAS = "StadiumAlias"
SPORT = "Sport"
LEAGUE_STRUCTURE = "LeagueStructure"
TRIP_POLL = "TripPoll"
POLL_VOTE = "PollVote"
ITINERARY_ITEM = "ItineraryItem"
@dataclass
class CloudKitRecord:
"""Represents a CloudKit record for upload.
Attributes:
record_name: Unique record identifier (canonical ID)
record_type: CloudKit record type
fields: Dictionary of field name -> field value
record_change_tag: Version tag for conflict detection (None for new records)
"""
record_name: str
record_type: RecordType
fields: dict[str, Any]
record_change_tag: Optional[str] = None
def to_cloudkit_dict(self) -> dict:
"""Convert to CloudKit API format."""
record = {
"recordName": self.record_name,
"recordType": self.record_type.value,
"fields": self._format_fields(),
}
if self.record_change_tag:
record["recordChangeTag"] = self.record_change_tag
return record
def _format_fields(self) -> dict:
"""Format fields for CloudKit API."""
formatted = {}
for key, value in self.fields.items():
if value is None:
continue
formatted[key] = self._format_field_value(value)
return formatted
def _format_field_value(self, value: Any) -> dict:
"""Format a single field value for CloudKit API."""
# Check bool BEFORE int (bool is a subclass of int in Python)
if isinstance(value, bool):
return {"value": 1 if value else 0, "type": "INT64"}
elif isinstance(value, str):
return {"value": value, "type": "STRING"}
elif isinstance(value, int):
return {"value": value, "type": "INT64"}
elif isinstance(value, float):
return {"value": value, "type": "DOUBLE"}
elif isinstance(value, datetime):
# CloudKit expects milliseconds since epoch
timestamp_ms = int(value.timestamp() * 1000)
return {"value": timestamp_ms, "type": "TIMESTAMP"}
elif isinstance(value, list):
return {"value": value, "type": "STRING_LIST"}
elif isinstance(value, dict) and "latitude" in value and "longitude" in value:
return {
"value": {
"latitude": value["latitude"],
"longitude": value["longitude"],
},
"type": "LOCATION",
}
else:
# Default to string
return {"value": str(value), "type": "STRING"}
@dataclass
class OperationResult:
"""Result of a CloudKit operation."""
record_name: str
success: bool
record_change_tag: Optional[str] = None
error_code: Optional[str] = None
error_message: Optional[str] = None
@dataclass
class BatchResult:
"""Result of a batch CloudKit operation."""
successful: list[OperationResult] = field(default_factory=list)
failed: list[OperationResult] = field(default_factory=list)
@property
def all_succeeded(self) -> bool:
return len(self.failed) == 0
@property
def success_count(self) -> int:
return len(self.successful)
@property
def failure_count(self) -> int:
return len(self.failed)
class CloudKitClient:
"""Client for CloudKit Web Services API.
Handles authentication via server-to-server JWT tokens and provides
methods for CRUD operations on CloudKit records.
Authentication requires:
- Key ID: CloudKit key identifier from Apple Developer Portal
- Private Key: EC private key in PEM format
Environment variables:
- CLOUDKIT_KEY_ID: The key identifier
- CLOUDKIT_PRIVATE_KEY_PATH: Path to the private key file
- CLOUDKIT_PRIVATE_KEY: The private key contents (alternative to path)
"""
BASE_URL = "https://api.apple-cloudkit.com"
TOKEN_EXPIRY_SECONDS = 3600 # 1 hour
def __init__(
self,
container_id: str = CLOUDKIT_CONTAINER_ID,
environment: str = CLOUDKIT_ENVIRONMENT,
key_id: Optional[str] = None,
private_key: Optional[str] = None,
private_key_path: Optional[str] = None,
):
"""Initialize the CloudKit client.
Args:
container_id: CloudKit container identifier
environment: 'development' or 'production'
key_id: CloudKit server-to-server key ID
private_key: PEM-encoded EC private key contents
private_key_path: Path to PEM-encoded EC private key file
"""
self.container_id = container_id
self.environment = environment
self.logger = get_logger()
# Load authentication credentials (config defaults > env vars > None)
self.key_id = key_id or os.environ.get("CLOUDKIT_KEY_ID") or CLOUDKIT_KEY_ID
if private_key:
self._private_key_pem = private_key
elif private_key_path:
self._private_key_pem = Path(private_key_path).read_text()
elif os.environ.get("CLOUDKIT_PRIVATE_KEY"):
self._private_key_pem = os.environ["CLOUDKIT_PRIVATE_KEY"]
elif os.environ.get("CLOUDKIT_PRIVATE_KEY_PATH"):
self._private_key_pem = Path(os.environ["CLOUDKIT_PRIVATE_KEY_PATH"]).read_text()
elif CLOUDKIT_PRIVATE_KEY_PATH.exists():
self._private_key_pem = CLOUDKIT_PRIVATE_KEY_PATH.read_text()
else:
self._private_key_pem = None
# Parse the private key if available
self._private_key = None
if self._private_key_pem:
self._private_key = serialization.load_pem_private_key(
self._private_key_pem.encode(),
password=None,
backend=default_backend(),
)
# Token cache
self._token: Optional[str] = None
self._token_expiry: float = 0
# Session for connection pooling
self._session = requests.Session()
@property
def is_configured(self) -> bool:
"""Check if the client has valid authentication credentials."""
return bool(self.key_id and self._private_key)
def _get_api_path(self, operation: str) -> str:
"""Build the full API path for an operation."""
return f"/database/1/{self.container_id}/{self.environment}/public/{operation}"
def _get_token(self) -> str:
"""Get a valid JWT token, generating a new one if needed."""
if not self.is_configured:
raise ValueError(
"CloudKit client not configured. Set CLOUDKIT_KEY_ID and "
"CLOUDKIT_PRIVATE_KEY_PATH environment variables."
)
now = time.time()
# Return cached token if still valid (with 5 min buffer)
if self._token and (self._token_expiry - now) > 300:
return self._token
# Generate new token
expiry = now + self.TOKEN_EXPIRY_SECONDS
payload = {
"iss": self.key_id,
"iat": int(now),
"exp": int(expiry),
"sub": self.container_id,
}
self._token = jwt.encode(
payload,
self._private_key,
algorithm="ES256",
)
self._token_expiry = expiry
return self._token
def _sign_request(self, method: str, path: str, body: Optional[bytes] = None) -> dict:
"""Generate request headers with authentication.
Args:
method: HTTP method
path: API path
body: Request body bytes
Returns:
Dictionary of headers to include in the request
"""
token = self._get_token()
# CloudKit uses date in ISO format
date_str = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
# Calculate body hash
if body:
body_hash = base64.b64encode(
hashlib.sha256(body).digest()
).decode()
else:
body_hash = base64.b64encode(
hashlib.sha256(b"").digest()
).decode()
# Build the message to sign
# Format: date:body_hash:path
message = f"{date_str}:{body_hash}:{path}"
# Sign the message
signature = self._private_key.sign(
message.encode(),
ec.ECDSA(hashes.SHA256()),
)
signature_b64 = base64.b64encode(signature).decode()
return {
"Authorization": f"Bearer {token}",
"X-Apple-CloudKit-Request-KeyID": self.key_id,
"X-Apple-CloudKit-Request-ISO8601Date": date_str,
"X-Apple-CloudKit-Request-SignatureV1": signature_b64,
"Content-Type": "application/json",
}
def _request(
self,
method: str,
operation: str,
body: Optional[dict] = None,
) -> dict:
"""Make a request to the CloudKit API.
Args:
method: HTTP method
operation: API operation path
body: Request body as dictionary
Returns:
Response data as dictionary
Raises:
CloudKitError: If the request fails
"""
path = self._get_api_path(operation)
url = f"{self.BASE_URL}{path}"
body_bytes = json.dumps(body).encode() if body else None
headers = self._sign_request(method, path, body_bytes)
response = self._session.request(
method=method,
url=url,
headers=headers,
data=body_bytes,
)
if response.status_code == 200:
return response.json()
elif response.status_code == 421:
# Authentication required - token may be expired
self._token = None
raise CloudKitAuthError("Authentication failed - check credentials")
elif response.status_code == 429:
raise CloudKitRateLimitError("Rate limit exceeded")
elif response.status_code >= 500:
raise CloudKitServerError(f"Server error: {response.status_code}")
else:
try:
error_data = response.json()
error_msg = error_data.get("serverErrorCode", str(response.status_code))
except (json.JSONDecodeError, KeyError):
error_msg = response.text
raise CloudKitError(f"Request failed: {error_msg}")
def fetch_records(
self,
record_type: RecordType,
record_names: Optional[list[str]] = None,
limit: int = 200,
) -> list[dict]:
"""Fetch records from CloudKit.
Args:
record_type: Type of records to fetch
record_names: Specific record names to fetch (optional)
limit: Maximum records to return (default 200)
Returns:
List of record dictionaries
"""
if record_names:
# Fetch specific records by name
body = {
"records": [{"recordName": name} for name in record_names],
}
response = self._request("POST", "records/lookup", body)
else:
# Query all records of type
body = {
"query": {
"recordType": record_type.value,
},
"resultsLimit": limit,
}
response = self._request("POST", "records/query", body)
records = response.get("records", [])
return [r for r in records if "recordName" in r]
def fetch_all_records(self, record_type: RecordType) -> list[dict]:
"""Fetch all records of a type using pagination.
Args:
record_type: Type of records to fetch
Returns:
List of all record dictionaries
"""
all_records = []
continuation_marker = None
while True:
body = {
"query": {
"recordType": record_type.value,
},
"resultsLimit": 200,
}
if continuation_marker:
body["continuationMarker"] = continuation_marker
response = self._request("POST", "records/query", body)
records = response.get("records", [])
all_records.extend([r for r in records if "recordName" in r])
continuation_marker = response.get("continuationMarker")
if not continuation_marker:
break
return all_records
def save_records(self, records: list[CloudKitRecord]) -> BatchResult:
"""Save records to CloudKit (create or update).
Args:
records: List of records to save
Returns:
BatchResult with success/failure details
"""
result = BatchResult()
# Process in batches
for i in range(0, len(records), CLOUDKIT_BATCH_SIZE):
batch = records[i:i + CLOUDKIT_BATCH_SIZE]
batch_result = self._save_batch(batch)
result.successful.extend(batch_result.successful)
result.failed.extend(batch_result.failed)
return result
def _save_batch(self, records: list[CloudKitRecord]) -> BatchResult:
"""Save a single batch of records.
Args:
records: List of records (max CLOUDKIT_BATCH_SIZE)
Returns:
BatchResult with success/failure details
"""
result = BatchResult()
operations = []
for record in records:
op = {
"operationType": "forceReplace",
"record": record.to_cloudkit_dict(),
}
operations.append(op)
body = {"operations": operations}
try:
response = self._request("POST", "records/modify", body)
except CloudKitError as e:
# Entire batch failed
for record in records:
result.failed.append(OperationResult(
record_name=record.record_name,
success=False,
error_message=str(e),
))
return result
# Process individual results
for record_data in response.get("records", []):
record_name = record_data.get("recordName", "unknown")
if "serverErrorCode" in record_data:
result.failed.append(OperationResult(
record_name=record_name,
success=False,
error_code=record_data.get("serverErrorCode"),
error_message=record_data.get("reason"),
))
else:
result.successful.append(OperationResult(
record_name=record_name,
success=True,
record_change_tag=record_data.get("recordChangeTag"),
))
return result
def delete_records(
self,
record_type: RecordType,
records: list[dict],
) -> BatchResult:
"""Delete records from CloudKit.
Args:
record_type: Type of records to delete
records: List of record dicts (must have recordName and recordChangeTag)
Returns:
BatchResult with success/failure details
"""
result = BatchResult()
# Process in batches
for i in range(0, len(records), CLOUDKIT_BATCH_SIZE):
batch = records[i:i + CLOUDKIT_BATCH_SIZE]
operations = []
for record in batch:
operations.append({
"operationType": "delete",
"record": {
"recordName": record["recordName"],
"recordChangeTag": record.get("recordChangeTag"),
},
})
body = {"operations": operations}
try:
response = self._request("POST", "records/modify", body)
except CloudKitError as e:
for record in batch:
result.failed.append(OperationResult(
record_name=record["recordName"],
success=False,
error_message=str(e),
))
continue
for record_data in response.get("records", []):
record_name = record_data.get("recordName", "unknown")
if "serverErrorCode" in record_data:
result.failed.append(OperationResult(
record_name=record_name,
success=False,
error_code=record_data.get("serverErrorCode"),
error_message=record_data.get("reason"),
))
else:
result.successful.append(OperationResult(
record_name=record_name,
success=True,
))
return result
class CloudKitError(Exception):
"""Base exception for CloudKit errors."""
pass
class CloudKitAuthError(CloudKitError):
"""Authentication error."""
pass
class CloudKitRateLimitError(CloudKitError):
"""Rate limit exceeded."""
pass
class CloudKitServerError(CloudKitError):
"""Server-side error."""
pass

View File

@@ -0,0 +1,741 @@
"""Record differ for CloudKit uploads.
This module compares local records with CloudKit records to determine
what needs to be created, updated, or deleted.
Field names must match CKModels.swift exactly:
- Stadium: stadiumId, canonicalId, name, city, state, location (CLLocation),
capacity, yearOpened, imageURL, sport
- Team: teamId, canonicalId, name, abbreviation, sport, city, stadiumCanonicalId,
logoURL, primaryColor, secondaryColor
- Game: gameId, canonicalId, homeTeamCanonicalId, awayTeamCanonicalId,
stadiumCanonicalId, dateTime, sport, season, isPlayoff, broadcastInfo
- TeamAlias: aliasId, teamCanonicalId, aliasType, aliasValue, validFrom, validUntil
- StadiumAlias: aliasName, stadiumCanonicalId, validFrom, validUntil
- Sport: sportId, abbreviation, displayName, iconName, colorHex,
seasonStartMonth, seasonEndMonth, isActive
- LeagueStructure: structureId, sport, type, name, abbreviation, parentId, displayOrder
"""
from dataclasses import dataclass, field
from datetime import datetime, date
from enum import Enum
from typing import Any, Optional
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import TeamAlias, StadiumAlias, AliasType
from ..models.sport import Sport, LeagueStructure
from .cloudkit import CloudKitRecord, RecordType
def _date_to_datetime(d: Optional[date]) -> Optional[datetime]:
"""Convert a date to a datetime at midnight UTC.
CloudKit TIMESTAMP fields require datetime, not date.
"""
if d is None:
return None
return datetime(d.year, d.month, d.day, 0, 0, 0)
class DiffAction(str, Enum):
"""Action to take for a record."""
CREATE = "create"
UPDATE = "update"
DELETE = "delete"
UNCHANGED = "unchanged"
@dataclass
class RecordDiff:
"""Represents the difference between local and remote records.
Attributes:
record_name: Canonical record ID
record_type: CloudKit record type
action: Action to take (create, update, delete, unchanged)
local_record: Local CloudKitRecord (None if delete)
remote_record: Remote record dict (None if create)
changed_fields: List of field names that changed (for update)
record_change_tag: Remote record's change tag (for update)
"""
record_name: str
record_type: RecordType
action: DiffAction
local_record: Optional[CloudKitRecord] = None
remote_record: Optional[dict] = None
changed_fields: list[str] = field(default_factory=list)
record_change_tag: Optional[str] = None
@dataclass
class DiffResult:
"""Result of diffing local and remote records.
Attributes:
creates: Records to create
updates: Records to update
deletes: Records to delete (record names)
unchanged: Records with no changes
"""
creates: list[RecordDiff] = field(default_factory=list)
updates: list[RecordDiff] = field(default_factory=list)
deletes: list[RecordDiff] = field(default_factory=list)
unchanged: list[RecordDiff] = field(default_factory=list)
@property
def create_count(self) -> int:
return len(self.creates)
@property
def update_count(self) -> int:
return len(self.updates)
@property
def delete_count(self) -> int:
return len(self.deletes)
@property
def unchanged_count(self) -> int:
return len(self.unchanged)
@property
def total_changes(self) -> int:
return self.create_count + self.update_count + self.delete_count
def get_records_to_upload(self) -> list[CloudKitRecord]:
"""Get all records that need to be uploaded (creates + updates)."""
records = []
for diff in self.creates:
if diff.local_record:
records.append(diff.local_record)
for diff in self.updates:
if diff.local_record:
# Add change tag for update
diff.local_record.record_change_tag = diff.record_change_tag
records.append(diff.local_record)
return records
class RecordDiffer:
"""Compares local records with CloudKit records.
Field names must match CKModels.swift field keys exactly (camelCase).
"""
# Fields to compare for each record type (matching CKModels.swift keys)
GAME_FIELDS = [
"gameId", "canonicalId", "sport", "season", "dateTime",
"homeTeamCanonicalId", "awayTeamCanonicalId", "stadiumCanonicalId",
"isPlayoff", "broadcastInfo",
]
TEAM_FIELDS = [
"teamId", "canonicalId", "sport", "city", "name", "abbreviation",
"stadiumCanonicalId", "logoURL", "primaryColor", "secondaryColor",
]
STADIUM_FIELDS = [
"stadiumId", "canonicalId", "sport", "name", "city", "state",
"location", "capacity", "yearOpened", "imageURL",
]
TEAM_ALIAS_FIELDS = [
"aliasId", "teamCanonicalId", "aliasType", "aliasValue",
"validFrom", "validUntil",
]
STADIUM_ALIAS_FIELDS = [
"aliasName", "stadiumCanonicalId", "validFrom", "validUntil",
]
SPORT_FIELDS = [
"sportId", "abbreviation", "displayName", "iconName",
"colorHex", "seasonStartMonth", "seasonEndMonth", "isActive",
]
LEAGUE_STRUCTURE_FIELDS = [
"structureId", "sport", "type", "name", "abbreviation",
"parentId", "displayOrder",
]
def diff_games(
self,
local_games: list[Game],
remote_records: list[dict],
) -> DiffResult:
"""Diff local games against remote CloudKit records.
Args:
local_games: List of local Game objects
remote_records: List of remote record dictionaries
Returns:
DiffResult with creates, updates, deletes
"""
local_records = [self._game_to_record(g) for g in local_games]
return self._diff_records(
local_records,
remote_records,
RecordType.GAME,
self.GAME_FIELDS,
)
def diff_teams(
self,
local_teams: list[Team],
remote_records: list[dict],
) -> DiffResult:
"""Diff local teams against remote CloudKit records.
Args:
local_teams: List of local Team objects
remote_records: List of remote record dictionaries
Returns:
DiffResult with creates, updates, deletes
"""
local_records = [self._team_to_record(t) for t in local_teams]
return self._diff_records(
local_records,
remote_records,
RecordType.TEAM,
self.TEAM_FIELDS,
)
def diff_stadiums(
self,
local_stadiums: list[Stadium],
remote_records: list[dict],
) -> DiffResult:
"""Diff local stadiums against remote CloudKit records.
Args:
local_stadiums: List of local Stadium objects
remote_records: List of remote record dictionaries
Returns:
DiffResult with creates, updates, deletes
"""
local_records = [self._stadium_to_record(s) for s in local_stadiums]
return self._diff_records(
local_records,
remote_records,
RecordType.STADIUM,
self.STADIUM_FIELDS,
)
def diff_team_aliases(
self,
local_aliases: list[TeamAlias],
remote_records: list[dict],
) -> DiffResult:
"""Diff local team aliases against remote CloudKit records.
Args:
local_aliases: List of local TeamAlias objects
remote_records: List of remote record dictionaries
Returns:
DiffResult with creates, updates, deletes
"""
local_records = [self._team_alias_to_record(a) for a in local_aliases]
return self._diff_records(
local_records,
remote_records,
RecordType.TEAM_ALIAS,
self.TEAM_ALIAS_FIELDS,
)
def diff_stadium_aliases(
self,
local_aliases: list[StadiumAlias],
remote_records: list[dict],
) -> DiffResult:
"""Diff local stadium aliases against remote CloudKit records.
Args:
local_aliases: List of local StadiumAlias objects
remote_records: List of remote record dictionaries
Returns:
DiffResult with creates, updates, deletes
"""
local_records = [self._stadium_alias_to_record(a) for a in local_aliases]
return self._diff_records(
local_records,
remote_records,
RecordType.STADIUM_ALIAS,
self.STADIUM_ALIAS_FIELDS,
)
def diff_sports(
self,
local_sports: list[Sport],
remote_records: list[dict],
) -> DiffResult:
"""Diff local sports against remote CloudKit records.
Args:
local_sports: List of local Sport objects
remote_records: List of remote record dictionaries
Returns:
DiffResult with creates, updates, deletes
"""
local_records = [self._sport_to_record(s) for s in local_sports]
return self._diff_records(
local_records,
remote_records,
RecordType.SPORT,
self.SPORT_FIELDS,
)
def diff_league_structures(
self,
local_structures: list[LeagueStructure],
remote_records: list[dict],
) -> DiffResult:
"""Diff local league structures against remote CloudKit records.
Args:
local_structures: List of local LeagueStructure objects
remote_records: List of remote record dictionaries
Returns:
DiffResult with creates, updates, deletes
"""
local_records = [self._league_structure_to_record(s) for s in local_structures]
return self._diff_records(
local_records,
remote_records,
RecordType.LEAGUE_STRUCTURE,
self.LEAGUE_STRUCTURE_FIELDS,
)
def _diff_records(
self,
local_records: list[CloudKitRecord],
remote_records: list[dict],
record_type: RecordType,
compare_fields: list[str],
) -> DiffResult:
"""Compare local and remote records.
Args:
local_records: List of local CloudKitRecord objects
remote_records: List of remote record dictionaries
record_type: Type of records being compared
compare_fields: List of field names to compare
Returns:
DiffResult with categorized differences
"""
result = DiffResult()
# Index remote records by name
remote_by_name: dict[str, dict] = {}
for record in remote_records:
name = record.get("recordName")
if name:
remote_by_name[name] = record
# Index local records by name
local_by_name: dict[str, CloudKitRecord] = {}
for record in local_records:
local_by_name[record.record_name] = record
# Find creates and updates
for local_record in local_records:
remote = remote_by_name.get(local_record.record_name)
if remote is None:
# New record
result.creates.append(RecordDiff(
record_name=local_record.record_name,
record_type=record_type,
action=DiffAction.CREATE,
local_record=local_record,
))
else:
# Check for changes
changed_fields = self._compare_fields(
local_record.fields,
remote.get("fields", {}),
compare_fields,
)
if changed_fields:
result.updates.append(RecordDiff(
record_name=local_record.record_name,
record_type=record_type,
action=DiffAction.UPDATE,
local_record=local_record,
remote_record=remote,
changed_fields=changed_fields,
record_change_tag=remote.get("recordChangeTag"),
))
else:
result.unchanged.append(RecordDiff(
record_name=local_record.record_name,
record_type=record_type,
action=DiffAction.UNCHANGED,
local_record=local_record,
remote_record=remote,
record_change_tag=remote.get("recordChangeTag"),
))
# Find deletes (remote records not in local)
local_names = set(local_by_name.keys())
for remote_name, remote in remote_by_name.items():
if remote_name not in local_names:
result.deletes.append(RecordDiff(
record_name=remote_name,
record_type=record_type,
action=DiffAction.DELETE,
remote_record=remote,
record_change_tag=remote.get("recordChangeTag"),
))
return result
def _compare_fields(
self,
local_fields: dict[str, Any],
remote_fields: dict[str, dict],
compare_fields: list[str],
) -> list[str]:
"""Compare field values between local and remote.
Args:
local_fields: Local field values
remote_fields: Remote field values (CloudKit format)
compare_fields: Fields to compare
Returns:
List of field names that differ
"""
changed = []
for field_name in compare_fields:
local_value = local_fields.get(field_name)
remote_field = remote_fields.get(field_name, {})
remote_value = remote_field.get("value") if remote_field else None
# Normalize values for comparison
local_normalized = self._normalize_value(local_value)
remote_normalized = self._normalize_remote_value(remote_value, remote_field)
if local_normalized != remote_normalized:
changed.append(field_name)
return changed
def _normalize_value(self, value: Any) -> Any:
"""Normalize a local value for comparison."""
if value is None:
return None
if isinstance(value, datetime):
# Convert to milliseconds since epoch
return int(value.timestamp() * 1000)
if isinstance(value, float):
# Round to 6 decimal places for coordinate comparison
return round(value, 6)
return value
def _normalize_remote_value(self, value: Any, field_data: dict) -> Any:
"""Normalize a remote CloudKit value for comparison."""
if value is None:
return None
field_type = field_data.get("type", "")
if field_type == "TIMESTAMP":
# Already in milliseconds
return value
if field_type == "DOUBLE":
return round(value, 6)
if field_type == "LOCATION":
# Return as tuple for comparison
if isinstance(value, dict):
return (
round(value.get("latitude", 0), 6),
round(value.get("longitude", 0), 6),
)
return value
def _game_to_record(self, game: Game) -> CloudKitRecord:
"""Convert a Game to a CloudKitRecord.
Field names match CKGame keys in CKModels.swift:
- gameId, canonicalId: Unique identifiers
- homeTeamCanonicalId, awayTeamCanonicalId, stadiumCanonicalId: References as strings
- dateTime: Game time as datetime (will be converted to TIMESTAMP)
- sport: Sport code uppercase (e.g., "MLB")
- season: Season string (e.g., "2025-26" or "2026")
- isPlayoff: Boolean as int (1 or 0)
- broadcastInfo: Optional broadcast network string
"""
# Format season as string
sport_lower = game.sport.lower()
if sport_lower in ("nba", "nhl"):
season_str = f"{game.season}-{str(game.season + 1)[-2:]}"
else:
season_str = str(game.season)
return CloudKitRecord(
record_name=game.id,
record_type=RecordType.GAME,
fields={
"gameId": game.id,
"canonicalId": game.id,
"sport": game.sport.upper(),
"season": season_str,
"dateTime": game.game_date,
"homeTeamCanonicalId": game.home_team_id,
"awayTeamCanonicalId": game.away_team_id,
"stadiumCanonicalId": game.stadium_id,
"isPlayoff": False, # Default, can be overridden
"broadcastInfo": None, # Default, can be overridden
},
)
def _team_to_record(self, team: Team) -> CloudKitRecord:
"""Convert a Team to a CloudKitRecord.
Field names match CKTeam keys in CKModels.swift:
- teamId, canonicalId: Unique identifiers
- name, abbreviation, city: Team info
- sport: Sport code uppercase (e.g., "NBA")
- stadiumCanonicalId: Home stadium canonical ID string
- logoURL: URL string for team logo
- primaryColor, secondaryColor: Hex color strings
"""
return CloudKitRecord(
record_name=team.id,
record_type=RecordType.TEAM,
fields={
"teamId": team.id,
"canonicalId": team.id,
"sport": team.sport.upper(),
"city": team.city,
"name": team.name,
"abbreviation": team.abbreviation,
"stadiumCanonicalId": team.stadium_id,
"logoURL": team.logo_url,
"primaryColor": team.primary_color,
"secondaryColor": team.secondary_color,
},
)
def _stadium_to_record(self, stadium: Stadium) -> CloudKitRecord:
"""Convert a Stadium to a CloudKitRecord.
Field names match CKStadium keys in CKModels.swift:
- stadiumId, canonicalId: Unique identifiers
- name, city, state: Location info
- location: CloudKit LOCATION type with latitude/longitude
- capacity: Seating capacity as int
- yearOpened: Year opened as int
- imageURL: URL string for stadium image
- sport: Sport code uppercase (e.g., "MLB")
"""
return CloudKitRecord(
record_name=stadium.id,
record_type=RecordType.STADIUM,
fields={
"stadiumId": stadium.id,
"canonicalId": stadium.id,
"sport": stadium.sport.upper(),
"name": stadium.name,
"city": stadium.city,
"state": stadium.state,
# CloudKit LOCATION type expects dict with latitude/longitude
"location": {
"latitude": stadium.latitude,
"longitude": stadium.longitude,
},
"capacity": stadium.capacity,
"yearOpened": stadium.opened_year,
"imageURL": stadium.image_url,
},
)
def _team_alias_to_record(self, alias: TeamAlias) -> CloudKitRecord:
"""Convert a TeamAlias to a CloudKitRecord.
Field names match CKTeamAlias keys in CKModels.swift:
- aliasId: Unique identifier
- teamCanonicalId: The canonical team this alias resolves to
- aliasType: Type of alias ("abbreviation", "name", "city")
- aliasValue: The alias value to match
- validFrom, validUntil: Optional date bounds
- schemaVersion, lastModified: Versioning fields
"""
return CloudKitRecord(
record_name=alias.id,
record_type=RecordType.TEAM_ALIAS,
fields={
"aliasId": alias.id,
"teamCanonicalId": alias.team_canonical_id,
"aliasType": alias.alias_type.value,
"aliasValue": alias.alias_value,
"validFrom": _date_to_datetime(alias.valid_from),
"validUntil": _date_to_datetime(alias.valid_until),
"schemaVersion": 1,
"lastModified": datetime.utcnow(),
},
)
def _stadium_alias_to_record(self, alias: StadiumAlias) -> CloudKitRecord:
"""Convert a StadiumAlias to a CloudKitRecord.
Field names match CKStadiumAlias keys in CKModels.swift:
- aliasName: The alias name (used as record name/primary key)
- stadiumCanonicalId: The canonical stadium this alias resolves to
- validFrom, validUntil: Optional date bounds
- schemaVersion, lastModified: Versioning fields
"""
# Record name must be unique - combine alias name with stadium ID
# to handle cases like "yankee stadium" mapping to both MLB and MLS stadiums
record_name = f"{alias.alias_name.lower()}|{alias.stadium_canonical_id}"
return CloudKitRecord(
record_name=record_name,
record_type=RecordType.STADIUM_ALIAS,
fields={
"aliasName": alias.alias_name.lower(),
"stadiumCanonicalId": alias.stadium_canonical_id,
"validFrom": _date_to_datetime(alias.valid_from),
"validUntil": _date_to_datetime(alias.valid_until),
"schemaVersion": 1,
"lastModified": datetime.utcnow(),
},
)
def _sport_to_record(self, sport: Sport) -> CloudKitRecord:
"""Convert a Sport to a CloudKitRecord.
Field names match CKSport keys in CKModels.swift:
- sportId: Unique identifier (e.g., 'MLB', 'NBA')
- abbreviation: Sport abbreviation
- displayName: Full display name
- iconName: SF Symbol name
- colorHex: Primary color as hex string
- seasonStartMonth, seasonEndMonth: Season boundary months (1-12)
- isActive: Whether sport is currently supported
- schemaVersion, lastModified: Versioning fields
"""
return CloudKitRecord(
record_name=sport.id,
record_type=RecordType.SPORT,
fields={
"sportId": sport.id,
"abbreviation": sport.abbreviation,
"displayName": sport.display_name,
"iconName": sport.icon_name,
"colorHex": sport.color_hex,
"seasonStartMonth": sport.season_start_month,
"seasonEndMonth": sport.season_end_month,
"isActive": sport.is_active,
"schemaVersion": 1,
"lastModified": datetime.utcnow(),
},
)
def _league_structure_to_record(self, structure: LeagueStructure) -> CloudKitRecord:
"""Convert a LeagueStructure to a CloudKitRecord.
Field names match CKLeagueStructure keys in CKModels.swift:
- structureId: Unique identifier (e.g., 'nba_eastern', 'mlb_al_east')
- sport: Sport code (e.g., 'NBA', 'MLB')
- type: Structure type ('conference', 'division', 'league')
- name: Full name
- abbreviation: Optional abbreviation
- parentId: Parent structure ID (e.g., division's parent is conference)
- displayOrder: Order for display (0-indexed)
- schemaVersion, lastModified: Versioning fields
"""
return CloudKitRecord(
record_name=structure.id,
record_type=RecordType.LEAGUE_STRUCTURE,
fields={
"structureId": structure.id,
"sport": structure.sport.upper(),
"type": structure.structure_type.value,
"name": structure.name,
"abbreviation": structure.abbreviation,
"parentId": structure.parent_id,
"displayOrder": structure.display_order,
"schemaVersion": 1,
"lastModified": datetime.utcnow(),
},
)
def game_to_cloudkit_record(game: Game) -> CloudKitRecord:
"""Convert a Game to a CloudKitRecord.
Convenience function for external use.
"""
differ = RecordDiffer()
return differ._game_to_record(game)
def team_to_cloudkit_record(team: Team) -> CloudKitRecord:
"""Convert a Team to a CloudKitRecord.
Convenience function for external use.
"""
differ = RecordDiffer()
return differ._team_to_record(team)
def stadium_to_cloudkit_record(stadium: Stadium) -> CloudKitRecord:
"""Convert a Stadium to a CloudKitRecord.
Convenience function for external use.
"""
differ = RecordDiffer()
return differ._stadium_to_record(stadium)
def team_alias_to_cloudkit_record(alias: TeamAlias) -> CloudKitRecord:
"""Convert a TeamAlias to a CloudKitRecord.
Convenience function for external use.
"""
differ = RecordDiffer()
return differ._team_alias_to_record(alias)
def stadium_alias_to_cloudkit_record(alias: StadiumAlias) -> CloudKitRecord:
"""Convert a StadiumAlias to a CloudKitRecord.
Convenience function for external use.
"""
differ = RecordDiffer()
return differ._stadium_alias_to_record(alias)
def sport_to_cloudkit_record(sport: Sport) -> CloudKitRecord:
"""Convert a Sport to a CloudKitRecord.
Convenience function for external use.
"""
differ = RecordDiffer()
return differ._sport_to_record(sport)
def league_structure_to_cloudkit_record(structure: LeagueStructure) -> CloudKitRecord:
"""Convert a LeagueStructure to a CloudKitRecord.
Convenience function for external use.
"""
differ = RecordDiffer()
return differ._league_structure_to_record(structure)

View File

@@ -0,0 +1,384 @@
"""Upload state manager for resumable uploads.
This module tracks upload progress to enable resuming interrupted uploads.
State is persisted to JSON files in the .parser_state directory.
"""
import json
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Optional
from ..config import STATE_DIR
@dataclass
class RecordState:
"""State of an individual record upload.
Attributes:
record_name: Canonical record ID
record_type: CloudKit record type
uploaded_at: Timestamp when successfully uploaded
record_change_tag: CloudKit version tag
status: 'pending', 'uploaded', 'failed'
error_message: Error message if failed
retry_count: Number of retry attempts
"""
record_name: str
record_type: str
uploaded_at: Optional[datetime] = None
record_change_tag: Optional[str] = None
status: str = "pending"
error_message: Optional[str] = None
retry_count: int = 0
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"record_name": self.record_name,
"record_type": self.record_type,
"uploaded_at": self.uploaded_at.isoformat() if self.uploaded_at else None,
"record_change_tag": self.record_change_tag,
"status": self.status,
"error_message": self.error_message,
"retry_count": self.retry_count,
}
@classmethod
def from_dict(cls, data: dict) -> "RecordState":
"""Create RecordState from dictionary."""
uploaded_at = data.get("uploaded_at")
if uploaded_at:
uploaded_at = datetime.fromisoformat(uploaded_at)
return cls(
record_name=data["record_name"],
record_type=data["record_type"],
uploaded_at=uploaded_at,
record_change_tag=data.get("record_change_tag"),
status=data.get("status", "pending"),
error_message=data.get("error_message"),
retry_count=data.get("retry_count", 0),
)
@dataclass
class UploadSession:
"""Tracks the state of an upload session.
Attributes:
sport: Sport code
season: Season start year
environment: CloudKit environment
started_at: When the upload session started
last_updated: When the state was last updated
records: Dictionary of record_name -> RecordState
total_count: Total number of records to upload
"""
sport: str
season: int
environment: str
started_at: datetime = field(default_factory=datetime.utcnow)
last_updated: datetime = field(default_factory=datetime.utcnow)
records: dict[str, RecordState] = field(default_factory=dict)
total_count: int = 0
@property
def uploaded_count(self) -> int:
"""Count of successfully uploaded records."""
return sum(1 for r in self.records.values() if r.status == "uploaded")
@property
def pending_count(self) -> int:
"""Count of pending records."""
return sum(1 for r in self.records.values() if r.status == "pending")
@property
def failed_count(self) -> int:
"""Count of failed records."""
return sum(1 for r in self.records.values() if r.status == "failed")
@property
def is_complete(self) -> bool:
"""Check if all records have been processed."""
return self.pending_count == 0
@property
def progress_percent(self) -> float:
"""Calculate upload progress as percentage."""
if self.total_count == 0:
return 100.0
return (self.uploaded_count / self.total_count) * 100
def get_pending_records(self) -> list[str]:
"""Get list of record names that still need to be uploaded."""
return [
name for name, state in self.records.items()
if state.status == "pending"
]
def get_failed_records(self) -> list[str]:
"""Get list of record names that failed to upload."""
return [
name for name, state in self.records.items()
if state.status == "failed"
]
def get_retryable_records(self, max_retries: int = 3) -> list[str]:
"""Get failed records that can be retried."""
return [
name for name, state in self.records.items()
if state.status == "failed" and state.retry_count < max_retries
]
def mark_uploaded(
self,
record_name: str,
record_change_tag: Optional[str] = None,
) -> None:
"""Mark a record as successfully uploaded."""
if record_name in self.records:
state = self.records[record_name]
state.status = "uploaded"
state.uploaded_at = datetime.utcnow()
state.record_change_tag = record_change_tag
state.error_message = None
self.last_updated = datetime.utcnow()
def mark_failed(self, record_name: str, error_message: str) -> None:
"""Mark a record as failed."""
if record_name in self.records:
state = self.records[record_name]
state.status = "failed"
state.error_message = error_message
state.retry_count += 1
self.last_updated = datetime.utcnow()
def mark_pending(self, record_name: str) -> None:
"""Mark a record as pending (for retry)."""
if record_name in self.records:
state = self.records[record_name]
state.status = "pending"
state.error_message = None
self.last_updated = datetime.utcnow()
def add_record(self, record_name: str, record_type: str) -> None:
"""Add a new record to track."""
if record_name not in self.records:
self.records[record_name] = RecordState(
record_name=record_name,
record_type=record_type,
)
self.total_count = len(self.records)
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"sport": self.sport,
"season": self.season,
"environment": self.environment,
"started_at": self.started_at.isoformat(),
"last_updated": self.last_updated.isoformat(),
"total_count": self.total_count,
"records": {
name: state.to_dict()
for name, state in self.records.items()
},
}
@classmethod
def from_dict(cls, data: dict) -> "UploadSession":
"""Create UploadSession from dictionary."""
session = cls(
sport=data["sport"],
season=data["season"],
environment=data["environment"],
started_at=datetime.fromisoformat(data["started_at"]),
last_updated=datetime.fromisoformat(data["last_updated"]),
total_count=data.get("total_count", 0),
)
for name, record_data in data.get("records", {}).items():
session.records[name] = RecordState.from_dict(record_data)
return session
class StateManager:
"""Manages upload state persistence.
State files are stored in .parser_state/ with naming convention:
upload_state_{sport}_{season}_{environment}.json
"""
def __init__(self, state_dir: Optional[Path] = None):
"""Initialize the state manager.
Args:
state_dir: Directory for state files (default: .parser_state/)
"""
self.state_dir = state_dir or STATE_DIR
self.state_dir.mkdir(parents=True, exist_ok=True)
def _get_state_file(self, sport: str, season: int, environment: str) -> Path:
"""Get the path to a state file."""
return self.state_dir / f"upload_state_{sport}_{season}_{environment}.json"
def load_session(
self,
sport: str,
season: int,
environment: str,
) -> Optional[UploadSession]:
"""Load an existing upload session.
Args:
sport: Sport code
season: Season start year
environment: CloudKit environment
Returns:
UploadSession if exists, None otherwise
"""
state_file = self._get_state_file(sport, season, environment)
if not state_file.exists():
return None
try:
with open(state_file, "r", encoding="utf-8") as f:
data = json.load(f)
return UploadSession.from_dict(data)
except (json.JSONDecodeError, KeyError) as e:
# Corrupted state file
return None
def save_session(self, session: UploadSession) -> None:
"""Save an upload session to disk.
Args:
session: The session to save
"""
state_file = self._get_state_file(
session.sport,
session.season,
session.environment,
)
session.last_updated = datetime.utcnow()
with open(state_file, "w", encoding="utf-8") as f:
json.dump(session.to_dict(), f, indent=2)
def create_session(
self,
sport: str,
season: int,
environment: str,
record_names: list[tuple[str, str]], # (record_name, record_type)
) -> UploadSession:
"""Create a new upload session.
Args:
sport: Sport code
season: Season start year
environment: CloudKit environment
record_names: List of (record_name, record_type) tuples
Returns:
New UploadSession
"""
session = UploadSession(
sport=sport,
season=season,
environment=environment,
)
for record_name, record_type in record_names:
session.add_record(record_name, record_type)
self.save_session(session)
return session
def delete_session(self, sport: str, season: int, environment: str) -> bool:
"""Delete an upload session state file.
Args:
sport: Sport code
season: Season start year
environment: CloudKit environment
Returns:
True if deleted, False if not found
"""
state_file = self._get_state_file(sport, season, environment)
if state_file.exists():
state_file.unlink()
return True
return False
def list_sessions(self) -> list[dict]:
"""List all upload sessions.
Returns:
List of session summaries
"""
sessions = []
for state_file in self.state_dir.glob("upload_state_*.json"):
try:
with open(state_file, "r", encoding="utf-8") as f:
data = json.load(f)
session = UploadSession.from_dict(data)
sessions.append({
"sport": session.sport,
"season": session.season,
"environment": session.environment,
"started_at": session.started_at.isoformat(),
"last_updated": session.last_updated.isoformat(),
"progress": f"{session.uploaded_count}/{session.total_count}",
"progress_percent": f"{session.progress_percent:.1f}%",
"status": "complete" if session.is_complete else "in_progress",
"failed_count": session.failed_count,
})
except (json.JSONDecodeError, KeyError):
continue
return sessions
def get_session_or_create(
self,
sport: str,
season: int,
environment: str,
record_names: list[tuple[str, str]],
resume: bool = False,
) -> UploadSession:
"""Get existing session or create new one.
Args:
sport: Sport code
season: Season start year
environment: CloudKit environment
record_names: List of (record_name, record_type) tuples
resume: Whether to resume existing session
Returns:
UploadSession (existing or new)
"""
if resume:
existing = self.load_session(sport, season, environment)
if existing:
# Add any new records not in existing session
existing_names = set(existing.records.keys())
for record_name, record_type in record_names:
if record_name not in existing_names:
existing.add_record(record_name, record_type)
return existing
# Create new session (overwrites existing)
return self.create_session(sport, season, environment, record_names)