feat(scripts): rewrite parser as modular Python CLI

Replace monolithic scraping scripts with sportstime_parser package:

- Multi-source scrapers with automatic fallback for 7 sports
- Canonical ID generation for games, teams, and stadiums
- Fuzzy matching with configurable thresholds for name resolution
- CloudKit Web Services uploader with JWT auth, diff-based updates
- Resumable uploads with checkpoint state persistence
- Validation reports with manual review items and suggested matches
- Comprehensive test suite (249 tests)

CLI: sportstime-parser scrape|validate|upload|status|retry|clear

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-10 21:06:12 -06:00
parent 284a10d9e1
commit eeaf900e5a
109 changed files with 18415 additions and 266211 deletions

View File

@@ -0,0 +1,52 @@
"""CloudKit uploaders for sportstime-parser."""
from .cloudkit import (
CloudKitClient,
CloudKitRecord,
CloudKitError,
CloudKitAuthError,
CloudKitRateLimitError,
CloudKitServerError,
RecordType,
OperationResult,
BatchResult,
)
from .state import (
RecordState,
UploadSession,
StateManager,
)
from .diff import (
DiffAction,
RecordDiff,
DiffResult,
RecordDiffer,
game_to_cloudkit_record,
team_to_cloudkit_record,
stadium_to_cloudkit_record,
)
__all__ = [
# CloudKit client
"CloudKitClient",
"CloudKitRecord",
"CloudKitError",
"CloudKitAuthError",
"CloudKitRateLimitError",
"CloudKitServerError",
"RecordType",
"OperationResult",
"BatchResult",
# State manager
"RecordState",
"UploadSession",
"StateManager",
# Differ
"DiffAction",
"RecordDiff",
"DiffResult",
"RecordDiffer",
"game_to_cloudkit_record",
"team_to_cloudkit_record",
"stadium_to_cloudkit_record",
]

View File

@@ -0,0 +1,565 @@
"""CloudKit Web Services client for sportstime-parser.
This module provides a client for uploading data to CloudKit using the
CloudKit Web Services API. It handles JWT authentication, request signing,
and batch operations.
Reference: https://developer.apple.com/documentation/cloudkitwebservices
"""
import base64
import hashlib
import json
import os
import time
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Optional
from enum import Enum
import jwt
import requests
from cryptography.hazmat.primitives import hashes, serialization
from cryptography.hazmat.primitives.asymmetric import ec
from cryptography.hazmat.backends import default_backend
from ..config import (
CLOUDKIT_CONTAINER_ID,
CLOUDKIT_ENVIRONMENT,
CLOUDKIT_BATCH_SIZE,
)
from ..utils.logging import get_logger
class RecordType(str, Enum):
"""CloudKit record types for SportsTime."""
GAME = "Game"
TEAM = "Team"
STADIUM = "Stadium"
TEAM_ALIAS = "TeamAlias"
STADIUM_ALIAS = "StadiumAlias"
@dataclass
class CloudKitRecord:
"""Represents a CloudKit record for upload.
Attributes:
record_name: Unique record identifier (canonical ID)
record_type: CloudKit record type
fields: Dictionary of field name -> field value
record_change_tag: Version tag for conflict detection (None for new records)
"""
record_name: str
record_type: RecordType
fields: dict[str, Any]
record_change_tag: Optional[str] = None
def to_cloudkit_dict(self) -> dict:
"""Convert to CloudKit API format."""
record = {
"recordName": self.record_name,
"recordType": self.record_type.value,
"fields": self._format_fields(),
}
if self.record_change_tag:
record["recordChangeTag"] = self.record_change_tag
return record
def _format_fields(self) -> dict:
"""Format fields for CloudKit API."""
formatted = {}
for key, value in self.fields.items():
if value is None:
continue
formatted[key] = self._format_field_value(value)
return formatted
def _format_field_value(self, value: Any) -> dict:
"""Format a single field value for CloudKit API."""
if isinstance(value, str):
return {"value": value, "type": "STRING"}
elif isinstance(value, int):
return {"value": value, "type": "INT64"}
elif isinstance(value, float):
return {"value": value, "type": "DOUBLE"}
elif isinstance(value, bool):
return {"value": 1 if value else 0, "type": "INT64"}
elif isinstance(value, datetime):
# CloudKit expects milliseconds since epoch
timestamp_ms = int(value.timestamp() * 1000)
return {"value": timestamp_ms, "type": "TIMESTAMP"}
elif isinstance(value, list):
return {"value": value, "type": "STRING_LIST"}
elif isinstance(value, dict) and "latitude" in value and "longitude" in value:
return {
"value": {
"latitude": value["latitude"],
"longitude": value["longitude"],
},
"type": "LOCATION",
}
else:
# Default to string
return {"value": str(value), "type": "STRING"}
@dataclass
class OperationResult:
"""Result of a CloudKit operation."""
record_name: str
success: bool
record_change_tag: Optional[str] = None
error_code: Optional[str] = None
error_message: Optional[str] = None
@dataclass
class BatchResult:
"""Result of a batch CloudKit operation."""
successful: list[OperationResult] = field(default_factory=list)
failed: list[OperationResult] = field(default_factory=list)
@property
def all_succeeded(self) -> bool:
return len(self.failed) == 0
@property
def success_count(self) -> int:
return len(self.successful)
@property
def failure_count(self) -> int:
return len(self.failed)
class CloudKitClient:
"""Client for CloudKit Web Services API.
Handles authentication via server-to-server JWT tokens and provides
methods for CRUD operations on CloudKit records.
Authentication requires:
- Key ID: CloudKit key identifier from Apple Developer Portal
- Private Key: EC private key in PEM format
Environment variables:
- CLOUDKIT_KEY_ID: The key identifier
- CLOUDKIT_PRIVATE_KEY_PATH: Path to the private key file
- CLOUDKIT_PRIVATE_KEY: The private key contents (alternative to path)
"""
BASE_URL = "https://api.apple-cloudkit.com"
TOKEN_EXPIRY_SECONDS = 3600 # 1 hour
def __init__(
self,
container_id: str = CLOUDKIT_CONTAINER_ID,
environment: str = CLOUDKIT_ENVIRONMENT,
key_id: Optional[str] = None,
private_key: Optional[str] = None,
private_key_path: Optional[str] = None,
):
"""Initialize the CloudKit client.
Args:
container_id: CloudKit container identifier
environment: 'development' or 'production'
key_id: CloudKit server-to-server key ID
private_key: PEM-encoded EC private key contents
private_key_path: Path to PEM-encoded EC private key file
"""
self.container_id = container_id
self.environment = environment
self.logger = get_logger()
# Load authentication credentials
self.key_id = key_id or os.environ.get("CLOUDKIT_KEY_ID")
if private_key:
self._private_key_pem = private_key
elif private_key_path:
self._private_key_pem = Path(private_key_path).read_text()
elif os.environ.get("CLOUDKIT_PRIVATE_KEY"):
self._private_key_pem = os.environ["CLOUDKIT_PRIVATE_KEY"]
elif os.environ.get("CLOUDKIT_PRIVATE_KEY_PATH"):
self._private_key_pem = Path(os.environ["CLOUDKIT_PRIVATE_KEY_PATH"]).read_text()
else:
self._private_key_pem = None
# Parse the private key if available
self._private_key = None
if self._private_key_pem:
self._private_key = serialization.load_pem_private_key(
self._private_key_pem.encode(),
password=None,
backend=default_backend(),
)
# Token cache
self._token: Optional[str] = None
self._token_expiry: float = 0
# Session for connection pooling
self._session = requests.Session()
@property
def is_configured(self) -> bool:
"""Check if the client has valid authentication credentials."""
return bool(self.key_id and self._private_key)
def _get_api_path(self, operation: str) -> str:
"""Build the full API path for an operation."""
return f"/database/1/{self.container_id}/{self.environment}/public/{operation}"
def _get_token(self) -> str:
"""Get a valid JWT token, generating a new one if needed."""
if not self.is_configured:
raise ValueError(
"CloudKit client not configured. Set CLOUDKIT_KEY_ID and "
"CLOUDKIT_PRIVATE_KEY_PATH environment variables."
)
now = time.time()
# Return cached token if still valid (with 5 min buffer)
if self._token and (self._token_expiry - now) > 300:
return self._token
# Generate new token
expiry = now + self.TOKEN_EXPIRY_SECONDS
payload = {
"iss": self.key_id,
"iat": int(now),
"exp": int(expiry),
"sub": self.container_id,
}
self._token = jwt.encode(
payload,
self._private_key,
algorithm="ES256",
)
self._token_expiry = expiry
return self._token
def _sign_request(self, method: str, path: str, body: Optional[bytes] = None) -> dict:
"""Generate request headers with authentication.
Args:
method: HTTP method
path: API path
body: Request body bytes
Returns:
Dictionary of headers to include in the request
"""
token = self._get_token()
# CloudKit uses date in ISO format
date_str = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
# Calculate body hash
if body:
body_hash = base64.b64encode(
hashlib.sha256(body).digest()
).decode()
else:
body_hash = base64.b64encode(
hashlib.sha256(b"").digest()
).decode()
# Build the message to sign
# Format: date:body_hash:path
message = f"{date_str}:{body_hash}:{path}"
# Sign the message
signature = self._private_key.sign(
message.encode(),
ec.ECDSA(hashes.SHA256()),
)
signature_b64 = base64.b64encode(signature).decode()
return {
"Authorization": f"Bearer {token}",
"X-Apple-CloudKit-Request-KeyID": self.key_id,
"X-Apple-CloudKit-Request-ISO8601Date": date_str,
"X-Apple-CloudKit-Request-SignatureV1": signature_b64,
"Content-Type": "application/json",
}
def _request(
self,
method: str,
operation: str,
body: Optional[dict] = None,
) -> dict:
"""Make a request to the CloudKit API.
Args:
method: HTTP method
operation: API operation path
body: Request body as dictionary
Returns:
Response data as dictionary
Raises:
CloudKitError: If the request fails
"""
path = self._get_api_path(operation)
url = f"{self.BASE_URL}{path}"
body_bytes = json.dumps(body).encode() if body else None
headers = self._sign_request(method, path, body_bytes)
response = self._session.request(
method=method,
url=url,
headers=headers,
data=body_bytes,
)
if response.status_code == 200:
return response.json()
elif response.status_code == 421:
# Authentication required - token may be expired
self._token = None
raise CloudKitAuthError("Authentication failed - check credentials")
elif response.status_code == 429:
raise CloudKitRateLimitError("Rate limit exceeded")
elif response.status_code >= 500:
raise CloudKitServerError(f"Server error: {response.status_code}")
else:
try:
error_data = response.json()
error_msg = error_data.get("serverErrorCode", str(response.status_code))
except (json.JSONDecodeError, KeyError):
error_msg = response.text
raise CloudKitError(f"Request failed: {error_msg}")
def fetch_records(
self,
record_type: RecordType,
record_names: Optional[list[str]] = None,
limit: int = 200,
) -> list[dict]:
"""Fetch records from CloudKit.
Args:
record_type: Type of records to fetch
record_names: Specific record names to fetch (optional)
limit: Maximum records to return (default 200)
Returns:
List of record dictionaries
"""
if record_names:
# Fetch specific records by name
body = {
"records": [{"recordName": name} for name in record_names],
}
response = self._request("POST", "records/lookup", body)
else:
# Query all records of type
body = {
"query": {
"recordType": record_type.value,
},
"resultsLimit": limit,
}
response = self._request("POST", "records/query", body)
records = response.get("records", [])
return [r for r in records if "recordName" in r]
def fetch_all_records(self, record_type: RecordType) -> list[dict]:
"""Fetch all records of a type using pagination.
Args:
record_type: Type of records to fetch
Returns:
List of all record dictionaries
"""
all_records = []
continuation_marker = None
while True:
body = {
"query": {
"recordType": record_type.value,
},
"resultsLimit": 200,
}
if continuation_marker:
body["continuationMarker"] = continuation_marker
response = self._request("POST", "records/query", body)
records = response.get("records", [])
all_records.extend([r for r in records if "recordName" in r])
continuation_marker = response.get("continuationMarker")
if not continuation_marker:
break
return all_records
def save_records(self, records: list[CloudKitRecord]) -> BatchResult:
"""Save records to CloudKit (create or update).
Args:
records: List of records to save
Returns:
BatchResult with success/failure details
"""
result = BatchResult()
# Process in batches
for i in range(0, len(records), CLOUDKIT_BATCH_SIZE):
batch = records[i:i + CLOUDKIT_BATCH_SIZE]
batch_result = self._save_batch(batch)
result.successful.extend(batch_result.successful)
result.failed.extend(batch_result.failed)
return result
def _save_batch(self, records: list[CloudKitRecord]) -> BatchResult:
"""Save a single batch of records.
Args:
records: List of records (max CLOUDKIT_BATCH_SIZE)
Returns:
BatchResult with success/failure details
"""
result = BatchResult()
operations = []
for record in records:
op = {
"operationType": "forceReplace",
"record": record.to_cloudkit_dict(),
}
operations.append(op)
body = {"operations": operations}
try:
response = self._request("POST", "records/modify", body)
except CloudKitError as e:
# Entire batch failed
for record in records:
result.failed.append(OperationResult(
record_name=record.record_name,
success=False,
error_message=str(e),
))
return result
# Process individual results
for record_data in response.get("records", []):
record_name = record_data.get("recordName", "unknown")
if "serverErrorCode" in record_data:
result.failed.append(OperationResult(
record_name=record_name,
success=False,
error_code=record_data.get("serverErrorCode"),
error_message=record_data.get("reason"),
))
else:
result.successful.append(OperationResult(
record_name=record_name,
success=True,
record_change_tag=record_data.get("recordChangeTag"),
))
return result
def delete_records(
self,
record_type: RecordType,
record_names: list[str],
) -> BatchResult:
"""Delete records from CloudKit.
Args:
record_type: Type of records to delete
record_names: List of record names to delete
Returns:
BatchResult with success/failure details
"""
result = BatchResult()
# Process in batches
for i in range(0, len(record_names), CLOUDKIT_BATCH_SIZE):
batch = record_names[i:i + CLOUDKIT_BATCH_SIZE]
operations = []
for name in batch:
operations.append({
"operationType": "delete",
"record": {
"recordName": name,
"recordType": record_type.value,
},
})
body = {"operations": operations}
try:
response = self._request("POST", "records/modify", body)
except CloudKitError as e:
for name in batch:
result.failed.append(OperationResult(
record_name=name,
success=False,
error_message=str(e),
))
continue
for record_data in response.get("records", []):
record_name = record_data.get("recordName", "unknown")
if "serverErrorCode" in record_data:
result.failed.append(OperationResult(
record_name=record_name,
success=False,
error_code=record_data.get("serverErrorCode"),
error_message=record_data.get("reason"),
))
else:
result.successful.append(OperationResult(
record_name=record_name,
success=True,
))
return result
class CloudKitError(Exception):
"""Base exception for CloudKit errors."""
pass
class CloudKitAuthError(CloudKitError):
"""Authentication error."""
pass
class CloudKitRateLimitError(CloudKitError):
"""Rate limit exceeded."""
pass
class CloudKitServerError(CloudKitError):
"""Server-side error."""
pass

View File

@@ -0,0 +1,425 @@
"""Record differ for CloudKit uploads.
This module compares local records with CloudKit records to determine
what needs to be created, updated, or deleted.
"""
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Any, Optional
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from .cloudkit import CloudKitRecord, RecordType
class DiffAction(str, Enum):
"""Action to take for a record."""
CREATE = "create"
UPDATE = "update"
DELETE = "delete"
UNCHANGED = "unchanged"
@dataclass
class RecordDiff:
"""Represents the difference between local and remote records.
Attributes:
record_name: Canonical record ID
record_type: CloudKit record type
action: Action to take (create, update, delete, unchanged)
local_record: Local CloudKitRecord (None if delete)
remote_record: Remote record dict (None if create)
changed_fields: List of field names that changed (for update)
record_change_tag: Remote record's change tag (for update)
"""
record_name: str
record_type: RecordType
action: DiffAction
local_record: Optional[CloudKitRecord] = None
remote_record: Optional[dict] = None
changed_fields: list[str] = field(default_factory=list)
record_change_tag: Optional[str] = None
@dataclass
class DiffResult:
"""Result of diffing local and remote records.
Attributes:
creates: Records to create
updates: Records to update
deletes: Records to delete (record names)
unchanged: Records with no changes
"""
creates: list[RecordDiff] = field(default_factory=list)
updates: list[RecordDiff] = field(default_factory=list)
deletes: list[RecordDiff] = field(default_factory=list)
unchanged: list[RecordDiff] = field(default_factory=list)
@property
def create_count(self) -> int:
return len(self.creates)
@property
def update_count(self) -> int:
return len(self.updates)
@property
def delete_count(self) -> int:
return len(self.deletes)
@property
def unchanged_count(self) -> int:
return len(self.unchanged)
@property
def total_changes(self) -> int:
return self.create_count + self.update_count + self.delete_count
def get_records_to_upload(self) -> list[CloudKitRecord]:
"""Get all records that need to be uploaded (creates + updates)."""
records = []
for diff in self.creates:
if diff.local_record:
records.append(diff.local_record)
for diff in self.updates:
if diff.local_record:
# Add change tag for update
diff.local_record.record_change_tag = diff.record_change_tag
records.append(diff.local_record)
return records
class RecordDiffer:
"""Compares local records with CloudKit records."""
# Fields to compare for each record type
GAME_FIELDS = [
"sport", "season", "home_team_id", "away_team_id", "stadium_id",
"game_date", "game_number", "home_score", "away_score", "status",
]
TEAM_FIELDS = [
"sport", "city", "name", "full_name", "abbreviation",
"conference", "division", "primary_color", "secondary_color",
"logo_url", "stadium_id",
]
STADIUM_FIELDS = [
"sport", "name", "city", "state", "country",
"latitude", "longitude", "capacity", "surface",
"roof_type", "opened_year", "image_url", "timezone",
]
def diff_games(
self,
local_games: list[Game],
remote_records: list[dict],
) -> DiffResult:
"""Diff local games against remote CloudKit records.
Args:
local_games: List of local Game objects
remote_records: List of remote record dictionaries
Returns:
DiffResult with creates, updates, deletes
"""
local_records = [self._game_to_record(g) for g in local_games]
return self._diff_records(
local_records,
remote_records,
RecordType.GAME,
self.GAME_FIELDS,
)
def diff_teams(
self,
local_teams: list[Team],
remote_records: list[dict],
) -> DiffResult:
"""Diff local teams against remote CloudKit records.
Args:
local_teams: List of local Team objects
remote_records: List of remote record dictionaries
Returns:
DiffResult with creates, updates, deletes
"""
local_records = [self._team_to_record(t) for t in local_teams]
return self._diff_records(
local_records,
remote_records,
RecordType.TEAM,
self.TEAM_FIELDS,
)
def diff_stadiums(
self,
local_stadiums: list[Stadium],
remote_records: list[dict],
) -> DiffResult:
"""Diff local stadiums against remote CloudKit records.
Args:
local_stadiums: List of local Stadium objects
remote_records: List of remote record dictionaries
Returns:
DiffResult with creates, updates, deletes
"""
local_records = [self._stadium_to_record(s) for s in local_stadiums]
return self._diff_records(
local_records,
remote_records,
RecordType.STADIUM,
self.STADIUM_FIELDS,
)
def _diff_records(
self,
local_records: list[CloudKitRecord],
remote_records: list[dict],
record_type: RecordType,
compare_fields: list[str],
) -> DiffResult:
"""Compare local and remote records.
Args:
local_records: List of local CloudKitRecord objects
remote_records: List of remote record dictionaries
record_type: Type of records being compared
compare_fields: List of field names to compare
Returns:
DiffResult with categorized differences
"""
result = DiffResult()
# Index remote records by name
remote_by_name: dict[str, dict] = {}
for record in remote_records:
name = record.get("recordName")
if name:
remote_by_name[name] = record
# Index local records by name
local_by_name: dict[str, CloudKitRecord] = {}
for record in local_records:
local_by_name[record.record_name] = record
# Find creates and updates
for local_record in local_records:
remote = remote_by_name.get(local_record.record_name)
if remote is None:
# New record
result.creates.append(RecordDiff(
record_name=local_record.record_name,
record_type=record_type,
action=DiffAction.CREATE,
local_record=local_record,
))
else:
# Check for changes
changed_fields = self._compare_fields(
local_record.fields,
remote.get("fields", {}),
compare_fields,
)
if changed_fields:
result.updates.append(RecordDiff(
record_name=local_record.record_name,
record_type=record_type,
action=DiffAction.UPDATE,
local_record=local_record,
remote_record=remote,
changed_fields=changed_fields,
record_change_tag=remote.get("recordChangeTag"),
))
else:
result.unchanged.append(RecordDiff(
record_name=local_record.record_name,
record_type=record_type,
action=DiffAction.UNCHANGED,
local_record=local_record,
remote_record=remote,
record_change_tag=remote.get("recordChangeTag"),
))
# Find deletes (remote records not in local)
local_names = set(local_by_name.keys())
for remote_name, remote in remote_by_name.items():
if remote_name not in local_names:
result.deletes.append(RecordDiff(
record_name=remote_name,
record_type=record_type,
action=DiffAction.DELETE,
remote_record=remote,
record_change_tag=remote.get("recordChangeTag"),
))
return result
def _compare_fields(
self,
local_fields: dict[str, Any],
remote_fields: dict[str, dict],
compare_fields: list[str],
) -> list[str]:
"""Compare field values between local and remote.
Args:
local_fields: Local field values
remote_fields: Remote field values (CloudKit format)
compare_fields: Fields to compare
Returns:
List of field names that differ
"""
changed = []
for field_name in compare_fields:
local_value = local_fields.get(field_name)
remote_field = remote_fields.get(field_name, {})
remote_value = remote_field.get("value") if remote_field else None
# Normalize values for comparison
local_normalized = self._normalize_value(local_value)
remote_normalized = self._normalize_remote_value(remote_value, remote_field)
if local_normalized != remote_normalized:
changed.append(field_name)
return changed
def _normalize_value(self, value: Any) -> Any:
"""Normalize a local value for comparison."""
if value is None:
return None
if isinstance(value, datetime):
# Convert to milliseconds since epoch
return int(value.timestamp() * 1000)
if isinstance(value, float):
# Round to 6 decimal places for coordinate comparison
return round(value, 6)
return value
def _normalize_remote_value(self, value: Any, field_data: dict) -> Any:
"""Normalize a remote CloudKit value for comparison."""
if value is None:
return None
field_type = field_data.get("type", "")
if field_type == "TIMESTAMP":
# Already in milliseconds
return value
if field_type == "DOUBLE":
return round(value, 6)
if field_type == "LOCATION":
# Return as tuple for comparison
if isinstance(value, dict):
return (
round(value.get("latitude", 0), 6),
round(value.get("longitude", 0), 6),
)
return value
def _game_to_record(self, game: Game) -> CloudKitRecord:
"""Convert a Game to a CloudKitRecord."""
return CloudKitRecord(
record_name=game.id,
record_type=RecordType.GAME,
fields={
"sport": game.sport,
"season": game.season,
"home_team_id": game.home_team_id,
"away_team_id": game.away_team_id,
"stadium_id": game.stadium_id,
"game_date": game.game_date,
"game_number": game.game_number,
"home_score": game.home_score,
"away_score": game.away_score,
"status": game.status,
},
)
def _team_to_record(self, team: Team) -> CloudKitRecord:
"""Convert a Team to a CloudKitRecord."""
return CloudKitRecord(
record_name=team.id,
record_type=RecordType.TEAM,
fields={
"sport": team.sport,
"city": team.city,
"name": team.name,
"full_name": team.full_name,
"abbreviation": team.abbreviation,
"conference": team.conference,
"division": team.division,
"primary_color": team.primary_color,
"secondary_color": team.secondary_color,
"logo_url": team.logo_url,
"stadium_id": team.stadium_id,
},
)
def _stadium_to_record(self, stadium: Stadium) -> CloudKitRecord:
"""Convert a Stadium to a CloudKitRecord."""
return CloudKitRecord(
record_name=stadium.id,
record_type=RecordType.STADIUM,
fields={
"sport": stadium.sport,
"name": stadium.name,
"city": stadium.city,
"state": stadium.state,
"country": stadium.country,
"latitude": stadium.latitude,
"longitude": stadium.longitude,
"capacity": stadium.capacity,
"surface": stadium.surface,
"roof_type": stadium.roof_type,
"opened_year": stadium.opened_year,
"image_url": stadium.image_url,
"timezone": stadium.timezone,
},
)
def game_to_cloudkit_record(game: Game) -> CloudKitRecord:
"""Convert a Game to a CloudKitRecord.
Convenience function for external use.
"""
differ = RecordDiffer()
return differ._game_to_record(game)
def team_to_cloudkit_record(team: Team) -> CloudKitRecord:
"""Convert a Team to a CloudKitRecord.
Convenience function for external use.
"""
differ = RecordDiffer()
return differ._team_to_record(team)
def stadium_to_cloudkit_record(stadium: Stadium) -> CloudKitRecord:
"""Convert a Stadium to a CloudKitRecord.
Convenience function for external use.
"""
differ = RecordDiffer()
return differ._stadium_to_record(stadium)

View File

@@ -0,0 +1,384 @@
"""Upload state manager for resumable uploads.
This module tracks upload progress to enable resuming interrupted uploads.
State is persisted to JSON files in the .parser_state directory.
"""
import json
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Optional
from ..config import STATE_DIR
@dataclass
class RecordState:
"""State of an individual record upload.
Attributes:
record_name: Canonical record ID
record_type: CloudKit record type
uploaded_at: Timestamp when successfully uploaded
record_change_tag: CloudKit version tag
status: 'pending', 'uploaded', 'failed'
error_message: Error message if failed
retry_count: Number of retry attempts
"""
record_name: str
record_type: str
uploaded_at: Optional[datetime] = None
record_change_tag: Optional[str] = None
status: str = "pending"
error_message: Optional[str] = None
retry_count: int = 0
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"record_name": self.record_name,
"record_type": self.record_type,
"uploaded_at": self.uploaded_at.isoformat() if self.uploaded_at else None,
"record_change_tag": self.record_change_tag,
"status": self.status,
"error_message": self.error_message,
"retry_count": self.retry_count,
}
@classmethod
def from_dict(cls, data: dict) -> "RecordState":
"""Create RecordState from dictionary."""
uploaded_at = data.get("uploaded_at")
if uploaded_at:
uploaded_at = datetime.fromisoformat(uploaded_at)
return cls(
record_name=data["record_name"],
record_type=data["record_type"],
uploaded_at=uploaded_at,
record_change_tag=data.get("record_change_tag"),
status=data.get("status", "pending"),
error_message=data.get("error_message"),
retry_count=data.get("retry_count", 0),
)
@dataclass
class UploadSession:
"""Tracks the state of an upload session.
Attributes:
sport: Sport code
season: Season start year
environment: CloudKit environment
started_at: When the upload session started
last_updated: When the state was last updated
records: Dictionary of record_name -> RecordState
total_count: Total number of records to upload
"""
sport: str
season: int
environment: str
started_at: datetime = field(default_factory=datetime.utcnow)
last_updated: datetime = field(default_factory=datetime.utcnow)
records: dict[str, RecordState] = field(default_factory=dict)
total_count: int = 0
@property
def uploaded_count(self) -> int:
"""Count of successfully uploaded records."""
return sum(1 for r in self.records.values() if r.status == "uploaded")
@property
def pending_count(self) -> int:
"""Count of pending records."""
return sum(1 for r in self.records.values() if r.status == "pending")
@property
def failed_count(self) -> int:
"""Count of failed records."""
return sum(1 for r in self.records.values() if r.status == "failed")
@property
def is_complete(self) -> bool:
"""Check if all records have been processed."""
return self.pending_count == 0
@property
def progress_percent(self) -> float:
"""Calculate upload progress as percentage."""
if self.total_count == 0:
return 100.0
return (self.uploaded_count / self.total_count) * 100
def get_pending_records(self) -> list[str]:
"""Get list of record names that still need to be uploaded."""
return [
name for name, state in self.records.items()
if state.status == "pending"
]
def get_failed_records(self) -> list[str]:
"""Get list of record names that failed to upload."""
return [
name for name, state in self.records.items()
if state.status == "failed"
]
def get_retryable_records(self, max_retries: int = 3) -> list[str]:
"""Get failed records that can be retried."""
return [
name for name, state in self.records.items()
if state.status == "failed" and state.retry_count < max_retries
]
def mark_uploaded(
self,
record_name: str,
record_change_tag: Optional[str] = None,
) -> None:
"""Mark a record as successfully uploaded."""
if record_name in self.records:
state = self.records[record_name]
state.status = "uploaded"
state.uploaded_at = datetime.utcnow()
state.record_change_tag = record_change_tag
state.error_message = None
self.last_updated = datetime.utcnow()
def mark_failed(self, record_name: str, error_message: str) -> None:
"""Mark a record as failed."""
if record_name in self.records:
state = self.records[record_name]
state.status = "failed"
state.error_message = error_message
state.retry_count += 1
self.last_updated = datetime.utcnow()
def mark_pending(self, record_name: str) -> None:
"""Mark a record as pending (for retry)."""
if record_name in self.records:
state = self.records[record_name]
state.status = "pending"
state.error_message = None
self.last_updated = datetime.utcnow()
def add_record(self, record_name: str, record_type: str) -> None:
"""Add a new record to track."""
if record_name not in self.records:
self.records[record_name] = RecordState(
record_name=record_name,
record_type=record_type,
)
self.total_count = len(self.records)
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"sport": self.sport,
"season": self.season,
"environment": self.environment,
"started_at": self.started_at.isoformat(),
"last_updated": self.last_updated.isoformat(),
"total_count": self.total_count,
"records": {
name: state.to_dict()
for name, state in self.records.items()
},
}
@classmethod
def from_dict(cls, data: dict) -> "UploadSession":
"""Create UploadSession from dictionary."""
session = cls(
sport=data["sport"],
season=data["season"],
environment=data["environment"],
started_at=datetime.fromisoformat(data["started_at"]),
last_updated=datetime.fromisoformat(data["last_updated"]),
total_count=data.get("total_count", 0),
)
for name, record_data in data.get("records", {}).items():
session.records[name] = RecordState.from_dict(record_data)
return session
class StateManager:
"""Manages upload state persistence.
State files are stored in .parser_state/ with naming convention:
upload_state_{sport}_{season}_{environment}.json
"""
def __init__(self, state_dir: Optional[Path] = None):
"""Initialize the state manager.
Args:
state_dir: Directory for state files (default: .parser_state/)
"""
self.state_dir = state_dir or STATE_DIR
self.state_dir.mkdir(parents=True, exist_ok=True)
def _get_state_file(self, sport: str, season: int, environment: str) -> Path:
"""Get the path to a state file."""
return self.state_dir / f"upload_state_{sport}_{season}_{environment}.json"
def load_session(
self,
sport: str,
season: int,
environment: str,
) -> Optional[UploadSession]:
"""Load an existing upload session.
Args:
sport: Sport code
season: Season start year
environment: CloudKit environment
Returns:
UploadSession if exists, None otherwise
"""
state_file = self._get_state_file(sport, season, environment)
if not state_file.exists():
return None
try:
with open(state_file, "r", encoding="utf-8") as f:
data = json.load(f)
return UploadSession.from_dict(data)
except (json.JSONDecodeError, KeyError) as e:
# Corrupted state file
return None
def save_session(self, session: UploadSession) -> None:
"""Save an upload session to disk.
Args:
session: The session to save
"""
state_file = self._get_state_file(
session.sport,
session.season,
session.environment,
)
session.last_updated = datetime.utcnow()
with open(state_file, "w", encoding="utf-8") as f:
json.dump(session.to_dict(), f, indent=2)
def create_session(
self,
sport: str,
season: int,
environment: str,
record_names: list[tuple[str, str]], # (record_name, record_type)
) -> UploadSession:
"""Create a new upload session.
Args:
sport: Sport code
season: Season start year
environment: CloudKit environment
record_names: List of (record_name, record_type) tuples
Returns:
New UploadSession
"""
session = UploadSession(
sport=sport,
season=season,
environment=environment,
)
for record_name, record_type in record_names:
session.add_record(record_name, record_type)
self.save_session(session)
return session
def delete_session(self, sport: str, season: int, environment: str) -> bool:
"""Delete an upload session state file.
Args:
sport: Sport code
season: Season start year
environment: CloudKit environment
Returns:
True if deleted, False if not found
"""
state_file = self._get_state_file(sport, season, environment)
if state_file.exists():
state_file.unlink()
return True
return False
def list_sessions(self) -> list[dict]:
"""List all upload sessions.
Returns:
List of session summaries
"""
sessions = []
for state_file in self.state_dir.glob("upload_state_*.json"):
try:
with open(state_file, "r", encoding="utf-8") as f:
data = json.load(f)
session = UploadSession.from_dict(data)
sessions.append({
"sport": session.sport,
"season": session.season,
"environment": session.environment,
"started_at": session.started_at.isoformat(),
"last_updated": session.last_updated.isoformat(),
"progress": f"{session.uploaded_count}/{session.total_count}",
"progress_percent": f"{session.progress_percent:.1f}%",
"status": "complete" if session.is_complete else "in_progress",
"failed_count": session.failed_count,
})
except (json.JSONDecodeError, KeyError):
continue
return sessions
def get_session_or_create(
self,
sport: str,
season: int,
environment: str,
record_names: list[tuple[str, str]],
resume: bool = False,
) -> UploadSession:
"""Get existing session or create new one.
Args:
sport: Sport code
season: Season start year
environment: CloudKit environment
record_names: List of (record_name, record_type) tuples
resume: Whether to resume existing session
Returns:
UploadSession (existing or new)
"""
if resume:
existing = self.load_session(sport, season, environment)
if existing:
# Add any new records not in existing session
existing_names = set(existing.records.keys())
for record_name, record_type in record_names:
if record_name not in existing_names:
existing.add_record(record_name, record_type)
return existing
# Create new session (overwrites existing)
return self.create_session(sport, season, environment, record_names)