Files
Trey t 52d445bca4 feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00

247 lines
7.8 KiB
Python

"""JSON Schema validation for canonical output matching iOS app expectations.
This module defines schemas that match the Swift structs in BootstrapService.swift:
- JSONCanonicalStadium
- JSONCanonicalTeam
- JSONCanonicalGame
Validation is performed at runtime before outputting JSON to ensure
Python output matches what the iOS app expects.
"""
import re
from dataclasses import dataclass
from typing import Any, Callable, Optional, Union
class SchemaValidationError(Exception):
"""Raised when canonical output fails schema validation."""
def __init__(self, model_type: str, errors: list[str]):
self.model_type = model_type
self.errors = errors
super().__init__(f"{model_type} schema validation failed:\n" + "\n".join(f" - {e}" for e in errors))
# ISO8601 UTC datetime pattern: YYYY-MM-DDTHH:MM:SSZ
ISO8601_UTC_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
# Season format patterns
SEASON_SPLIT_PATTERN = re.compile(r"^\d{4}-\d{2}$") # e.g., "2025-26"
SEASON_SINGLE_PATTERN = re.compile(r"^\d{4}$") # e.g., "2025"
@dataclass
class FieldSpec:
"""Specification for a field in the canonical schema."""
name: str
required: bool
field_type: Union[type, tuple]
validator: Optional[Callable] = None
# Schema definitions matching Swift structs in BootstrapService.swift
STADIUM_SCHEMA: list[FieldSpec] = [
FieldSpec("canonical_id", required=True, field_type=str),
FieldSpec("name", required=True, field_type=str),
FieldSpec("city", required=True, field_type=str),
FieldSpec("state", required=True, field_type=str),
FieldSpec("latitude", required=True, field_type=(int, float)),
FieldSpec("longitude", required=True, field_type=(int, float)),
FieldSpec("capacity", required=True, field_type=int),
FieldSpec("sport", required=True, field_type=str),
FieldSpec("primary_team_abbrevs", required=True, field_type=list),
FieldSpec("year_opened", required=False, field_type=(int, type(None))),
]
TEAM_SCHEMA: list[FieldSpec] = [
FieldSpec("canonical_id", required=True, field_type=str),
FieldSpec("name", required=True, field_type=str),
FieldSpec("abbreviation", required=True, field_type=str),
FieldSpec("sport", required=True, field_type=str),
FieldSpec("city", required=True, field_type=str),
FieldSpec("stadium_canonical_id", required=True, field_type=str),
FieldSpec("conference_id", required=False, field_type=(str, type(None))),
FieldSpec("division_id", required=False, field_type=(str, type(None))),
FieldSpec("primary_color", required=False, field_type=(str, type(None))),
FieldSpec("secondary_color", required=False, field_type=(str, type(None))),
]
GAME_SCHEMA: list[FieldSpec] = [
FieldSpec("canonical_id", required=True, field_type=str),
FieldSpec("sport", required=True, field_type=str),
FieldSpec(
"season",
required=True,
field_type=str,
validator=lambda v: SEASON_SPLIT_PATTERN.match(v) or SEASON_SINGLE_PATTERN.match(v),
),
FieldSpec(
"game_datetime_utc",
required=True,
field_type=str,
validator=lambda v: ISO8601_UTC_PATTERN.match(v),
),
FieldSpec("home_team_canonical_id", required=True, field_type=str),
FieldSpec("away_team_canonical_id", required=True, field_type=str),
FieldSpec("stadium_canonical_id", required=True, field_type=str),
FieldSpec("is_playoff", required=True, field_type=bool),
FieldSpec("broadcast", required=False, field_type=(str, type(None))),
]
def validate_field(data: dict[str, Any], spec: FieldSpec) -> list[str]:
"""Validate a single field against its specification.
Args:
data: The dictionary to validate
spec: The field specification
Returns:
List of error messages (empty if valid)
"""
errors = []
if spec.name not in data:
if spec.required:
errors.append(f"Missing required field: {spec.name}")
return errors
value = data[spec.name]
# Check type
if not isinstance(value, spec.field_type):
expected = spec.field_type.__name__ if isinstance(spec.field_type, type) else str(spec.field_type)
actual = type(value).__name__
errors.append(f"Field '{spec.name}' has wrong type: expected {expected}, got {actual} (value: {value!r})")
return errors
# Check custom validator
if spec.validator and value is not None:
if not spec.validator(value):
errors.append(f"Field '{spec.name}' failed validation: {value!r}")
return errors
def validate_canonical_stadium(data: dict[str, Any]) -> list[str]:
"""Validate a canonical stadium dictionary.
Args:
data: Stadium dictionary from to_canonical_dict()
Returns:
List of error messages (empty if valid)
"""
errors = []
for spec in STADIUM_SCHEMA:
errors.extend(validate_field(data, spec))
# Additional validation: primary_team_abbrevs should contain strings
if "primary_team_abbrevs" in data and isinstance(data["primary_team_abbrevs"], list):
for i, abbrev in enumerate(data["primary_team_abbrevs"]):
if not isinstance(abbrev, str):
errors.append(f"primary_team_abbrevs[{i}] must be string, got {type(abbrev).__name__}")
return errors
def validate_canonical_team(data: dict[str, Any]) -> list[str]:
"""Validate a canonical team dictionary.
Args:
data: Team dictionary from to_canonical_dict()
Returns:
List of error messages (empty if valid)
"""
errors = []
for spec in TEAM_SCHEMA:
errors.extend(validate_field(data, spec))
return errors
def validate_canonical_game(data: dict[str, Any]) -> list[str]:
"""Validate a canonical game dictionary.
Args:
data: Game dictionary from to_canonical_dict()
Returns:
List of error messages (empty if valid)
"""
errors = []
for spec in GAME_SCHEMA:
errors.extend(validate_field(data, spec))
return errors
def validate_and_raise(data: dict[str, Any], model_type: str) -> None:
"""Validate a canonical dictionary and raise on error.
Args:
data: Dictionary from to_canonical_dict()
model_type: One of 'stadium', 'team', 'game'
Raises:
SchemaValidationError: If validation fails
ValueError: If model_type is unknown
"""
validators = {
"stadium": validate_canonical_stadium,
"team": validate_canonical_team,
"game": validate_canonical_game,
}
if model_type not in validators:
raise ValueError(f"Unknown model type: {model_type}")
errors = validators[model_type](data)
if errors:
raise SchemaValidationError(model_type, errors)
def validate_batch(
items: list[dict[str, Any]],
model_type: str,
fail_fast: bool = True,
) -> list[tuple[int, list[str]]]:
"""Validate a batch of canonical dictionaries.
Args:
items: List of dictionaries from to_canonical_dict()
model_type: One of 'stadium', 'team', 'game'
fail_fast: If True, raise on first error; if False, collect all errors
Returns:
List of (index, errors) tuples for items with validation errors
Raises:
SchemaValidationError: If fail_fast=True and validation fails
"""
validators = {
"stadium": validate_canonical_stadium,
"team": validate_canonical_team,
"game": validate_canonical_game,
}
if model_type not in validators:
raise ValueError(f"Unknown model type: {model_type}")
validator = validators[model_type]
all_errors = []
for i, item in enumerate(items):
errors = validator(item)
if errors:
if fail_fast:
raise SchemaValidationError(
model_type,
[f"Item {i}: {e}" for e in errors],
)
all_errors.append((i, errors))
return all_errors