Files
Trey t 52d445bca4 feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00

473 lines
15 KiB
Python

"""Tests for the upload state manager."""
import json
import pytest
from datetime import datetime, timedelta
from pathlib import Path
from tempfile import TemporaryDirectory
from sportstime_parser.uploaders.state import (
RecordState,
UploadSession,
StateManager,
)
class TestRecordState:
"""Tests for RecordState dataclass."""
def test_create_record_state(self):
"""Test creating a RecordState with default values."""
state = RecordState(
record_name="nba_2025_hou_okc_1021",
record_type="Game",
)
assert state.record_name == "nba_2025_hou_okc_1021"
assert state.record_type == "Game"
assert state.status == "pending"
assert state.uploaded_at is None
assert state.record_change_tag is None
assert state.error_message is None
assert state.retry_count == 0
def test_record_state_to_dict(self):
"""Test serializing RecordState to dictionary."""
now = datetime.utcnow()
state = RecordState(
record_name="nba_2025_hou_okc_1021",
record_type="Game",
uploaded_at=now,
record_change_tag="abc123",
status="uploaded",
)
data = state.to_dict()
assert data["record_name"] == "nba_2025_hou_okc_1021"
assert data["record_type"] == "Game"
assert data["status"] == "uploaded"
assert data["uploaded_at"] == now.isoformat()
assert data["record_change_tag"] == "abc123"
def test_record_state_from_dict(self):
"""Test deserializing RecordState from dictionary."""
data = {
"record_name": "nba_2025_hou_okc_1021",
"record_type": "Game",
"uploaded_at": "2026-01-10T12:00:00",
"record_change_tag": "abc123",
"status": "uploaded",
"error_message": None,
"retry_count": 0,
}
state = RecordState.from_dict(data)
assert state.record_name == "nba_2025_hou_okc_1021"
assert state.record_type == "Game"
assert state.status == "uploaded"
assert state.uploaded_at == datetime.fromisoformat("2026-01-10T12:00:00")
assert state.record_change_tag == "abc123"
class TestUploadSession:
"""Tests for UploadSession dataclass."""
def test_create_upload_session(self):
"""Test creating an UploadSession."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
assert session.sport == "nba"
assert session.season == 2025
assert session.environment == "development"
assert session.total_count == 0
assert len(session.records) == 0
def test_add_record(self):
"""Test adding records to a session."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
session.add_record("game_1", "Game")
session.add_record("game_2", "Game")
session.add_record("team_1", "Team")
assert session.total_count == 3
assert len(session.records) == 3
assert "game_1" in session.records
assert session.records["game_1"].record_type == "Game"
def test_mark_uploaded(self):
"""Test marking a record as uploaded."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
session.add_record("game_1", "Game")
session.mark_uploaded("game_1", "change_tag_123")
assert session.records["game_1"].status == "uploaded"
assert session.records["game_1"].record_change_tag == "change_tag_123"
assert session.records["game_1"].uploaded_at is not None
def test_mark_failed(self):
"""Test marking a record as failed."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
session.add_record("game_1", "Game")
session.mark_failed("game_1", "Server error")
assert session.records["game_1"].status == "failed"
assert session.records["game_1"].error_message == "Server error"
assert session.records["game_1"].retry_count == 1
def test_mark_failed_increments_retry_count(self):
"""Test that marking failed increments retry count."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
session.add_record("game_1", "Game")
session.mark_failed("game_1", "Error 1")
session.mark_failed("game_1", "Error 2")
session.mark_failed("game_1", "Error 3")
assert session.records["game_1"].retry_count == 3
def test_counts(self):
"""Test session counts."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
session.add_record("game_1", "Game")
session.add_record("game_2", "Game")
session.add_record("game_3", "Game")
session.mark_uploaded("game_1")
session.mark_failed("game_2", "Error")
assert session.uploaded_count == 1
assert session.failed_count == 1
assert session.pending_count == 1
def test_is_complete(self):
"""Test is_complete property."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
session.add_record("game_1", "Game")
session.add_record("game_2", "Game")
assert not session.is_complete
session.mark_uploaded("game_1")
assert not session.is_complete
session.mark_uploaded("game_2")
assert session.is_complete
def test_progress_percent(self):
"""Test progress percentage calculation."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
session.add_record("game_1", "Game")
session.add_record("game_2", "Game")
session.add_record("game_3", "Game")
session.add_record("game_4", "Game")
session.mark_uploaded("game_1")
assert session.progress_percent == 25.0
def test_get_pending_records(self):
"""Test getting pending record names."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
session.add_record("game_1", "Game")
session.add_record("game_2", "Game")
session.add_record("game_3", "Game")
session.mark_uploaded("game_1")
session.mark_failed("game_2", "Error")
pending = session.get_pending_records()
assert pending == ["game_3"]
def test_get_failed_records(self):
"""Test getting failed record names."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
session.add_record("game_1", "Game")
session.add_record("game_2", "Game")
session.add_record("game_3", "Game")
session.mark_failed("game_1", "Error 1")
session.mark_failed("game_3", "Error 3")
failed = session.get_failed_records()
assert set(failed) == {"game_1", "game_3"}
def test_get_retryable_records(self):
"""Test getting records eligible for retry."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
session.add_record("game_1", "Game")
session.add_record("game_2", "Game")
session.add_record("game_3", "Game")
# Fail game_1 once
session.mark_failed("game_1", "Error")
# Fail game_2 three times (max retries)
session.mark_failed("game_2", "Error")
session.mark_failed("game_2", "Error")
session.mark_failed("game_2", "Error")
retryable = session.get_retryable_records(max_retries=3)
assert retryable == ["game_1"]
def test_to_dict_and_from_dict(self):
"""Test round-trip serialization."""
session = UploadSession(
sport="nba",
season=2025,
environment="development",
)
session.add_record("game_1", "Game")
session.add_record("game_2", "Game")
session.mark_uploaded("game_1", "tag_123")
data = session.to_dict()
restored = UploadSession.from_dict(data)
assert restored.sport == session.sport
assert restored.season == session.season
assert restored.environment == session.environment
assert restored.total_count == session.total_count
assert restored.uploaded_count == session.uploaded_count
assert restored.records["game_1"].status == "uploaded"
class TestStateManager:
"""Tests for StateManager."""
def test_create_session(self):
"""Test creating a new session."""
with TemporaryDirectory() as tmpdir:
manager = StateManager(state_dir=Path(tmpdir))
session = manager.create_session(
sport="nba",
season=2025,
environment="development",
record_names=[
("game_1", "Game"),
("game_2", "Game"),
("team_1", "Team"),
],
)
assert session.sport == "nba"
assert session.season == 2025
assert session.total_count == 3
# Check file was created
state_file = Path(tmpdir) / "upload_state_nba_2025_development.json"
assert state_file.exists()
def test_load_session(self):
"""Test loading an existing session."""
with TemporaryDirectory() as tmpdir:
manager = StateManager(state_dir=Path(tmpdir))
# Create and save a session
original = manager.create_session(
sport="nba",
season=2025,
environment="development",
record_names=[("game_1", "Game")],
)
original.mark_uploaded("game_1", "tag_123")
manager.save_session(original)
# Load it back
loaded = manager.load_session("nba", 2025, "development")
assert loaded is not None
assert loaded.sport == "nba"
assert loaded.records["game_1"].status == "uploaded"
def test_load_nonexistent_session(self):
"""Test loading a session that doesn't exist."""
with TemporaryDirectory() as tmpdir:
manager = StateManager(state_dir=Path(tmpdir))
session = manager.load_session("nba", 2025, "development")
assert session is None
def test_delete_session(self):
"""Test deleting a session."""
with TemporaryDirectory() as tmpdir:
manager = StateManager(state_dir=Path(tmpdir))
# Create a session
manager.create_session(
sport="nba",
season=2025,
environment="development",
record_names=[("game_1", "Game")],
)
# Delete it
result = manager.delete_session("nba", 2025, "development")
assert result is True
# Verify it's gone
loaded = manager.load_session("nba", 2025, "development")
assert loaded is None
def test_delete_nonexistent_session(self):
"""Test deleting a session that doesn't exist."""
with TemporaryDirectory() as tmpdir:
manager = StateManager(state_dir=Path(tmpdir))
result = manager.delete_session("nba", 2025, "development")
assert result is False
def test_list_sessions(self):
"""Test listing all sessions."""
with TemporaryDirectory() as tmpdir:
manager = StateManager(state_dir=Path(tmpdir))
# Create multiple sessions
manager.create_session(
sport="nba",
season=2025,
environment="development",
record_names=[("game_1", "Game")],
)
manager.create_session(
sport="mlb",
season=2026,
environment="production",
record_names=[("game_2", "Game"), ("game_3", "Game")],
)
sessions = manager.list_sessions()
assert len(sessions) == 2
sports = {s["sport"] for s in sessions}
assert sports == {"nba", "mlb"}
def test_get_session_or_create_new(self):
"""Test getting a session when none exists."""
with TemporaryDirectory() as tmpdir:
manager = StateManager(state_dir=Path(tmpdir))
session = manager.get_session_or_create(
sport="nba",
season=2025,
environment="development",
record_names=[("game_1", "Game")],
resume=False,
)
assert session.sport == "nba"
assert session.total_count == 1
def test_get_session_or_create_resume(self):
"""Test resuming an existing session."""
with TemporaryDirectory() as tmpdir:
manager = StateManager(state_dir=Path(tmpdir))
# Create initial session
original = manager.create_session(
sport="nba",
season=2025,
environment="development",
record_names=[("game_1", "Game"), ("game_2", "Game")],
)
original.mark_uploaded("game_1", "tag_123")
manager.save_session(original)
# Resume with additional records
session = manager.get_session_or_create(
sport="nba",
season=2025,
environment="development",
record_names=[("game_1", "Game"), ("game_2", "Game"), ("game_3", "Game")],
resume=True,
)
# Should have original progress plus new record
assert session.records["game_1"].status == "uploaded"
assert "game_3" in session.records
assert session.total_count == 3
def test_get_session_or_create_overwrite(self):
"""Test overwriting an existing session when not resuming."""
with TemporaryDirectory() as tmpdir:
manager = StateManager(state_dir=Path(tmpdir))
# Create initial session
original = manager.create_session(
sport="nba",
season=2025,
environment="development",
record_names=[("game_1", "Game"), ("game_2", "Game")],
)
original.mark_uploaded("game_1", "tag_123")
manager.save_session(original)
# Create new session (not resuming)
session = manager.get_session_or_create(
sport="nba",
season=2025,
environment="development",
record_names=[("game_3", "Game")],
resume=False,
)
# Should be a fresh session
assert session.total_count == 1
assert "game_1" not in session.records
assert "game_3" in session.records