feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
472
sportstime_parser/tests/test_uploaders/test_state.py
Normal file
472
sportstime_parser/tests/test_uploaders/test_state.py
Normal file
@@ -0,0 +1,472 @@
|
||||
"""Tests for the upload state manager."""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from sportstime_parser.uploaders.state import (
|
||||
RecordState,
|
||||
UploadSession,
|
||||
StateManager,
|
||||
)
|
||||
|
||||
|
||||
class TestRecordState:
|
||||
"""Tests for RecordState dataclass."""
|
||||
|
||||
def test_create_record_state(self):
|
||||
"""Test creating a RecordState with default values."""
|
||||
state = RecordState(
|
||||
record_name="nba_2025_hou_okc_1021",
|
||||
record_type="Game",
|
||||
)
|
||||
|
||||
assert state.record_name == "nba_2025_hou_okc_1021"
|
||||
assert state.record_type == "Game"
|
||||
assert state.status == "pending"
|
||||
assert state.uploaded_at is None
|
||||
assert state.record_change_tag is None
|
||||
assert state.error_message is None
|
||||
assert state.retry_count == 0
|
||||
|
||||
def test_record_state_to_dict(self):
|
||||
"""Test serializing RecordState to dictionary."""
|
||||
now = datetime.utcnow()
|
||||
state = RecordState(
|
||||
record_name="nba_2025_hou_okc_1021",
|
||||
record_type="Game",
|
||||
uploaded_at=now,
|
||||
record_change_tag="abc123",
|
||||
status="uploaded",
|
||||
)
|
||||
|
||||
data = state.to_dict()
|
||||
|
||||
assert data["record_name"] == "nba_2025_hou_okc_1021"
|
||||
assert data["record_type"] == "Game"
|
||||
assert data["status"] == "uploaded"
|
||||
assert data["uploaded_at"] == now.isoformat()
|
||||
assert data["record_change_tag"] == "abc123"
|
||||
|
||||
def test_record_state_from_dict(self):
|
||||
"""Test deserializing RecordState from dictionary."""
|
||||
data = {
|
||||
"record_name": "nba_2025_hou_okc_1021",
|
||||
"record_type": "Game",
|
||||
"uploaded_at": "2026-01-10T12:00:00",
|
||||
"record_change_tag": "abc123",
|
||||
"status": "uploaded",
|
||||
"error_message": None,
|
||||
"retry_count": 0,
|
||||
}
|
||||
|
||||
state = RecordState.from_dict(data)
|
||||
|
||||
assert state.record_name == "nba_2025_hou_okc_1021"
|
||||
assert state.record_type == "Game"
|
||||
assert state.status == "uploaded"
|
||||
assert state.uploaded_at == datetime.fromisoformat("2026-01-10T12:00:00")
|
||||
assert state.record_change_tag == "abc123"
|
||||
|
||||
|
||||
class TestUploadSession:
|
||||
"""Tests for UploadSession dataclass."""
|
||||
|
||||
def test_create_upload_session(self):
|
||||
"""Test creating an UploadSession."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
|
||||
assert session.sport == "nba"
|
||||
assert session.season == 2025
|
||||
assert session.environment == "development"
|
||||
assert session.total_count == 0
|
||||
assert len(session.records) == 0
|
||||
|
||||
def test_add_record(self):
|
||||
"""Test adding records to a session."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
|
||||
session.add_record("game_1", "Game")
|
||||
session.add_record("game_2", "Game")
|
||||
session.add_record("team_1", "Team")
|
||||
|
||||
assert session.total_count == 3
|
||||
assert len(session.records) == 3
|
||||
assert "game_1" in session.records
|
||||
assert session.records["game_1"].record_type == "Game"
|
||||
|
||||
def test_mark_uploaded(self):
|
||||
"""Test marking a record as uploaded."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
session.add_record("game_1", "Game")
|
||||
|
||||
session.mark_uploaded("game_1", "change_tag_123")
|
||||
|
||||
assert session.records["game_1"].status == "uploaded"
|
||||
assert session.records["game_1"].record_change_tag == "change_tag_123"
|
||||
assert session.records["game_1"].uploaded_at is not None
|
||||
|
||||
def test_mark_failed(self):
|
||||
"""Test marking a record as failed."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
session.add_record("game_1", "Game")
|
||||
|
||||
session.mark_failed("game_1", "Server error")
|
||||
|
||||
assert session.records["game_1"].status == "failed"
|
||||
assert session.records["game_1"].error_message == "Server error"
|
||||
assert session.records["game_1"].retry_count == 1
|
||||
|
||||
def test_mark_failed_increments_retry_count(self):
|
||||
"""Test that marking failed increments retry count."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
session.add_record("game_1", "Game")
|
||||
|
||||
session.mark_failed("game_1", "Error 1")
|
||||
session.mark_failed("game_1", "Error 2")
|
||||
session.mark_failed("game_1", "Error 3")
|
||||
|
||||
assert session.records["game_1"].retry_count == 3
|
||||
|
||||
def test_counts(self):
|
||||
"""Test session counts."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
session.add_record("game_1", "Game")
|
||||
session.add_record("game_2", "Game")
|
||||
session.add_record("game_3", "Game")
|
||||
|
||||
session.mark_uploaded("game_1")
|
||||
session.mark_failed("game_2", "Error")
|
||||
|
||||
assert session.uploaded_count == 1
|
||||
assert session.failed_count == 1
|
||||
assert session.pending_count == 1
|
||||
|
||||
def test_is_complete(self):
|
||||
"""Test is_complete property."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
session.add_record("game_1", "Game")
|
||||
session.add_record("game_2", "Game")
|
||||
|
||||
assert not session.is_complete
|
||||
|
||||
session.mark_uploaded("game_1")
|
||||
assert not session.is_complete
|
||||
|
||||
session.mark_uploaded("game_2")
|
||||
assert session.is_complete
|
||||
|
||||
def test_progress_percent(self):
|
||||
"""Test progress percentage calculation."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
session.add_record("game_1", "Game")
|
||||
session.add_record("game_2", "Game")
|
||||
session.add_record("game_3", "Game")
|
||||
session.add_record("game_4", "Game")
|
||||
|
||||
session.mark_uploaded("game_1")
|
||||
|
||||
assert session.progress_percent == 25.0
|
||||
|
||||
def test_get_pending_records(self):
|
||||
"""Test getting pending record names."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
session.add_record("game_1", "Game")
|
||||
session.add_record("game_2", "Game")
|
||||
session.add_record("game_3", "Game")
|
||||
|
||||
session.mark_uploaded("game_1")
|
||||
session.mark_failed("game_2", "Error")
|
||||
|
||||
pending = session.get_pending_records()
|
||||
|
||||
assert pending == ["game_3"]
|
||||
|
||||
def test_get_failed_records(self):
|
||||
"""Test getting failed record names."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
session.add_record("game_1", "Game")
|
||||
session.add_record("game_2", "Game")
|
||||
session.add_record("game_3", "Game")
|
||||
|
||||
session.mark_failed("game_1", "Error 1")
|
||||
session.mark_failed("game_3", "Error 3")
|
||||
|
||||
failed = session.get_failed_records()
|
||||
|
||||
assert set(failed) == {"game_1", "game_3"}
|
||||
|
||||
def test_get_retryable_records(self):
|
||||
"""Test getting records eligible for retry."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
session.add_record("game_1", "Game")
|
||||
session.add_record("game_2", "Game")
|
||||
session.add_record("game_3", "Game")
|
||||
|
||||
# Fail game_1 once
|
||||
session.mark_failed("game_1", "Error")
|
||||
|
||||
# Fail game_2 three times (max retries)
|
||||
session.mark_failed("game_2", "Error")
|
||||
session.mark_failed("game_2", "Error")
|
||||
session.mark_failed("game_2", "Error")
|
||||
|
||||
retryable = session.get_retryable_records(max_retries=3)
|
||||
|
||||
assert retryable == ["game_1"]
|
||||
|
||||
def test_to_dict_and_from_dict(self):
|
||||
"""Test round-trip serialization."""
|
||||
session = UploadSession(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
)
|
||||
session.add_record("game_1", "Game")
|
||||
session.add_record("game_2", "Game")
|
||||
session.mark_uploaded("game_1", "tag_123")
|
||||
|
||||
data = session.to_dict()
|
||||
restored = UploadSession.from_dict(data)
|
||||
|
||||
assert restored.sport == session.sport
|
||||
assert restored.season == session.season
|
||||
assert restored.environment == session.environment
|
||||
assert restored.total_count == session.total_count
|
||||
assert restored.uploaded_count == session.uploaded_count
|
||||
assert restored.records["game_1"].status == "uploaded"
|
||||
|
||||
|
||||
class TestStateManager:
|
||||
"""Tests for StateManager."""
|
||||
|
||||
def test_create_session(self):
|
||||
"""Test creating a new session."""
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
manager = StateManager(state_dir=Path(tmpdir))
|
||||
|
||||
session = manager.create_session(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
record_names=[
|
||||
("game_1", "Game"),
|
||||
("game_2", "Game"),
|
||||
("team_1", "Team"),
|
||||
],
|
||||
)
|
||||
|
||||
assert session.sport == "nba"
|
||||
assert session.season == 2025
|
||||
assert session.total_count == 3
|
||||
|
||||
# Check file was created
|
||||
state_file = Path(tmpdir) / "upload_state_nba_2025_development.json"
|
||||
assert state_file.exists()
|
||||
|
||||
def test_load_session(self):
|
||||
"""Test loading an existing session."""
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
manager = StateManager(state_dir=Path(tmpdir))
|
||||
|
||||
# Create and save a session
|
||||
original = manager.create_session(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
record_names=[("game_1", "Game")],
|
||||
)
|
||||
original.mark_uploaded("game_1", "tag_123")
|
||||
manager.save_session(original)
|
||||
|
||||
# Load it back
|
||||
loaded = manager.load_session("nba", 2025, "development")
|
||||
|
||||
assert loaded is not None
|
||||
assert loaded.sport == "nba"
|
||||
assert loaded.records["game_1"].status == "uploaded"
|
||||
|
||||
def test_load_nonexistent_session(self):
|
||||
"""Test loading a session that doesn't exist."""
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
manager = StateManager(state_dir=Path(tmpdir))
|
||||
|
||||
session = manager.load_session("nba", 2025, "development")
|
||||
|
||||
assert session is None
|
||||
|
||||
def test_delete_session(self):
|
||||
"""Test deleting a session."""
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
manager = StateManager(state_dir=Path(tmpdir))
|
||||
|
||||
# Create a session
|
||||
manager.create_session(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
record_names=[("game_1", "Game")],
|
||||
)
|
||||
|
||||
# Delete it
|
||||
result = manager.delete_session("nba", 2025, "development")
|
||||
|
||||
assert result is True
|
||||
|
||||
# Verify it's gone
|
||||
loaded = manager.load_session("nba", 2025, "development")
|
||||
assert loaded is None
|
||||
|
||||
def test_delete_nonexistent_session(self):
|
||||
"""Test deleting a session that doesn't exist."""
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
manager = StateManager(state_dir=Path(tmpdir))
|
||||
|
||||
result = manager.delete_session("nba", 2025, "development")
|
||||
|
||||
assert result is False
|
||||
|
||||
def test_list_sessions(self):
|
||||
"""Test listing all sessions."""
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
manager = StateManager(state_dir=Path(tmpdir))
|
||||
|
||||
# Create multiple sessions
|
||||
manager.create_session(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
record_names=[("game_1", "Game")],
|
||||
)
|
||||
manager.create_session(
|
||||
sport="mlb",
|
||||
season=2026,
|
||||
environment="production",
|
||||
record_names=[("game_2", "Game"), ("game_3", "Game")],
|
||||
)
|
||||
|
||||
sessions = manager.list_sessions()
|
||||
|
||||
assert len(sessions) == 2
|
||||
sports = {s["sport"] for s in sessions}
|
||||
assert sports == {"nba", "mlb"}
|
||||
|
||||
def test_get_session_or_create_new(self):
|
||||
"""Test getting a session when none exists."""
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
manager = StateManager(state_dir=Path(tmpdir))
|
||||
|
||||
session = manager.get_session_or_create(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
record_names=[("game_1", "Game")],
|
||||
resume=False,
|
||||
)
|
||||
|
||||
assert session.sport == "nba"
|
||||
assert session.total_count == 1
|
||||
|
||||
def test_get_session_or_create_resume(self):
|
||||
"""Test resuming an existing session."""
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
manager = StateManager(state_dir=Path(tmpdir))
|
||||
|
||||
# Create initial session
|
||||
original = manager.create_session(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
record_names=[("game_1", "Game"), ("game_2", "Game")],
|
||||
)
|
||||
original.mark_uploaded("game_1", "tag_123")
|
||||
manager.save_session(original)
|
||||
|
||||
# Resume with additional records
|
||||
session = manager.get_session_or_create(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
record_names=[("game_1", "Game"), ("game_2", "Game"), ("game_3", "Game")],
|
||||
resume=True,
|
||||
)
|
||||
|
||||
# Should have original progress plus new record
|
||||
assert session.records["game_1"].status == "uploaded"
|
||||
assert "game_3" in session.records
|
||||
assert session.total_count == 3
|
||||
|
||||
def test_get_session_or_create_overwrite(self):
|
||||
"""Test overwriting an existing session when not resuming."""
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
manager = StateManager(state_dir=Path(tmpdir))
|
||||
|
||||
# Create initial session
|
||||
original = manager.create_session(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
record_names=[("game_1", "Game"), ("game_2", "Game")],
|
||||
)
|
||||
original.mark_uploaded("game_1", "tag_123")
|
||||
manager.save_session(original)
|
||||
|
||||
# Create new session (not resuming)
|
||||
session = manager.get_session_or_create(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
environment="development",
|
||||
record_names=[("game_3", "Game")],
|
||||
resume=False,
|
||||
)
|
||||
|
||||
# Should be a fresh session
|
||||
assert session.total_count == 1
|
||||
assert "game_1" not in session.records
|
||||
assert "game_3" in session.records
|
||||
Reference in New Issue
Block a user