Files
Trey t 52d445bca4 feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00

351 lines
12 KiB
Python

"""Tests for the record differ."""
import pytest
from datetime import datetime
from sportstime_parser.models.game import Game
from sportstime_parser.models.team import Team
from sportstime_parser.models.stadium import Stadium
from sportstime_parser.uploaders.diff import (
DiffAction,
RecordDiff,
DiffResult,
RecordDiffer,
game_to_cloudkit_record,
team_to_cloudkit_record,
stadium_to_cloudkit_record,
)
from sportstime_parser.uploaders.cloudkit import RecordType
class TestRecordDiff:
"""Tests for RecordDiff dataclass."""
def test_create_record_diff(self):
"""Test creating a RecordDiff."""
diff = RecordDiff(
record_name="nba_2025_hou_okc_1021",
record_type=RecordType.GAME,
action=DiffAction.CREATE,
)
assert diff.record_name == "nba_2025_hou_okc_1021"
assert diff.record_type == RecordType.GAME
assert diff.action == DiffAction.CREATE
class TestDiffResult:
"""Tests for DiffResult dataclass."""
def test_empty_result(self):
"""Test empty DiffResult."""
result = DiffResult()
assert result.create_count == 0
assert result.update_count == 0
assert result.delete_count == 0
assert result.unchanged_count == 0
assert result.total_changes == 0
def test_counts(self):
"""Test counting different change types."""
result = DiffResult()
result.creates.append(RecordDiff(
record_name="game_1",
record_type=RecordType.GAME,
action=DiffAction.CREATE,
))
result.creates.append(RecordDiff(
record_name="game_2",
record_type=RecordType.GAME,
action=DiffAction.CREATE,
))
result.updates.append(RecordDiff(
record_name="game_3",
record_type=RecordType.GAME,
action=DiffAction.UPDATE,
))
result.deletes.append(RecordDiff(
record_name="game_4",
record_type=RecordType.GAME,
action=DiffAction.DELETE,
))
result.unchanged.append(RecordDiff(
record_name="game_5",
record_type=RecordType.GAME,
action=DiffAction.UNCHANGED,
))
assert result.create_count == 2
assert result.update_count == 1
assert result.delete_count == 1
assert result.unchanged_count == 1
assert result.total_changes == 4 # excludes unchanged
class TestRecordDiffer:
"""Tests for RecordDiffer."""
@pytest.fixture
def differ(self):
"""Create a RecordDiffer instance."""
return RecordDiffer()
@pytest.fixture
def sample_game(self):
"""Create a sample Game."""
return Game(
id="nba_2025_hou_okc_1021",
sport="nba",
season=2025,
home_team_id="team_nba_okc",
away_team_id="team_nba_hou",
stadium_id="stadium_nba_paycom_center",
game_date=datetime(2025, 10, 21, 19, 0, 0),
status="scheduled",
)
@pytest.fixture
def sample_team(self):
"""Create a sample Team."""
return Team(
id="team_nba_okc",
sport="nba",
city="Oklahoma City",
name="Thunder",
full_name="Oklahoma City Thunder",
abbreviation="OKC",
conference="Western",
division="Northwest",
)
@pytest.fixture
def sample_stadium(self):
"""Create a sample Stadium."""
return Stadium(
id="stadium_nba_paycom_center",
sport="nba",
name="Paycom Center",
city="Oklahoma City",
state="OK",
country="USA",
latitude=35.4634,
longitude=-97.5151,
capacity=18203,
)
def test_diff_games_create(self, differ, sample_game):
"""Test detecting new games to create."""
local_games = [sample_game]
remote_records = []
result = differ.diff_games(local_games, remote_records)
assert result.create_count == 1
assert result.update_count == 0
assert result.delete_count == 0
assert result.creates[0].record_name == sample_game.id
def test_diff_games_delete(self, differ, sample_game):
"""Test detecting games to delete."""
local_games = []
remote_records = [
{
"recordName": sample_game.id,
"recordType": "Game",
"fields": {
"sport": {"value": "nba", "type": "STRING"},
"season": {"value": 2025, "type": "INT64"},
},
"recordChangeTag": "abc123",
}
]
result = differ.diff_games(local_games, remote_records)
assert result.create_count == 0
assert result.delete_count == 1
assert result.deletes[0].record_name == sample_game.id
def test_diff_games_unchanged(self, differ, sample_game):
"""Test detecting unchanged games."""
local_games = [sample_game]
remote_records = [
{
"recordName": sample_game.id,
"recordType": "Game",
"fields": {
"sport": {"value": "nba", "type": "STRING"},
"season": {"value": 2025, "type": "INT64"},
"home_team_id": {"value": "team_nba_okc", "type": "STRING"},
"away_team_id": {"value": "team_nba_hou", "type": "STRING"},
"stadium_id": {"value": "stadium_nba_paycom_center", "type": "STRING"},
"game_date": {"value": int(sample_game.game_date.timestamp() * 1000), "type": "TIMESTAMP"},
"game_number": {"value": None, "type": "INT64"},
"home_score": {"value": None, "type": "INT64"},
"away_score": {"value": None, "type": "INT64"},
"status": {"value": "scheduled", "type": "STRING"},
},
"recordChangeTag": "abc123",
}
]
result = differ.diff_games(local_games, remote_records)
assert result.create_count == 0
assert result.update_count == 0
assert result.unchanged_count == 1
def test_diff_games_update(self, differ, sample_game):
"""Test detecting games that need update."""
local_games = [sample_game]
# Remote has different status
remote_records = [
{
"recordName": sample_game.id,
"recordType": "Game",
"fields": {
"sport": {"value": "nba", "type": "STRING"},
"season": {"value": 2025, "type": "INT64"},
"home_team_id": {"value": "team_nba_okc", "type": "STRING"},
"away_team_id": {"value": "team_nba_hou", "type": "STRING"},
"stadium_id": {"value": "stadium_nba_paycom_center", "type": "STRING"},
"game_date": {"value": int(sample_game.game_date.timestamp() * 1000), "type": "TIMESTAMP"},
"game_number": {"value": None, "type": "INT64"},
"home_score": {"value": None, "type": "INT64"},
"away_score": {"value": None, "type": "INT64"},
"status": {"value": "postponed", "type": "STRING"}, # Different!
},
"recordChangeTag": "abc123",
}
]
result = differ.diff_games(local_games, remote_records)
assert result.update_count == 1
assert "status" in result.updates[0].changed_fields
assert result.updates[0].record_change_tag == "abc123"
def test_diff_teams_create(self, differ, sample_team):
"""Test detecting new teams to create."""
local_teams = [sample_team]
remote_records = []
result = differ.diff_teams(local_teams, remote_records)
assert result.create_count == 1
assert result.creates[0].record_name == sample_team.id
def test_diff_stadiums_create(self, differ, sample_stadium):
"""Test detecting new stadiums to create."""
local_stadiums = [sample_stadium]
remote_records = []
result = differ.diff_stadiums(local_stadiums, remote_records)
assert result.create_count == 1
assert result.creates[0].record_name == sample_stadium.id
def test_get_records_to_upload(self, differ, sample_game):
"""Test getting CloudKitRecords for upload."""
game2 = Game(
id="nba_2025_lal_lac_1022",
sport="nba",
season=2025,
home_team_id="team_nba_lac",
away_team_id="team_nba_lal",
stadium_id="stadium_nba_crypto_com",
game_date=datetime(2025, 10, 22, 19, 0, 0),
status="scheduled",
)
local_games = [sample_game, game2]
# Only game2 exists remotely with different status
remote_records = [
{
"recordName": game2.id,
"recordType": "Game",
"fields": {
"sport": {"value": "nba", "type": "STRING"},
"season": {"value": 2025, "type": "INT64"},
"home_team_id": {"value": "team_nba_lac", "type": "STRING"},
"away_team_id": {"value": "team_nba_lal", "type": "STRING"},
"stadium_id": {"value": "stadium_nba_crypto_com", "type": "STRING"},
"game_date": {"value": int(game2.game_date.timestamp() * 1000), "type": "TIMESTAMP"},
"status": {"value": "postponed", "type": "STRING"}, # Different!
},
"recordChangeTag": "xyz789",
}
]
result = differ.diff_games(local_games, remote_records)
records = result.get_records_to_upload()
assert len(records) == 2 # 1 create + 1 update
record_names = [r.record_name for r in records]
assert sample_game.id in record_names
assert game2.id in record_names
class TestConvenienceFunctions:
"""Tests for module-level convenience functions."""
def test_game_to_cloudkit_record(self):
"""Test converting Game to CloudKitRecord."""
game = Game(
id="nba_2025_hou_okc_1021",
sport="nba",
season=2025,
home_team_id="team_nba_okc",
away_team_id="team_nba_hou",
stadium_id="stadium_nba_paycom_center",
game_date=datetime(2025, 10, 21, 19, 0, 0),
status="scheduled",
)
record = game_to_cloudkit_record(game)
assert record.record_name == game.id
assert record.record_type == RecordType.GAME
assert record.fields["sport"] == "nba"
assert record.fields["season"] == 2025
def test_team_to_cloudkit_record(self):
"""Test converting Team to CloudKitRecord."""
team = Team(
id="team_nba_okc",
sport="nba",
city="Oklahoma City",
name="Thunder",
full_name="Oklahoma City Thunder",
abbreviation="OKC",
)
record = team_to_cloudkit_record(team)
assert record.record_name == team.id
assert record.record_type == RecordType.TEAM
assert record.fields["city"] == "Oklahoma City"
assert record.fields["name"] == "Thunder"
def test_stadium_to_cloudkit_record(self):
"""Test converting Stadium to CloudKitRecord."""
stadium = Stadium(
id="stadium_nba_paycom_center",
sport="nba",
name="Paycom Center",
city="Oklahoma City",
state="OK",
country="USA",
latitude=35.4634,
longitude=-97.5151,
)
record = stadium_to_cloudkit_record(stadium)
assert record.record_name == stadium.id
assert record.record_type == RecordType.STADIUM
assert record.fields["name"] == "Paycom Center"
assert record.fields["latitude"] == 35.4634