feat(scripts): add sportstime-parser data pipeline

Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-20 18:56:25 -06:00
parent ac78042a7e
commit 52d445bca4
76 changed files with 25065 additions and 0 deletions

View File

@@ -0,0 +1,187 @@
"""Tests for canonical ID generation."""
import pytest
from datetime import datetime, date
from sportstime_parser.normalizers.canonical_id import (
generate_game_id,
generate_team_id,
generate_team_id_from_abbrev,
generate_stadium_id,
parse_game_id,
normalize_string,
)
class TestNormalizeString:
"""Tests for normalize_string function."""
def test_basic_normalization(self):
"""Test basic string normalization."""
assert normalize_string("New York") == "new_york"
assert normalize_string("Los Angeles") == "los_angeles"
def test_removes_special_characters(self):
"""Test that special characters are removed."""
assert normalize_string("AT&T Stadium") == "att_stadium"
assert normalize_string("St. Louis") == "st_louis"
assert normalize_string("O'Brien Field") == "obrien_field"
def test_collapses_whitespace(self):
"""Test that multiple spaces are collapsed."""
assert normalize_string("New York") == "new_york"
assert normalize_string(" Los Angeles ") == "los_angeles"
def test_empty_string(self):
"""Test empty string handling."""
assert normalize_string("") == ""
assert normalize_string(" ") == ""
def test_unicode_normalization(self):
"""Test unicode characters are handled."""
assert normalize_string("Café") == "cafe"
assert normalize_string("José") == "jose"
class TestGenerateGameId:
"""Tests for generate_game_id function."""
def test_basic_game_id(self):
"""Test basic game ID generation."""
game_id = generate_game_id(
sport="nba",
season=2025,
away_abbrev="bos",
home_abbrev="lal",
game_date=date(2025, 12, 25),
)
assert game_id == "game_nba_2025_20251225_bos_lal"
def test_game_id_with_datetime(self):
"""Test game ID generation with datetime object."""
game_id = generate_game_id(
sport="mlb",
season=2026,
away_abbrev="nyy",
home_abbrev="bos",
game_date=datetime(2026, 4, 1, 19, 0),
)
assert game_id == "game_mlb_2026_20260401_nyy_bos"
def test_game_id_with_game_number(self):
"""Test game ID for doubleheader."""
game_id_1 = generate_game_id(
sport="mlb",
season=2026,
away_abbrev="nyy",
home_abbrev="bos",
game_date=date(2026, 7, 4),
game_number=1,
)
game_id_2 = generate_game_id(
sport="mlb",
season=2026,
away_abbrev="nyy",
home_abbrev="bos",
game_date=date(2026, 7, 4),
game_number=2,
)
assert game_id_1 == "game_mlb_2026_20260704_nyy_bos_1"
assert game_id_2 == "game_mlb_2026_20260704_nyy_bos_2"
def test_sport_lowercased(self):
"""Test that sport is lowercased."""
game_id = generate_game_id(
sport="NBA",
season=2025,
away_abbrev="BOS",
home_abbrev="LAL",
game_date=date(2025, 12, 25),
)
assert game_id == "game_nba_2025_20251225_bos_lal"
class TestParseGameId:
"""Tests for parse_game_id function."""
def test_parse_basic_game_id(self):
"""Test parsing a basic game ID."""
parsed = parse_game_id("game_nba_2025_20251225_bos_lal")
assert parsed["sport"] == "nba"
assert parsed["season"] == 2025
assert parsed["away_abbrev"] == "bos"
assert parsed["home_abbrev"] == "lal"
assert parsed["year"] == 2025
assert parsed["month"] == 12
assert parsed["day"] == 25
assert parsed["game_number"] is None
def test_parse_game_id_with_game_number(self):
"""Test parsing game ID with game number."""
parsed = parse_game_id("game_mlb_2026_20260704_nyy_bos_2")
assert parsed["sport"] == "mlb"
assert parsed["season"] == 2026
assert parsed["away_abbrev"] == "nyy"
assert parsed["home_abbrev"] == "bos"
assert parsed["year"] == 2026
assert parsed["month"] == 7
assert parsed["day"] == 4
assert parsed["game_number"] == 2
def test_parse_invalid_game_id(self):
"""Test parsing invalid game ID raises error."""
with pytest.raises(ValueError):
parse_game_id("invalid")
with pytest.raises(ValueError):
parse_game_id("nba_2025_bos") # Missing game_ prefix
with pytest.raises(ValueError):
parse_game_id("")
with pytest.raises(ValueError):
parse_game_id("game_nba_2025_bos_lal") # Missing date
class TestGenerateTeamId:
"""Tests for generate_team_id function."""
def test_basic_team_id(self):
"""Test basic team ID generation from city and name."""
team_id = generate_team_id(sport="nba", city="Los Angeles", name="Lakers")
assert team_id == "team_nba_los_angeles_lakers"
def test_team_id_normalizes_input(self):
"""Test that inputs are normalized."""
team_id = generate_team_id(sport="NBA", city="New York", name="Yankees")
assert team_id == "team_nba_new_york_yankees"
class TestGenerateTeamIdFromAbbrev:
"""Tests for generate_team_id_from_abbrev function."""
def test_basic_team_id_from_abbrev(self):
"""Test team ID from abbreviation."""
team_id = generate_team_id_from_abbrev(sport="nba", abbreviation="LAL")
assert team_id == "team_nba_lal"
def test_lowercases_abbreviation(self):
"""Test abbreviation is lowercased."""
team_id = generate_team_id_from_abbrev(sport="MLB", abbreviation="NYY")
assert team_id == "team_mlb_nyy"
class TestGenerateStadiumId:
"""Tests for generate_stadium_id function."""
def test_basic_stadium_id(self):
"""Test basic stadium ID generation."""
stadium_id = generate_stadium_id(sport="mlb", name="Fenway Park")
assert stadium_id == "stadium_mlb_fenway_park"
def test_stadium_id_special_characters(self):
"""Test stadium ID with special characters."""
stadium_id = generate_stadium_id(sport="nfl", name="AT&T Stadium")
assert stadium_id == "stadium_nfl_att_stadium"
def test_stadium_id_with_sponsor(self):
"""Test stadium ID with sponsor name."""
stadium_id = generate_stadium_id(sport="nba", name="Crypto.com Arena")
assert stadium_id == "stadium_nba_cryptocom_arena"