feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
183
Scripts/sportstime_parser/tests/test_canonical_id.py
Normal file
183
Scripts/sportstime_parser/tests/test_canonical_id.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""Tests for canonical ID generation."""
|
||||
|
||||
import pytest
|
||||
from datetime import datetime, date
|
||||
|
||||
from sportstime_parser.normalizers.canonical_id import (
|
||||
generate_game_id,
|
||||
generate_team_id,
|
||||
generate_team_id_from_abbrev,
|
||||
generate_stadium_id,
|
||||
parse_game_id,
|
||||
normalize_string,
|
||||
)
|
||||
|
||||
|
||||
class TestNormalizeString:
|
||||
"""Tests for normalize_string function."""
|
||||
|
||||
def test_basic_normalization(self):
|
||||
"""Test basic string normalization."""
|
||||
assert normalize_string("New York") == "new_york"
|
||||
assert normalize_string("Los Angeles") == "los_angeles"
|
||||
|
||||
def test_removes_special_characters(self):
|
||||
"""Test that special characters are removed."""
|
||||
assert normalize_string("AT&T Stadium") == "att_stadium"
|
||||
assert normalize_string("St. Louis") == "st_louis"
|
||||
assert normalize_string("O'Brien Field") == "obrien_field"
|
||||
|
||||
def test_collapses_whitespace(self):
|
||||
"""Test that multiple spaces are collapsed."""
|
||||
assert normalize_string("New York") == "new_york"
|
||||
assert normalize_string(" Los Angeles ") == "los_angeles"
|
||||
|
||||
def test_empty_string(self):
|
||||
"""Test empty string handling."""
|
||||
assert normalize_string("") == ""
|
||||
assert normalize_string(" ") == ""
|
||||
|
||||
def test_unicode_normalization(self):
|
||||
"""Test unicode characters are handled."""
|
||||
assert normalize_string("Café") == "cafe"
|
||||
assert normalize_string("José") == "jose"
|
||||
|
||||
|
||||
class TestGenerateGameId:
|
||||
"""Tests for generate_game_id function."""
|
||||
|
||||
def test_basic_game_id(self):
|
||||
"""Test basic game ID generation."""
|
||||
game_id = generate_game_id(
|
||||
sport="nba",
|
||||
season=2025,
|
||||
away_abbrev="bos",
|
||||
home_abbrev="lal",
|
||||
game_date=date(2025, 12, 25),
|
||||
)
|
||||
assert game_id == "nba_2025_bos_lal_1225"
|
||||
|
||||
def test_game_id_with_datetime(self):
|
||||
"""Test game ID generation with datetime object."""
|
||||
game_id = generate_game_id(
|
||||
sport="mlb",
|
||||
season=2026,
|
||||
away_abbrev="nyy",
|
||||
home_abbrev="bos",
|
||||
game_date=datetime(2026, 4, 1, 19, 0),
|
||||
)
|
||||
assert game_id == "mlb_2026_nyy_bos_0401"
|
||||
|
||||
def test_game_id_with_game_number(self):
|
||||
"""Test game ID for doubleheader."""
|
||||
game_id_1 = generate_game_id(
|
||||
sport="mlb",
|
||||
season=2026,
|
||||
away_abbrev="nyy",
|
||||
home_abbrev="bos",
|
||||
game_date=date(2026, 7, 4),
|
||||
game_number=1,
|
||||
)
|
||||
game_id_2 = generate_game_id(
|
||||
sport="mlb",
|
||||
season=2026,
|
||||
away_abbrev="nyy",
|
||||
home_abbrev="bos",
|
||||
game_date=date(2026, 7, 4),
|
||||
game_number=2,
|
||||
)
|
||||
assert game_id_1 == "mlb_2026_nyy_bos_0704_1"
|
||||
assert game_id_2 == "mlb_2026_nyy_bos_0704_2"
|
||||
|
||||
def test_sport_lowercased(self):
|
||||
"""Test that sport is lowercased."""
|
||||
game_id = generate_game_id(
|
||||
sport="NBA",
|
||||
season=2025,
|
||||
away_abbrev="BOS",
|
||||
home_abbrev="LAL",
|
||||
game_date=date(2025, 12, 25),
|
||||
)
|
||||
assert game_id == "nba_2025_bos_lal_1225"
|
||||
|
||||
|
||||
class TestParseGameId:
|
||||
"""Tests for parse_game_id function."""
|
||||
|
||||
def test_parse_basic_game_id(self):
|
||||
"""Test parsing a basic game ID."""
|
||||
parsed = parse_game_id("nba_2025_bos_lal_1225")
|
||||
assert parsed["sport"] == "nba"
|
||||
assert parsed["season"] == 2025
|
||||
assert parsed["away_abbrev"] == "bos"
|
||||
assert parsed["home_abbrev"] == "lal"
|
||||
assert parsed["month"] == 12
|
||||
assert parsed["day"] == 25
|
||||
assert parsed["game_number"] is None
|
||||
|
||||
def test_parse_game_id_with_game_number(self):
|
||||
"""Test parsing game ID with game number."""
|
||||
parsed = parse_game_id("mlb_2026_nyy_bos_0704_2")
|
||||
assert parsed["sport"] == "mlb"
|
||||
assert parsed["season"] == 2026
|
||||
assert parsed["away_abbrev"] == "nyy"
|
||||
assert parsed["home_abbrev"] == "bos"
|
||||
assert parsed["month"] == 7
|
||||
assert parsed["day"] == 4
|
||||
assert parsed["game_number"] == 2
|
||||
|
||||
def test_parse_invalid_game_id(self):
|
||||
"""Test parsing invalid game ID raises error."""
|
||||
with pytest.raises(ValueError):
|
||||
parse_game_id("invalid")
|
||||
with pytest.raises(ValueError):
|
||||
parse_game_id("nba_2025_bos")
|
||||
with pytest.raises(ValueError):
|
||||
parse_game_id("")
|
||||
|
||||
|
||||
class TestGenerateTeamId:
|
||||
"""Tests for generate_team_id function."""
|
||||
|
||||
def test_basic_team_id(self):
|
||||
"""Test basic team ID generation from city and name."""
|
||||
team_id = generate_team_id(sport="nba", city="Los Angeles", name="Lakers")
|
||||
assert team_id == "team_nba_los_angeles_lakers"
|
||||
|
||||
def test_team_id_normalizes_input(self):
|
||||
"""Test that inputs are normalized."""
|
||||
team_id = generate_team_id(sport="NBA", city="New York", name="Yankees")
|
||||
assert team_id == "team_nba_new_york_yankees"
|
||||
|
||||
|
||||
class TestGenerateTeamIdFromAbbrev:
|
||||
"""Tests for generate_team_id_from_abbrev function."""
|
||||
|
||||
def test_basic_team_id_from_abbrev(self):
|
||||
"""Test team ID from abbreviation."""
|
||||
team_id = generate_team_id_from_abbrev(sport="nba", abbreviation="LAL")
|
||||
assert team_id == "team_nba_lal"
|
||||
|
||||
def test_lowercases_abbreviation(self):
|
||||
"""Test abbreviation is lowercased."""
|
||||
team_id = generate_team_id_from_abbrev(sport="MLB", abbreviation="NYY")
|
||||
assert team_id == "team_mlb_nyy"
|
||||
|
||||
|
||||
class TestGenerateStadiumId:
|
||||
"""Tests for generate_stadium_id function."""
|
||||
|
||||
def test_basic_stadium_id(self):
|
||||
"""Test basic stadium ID generation."""
|
||||
stadium_id = generate_stadium_id(sport="mlb", name="Fenway Park")
|
||||
assert stadium_id == "stadium_mlb_fenway_park"
|
||||
|
||||
def test_stadium_id_special_characters(self):
|
||||
"""Test stadium ID with special characters."""
|
||||
stadium_id = generate_stadium_id(sport="nfl", name="AT&T Stadium")
|
||||
assert stadium_id == "stadium_nfl_att_stadium"
|
||||
|
||||
def test_stadium_id_with_sponsor(self):
|
||||
"""Test stadium ID with sponsor name."""
|
||||
stadium_id = generate_stadium_id(sport="nba", name="Crypto.com Arena")
|
||||
assert stadium_id == "stadium_nba_cryptocom_arena"
|
||||
Reference in New Issue
Block a user