feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
226
sportstime_parser/tests/test_scrapers/test_wnba.py
Normal file
226
sportstime_parser/tests/test_scrapers/test_wnba.py
Normal file
@@ -0,0 +1,226 @@
|
||||
"""Tests for WNBA scraper."""
|
||||
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from sportstime_parser.scrapers.wnba import WNBAScraper, create_wnba_scraper
|
||||
from sportstime_parser.scrapers.base import RawGameData
|
||||
from sportstime_parser.tests.fixtures import (
|
||||
load_json_fixture,
|
||||
WNBA_ESPN_SCOREBOARD_JSON,
|
||||
)
|
||||
|
||||
|
||||
class TestWNBAScraperInit:
|
||||
"""Test WNBAScraper initialization."""
|
||||
|
||||
def test_creates_scraper_with_season(self):
|
||||
"""Test scraper initializes with correct season."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
assert scraper.sport == "wnba"
|
||||
assert scraper.season == 2026
|
||||
|
||||
def test_factory_function_creates_scraper(self):
|
||||
"""Test factory function creates correct scraper."""
|
||||
scraper = create_wnba_scraper(season=2026)
|
||||
assert isinstance(scraper, WNBAScraper)
|
||||
assert scraper.season == 2026
|
||||
|
||||
def test_expected_game_count(self):
|
||||
"""Test expected game count is correct for WNBA."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
assert scraper.expected_game_count == 220
|
||||
|
||||
def test_sources_in_priority_order(self):
|
||||
"""Test sources are returned in correct priority order."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
sources = scraper._get_sources()
|
||||
assert sources == ["espn"]
|
||||
|
||||
|
||||
class TestESPNParsing:
|
||||
"""Test ESPN API response parsing."""
|
||||
|
||||
def test_parses_completed_games(self):
|
||||
"""Test parsing completed games from ESPN."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
data = load_json_fixture(WNBA_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
completed = [g for g in games if g.status == "final"]
|
||||
assert len(completed) == 2
|
||||
|
||||
# Aces @ Liberty
|
||||
lv_ny = next(g for g in completed if g.away_team_raw == "Las Vegas Aces")
|
||||
assert lv_ny.home_team_raw == "New York Liberty"
|
||||
assert lv_ny.away_score == 88
|
||||
assert lv_ny.home_score == 92
|
||||
assert lv_ny.stadium_raw == "Barclays Center"
|
||||
|
||||
def test_parses_scheduled_games(self):
|
||||
"""Test parsing scheduled games from ESPN."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
data = load_json_fixture(WNBA_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
scheduled = [g for g in games if g.status == "scheduled"]
|
||||
assert len(scheduled) == 1
|
||||
|
||||
phx_sea = scheduled[0]
|
||||
assert phx_sea.away_team_raw == "Phoenix Mercury"
|
||||
assert phx_sea.home_team_raw == "Seattle Storm"
|
||||
assert phx_sea.stadium_raw == "Climate Pledge Arena"
|
||||
|
||||
def test_parses_venue_info(self):
|
||||
"""Test venue information is extracted."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
data = load_json_fixture(WNBA_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
for game in games:
|
||||
assert game.stadium_raw is not None
|
||||
|
||||
|
||||
class TestGameNormalization:
|
||||
"""Test game normalization and canonical ID generation."""
|
||||
|
||||
def test_normalizes_games_with_canonical_ids(self):
|
||||
"""Test games are normalized with correct canonical IDs."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 5, 20),
|
||||
home_team_raw="New York Liberty",
|
||||
away_team_raw="Las Vegas Aces",
|
||||
stadium_raw="Barclays Center",
|
||||
home_score=92,
|
||||
away_score=88,
|
||||
status="final",
|
||||
source_url="http://example.com",
|
||||
)
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
assert len(games) == 1
|
||||
game = games[0]
|
||||
|
||||
# Check canonical ID format
|
||||
assert game.id == "wnba_2026_lv_ny_0520"
|
||||
assert game.sport == "wnba"
|
||||
assert game.season == 2026
|
||||
|
||||
# Check team IDs
|
||||
assert game.home_team_id == "team_wnba_ny"
|
||||
assert game.away_team_id == "team_wnba_lv"
|
||||
|
||||
# Check scores preserved
|
||||
assert game.home_score == 92
|
||||
assert game.away_score == 88
|
||||
|
||||
def test_creates_review_items_for_unresolved_teams(self):
|
||||
"""Test review items are created for unresolved teams."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 5, 20),
|
||||
home_team_raw="Unknown Team XYZ",
|
||||
away_team_raw="Las Vegas Aces",
|
||||
stadium_raw="Barclays Center",
|
||||
status="scheduled",
|
||||
),
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
# Game should not be created due to unresolved team
|
||||
assert len(games) == 0
|
||||
|
||||
# But there should be a review item
|
||||
assert len(review_items) >= 1
|
||||
|
||||
|
||||
class TestTeamAndStadiumScraping:
|
||||
"""Test team and stadium data scraping."""
|
||||
|
||||
def test_scrapes_all_wnba_teams(self):
|
||||
"""Test all WNBA teams are returned."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# WNBA has 13 teams (including Golden State Valkyries)
|
||||
assert len(teams) == 13
|
||||
|
||||
# Check team IDs are unique
|
||||
team_ids = [t.id for t in teams]
|
||||
assert len(set(team_ids)) == 13
|
||||
|
||||
# Check all teams have required fields
|
||||
for team in teams:
|
||||
assert team.id.startswith("team_wnba_")
|
||||
assert team.sport == "wnba"
|
||||
assert team.city
|
||||
assert team.name
|
||||
assert team.full_name
|
||||
assert team.abbreviation
|
||||
|
||||
def test_scrapes_all_wnba_stadiums(self):
|
||||
"""Test all WNBA stadiums are returned."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
stadiums = scraper.scrape_stadiums()
|
||||
|
||||
# Should have stadiums for all teams
|
||||
assert len(stadiums) == 13
|
||||
|
||||
# Check stadium IDs are unique
|
||||
stadium_ids = [s.id for s in stadiums]
|
||||
assert len(set(stadium_ids)) == 13
|
||||
|
||||
# Check all stadiums have required fields
|
||||
for stadium in stadiums:
|
||||
assert stadium.id.startswith("stadium_wnba_")
|
||||
assert stadium.sport == "wnba"
|
||||
assert stadium.name
|
||||
assert stadium.city
|
||||
assert stadium.state
|
||||
assert stadium.country == "USA"
|
||||
assert stadium.latitude != 0
|
||||
assert stadium.longitude != 0
|
||||
|
||||
|
||||
class TestScrapeFallback:
|
||||
"""Test fallback behavior (WNBA only has ESPN)."""
|
||||
|
||||
def test_returns_failure_when_espn_fails(self):
|
||||
"""Test scraper returns failure when ESPN fails."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
|
||||
with patch.object(scraper, '_scrape_espn') as mock_espn:
|
||||
mock_espn.side_effect = Exception("ESPN failed")
|
||||
|
||||
result = scraper.scrape_games()
|
||||
|
||||
assert not result.success
|
||||
assert "All sources failed" in result.error_message
|
||||
|
||||
|
||||
class TestSeasonMonths:
|
||||
"""Test season month calculation."""
|
||||
|
||||
def test_gets_correct_season_months(self):
|
||||
"""Test correct months are returned for WNBA season."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
months = scraper._get_season_months()
|
||||
|
||||
# WNBA season is May-October
|
||||
assert len(months) == 6 # May, Jun, Jul, Aug, Sep, Oct
|
||||
|
||||
# Check first month is May of season year
|
||||
assert months[0] == (2026, 5)
|
||||
|
||||
# Check last month is October
|
||||
assert months[-1] == (2026, 10)
|
||||
Reference in New Issue
Block a user