feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
1
sportstime_parser/tests/test_scrapers/__init__.py
Normal file
1
sportstime_parser/tests/test_scrapers/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Tests for scrapers module."""
|
||||
257
sportstime_parser/tests/test_scrapers/test_mlb.py
Normal file
257
sportstime_parser/tests/test_scrapers/test_mlb.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""Tests for MLB scraper."""
|
||||
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from sportstime_parser.scrapers.mlb import MLBScraper, create_mlb_scraper
|
||||
from sportstime_parser.scrapers.base import RawGameData
|
||||
from sportstime_parser.tests.fixtures import (
|
||||
load_json_fixture,
|
||||
MLB_ESPN_SCOREBOARD_JSON,
|
||||
)
|
||||
|
||||
|
||||
class TestMLBScraperInit:
|
||||
"""Test MLBScraper initialization."""
|
||||
|
||||
def test_creates_scraper_with_season(self):
|
||||
"""Test scraper initializes with correct season."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
assert scraper.sport == "mlb"
|
||||
assert scraper.season == 2026
|
||||
|
||||
def test_factory_function_creates_scraper(self):
|
||||
"""Test factory function creates correct scraper."""
|
||||
scraper = create_mlb_scraper(season=2026)
|
||||
assert isinstance(scraper, MLBScraper)
|
||||
assert scraper.season == 2026
|
||||
|
||||
def test_expected_game_count(self):
|
||||
"""Test expected game count is correct for MLB."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
assert scraper.expected_game_count == 2430
|
||||
|
||||
def test_sources_in_priority_order(self):
|
||||
"""Test sources are returned in correct priority order."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
sources = scraper._get_sources()
|
||||
assert sources == ["baseball_reference", "mlb_api", "espn"]
|
||||
|
||||
|
||||
class TestESPNParsing:
|
||||
"""Test ESPN API response parsing."""
|
||||
|
||||
def test_parses_completed_games(self):
|
||||
"""Test parsing completed games from ESPN."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
data = load_json_fixture(MLB_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
completed = [g for g in games if g.status == "final"]
|
||||
assert len(completed) == 2
|
||||
|
||||
# Yankees @ Red Sox
|
||||
nyy_bos = next(g for g in completed if g.away_team_raw == "New York Yankees")
|
||||
assert nyy_bos.home_team_raw == "Boston Red Sox"
|
||||
assert nyy_bos.away_score == 3
|
||||
assert nyy_bos.home_score == 5
|
||||
assert nyy_bos.stadium_raw == "Fenway Park"
|
||||
|
||||
def test_parses_scheduled_games(self):
|
||||
"""Test parsing scheduled games from ESPN."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
data = load_json_fixture(MLB_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
scheduled = [g for g in games if g.status == "scheduled"]
|
||||
assert len(scheduled) == 1
|
||||
|
||||
lad_sf = scheduled[0]
|
||||
assert lad_sf.away_team_raw == "Los Angeles Dodgers"
|
||||
assert lad_sf.home_team_raw == "San Francisco Giants"
|
||||
assert lad_sf.stadium_raw == "Oracle Park"
|
||||
|
||||
def test_parses_venue_info(self):
|
||||
"""Test venue information is extracted."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
data = load_json_fixture(MLB_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
for game in games:
|
||||
assert game.stadium_raw is not None
|
||||
|
||||
|
||||
class TestGameNormalization:
|
||||
"""Test game normalization and canonical ID generation."""
|
||||
|
||||
def test_normalizes_games_with_canonical_ids(self):
|
||||
"""Test games are normalized with correct canonical IDs."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 4, 15),
|
||||
home_team_raw="Boston Red Sox",
|
||||
away_team_raw="New York Yankees",
|
||||
stadium_raw="Fenway Park",
|
||||
home_score=5,
|
||||
away_score=3,
|
||||
status="final",
|
||||
source_url="http://example.com",
|
||||
)
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
assert len(games) == 1
|
||||
game = games[0]
|
||||
|
||||
# Check canonical ID format
|
||||
assert game.id == "mlb_2026_nyy_bos_0415"
|
||||
assert game.sport == "mlb"
|
||||
assert game.season == 2026
|
||||
|
||||
# Check team IDs
|
||||
assert game.home_team_id == "team_mlb_bos"
|
||||
assert game.away_team_id == "team_mlb_nyy"
|
||||
|
||||
# Check scores preserved
|
||||
assert game.home_score == 5
|
||||
assert game.away_score == 3
|
||||
|
||||
def test_creates_review_items_for_unresolved_teams(self):
|
||||
"""Test review items are created for unresolved teams."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 4, 15),
|
||||
home_team_raw="Unknown Team XYZ",
|
||||
away_team_raw="Boston Red Sox",
|
||||
stadium_raw="Fenway Park",
|
||||
status="scheduled",
|
||||
),
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
# Game should not be created due to unresolved team
|
||||
assert len(games) == 0
|
||||
|
||||
# But there should be a review item
|
||||
assert len(review_items) >= 1
|
||||
|
||||
|
||||
class TestTeamAndStadiumScraping:
|
||||
"""Test team and stadium data scraping."""
|
||||
|
||||
def test_scrapes_all_mlb_teams(self):
|
||||
"""Test all 30 MLB teams are returned."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# 30 MLB teams
|
||||
assert len(teams) == 30
|
||||
|
||||
# Check team IDs are unique
|
||||
team_ids = [t.id for t in teams]
|
||||
assert len(set(team_ids)) == 30
|
||||
|
||||
# Check all teams have required fields
|
||||
for team in teams:
|
||||
assert team.id.startswith("team_mlb_")
|
||||
assert team.sport == "mlb"
|
||||
assert team.city
|
||||
assert team.name
|
||||
assert team.full_name
|
||||
assert team.abbreviation
|
||||
|
||||
def test_teams_have_leagues_and_divisions(self):
|
||||
"""Test teams have league (conference) and division info."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# Count teams by league
|
||||
al = [t for t in teams if t.conference == "American"]
|
||||
nl = [t for t in teams if t.conference == "National"]
|
||||
|
||||
assert len(al) == 15
|
||||
assert len(nl) == 15
|
||||
|
||||
def test_scrapes_all_mlb_stadiums(self):
|
||||
"""Test all MLB stadiums are returned."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
stadiums = scraper.scrape_stadiums()
|
||||
|
||||
# Should have stadiums for all teams
|
||||
assert len(stadiums) == 30
|
||||
|
||||
# Check stadium IDs are unique
|
||||
stadium_ids = [s.id for s in stadiums]
|
||||
assert len(set(stadium_ids)) == 30
|
||||
|
||||
# Check all stadiums have required fields
|
||||
for stadium in stadiums:
|
||||
assert stadium.id.startswith("stadium_mlb_")
|
||||
assert stadium.sport == "mlb"
|
||||
assert stadium.name
|
||||
assert stadium.city
|
||||
assert stadium.state
|
||||
assert stadium.country in ["USA", "Canada"]
|
||||
assert stadium.latitude != 0
|
||||
assert stadium.longitude != 0
|
||||
|
||||
|
||||
class TestScrapeFallback:
|
||||
"""Test multi-source fallback behavior."""
|
||||
|
||||
def test_falls_back_to_next_source_on_failure(self):
|
||||
"""Test scraper tries next source when first fails."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
|
||||
with patch.object(scraper, '_scrape_baseball_reference') as mock_br, \
|
||||
patch.object(scraper, '_scrape_mlb_api') as mock_mlb, \
|
||||
patch.object(scraper, '_scrape_espn') as mock_espn:
|
||||
|
||||
# Make BR and MLB API fail
|
||||
mock_br.side_effect = Exception("Connection failed")
|
||||
mock_mlb.side_effect = Exception("API error")
|
||||
|
||||
# Make ESPN return data
|
||||
mock_espn.return_value = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 4, 15),
|
||||
home_team_raw="Boston Red Sox",
|
||||
away_team_raw="New York Yankees",
|
||||
stadium_raw="Fenway Park",
|
||||
status="scheduled",
|
||||
)
|
||||
]
|
||||
|
||||
result = scraper.scrape_games()
|
||||
|
||||
assert result.success
|
||||
assert result.source == "espn"
|
||||
assert mock_br.called
|
||||
assert mock_mlb.called
|
||||
assert mock_espn.called
|
||||
|
||||
|
||||
class TestSeasonMonths:
|
||||
"""Test season month calculation."""
|
||||
|
||||
def test_gets_correct_season_months(self):
|
||||
"""Test correct months are returned for MLB season."""
|
||||
scraper = MLBScraper(season=2026)
|
||||
months = scraper._get_season_months()
|
||||
|
||||
# MLB season is March-November
|
||||
assert len(months) == 9 # Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov
|
||||
|
||||
# Check first month is March of season year
|
||||
assert months[0] == (2026, 3)
|
||||
|
||||
# Check last month is November
|
||||
assert months[-1] == (2026, 11)
|
||||
251
sportstime_parser/tests/test_scrapers/test_mls.py
Normal file
251
sportstime_parser/tests/test_scrapers/test_mls.py
Normal file
@@ -0,0 +1,251 @@
|
||||
"""Tests for MLS scraper."""
|
||||
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from sportstime_parser.scrapers.mls import MLSScraper, create_mls_scraper
|
||||
from sportstime_parser.scrapers.base import RawGameData
|
||||
from sportstime_parser.tests.fixtures import (
|
||||
load_json_fixture,
|
||||
MLS_ESPN_SCOREBOARD_JSON,
|
||||
)
|
||||
|
||||
|
||||
class TestMLSScraperInit:
|
||||
"""Test MLSScraper initialization."""
|
||||
|
||||
def test_creates_scraper_with_season(self):
|
||||
"""Test scraper initializes with correct season."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
assert scraper.sport == "mls"
|
||||
assert scraper.season == 2026
|
||||
|
||||
def test_factory_function_creates_scraper(self):
|
||||
"""Test factory function creates correct scraper."""
|
||||
scraper = create_mls_scraper(season=2026)
|
||||
assert isinstance(scraper, MLSScraper)
|
||||
assert scraper.season == 2026
|
||||
|
||||
def test_expected_game_count(self):
|
||||
"""Test expected game count is correct for MLS."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
assert scraper.expected_game_count == 493
|
||||
|
||||
def test_sources_in_priority_order(self):
|
||||
"""Test sources are returned in correct priority order."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
sources = scraper._get_sources()
|
||||
assert sources == ["espn", "fbref"]
|
||||
|
||||
|
||||
class TestESPNParsing:
|
||||
"""Test ESPN API response parsing."""
|
||||
|
||||
def test_parses_completed_games(self):
|
||||
"""Test parsing completed games from ESPN."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
data = load_json_fixture(MLS_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
completed = [g for g in games if g.status == "final"]
|
||||
assert len(completed) == 2
|
||||
|
||||
# Galaxy @ LAFC
|
||||
la_lafc = next(g for g in completed if g.away_team_raw == "LA Galaxy")
|
||||
assert la_lafc.home_team_raw == "Los Angeles FC"
|
||||
assert la_lafc.away_score == 2
|
||||
assert la_lafc.home_score == 3
|
||||
assert la_lafc.stadium_raw == "BMO Stadium"
|
||||
|
||||
def test_parses_scheduled_games(self):
|
||||
"""Test parsing scheduled games from ESPN."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
data = load_json_fixture(MLS_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
scheduled = [g for g in games if g.status == "scheduled"]
|
||||
assert len(scheduled) == 1
|
||||
|
||||
ny_atl = scheduled[0]
|
||||
assert ny_atl.away_team_raw == "New York Red Bulls"
|
||||
assert ny_atl.home_team_raw == "Atlanta United FC"
|
||||
assert ny_atl.stadium_raw == "Mercedes-Benz Stadium"
|
||||
|
||||
def test_parses_venue_info(self):
|
||||
"""Test venue information is extracted."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
data = load_json_fixture(MLS_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
for game in games:
|
||||
assert game.stadium_raw is not None
|
||||
|
||||
|
||||
class TestGameNormalization:
|
||||
"""Test game normalization and canonical ID generation."""
|
||||
|
||||
def test_normalizes_games_with_canonical_ids(self):
|
||||
"""Test games are normalized with correct canonical IDs."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 3, 15),
|
||||
home_team_raw="Los Angeles FC",
|
||||
away_team_raw="LA Galaxy",
|
||||
stadium_raw="BMO Stadium",
|
||||
home_score=3,
|
||||
away_score=2,
|
||||
status="final",
|
||||
source_url="http://example.com",
|
||||
)
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
assert len(games) == 1
|
||||
game = games[0]
|
||||
|
||||
# Check canonical ID format
|
||||
assert game.id == "mls_2026_lag_lafc_0315"
|
||||
assert game.sport == "mls"
|
||||
assert game.season == 2026
|
||||
|
||||
# Check team IDs
|
||||
assert game.home_team_id == "team_mls_lafc"
|
||||
assert game.away_team_id == "team_mls_lag"
|
||||
|
||||
# Check scores preserved
|
||||
assert game.home_score == 3
|
||||
assert game.away_score == 2
|
||||
|
||||
def test_creates_review_items_for_unresolved_teams(self):
|
||||
"""Test review items are created for unresolved teams."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 3, 15),
|
||||
home_team_raw="Unknown Team XYZ",
|
||||
away_team_raw="LA Galaxy",
|
||||
stadium_raw="BMO Stadium",
|
||||
status="scheduled",
|
||||
),
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
# Game should not be created due to unresolved team
|
||||
assert len(games) == 0
|
||||
|
||||
# But there should be a review item
|
||||
assert len(review_items) >= 1
|
||||
|
||||
|
||||
class TestTeamAndStadiumScraping:
|
||||
"""Test team and stadium data scraping."""
|
||||
|
||||
def test_scrapes_all_mls_teams(self):
|
||||
"""Test all MLS teams are returned."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# MLS has 29+ teams
|
||||
assert len(teams) >= 29
|
||||
|
||||
# Check team IDs are unique
|
||||
team_ids = [t.id for t in teams]
|
||||
assert len(set(team_ids)) == len(teams)
|
||||
|
||||
# Check all teams have required fields
|
||||
for team in teams:
|
||||
assert team.id.startswith("team_mls_")
|
||||
assert team.sport == "mls"
|
||||
assert team.city
|
||||
assert team.name
|
||||
assert team.full_name
|
||||
assert team.abbreviation
|
||||
|
||||
def test_teams_have_conferences(self):
|
||||
"""Test teams have conference info."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# Count teams by conference
|
||||
eastern = [t for t in teams if t.conference == "Eastern"]
|
||||
western = [t for t in teams if t.conference == "Western"]
|
||||
|
||||
# MLS has two conferences
|
||||
assert len(eastern) >= 14
|
||||
assert len(western) >= 14
|
||||
|
||||
def test_scrapes_all_mls_stadiums(self):
|
||||
"""Test all MLS stadiums are returned."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
stadiums = scraper.scrape_stadiums()
|
||||
|
||||
# Should have stadiums for all teams
|
||||
assert len(stadiums) >= 29
|
||||
|
||||
# Check all stadiums have required fields
|
||||
for stadium in stadiums:
|
||||
assert stadium.id.startswith("stadium_mls_")
|
||||
assert stadium.sport == "mls"
|
||||
assert stadium.name
|
||||
assert stadium.city
|
||||
assert stadium.state
|
||||
assert stadium.country in ["USA", "Canada"]
|
||||
assert stadium.latitude != 0
|
||||
assert stadium.longitude != 0
|
||||
|
||||
|
||||
class TestScrapeFallback:
|
||||
"""Test multi-source fallback behavior."""
|
||||
|
||||
def test_falls_back_to_next_source_on_failure(self):
|
||||
"""Test scraper tries next source when first fails."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
|
||||
with patch.object(scraper, '_scrape_espn') as mock_espn, \
|
||||
patch.object(scraper, '_scrape_fbref') as mock_fbref:
|
||||
|
||||
# Make ESPN fail
|
||||
mock_espn.side_effect = Exception("Connection failed")
|
||||
|
||||
# Make FBref return data
|
||||
mock_fbref.return_value = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 3, 15),
|
||||
home_team_raw="Los Angeles FC",
|
||||
away_team_raw="LA Galaxy",
|
||||
stadium_raw="BMO Stadium",
|
||||
status="scheduled",
|
||||
)
|
||||
]
|
||||
|
||||
result = scraper.scrape_games()
|
||||
|
||||
assert result.success
|
||||
assert result.source == "fbref"
|
||||
assert mock_espn.called
|
||||
assert mock_fbref.called
|
||||
|
||||
|
||||
class TestSeasonMonths:
|
||||
"""Test season month calculation."""
|
||||
|
||||
def test_gets_correct_season_months(self):
|
||||
"""Test correct months are returned for MLS season."""
|
||||
scraper = MLSScraper(season=2026)
|
||||
months = scraper._get_season_months()
|
||||
|
||||
# MLS season is February-November
|
||||
assert len(months) == 10 # Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov
|
||||
|
||||
# Check first month is February of season year
|
||||
assert months[0] == (2026, 2)
|
||||
|
||||
# Check last month is November
|
||||
assert months[-1] == (2026, 11)
|
||||
428
sportstime_parser/tests/test_scrapers/test_nba.py
Normal file
428
sportstime_parser/tests/test_scrapers/test_nba.py
Normal file
@@ -0,0 +1,428 @@
|
||||
"""Tests for NBA scraper."""
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from sportstime_parser.scrapers.nba import NBAScraper, create_nba_scraper
|
||||
from sportstime_parser.scrapers.base import RawGameData
|
||||
from sportstime_parser.tests.fixtures import (
|
||||
load_fixture,
|
||||
load_json_fixture,
|
||||
NBA_BR_OCTOBER_HTML,
|
||||
NBA_BR_EDGE_CASES_HTML,
|
||||
NBA_ESPN_SCOREBOARD_JSON,
|
||||
)
|
||||
|
||||
|
||||
class TestNBAScraperInit:
|
||||
"""Test NBAScraper initialization."""
|
||||
|
||||
def test_creates_scraper_with_season(self):
|
||||
"""Test scraper initializes with correct season."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
assert scraper.sport == "nba"
|
||||
assert scraper.season == 2025
|
||||
|
||||
def test_factory_function_creates_scraper(self):
|
||||
"""Test factory function creates correct scraper."""
|
||||
scraper = create_nba_scraper(season=2025)
|
||||
assert isinstance(scraper, NBAScraper)
|
||||
assert scraper.season == 2025
|
||||
|
||||
def test_expected_game_count(self):
|
||||
"""Test expected game count is correct for NBA."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
assert scraper.expected_game_count == 1230
|
||||
|
||||
def test_sources_in_priority_order(self):
|
||||
"""Test sources are returned in correct priority order."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
sources = scraper._get_sources()
|
||||
assert sources == ["basketball_reference", "espn", "cbs"]
|
||||
|
||||
|
||||
class TestBasketballReferenceParsing:
|
||||
"""Test Basketball-Reference HTML parsing."""
|
||||
|
||||
def test_parses_completed_games(self):
|
||||
"""Test parsing completed games with scores."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
html = load_fixture(NBA_BR_OCTOBER_HTML)
|
||||
games = scraper._parse_basketball_reference(html, "http://example.com")
|
||||
|
||||
# Should find all games in fixture
|
||||
assert len(games) == 7
|
||||
|
||||
# Check first completed game
|
||||
completed_games = [g for g in games if g.status == "final"]
|
||||
assert len(completed_games) == 2
|
||||
|
||||
# Boston @ Cleveland
|
||||
bos_cle = next(g for g in games if g.away_team_raw == "Boston Celtics")
|
||||
assert bos_cle.home_team_raw == "Cleveland Cavaliers"
|
||||
assert bos_cle.away_score == 112
|
||||
assert bos_cle.home_score == 108
|
||||
assert bos_cle.stadium_raw == "Rocket Mortgage FieldHouse"
|
||||
assert bos_cle.status == "final"
|
||||
|
||||
def test_parses_scheduled_games(self):
|
||||
"""Test parsing scheduled games without scores."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
html = load_fixture(NBA_BR_OCTOBER_HTML)
|
||||
games = scraper._parse_basketball_reference(html, "http://example.com")
|
||||
|
||||
scheduled_games = [g for g in games if g.status == "scheduled"]
|
||||
assert len(scheduled_games) == 5
|
||||
|
||||
# Houston @ OKC
|
||||
hou_okc = next(g for g in scheduled_games if g.away_team_raw == "Houston Rockets")
|
||||
assert hou_okc.home_team_raw == "Oklahoma City Thunder"
|
||||
assert hou_okc.away_score is None
|
||||
assert hou_okc.home_score is None
|
||||
assert hou_okc.stadium_raw == "Paycom Center"
|
||||
|
||||
def test_parses_game_dates_correctly(self):
|
||||
"""Test game dates are parsed correctly."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
html = load_fixture(NBA_BR_OCTOBER_HTML)
|
||||
games = scraper._parse_basketball_reference(html, "http://example.com")
|
||||
|
||||
# Check first game date
|
||||
first_game = games[0]
|
||||
assert first_game.game_date.year == 2025
|
||||
assert first_game.game_date.month == 10
|
||||
assert first_game.game_date.day == 22
|
||||
|
||||
def test_tracks_source_url(self):
|
||||
"""Test source URL is tracked for all games."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
html = load_fixture(NBA_BR_OCTOBER_HTML)
|
||||
source_url = "http://basketball-reference.com/test"
|
||||
games = scraper._parse_basketball_reference(html, source_url)
|
||||
|
||||
for game in games:
|
||||
assert game.source_url == source_url
|
||||
|
||||
|
||||
class TestBasketballReferenceEdgeCases:
|
||||
"""Test edge case handling in Basketball-Reference parsing."""
|
||||
|
||||
def test_parses_postponed_games(self):
|
||||
"""Test postponed games are identified correctly."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
html = load_fixture(NBA_BR_EDGE_CASES_HTML)
|
||||
games = scraper._parse_basketball_reference(html, "http://example.com")
|
||||
|
||||
postponed = [g for g in games if g.status == "postponed"]
|
||||
assert len(postponed) == 1
|
||||
assert postponed[0].away_team_raw == "Los Angeles Lakers"
|
||||
assert postponed[0].home_team_raw == "Phoenix Suns"
|
||||
|
||||
def test_parses_cancelled_games(self):
|
||||
"""Test cancelled games are identified correctly."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
html = load_fixture(NBA_BR_EDGE_CASES_HTML)
|
||||
games = scraper._parse_basketball_reference(html, "http://example.com")
|
||||
|
||||
cancelled = [g for g in games if g.status == "cancelled"]
|
||||
assert len(cancelled) == 1
|
||||
assert cancelled[0].away_team_raw == "Portland Trail Blazers"
|
||||
|
||||
def test_parses_neutral_site_games(self):
|
||||
"""Test neutral site games are parsed."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
html = load_fixture(NBA_BR_EDGE_CASES_HTML)
|
||||
games = scraper._parse_basketball_reference(html, "http://example.com")
|
||||
|
||||
# Mexico City game
|
||||
mexico = next(g for g in games if g.stadium_raw == "Arena CDMX")
|
||||
assert mexico.away_team_raw == "Miami Heat"
|
||||
assert mexico.home_team_raw == "Washington Wizards"
|
||||
assert mexico.status == "final"
|
||||
|
||||
def test_parses_overtime_games(self):
|
||||
"""Test overtime games with high scores."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
html = load_fixture(NBA_BR_EDGE_CASES_HTML)
|
||||
games = scraper._parse_basketball_reference(html, "http://example.com")
|
||||
|
||||
# High scoring OT game
|
||||
ot_game = next(g for g in games if g.away_score == 147)
|
||||
assert ot_game.home_score == 150
|
||||
assert ot_game.status == "final"
|
||||
|
||||
|
||||
class TestESPNParsing:
|
||||
"""Test ESPN API response parsing."""
|
||||
|
||||
def test_parses_completed_games(self):
|
||||
"""Test parsing completed games from ESPN."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
data = load_json_fixture(NBA_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
completed = [g for g in games if g.status == "final"]
|
||||
assert len(completed) == 2
|
||||
|
||||
# Boston @ Cleveland
|
||||
bos_cle = next(g for g in completed if g.away_team_raw == "Boston Celtics")
|
||||
assert bos_cle.home_team_raw == "Cleveland Cavaliers"
|
||||
assert bos_cle.away_score == 112
|
||||
assert bos_cle.home_score == 108
|
||||
assert bos_cle.stadium_raw == "Rocket Mortgage FieldHouse"
|
||||
|
||||
def test_parses_scheduled_games(self):
|
||||
"""Test parsing scheduled games from ESPN."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
data = load_json_fixture(NBA_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
scheduled = [g for g in games if g.status == "scheduled"]
|
||||
assert len(scheduled) == 1
|
||||
|
||||
hou_okc = scheduled[0]
|
||||
assert hou_okc.away_team_raw == "Houston Rockets"
|
||||
assert hou_okc.home_team_raw == "Oklahoma City Thunder"
|
||||
assert hou_okc.stadium_raw == "Paycom Center"
|
||||
|
||||
def test_parses_venue_info(self):
|
||||
"""Test venue information is extracted."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
data = load_json_fixture(NBA_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
# Check all games have venue info
|
||||
for game in games:
|
||||
assert game.stadium_raw is not None
|
||||
|
||||
|
||||
class TestGameNormalization:
|
||||
"""Test game normalization and canonical ID generation."""
|
||||
|
||||
def test_normalizes_games_with_canonical_ids(self):
|
||||
"""Test games are normalized with correct canonical IDs."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 10, 22),
|
||||
home_team_raw="Cleveland Cavaliers",
|
||||
away_team_raw="Boston Celtics",
|
||||
stadium_raw="Rocket Mortgage FieldHouse",
|
||||
home_score=108,
|
||||
away_score=112,
|
||||
status="final",
|
||||
source_url="http://example.com",
|
||||
)
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
assert len(games) == 1
|
||||
game = games[0]
|
||||
|
||||
# Check canonical ID format
|
||||
assert game.id == "nba_2025_bos_cle_1022"
|
||||
assert game.sport == "nba"
|
||||
assert game.season == 2025
|
||||
|
||||
# Check team IDs
|
||||
assert game.home_team_id == "team_nba_cle"
|
||||
assert game.away_team_id == "team_nba_bos"
|
||||
|
||||
# Check scores preserved
|
||||
assert game.home_score == 108
|
||||
assert game.away_score == 112
|
||||
|
||||
def test_detects_doubleheaders(self):
|
||||
"""Test doubleheaders get correct game numbers."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 4, 1, 13, 0),
|
||||
home_team_raw="Boston Celtics",
|
||||
away_team_raw="New York Knicks",
|
||||
stadium_raw="TD Garden",
|
||||
status="final",
|
||||
home_score=105,
|
||||
away_score=98,
|
||||
),
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 4, 1, 19, 0),
|
||||
home_team_raw="Boston Celtics",
|
||||
away_team_raw="New York Knicks",
|
||||
stadium_raw="TD Garden",
|
||||
status="final",
|
||||
home_score=110,
|
||||
away_score=102,
|
||||
),
|
||||
]
|
||||
|
||||
games, _ = scraper._normalize_games(raw_games)
|
||||
|
||||
assert len(games) == 2
|
||||
game_numbers = sorted([g.game_number for g in games])
|
||||
assert game_numbers == [1, 2]
|
||||
|
||||
# Check IDs include game number
|
||||
game_ids = sorted([g.id for g in games])
|
||||
assert game_ids == ["nba_2025_nyk_bos_0401_1", "nba_2025_nyk_bos_0401_2"]
|
||||
|
||||
def test_creates_review_items_for_unresolved_teams(self):
|
||||
"""Test review items are created for unresolved teams."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 10, 22),
|
||||
home_team_raw="Unknown Team XYZ",
|
||||
away_team_raw="Boston Celtics",
|
||||
stadium_raw="TD Garden",
|
||||
status="scheduled",
|
||||
),
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
# Game should not be created due to unresolved team
|
||||
assert len(games) == 0
|
||||
|
||||
# But there should be a review item
|
||||
assert len(review_items) >= 1
|
||||
|
||||
|
||||
class TestTeamAndStadiumScraping:
|
||||
"""Test team and stadium data scraping."""
|
||||
|
||||
def test_scrapes_all_nba_teams(self):
|
||||
"""Test all 30 NBA teams are returned."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# 30 NBA teams
|
||||
assert len(teams) == 30
|
||||
|
||||
# Check team IDs are unique
|
||||
team_ids = [t.id for t in teams]
|
||||
assert len(set(team_ids)) == 30
|
||||
|
||||
# Check all teams have required fields
|
||||
for team in teams:
|
||||
assert team.id.startswith("team_nba_")
|
||||
assert team.sport == "nba"
|
||||
assert team.city
|
||||
assert team.name
|
||||
assert team.full_name
|
||||
assert team.abbreviation
|
||||
|
||||
def test_teams_have_conferences_and_divisions(self):
|
||||
"""Test teams have conference and division info."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# Count teams by conference
|
||||
eastern = [t for t in teams if t.conference == "Eastern"]
|
||||
western = [t for t in teams if t.conference == "Western"]
|
||||
|
||||
assert len(eastern) == 15
|
||||
assert len(western) == 15
|
||||
|
||||
def test_scrapes_all_nba_stadiums(self):
|
||||
"""Test all NBA stadiums are returned."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
stadiums = scraper.scrape_stadiums()
|
||||
|
||||
# Should have stadiums for all teams
|
||||
assert len(stadiums) == 30
|
||||
|
||||
# Check stadium IDs are unique
|
||||
stadium_ids = [s.id for s in stadiums]
|
||||
assert len(set(stadium_ids)) == 30
|
||||
|
||||
# Check all stadiums have required fields
|
||||
for stadium in stadiums:
|
||||
assert stadium.id.startswith("stadium_nba_")
|
||||
assert stadium.sport == "nba"
|
||||
assert stadium.name
|
||||
assert stadium.city
|
||||
assert stadium.state
|
||||
assert stadium.country in ["USA", "Canada"]
|
||||
assert stadium.latitude != 0
|
||||
assert stadium.longitude != 0
|
||||
|
||||
|
||||
class TestScrapeFallback:
|
||||
"""Test multi-source fallback behavior."""
|
||||
|
||||
def test_falls_back_to_next_source_on_failure(self):
|
||||
"""Test scraper tries next source when first fails."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
|
||||
with patch.object(scraper, '_scrape_basketball_reference') as mock_br, \
|
||||
patch.object(scraper, '_scrape_espn') as mock_espn:
|
||||
|
||||
# Make BR fail
|
||||
mock_br.side_effect = Exception("Connection failed")
|
||||
|
||||
# Make ESPN return data
|
||||
mock_espn.return_value = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 10, 22),
|
||||
home_team_raw="Cleveland Cavaliers",
|
||||
away_team_raw="Boston Celtics",
|
||||
stadium_raw="Rocket Mortgage FieldHouse",
|
||||
status="scheduled",
|
||||
)
|
||||
]
|
||||
|
||||
result = scraper.scrape_games()
|
||||
|
||||
# Should have succeeded with ESPN
|
||||
assert result.success
|
||||
assert result.source == "espn"
|
||||
assert mock_br.called
|
||||
assert mock_espn.called
|
||||
|
||||
def test_returns_failure_when_all_sources_fail(self):
|
||||
"""Test scraper returns failure when all sources fail."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
|
||||
with patch.object(scraper, '_scrape_basketball_reference') as mock_br, \
|
||||
patch.object(scraper, '_scrape_espn') as mock_espn, \
|
||||
patch.object(scraper, '_scrape_cbs') as mock_cbs:
|
||||
|
||||
mock_br.side_effect = Exception("BR failed")
|
||||
mock_espn.side_effect = Exception("ESPN failed")
|
||||
mock_cbs.side_effect = Exception("CBS failed")
|
||||
|
||||
result = scraper.scrape_games()
|
||||
|
||||
assert not result.success
|
||||
assert "All sources failed" in result.error_message
|
||||
assert "CBS failed" in result.error_message
|
||||
|
||||
|
||||
class TestSeasonMonths:
|
||||
"""Test season month calculation."""
|
||||
|
||||
def test_gets_correct_season_months(self):
|
||||
"""Test correct months are returned for NBA season."""
|
||||
scraper = NBAScraper(season=2025)
|
||||
months = scraper._get_season_months()
|
||||
|
||||
# NBA season is Oct-Jun
|
||||
assert len(months) == 9 # Oct, Nov, Dec, Jan, Feb, Mar, Apr, May, Jun
|
||||
|
||||
# Check first month is Oct of season year
|
||||
assert months[0] == (2025, 10)
|
||||
|
||||
# Check last month is Jun of following year
|
||||
assert months[-1] == (2026, 6)
|
||||
|
||||
# Check transition to new year
|
||||
assert months[2] == (2025, 12) # December
|
||||
assert months[3] == (2026, 1) # January
|
||||
310
sportstime_parser/tests/test_scrapers/test_nfl.py
Normal file
310
sportstime_parser/tests/test_scrapers/test_nfl.py
Normal file
@@ -0,0 +1,310 @@
|
||||
"""Tests for NFL scraper."""
|
||||
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from sportstime_parser.scrapers.nfl import NFLScraper, create_nfl_scraper
|
||||
from sportstime_parser.scrapers.base import RawGameData
|
||||
from sportstime_parser.tests.fixtures import (
|
||||
load_json_fixture,
|
||||
NFL_ESPN_SCOREBOARD_JSON,
|
||||
)
|
||||
|
||||
|
||||
class TestNFLScraperInit:
|
||||
"""Test NFLScraper initialization."""
|
||||
|
||||
def test_creates_scraper_with_season(self):
|
||||
"""Test scraper initializes with correct season."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
assert scraper.sport == "nfl"
|
||||
assert scraper.season == 2025
|
||||
|
||||
def test_factory_function_creates_scraper(self):
|
||||
"""Test factory function creates correct scraper."""
|
||||
scraper = create_nfl_scraper(season=2025)
|
||||
assert isinstance(scraper, NFLScraper)
|
||||
assert scraper.season == 2025
|
||||
|
||||
def test_expected_game_count(self):
|
||||
"""Test expected game count is correct for NFL."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
assert scraper.expected_game_count == 272
|
||||
|
||||
def test_sources_in_priority_order(self):
|
||||
"""Test sources are returned in correct priority order."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
sources = scraper._get_sources()
|
||||
assert sources == ["espn", "pro_football_reference", "cbs"]
|
||||
|
||||
|
||||
class TestESPNParsing:
|
||||
"""Test ESPN API response parsing."""
|
||||
|
||||
def test_parses_completed_games(self):
|
||||
"""Test parsing completed games from ESPN."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
data = load_json_fixture(NFL_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
completed = [g for g in games if g.status == "final"]
|
||||
assert len(completed) == 2
|
||||
|
||||
# Chiefs @ Ravens
|
||||
kc_bal = next(g for g in completed if g.away_team_raw == "Kansas City Chiefs")
|
||||
assert kc_bal.home_team_raw == "Baltimore Ravens"
|
||||
assert kc_bal.away_score == 27
|
||||
assert kc_bal.home_score == 20
|
||||
assert kc_bal.stadium_raw == "M&T Bank Stadium"
|
||||
|
||||
def test_parses_scheduled_games(self):
|
||||
"""Test parsing scheduled games from ESPN."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
data = load_json_fixture(NFL_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
scheduled = [g for g in games if g.status == "scheduled"]
|
||||
assert len(scheduled) == 1
|
||||
|
||||
dal_cle = scheduled[0]
|
||||
assert dal_cle.away_team_raw == "Dallas Cowboys"
|
||||
assert dal_cle.home_team_raw == "Cleveland Browns"
|
||||
assert dal_cle.stadium_raw == "Cleveland Browns Stadium"
|
||||
|
||||
def test_parses_venue_info(self):
|
||||
"""Test venue information is extracted."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
data = load_json_fixture(NFL_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
for game in games:
|
||||
assert game.stadium_raw is not None
|
||||
|
||||
|
||||
class TestGameNormalization:
|
||||
"""Test game normalization and canonical ID generation."""
|
||||
|
||||
def test_normalizes_games_with_canonical_ids(self):
|
||||
"""Test games are normalized with correct canonical IDs."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 9, 7),
|
||||
home_team_raw="Baltimore Ravens",
|
||||
away_team_raw="Kansas City Chiefs",
|
||||
stadium_raw="M&T Bank Stadium",
|
||||
home_score=20,
|
||||
away_score=27,
|
||||
status="final",
|
||||
source_url="http://example.com",
|
||||
)
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
assert len(games) == 1
|
||||
game = games[0]
|
||||
|
||||
# Check canonical ID format
|
||||
assert game.id == "nfl_2025_kc_bal_0907"
|
||||
assert game.sport == "nfl"
|
||||
assert game.season == 2025
|
||||
|
||||
# Check team IDs
|
||||
assert game.home_team_id == "team_nfl_bal"
|
||||
assert game.away_team_id == "team_nfl_kc"
|
||||
|
||||
# Check scores preserved
|
||||
assert game.home_score == 20
|
||||
assert game.away_score == 27
|
||||
|
||||
def test_creates_review_items_for_unresolved_teams(self):
|
||||
"""Test review items are created for unresolved teams."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 9, 7),
|
||||
home_team_raw="Unknown Team XYZ",
|
||||
away_team_raw="Kansas City Chiefs",
|
||||
stadium_raw="Arrowhead Stadium",
|
||||
status="scheduled",
|
||||
),
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
# Game should not be created due to unresolved team
|
||||
assert len(games) == 0
|
||||
|
||||
# But there should be a review item
|
||||
assert len(review_items) >= 1
|
||||
|
||||
|
||||
class TestTeamAndStadiumScraping:
|
||||
"""Test team and stadium data scraping."""
|
||||
|
||||
def test_scrapes_all_nfl_teams(self):
|
||||
"""Test all 32 NFL teams are returned."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# 32 NFL teams
|
||||
assert len(teams) == 32
|
||||
|
||||
# Check team IDs are unique
|
||||
team_ids = [t.id for t in teams]
|
||||
assert len(set(team_ids)) == 32
|
||||
|
||||
# Check all teams have required fields
|
||||
for team in teams:
|
||||
assert team.id.startswith("team_nfl_")
|
||||
assert team.sport == "nfl"
|
||||
assert team.city
|
||||
assert team.name
|
||||
assert team.full_name
|
||||
assert team.abbreviation
|
||||
|
||||
def test_teams_have_conferences_and_divisions(self):
|
||||
"""Test teams have conference and division info."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# Count teams by conference
|
||||
afc = [t for t in teams if t.conference == "AFC"]
|
||||
nfc = [t for t in teams if t.conference == "NFC"]
|
||||
|
||||
assert len(afc) == 16
|
||||
assert len(nfc) == 16
|
||||
|
||||
def test_scrapes_all_nfl_stadiums(self):
|
||||
"""Test all NFL stadiums are returned."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
stadiums = scraper.scrape_stadiums()
|
||||
|
||||
# Should have stadiums for all teams (some share)
|
||||
assert len(stadiums) >= 30
|
||||
|
||||
# Check all stadiums have required fields
|
||||
for stadium in stadiums:
|
||||
assert stadium.id.startswith("stadium_nfl_")
|
||||
assert stadium.sport == "nfl"
|
||||
assert stadium.name
|
||||
assert stadium.city
|
||||
assert stadium.state
|
||||
assert stadium.country == "USA"
|
||||
assert stadium.latitude != 0
|
||||
assert stadium.longitude != 0
|
||||
|
||||
|
||||
class TestScrapeFallback:
|
||||
"""Test multi-source fallback behavior."""
|
||||
|
||||
def test_falls_back_to_next_source_on_failure(self):
|
||||
"""Test scraper tries next source when first fails."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
|
||||
with patch.object(scraper, '_scrape_espn') as mock_espn, \
|
||||
patch.object(scraper, '_scrape_pro_football_reference') as mock_pfr:
|
||||
|
||||
# Make ESPN fail
|
||||
mock_espn.side_effect = Exception("Connection failed")
|
||||
|
||||
# Make PFR return data
|
||||
mock_pfr.return_value = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 9, 7),
|
||||
home_team_raw="Baltimore Ravens",
|
||||
away_team_raw="Kansas City Chiefs",
|
||||
stadium_raw="M&T Bank Stadium",
|
||||
status="scheduled",
|
||||
)
|
||||
]
|
||||
|
||||
result = scraper.scrape_games()
|
||||
|
||||
assert result.success
|
||||
assert result.source == "pro_football_reference"
|
||||
assert mock_espn.called
|
||||
assert mock_pfr.called
|
||||
|
||||
|
||||
class TestSeasonMonths:
|
||||
"""Test season month calculation."""
|
||||
|
||||
def test_gets_correct_season_months(self):
|
||||
"""Test correct months are returned for NFL season."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
months = scraper._get_season_months()
|
||||
|
||||
# NFL season is September-February
|
||||
assert len(months) == 6 # Sep, Oct, Nov, Dec, Jan, Feb
|
||||
|
||||
# Check first month is September of season year
|
||||
assert months[0] == (2025, 9)
|
||||
|
||||
# Check last month is February of following year
|
||||
assert months[-1] == (2026, 2)
|
||||
|
||||
# Check transition to new year
|
||||
assert months[3] == (2025, 12) # December
|
||||
assert months[4] == (2026, 1) # January
|
||||
|
||||
|
||||
class TestInternationalFiltering:
|
||||
"""Test international game filtering.
|
||||
|
||||
Note: Filtering happens in _parse_espn_response, not _normalize_games.
|
||||
"""
|
||||
|
||||
def test_filters_london_games_during_parsing(self):
|
||||
"""Test London games are filtered out during ESPN parsing."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
|
||||
# Create ESPN-like data with London game
|
||||
espn_data = {
|
||||
"events": [
|
||||
{
|
||||
"date": "2025-10-15T09:30:00Z",
|
||||
"competitions": [
|
||||
{
|
||||
"neutralSite": True,
|
||||
"venue": {
|
||||
"fullName": "London Stadium",
|
||||
"address": {"city": "London", "country": "UK"},
|
||||
},
|
||||
"competitors": [
|
||||
{"homeAway": "home", "team": {"displayName": "Jacksonville Jaguars"}},
|
||||
{"homeAway": "away", "team": {"displayName": "Buffalo Bills"}},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
games = scraper._parse_espn_response(espn_data, "http://espn.com/api")
|
||||
|
||||
# London game should be filtered
|
||||
assert len(games) == 0
|
||||
|
||||
def test_keeps_us_games(self):
|
||||
"""Test US games are kept."""
|
||||
scraper = NFLScraper(season=2025)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 9, 7),
|
||||
home_team_raw="Baltimore Ravens",
|
||||
away_team_raw="Kansas City Chiefs",
|
||||
stadium_raw="M&T Bank Stadium",
|
||||
status="scheduled",
|
||||
),
|
||||
]
|
||||
|
||||
games, _ = scraper._normalize_games(raw_games)
|
||||
|
||||
assert len(games) == 1
|
||||
317
sportstime_parser/tests/test_scrapers/test_nhl.py
Normal file
317
sportstime_parser/tests/test_scrapers/test_nhl.py
Normal file
@@ -0,0 +1,317 @@
|
||||
"""Tests for NHL scraper."""
|
||||
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from sportstime_parser.scrapers.nhl import NHLScraper, create_nhl_scraper
|
||||
from sportstime_parser.scrapers.base import RawGameData
|
||||
from sportstime_parser.tests.fixtures import (
|
||||
load_json_fixture,
|
||||
NHL_ESPN_SCOREBOARD_JSON,
|
||||
)
|
||||
|
||||
|
||||
class TestNHLScraperInit:
|
||||
"""Test NHLScraper initialization."""
|
||||
|
||||
def test_creates_scraper_with_season(self):
|
||||
"""Test scraper initializes with correct season."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
assert scraper.sport == "nhl"
|
||||
assert scraper.season == 2025
|
||||
|
||||
def test_factory_function_creates_scraper(self):
|
||||
"""Test factory function creates correct scraper."""
|
||||
scraper = create_nhl_scraper(season=2025)
|
||||
assert isinstance(scraper, NHLScraper)
|
||||
assert scraper.season == 2025
|
||||
|
||||
def test_expected_game_count(self):
|
||||
"""Test expected game count is correct for NHL."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
assert scraper.expected_game_count == 1312
|
||||
|
||||
def test_sources_in_priority_order(self):
|
||||
"""Test sources are returned in correct priority order."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
sources = scraper._get_sources()
|
||||
assert sources == ["hockey_reference", "nhl_api", "espn"]
|
||||
|
||||
|
||||
class TestESPNParsing:
|
||||
"""Test ESPN API response parsing."""
|
||||
|
||||
def test_parses_completed_games(self):
|
||||
"""Test parsing completed games from ESPN."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
data = load_json_fixture(NHL_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
completed = [g for g in games if g.status == "final"]
|
||||
assert len(completed) == 2
|
||||
|
||||
# Penguins @ Bruins
|
||||
pit_bos = next(g for g in completed if g.away_team_raw == "Pittsburgh Penguins")
|
||||
assert pit_bos.home_team_raw == "Boston Bruins"
|
||||
assert pit_bos.away_score == 2
|
||||
assert pit_bos.home_score == 4
|
||||
assert pit_bos.stadium_raw == "TD Garden"
|
||||
|
||||
def test_parses_scheduled_games(self):
|
||||
"""Test parsing scheduled games from ESPN."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
data = load_json_fixture(NHL_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
scheduled = [g for g in games if g.status == "scheduled"]
|
||||
assert len(scheduled) == 1
|
||||
|
||||
vgk_lak = scheduled[0]
|
||||
assert vgk_lak.away_team_raw == "Vegas Golden Knights"
|
||||
assert vgk_lak.home_team_raw == "Los Angeles Kings"
|
||||
assert vgk_lak.stadium_raw == "Crypto.com Arena"
|
||||
|
||||
def test_parses_venue_info(self):
|
||||
"""Test venue information is extracted."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
data = load_json_fixture(NHL_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
for game in games:
|
||||
assert game.stadium_raw is not None
|
||||
|
||||
|
||||
class TestGameNormalization:
|
||||
"""Test game normalization and canonical ID generation."""
|
||||
|
||||
def test_normalizes_games_with_canonical_ids(self):
|
||||
"""Test games are normalized with correct canonical IDs."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 10, 8),
|
||||
home_team_raw="Boston Bruins",
|
||||
away_team_raw="Pittsburgh Penguins",
|
||||
stadium_raw="TD Garden",
|
||||
home_score=4,
|
||||
away_score=2,
|
||||
status="final",
|
||||
source_url="http://example.com",
|
||||
)
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
assert len(games) == 1
|
||||
game = games[0]
|
||||
|
||||
# Check canonical ID format
|
||||
assert game.id == "nhl_2025_pit_bos_1008"
|
||||
assert game.sport == "nhl"
|
||||
assert game.season == 2025
|
||||
|
||||
# Check team IDs
|
||||
assert game.home_team_id == "team_nhl_bos"
|
||||
assert game.away_team_id == "team_nhl_pit"
|
||||
|
||||
# Check scores preserved
|
||||
assert game.home_score == 4
|
||||
assert game.away_score == 2
|
||||
|
||||
def test_creates_review_items_for_unresolved_teams(self):
|
||||
"""Test review items are created for unresolved teams."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 10, 8),
|
||||
home_team_raw="Unknown Team XYZ",
|
||||
away_team_raw="Boston Bruins",
|
||||
stadium_raw="TD Garden",
|
||||
status="scheduled",
|
||||
),
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
# Game should not be created due to unresolved team
|
||||
assert len(games) == 0
|
||||
|
||||
# But there should be a review item
|
||||
assert len(review_items) >= 1
|
||||
|
||||
|
||||
class TestTeamAndStadiumScraping:
|
||||
"""Test team and stadium data scraping."""
|
||||
|
||||
def test_scrapes_all_nhl_teams(self):
|
||||
"""Test all 32 NHL teams are returned."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# 32 NHL teams
|
||||
assert len(teams) == 32
|
||||
|
||||
# Check team IDs are unique
|
||||
team_ids = [t.id for t in teams]
|
||||
assert len(set(team_ids)) == 32
|
||||
|
||||
# Check all teams have required fields
|
||||
for team in teams:
|
||||
assert team.id.startswith("team_nhl_")
|
||||
assert team.sport == "nhl"
|
||||
assert team.city
|
||||
assert team.name
|
||||
assert team.full_name
|
||||
assert team.abbreviation
|
||||
|
||||
def test_teams_have_conferences_and_divisions(self):
|
||||
"""Test teams have conference and division info."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# Count teams by conference
|
||||
eastern = [t for t in teams if t.conference == "Eastern"]
|
||||
western = [t for t in teams if t.conference == "Western"]
|
||||
|
||||
assert len(eastern) == 16
|
||||
assert len(western) == 16
|
||||
|
||||
def test_scrapes_all_nhl_stadiums(self):
|
||||
"""Test all NHL stadiums are returned."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
stadiums = scraper.scrape_stadiums()
|
||||
|
||||
# Should have stadiums for all teams
|
||||
assert len(stadiums) == 32
|
||||
|
||||
# Check stadium IDs are unique
|
||||
stadium_ids = [s.id for s in stadiums]
|
||||
assert len(set(stadium_ids)) == 32
|
||||
|
||||
# Check all stadiums have required fields
|
||||
for stadium in stadiums:
|
||||
assert stadium.id.startswith("stadium_nhl_")
|
||||
assert stadium.sport == "nhl"
|
||||
assert stadium.name
|
||||
assert stadium.city
|
||||
assert stadium.state
|
||||
assert stadium.country in ["USA", "Canada"]
|
||||
assert stadium.latitude != 0
|
||||
assert stadium.longitude != 0
|
||||
|
||||
|
||||
class TestScrapeFallback:
|
||||
"""Test multi-source fallback behavior."""
|
||||
|
||||
def test_falls_back_to_next_source_on_failure(self):
|
||||
"""Test scraper tries next source when first fails."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
|
||||
with patch.object(scraper, '_scrape_hockey_reference') as mock_hr, \
|
||||
patch.object(scraper, '_scrape_nhl_api') as mock_nhl, \
|
||||
patch.object(scraper, '_scrape_espn') as mock_espn:
|
||||
|
||||
# Make HR and NHL API fail
|
||||
mock_hr.side_effect = Exception("Connection failed")
|
||||
mock_nhl.side_effect = Exception("API error")
|
||||
|
||||
# Make ESPN return data
|
||||
mock_espn.return_value = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 10, 8),
|
||||
home_team_raw="Boston Bruins",
|
||||
away_team_raw="Pittsburgh Penguins",
|
||||
stadium_raw="TD Garden",
|
||||
status="scheduled",
|
||||
)
|
||||
]
|
||||
|
||||
result = scraper.scrape_games()
|
||||
|
||||
assert result.success
|
||||
assert result.source == "espn"
|
||||
assert mock_hr.called
|
||||
assert mock_nhl.called
|
||||
assert mock_espn.called
|
||||
|
||||
|
||||
class TestSeasonMonths:
|
||||
"""Test season month calculation."""
|
||||
|
||||
def test_gets_correct_season_months(self):
|
||||
"""Test correct months are returned for NHL season."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
months = scraper._get_season_months()
|
||||
|
||||
# NHL season is October-June
|
||||
assert len(months) == 9 # Oct, Nov, Dec, Jan, Feb, Mar, Apr, May, Jun
|
||||
|
||||
# Check first month is October of season year
|
||||
assert months[0] == (2025, 10)
|
||||
|
||||
# Check last month is June of following year
|
||||
assert months[-1] == (2026, 6)
|
||||
|
||||
# Check transition to new year
|
||||
assert months[2] == (2025, 12) # December
|
||||
assert months[3] == (2026, 1) # January
|
||||
|
||||
|
||||
class TestInternationalFiltering:
|
||||
"""Test international game filtering.
|
||||
|
||||
Note: Filtering happens in _parse_espn_response, not _normalize_games.
|
||||
"""
|
||||
|
||||
def test_filters_european_games_during_parsing(self):
|
||||
"""Test European games are filtered out during ESPN parsing."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
|
||||
# Create ESPN-like data with Prague game (Global Series)
|
||||
espn_data = {
|
||||
"events": [
|
||||
{
|
||||
"date": "2025-10-10T18:00:00Z",
|
||||
"competitions": [
|
||||
{
|
||||
"neutralSite": True,
|
||||
"venue": {
|
||||
"fullName": "O2 Arena, Prague",
|
||||
"address": {"city": "Prague", "country": "Czech Republic"},
|
||||
},
|
||||
"competitors": [
|
||||
{"homeAway": "home", "team": {"displayName": "Florida Panthers"}},
|
||||
{"homeAway": "away", "team": {"displayName": "Dallas Stars"}},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
games = scraper._parse_espn_response(espn_data, "http://espn.com/api")
|
||||
|
||||
# Prague game should be filtered
|
||||
assert len(games) == 0
|
||||
|
||||
def test_keeps_north_american_games(self):
|
||||
"""Test North American games are kept."""
|
||||
scraper = NHLScraper(season=2025)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2025, 10, 8),
|
||||
home_team_raw="Boston Bruins",
|
||||
away_team_raw="Pittsburgh Penguins",
|
||||
stadium_raw="TD Garden",
|
||||
status="scheduled",
|
||||
),
|
||||
]
|
||||
|
||||
games, _ = scraper._normalize_games(raw_games)
|
||||
|
||||
assert len(games) == 1
|
||||
226
sportstime_parser/tests/test_scrapers/test_nwsl.py
Normal file
226
sportstime_parser/tests/test_scrapers/test_nwsl.py
Normal file
@@ -0,0 +1,226 @@
|
||||
"""Tests for NWSL scraper."""
|
||||
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from sportstime_parser.scrapers.nwsl import NWSLScraper, create_nwsl_scraper
|
||||
from sportstime_parser.scrapers.base import RawGameData
|
||||
from sportstime_parser.tests.fixtures import (
|
||||
load_json_fixture,
|
||||
NWSL_ESPN_SCOREBOARD_JSON,
|
||||
)
|
||||
|
||||
|
||||
class TestNWSLScraperInit:
|
||||
"""Test NWSLScraper initialization."""
|
||||
|
||||
def test_creates_scraper_with_season(self):
|
||||
"""Test scraper initializes with correct season."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
assert scraper.sport == "nwsl"
|
||||
assert scraper.season == 2026
|
||||
|
||||
def test_factory_function_creates_scraper(self):
|
||||
"""Test factory function creates correct scraper."""
|
||||
scraper = create_nwsl_scraper(season=2026)
|
||||
assert isinstance(scraper, NWSLScraper)
|
||||
assert scraper.season == 2026
|
||||
|
||||
def test_expected_game_count(self):
|
||||
"""Test expected game count is correct for NWSL."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
assert scraper.expected_game_count == 182
|
||||
|
||||
def test_sources_in_priority_order(self):
|
||||
"""Test sources are returned in correct priority order."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
sources = scraper._get_sources()
|
||||
assert sources == ["espn"]
|
||||
|
||||
|
||||
class TestESPNParsing:
|
||||
"""Test ESPN API response parsing."""
|
||||
|
||||
def test_parses_completed_games(self):
|
||||
"""Test parsing completed games from ESPN."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
data = load_json_fixture(NWSL_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
completed = [g for g in games if g.status == "final"]
|
||||
assert len(completed) == 2
|
||||
|
||||
# Angel City @ Thorns
|
||||
la_por = next(g for g in completed if g.away_team_raw == "Angel City FC")
|
||||
assert la_por.home_team_raw == "Portland Thorns FC"
|
||||
assert la_por.away_score == 1
|
||||
assert la_por.home_score == 2
|
||||
assert la_por.stadium_raw == "Providence Park"
|
||||
|
||||
def test_parses_scheduled_games(self):
|
||||
"""Test parsing scheduled games from ESPN."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
data = load_json_fixture(NWSL_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
scheduled = [g for g in games if g.status == "scheduled"]
|
||||
assert len(scheduled) == 1
|
||||
|
||||
sd_bay = scheduled[0]
|
||||
assert sd_bay.away_team_raw == "San Diego Wave FC"
|
||||
assert sd_bay.home_team_raw == "Bay FC"
|
||||
assert sd_bay.stadium_raw == "PayPal Park"
|
||||
|
||||
def test_parses_venue_info(self):
|
||||
"""Test venue information is extracted."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
data = load_json_fixture(NWSL_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
for game in games:
|
||||
assert game.stadium_raw is not None
|
||||
|
||||
|
||||
class TestGameNormalization:
|
||||
"""Test game normalization and canonical ID generation."""
|
||||
|
||||
def test_normalizes_games_with_canonical_ids(self):
|
||||
"""Test games are normalized with correct canonical IDs."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 4, 10),
|
||||
home_team_raw="Portland Thorns FC",
|
||||
away_team_raw="Angel City FC",
|
||||
stadium_raw="Providence Park",
|
||||
home_score=2,
|
||||
away_score=1,
|
||||
status="final",
|
||||
source_url="http://example.com",
|
||||
)
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
assert len(games) == 1
|
||||
game = games[0]
|
||||
|
||||
# Check canonical ID format
|
||||
assert game.id == "nwsl_2026_anf_por_0410"
|
||||
assert game.sport == "nwsl"
|
||||
assert game.season == 2026
|
||||
|
||||
# Check team IDs
|
||||
assert game.home_team_id == "team_nwsl_por"
|
||||
assert game.away_team_id == "team_nwsl_anf"
|
||||
|
||||
# Check scores preserved
|
||||
assert game.home_score == 2
|
||||
assert game.away_score == 1
|
||||
|
||||
def test_creates_review_items_for_unresolved_teams(self):
|
||||
"""Test review items are created for unresolved teams."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 4, 10),
|
||||
home_team_raw="Unknown Team XYZ",
|
||||
away_team_raw="Portland Thorns FC",
|
||||
stadium_raw="Providence Park",
|
||||
status="scheduled",
|
||||
),
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
# Game should not be created due to unresolved team
|
||||
assert len(games) == 0
|
||||
|
||||
# But there should be a review item
|
||||
assert len(review_items) >= 1
|
||||
|
||||
|
||||
class TestTeamAndStadiumScraping:
|
||||
"""Test team and stadium data scraping."""
|
||||
|
||||
def test_scrapes_all_nwsl_teams(self):
|
||||
"""Test all NWSL teams are returned."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# NWSL has 14 teams
|
||||
assert len(teams) == 14
|
||||
|
||||
# Check team IDs are unique
|
||||
team_ids = [t.id for t in teams]
|
||||
assert len(set(team_ids)) == 14
|
||||
|
||||
# Check all teams have required fields
|
||||
for team in teams:
|
||||
assert team.id.startswith("team_nwsl_")
|
||||
assert team.sport == "nwsl"
|
||||
assert team.city
|
||||
assert team.name
|
||||
assert team.full_name
|
||||
assert team.abbreviation
|
||||
|
||||
def test_scrapes_all_nwsl_stadiums(self):
|
||||
"""Test all NWSL stadiums are returned."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
stadiums = scraper.scrape_stadiums()
|
||||
|
||||
# Should have stadiums for all teams
|
||||
assert len(stadiums) == 14
|
||||
|
||||
# Check stadium IDs are unique
|
||||
stadium_ids = [s.id for s in stadiums]
|
||||
assert len(set(stadium_ids)) == 14
|
||||
|
||||
# Check all stadiums have required fields
|
||||
for stadium in stadiums:
|
||||
assert stadium.id.startswith("stadium_nwsl_")
|
||||
assert stadium.sport == "nwsl"
|
||||
assert stadium.name
|
||||
assert stadium.city
|
||||
assert stadium.state
|
||||
assert stadium.country == "USA"
|
||||
assert stadium.latitude != 0
|
||||
assert stadium.longitude != 0
|
||||
|
||||
|
||||
class TestScrapeFallback:
|
||||
"""Test fallback behavior (NWSL only has ESPN)."""
|
||||
|
||||
def test_returns_failure_when_espn_fails(self):
|
||||
"""Test scraper returns failure when ESPN fails."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
|
||||
with patch.object(scraper, '_scrape_espn') as mock_espn:
|
||||
mock_espn.side_effect = Exception("ESPN failed")
|
||||
|
||||
result = scraper.scrape_games()
|
||||
|
||||
assert not result.success
|
||||
assert "All sources failed" in result.error_message
|
||||
|
||||
|
||||
class TestSeasonMonths:
|
||||
"""Test season month calculation."""
|
||||
|
||||
def test_gets_correct_season_months(self):
|
||||
"""Test correct months are returned for NWSL season."""
|
||||
scraper = NWSLScraper(season=2026)
|
||||
months = scraper._get_season_months()
|
||||
|
||||
# NWSL season is March-November
|
||||
assert len(months) == 9 # Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov
|
||||
|
||||
# Check first month is March of season year
|
||||
assert months[0] == (2026, 3)
|
||||
|
||||
# Check last month is November
|
||||
assert months[-1] == (2026, 11)
|
||||
226
sportstime_parser/tests/test_scrapers/test_wnba.py
Normal file
226
sportstime_parser/tests/test_scrapers/test_wnba.py
Normal file
@@ -0,0 +1,226 @@
|
||||
"""Tests for WNBA scraper."""
|
||||
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from sportstime_parser.scrapers.wnba import WNBAScraper, create_wnba_scraper
|
||||
from sportstime_parser.scrapers.base import RawGameData
|
||||
from sportstime_parser.tests.fixtures import (
|
||||
load_json_fixture,
|
||||
WNBA_ESPN_SCOREBOARD_JSON,
|
||||
)
|
||||
|
||||
|
||||
class TestWNBAScraperInit:
|
||||
"""Test WNBAScraper initialization."""
|
||||
|
||||
def test_creates_scraper_with_season(self):
|
||||
"""Test scraper initializes with correct season."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
assert scraper.sport == "wnba"
|
||||
assert scraper.season == 2026
|
||||
|
||||
def test_factory_function_creates_scraper(self):
|
||||
"""Test factory function creates correct scraper."""
|
||||
scraper = create_wnba_scraper(season=2026)
|
||||
assert isinstance(scraper, WNBAScraper)
|
||||
assert scraper.season == 2026
|
||||
|
||||
def test_expected_game_count(self):
|
||||
"""Test expected game count is correct for WNBA."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
assert scraper.expected_game_count == 220
|
||||
|
||||
def test_sources_in_priority_order(self):
|
||||
"""Test sources are returned in correct priority order."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
sources = scraper._get_sources()
|
||||
assert sources == ["espn"]
|
||||
|
||||
|
||||
class TestESPNParsing:
|
||||
"""Test ESPN API response parsing."""
|
||||
|
||||
def test_parses_completed_games(self):
|
||||
"""Test parsing completed games from ESPN."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
data = load_json_fixture(WNBA_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
completed = [g for g in games if g.status == "final"]
|
||||
assert len(completed) == 2
|
||||
|
||||
# Aces @ Liberty
|
||||
lv_ny = next(g for g in completed if g.away_team_raw == "Las Vegas Aces")
|
||||
assert lv_ny.home_team_raw == "New York Liberty"
|
||||
assert lv_ny.away_score == 88
|
||||
assert lv_ny.home_score == 92
|
||||
assert lv_ny.stadium_raw == "Barclays Center"
|
||||
|
||||
def test_parses_scheduled_games(self):
|
||||
"""Test parsing scheduled games from ESPN."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
data = load_json_fixture(WNBA_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
scheduled = [g for g in games if g.status == "scheduled"]
|
||||
assert len(scheduled) == 1
|
||||
|
||||
phx_sea = scheduled[0]
|
||||
assert phx_sea.away_team_raw == "Phoenix Mercury"
|
||||
assert phx_sea.home_team_raw == "Seattle Storm"
|
||||
assert phx_sea.stadium_raw == "Climate Pledge Arena"
|
||||
|
||||
def test_parses_venue_info(self):
|
||||
"""Test venue information is extracted."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
data = load_json_fixture(WNBA_ESPN_SCOREBOARD_JSON)
|
||||
games = scraper._parse_espn_response(data, "http://espn.com/api")
|
||||
|
||||
for game in games:
|
||||
assert game.stadium_raw is not None
|
||||
|
||||
|
||||
class TestGameNormalization:
|
||||
"""Test game normalization and canonical ID generation."""
|
||||
|
||||
def test_normalizes_games_with_canonical_ids(self):
|
||||
"""Test games are normalized with correct canonical IDs."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 5, 20),
|
||||
home_team_raw="New York Liberty",
|
||||
away_team_raw="Las Vegas Aces",
|
||||
stadium_raw="Barclays Center",
|
||||
home_score=92,
|
||||
away_score=88,
|
||||
status="final",
|
||||
source_url="http://example.com",
|
||||
)
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
assert len(games) == 1
|
||||
game = games[0]
|
||||
|
||||
# Check canonical ID format
|
||||
assert game.id == "wnba_2026_lv_ny_0520"
|
||||
assert game.sport == "wnba"
|
||||
assert game.season == 2026
|
||||
|
||||
# Check team IDs
|
||||
assert game.home_team_id == "team_wnba_ny"
|
||||
assert game.away_team_id == "team_wnba_lv"
|
||||
|
||||
# Check scores preserved
|
||||
assert game.home_score == 92
|
||||
assert game.away_score == 88
|
||||
|
||||
def test_creates_review_items_for_unresolved_teams(self):
|
||||
"""Test review items are created for unresolved teams."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
|
||||
raw_games = [
|
||||
RawGameData(
|
||||
game_date=datetime(2026, 5, 20),
|
||||
home_team_raw="Unknown Team XYZ",
|
||||
away_team_raw="Las Vegas Aces",
|
||||
stadium_raw="Barclays Center",
|
||||
status="scheduled",
|
||||
),
|
||||
]
|
||||
|
||||
games, review_items = scraper._normalize_games(raw_games)
|
||||
|
||||
# Game should not be created due to unresolved team
|
||||
assert len(games) == 0
|
||||
|
||||
# But there should be a review item
|
||||
assert len(review_items) >= 1
|
||||
|
||||
|
||||
class TestTeamAndStadiumScraping:
|
||||
"""Test team and stadium data scraping."""
|
||||
|
||||
def test_scrapes_all_wnba_teams(self):
|
||||
"""Test all WNBA teams are returned."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
teams = scraper.scrape_teams()
|
||||
|
||||
# WNBA has 13 teams (including Golden State Valkyries)
|
||||
assert len(teams) == 13
|
||||
|
||||
# Check team IDs are unique
|
||||
team_ids = [t.id for t in teams]
|
||||
assert len(set(team_ids)) == 13
|
||||
|
||||
# Check all teams have required fields
|
||||
for team in teams:
|
||||
assert team.id.startswith("team_wnba_")
|
||||
assert team.sport == "wnba"
|
||||
assert team.city
|
||||
assert team.name
|
||||
assert team.full_name
|
||||
assert team.abbreviation
|
||||
|
||||
def test_scrapes_all_wnba_stadiums(self):
|
||||
"""Test all WNBA stadiums are returned."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
stadiums = scraper.scrape_stadiums()
|
||||
|
||||
# Should have stadiums for all teams
|
||||
assert len(stadiums) == 13
|
||||
|
||||
# Check stadium IDs are unique
|
||||
stadium_ids = [s.id for s in stadiums]
|
||||
assert len(set(stadium_ids)) == 13
|
||||
|
||||
# Check all stadiums have required fields
|
||||
for stadium in stadiums:
|
||||
assert stadium.id.startswith("stadium_wnba_")
|
||||
assert stadium.sport == "wnba"
|
||||
assert stadium.name
|
||||
assert stadium.city
|
||||
assert stadium.state
|
||||
assert stadium.country == "USA"
|
||||
assert stadium.latitude != 0
|
||||
assert stadium.longitude != 0
|
||||
|
||||
|
||||
class TestScrapeFallback:
|
||||
"""Test fallback behavior (WNBA only has ESPN)."""
|
||||
|
||||
def test_returns_failure_when_espn_fails(self):
|
||||
"""Test scraper returns failure when ESPN fails."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
|
||||
with patch.object(scraper, '_scrape_espn') as mock_espn:
|
||||
mock_espn.side_effect = Exception("ESPN failed")
|
||||
|
||||
result = scraper.scrape_games()
|
||||
|
||||
assert not result.success
|
||||
assert "All sources failed" in result.error_message
|
||||
|
||||
|
||||
class TestSeasonMonths:
|
||||
"""Test season month calculation."""
|
||||
|
||||
def test_gets_correct_season_months(self):
|
||||
"""Test correct months are returned for WNBA season."""
|
||||
scraper = WNBAScraper(season=2026)
|
||||
months = scraper._get_season_months()
|
||||
|
||||
# WNBA season is May-October
|
||||
assert len(months) == 6 # May, Jun, Jul, Aug, Sep, Oct
|
||||
|
||||
# Check first month is May of season year
|
||||
assert months[0] == (2026, 5)
|
||||
|
||||
# Check last month is October
|
||||
assert months[-1] == (2026, 10)
|
||||
Reference in New Issue
Block a user