Files
SportstimeAPI/sportstime_parser/tests/test_scrapers/test_nfl.py
Trey t 52d445bca4 feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00

311 lines
10 KiB
Python

"""Tests for NFL scraper."""
from datetime import datetime
from unittest.mock import patch
import pytest
from sportstime_parser.scrapers.nfl import NFLScraper, create_nfl_scraper
from sportstime_parser.scrapers.base import RawGameData
from sportstime_parser.tests.fixtures import (
load_json_fixture,
NFL_ESPN_SCOREBOARD_JSON,
)
class TestNFLScraperInit:
"""Test NFLScraper initialization."""
def test_creates_scraper_with_season(self):
"""Test scraper initializes with correct season."""
scraper = NFLScraper(season=2025)
assert scraper.sport == "nfl"
assert scraper.season == 2025
def test_factory_function_creates_scraper(self):
"""Test factory function creates correct scraper."""
scraper = create_nfl_scraper(season=2025)
assert isinstance(scraper, NFLScraper)
assert scraper.season == 2025
def test_expected_game_count(self):
"""Test expected game count is correct for NFL."""
scraper = NFLScraper(season=2025)
assert scraper.expected_game_count == 272
def test_sources_in_priority_order(self):
"""Test sources are returned in correct priority order."""
scraper = NFLScraper(season=2025)
sources = scraper._get_sources()
assert sources == ["espn", "pro_football_reference", "cbs"]
class TestESPNParsing:
"""Test ESPN API response parsing."""
def test_parses_completed_games(self):
"""Test parsing completed games from ESPN."""
scraper = NFLScraper(season=2025)
data = load_json_fixture(NFL_ESPN_SCOREBOARD_JSON)
games = scraper._parse_espn_response(data, "http://espn.com/api")
completed = [g for g in games if g.status == "final"]
assert len(completed) == 2
# Chiefs @ Ravens
kc_bal = next(g for g in completed if g.away_team_raw == "Kansas City Chiefs")
assert kc_bal.home_team_raw == "Baltimore Ravens"
assert kc_bal.away_score == 27
assert kc_bal.home_score == 20
assert kc_bal.stadium_raw == "M&T Bank Stadium"
def test_parses_scheduled_games(self):
"""Test parsing scheduled games from ESPN."""
scraper = NFLScraper(season=2025)
data = load_json_fixture(NFL_ESPN_SCOREBOARD_JSON)
games = scraper._parse_espn_response(data, "http://espn.com/api")
scheduled = [g for g in games if g.status == "scheduled"]
assert len(scheduled) == 1
dal_cle = scheduled[0]
assert dal_cle.away_team_raw == "Dallas Cowboys"
assert dal_cle.home_team_raw == "Cleveland Browns"
assert dal_cle.stadium_raw == "Cleveland Browns Stadium"
def test_parses_venue_info(self):
"""Test venue information is extracted."""
scraper = NFLScraper(season=2025)
data = load_json_fixture(NFL_ESPN_SCOREBOARD_JSON)
games = scraper._parse_espn_response(data, "http://espn.com/api")
for game in games:
assert game.stadium_raw is not None
class TestGameNormalization:
"""Test game normalization and canonical ID generation."""
def test_normalizes_games_with_canonical_ids(self):
"""Test games are normalized with correct canonical IDs."""
scraper = NFLScraper(season=2025)
raw_games = [
RawGameData(
game_date=datetime(2025, 9, 7),
home_team_raw="Baltimore Ravens",
away_team_raw="Kansas City Chiefs",
stadium_raw="M&T Bank Stadium",
home_score=20,
away_score=27,
status="final",
source_url="http://example.com",
)
]
games, review_items = scraper._normalize_games(raw_games)
assert len(games) == 1
game = games[0]
# Check canonical ID format
assert game.id == "nfl_2025_kc_bal_0907"
assert game.sport == "nfl"
assert game.season == 2025
# Check team IDs
assert game.home_team_id == "team_nfl_bal"
assert game.away_team_id == "team_nfl_kc"
# Check scores preserved
assert game.home_score == 20
assert game.away_score == 27
def test_creates_review_items_for_unresolved_teams(self):
"""Test review items are created for unresolved teams."""
scraper = NFLScraper(season=2025)
raw_games = [
RawGameData(
game_date=datetime(2025, 9, 7),
home_team_raw="Unknown Team XYZ",
away_team_raw="Kansas City Chiefs",
stadium_raw="Arrowhead Stadium",
status="scheduled",
),
]
games, review_items = scraper._normalize_games(raw_games)
# Game should not be created due to unresolved team
assert len(games) == 0
# But there should be a review item
assert len(review_items) >= 1
class TestTeamAndStadiumScraping:
"""Test team and stadium data scraping."""
def test_scrapes_all_nfl_teams(self):
"""Test all 32 NFL teams are returned."""
scraper = NFLScraper(season=2025)
teams = scraper.scrape_teams()
# 32 NFL teams
assert len(teams) == 32
# Check team IDs are unique
team_ids = [t.id for t in teams]
assert len(set(team_ids)) == 32
# Check all teams have required fields
for team in teams:
assert team.id.startswith("team_nfl_")
assert team.sport == "nfl"
assert team.city
assert team.name
assert team.full_name
assert team.abbreviation
def test_teams_have_conferences_and_divisions(self):
"""Test teams have conference and division info."""
scraper = NFLScraper(season=2025)
teams = scraper.scrape_teams()
# Count teams by conference
afc = [t for t in teams if t.conference == "AFC"]
nfc = [t for t in teams if t.conference == "NFC"]
assert len(afc) == 16
assert len(nfc) == 16
def test_scrapes_all_nfl_stadiums(self):
"""Test all NFL stadiums are returned."""
scraper = NFLScraper(season=2025)
stadiums = scraper.scrape_stadiums()
# Should have stadiums for all teams (some share)
assert len(stadiums) >= 30
# Check all stadiums have required fields
for stadium in stadiums:
assert stadium.id.startswith("stadium_nfl_")
assert stadium.sport == "nfl"
assert stadium.name
assert stadium.city
assert stadium.state
assert stadium.country == "USA"
assert stadium.latitude != 0
assert stadium.longitude != 0
class TestScrapeFallback:
"""Test multi-source fallback behavior."""
def test_falls_back_to_next_source_on_failure(self):
"""Test scraper tries next source when first fails."""
scraper = NFLScraper(season=2025)
with patch.object(scraper, '_scrape_espn') as mock_espn, \
patch.object(scraper, '_scrape_pro_football_reference') as mock_pfr:
# Make ESPN fail
mock_espn.side_effect = Exception("Connection failed")
# Make PFR return data
mock_pfr.return_value = [
RawGameData(
game_date=datetime(2025, 9, 7),
home_team_raw="Baltimore Ravens",
away_team_raw="Kansas City Chiefs",
stadium_raw="M&T Bank Stadium",
status="scheduled",
)
]
result = scraper.scrape_games()
assert result.success
assert result.source == "pro_football_reference"
assert mock_espn.called
assert mock_pfr.called
class TestSeasonMonths:
"""Test season month calculation."""
def test_gets_correct_season_months(self):
"""Test correct months are returned for NFL season."""
scraper = NFLScraper(season=2025)
months = scraper._get_season_months()
# NFL season is September-February
assert len(months) == 6 # Sep, Oct, Nov, Dec, Jan, Feb
# Check first month is September of season year
assert months[0] == (2025, 9)
# Check last month is February of following year
assert months[-1] == (2026, 2)
# Check transition to new year
assert months[3] == (2025, 12) # December
assert months[4] == (2026, 1) # January
class TestInternationalFiltering:
"""Test international game filtering.
Note: Filtering happens in _parse_espn_response, not _normalize_games.
"""
def test_filters_london_games_during_parsing(self):
"""Test London games are filtered out during ESPN parsing."""
scraper = NFLScraper(season=2025)
# Create ESPN-like data with London game
espn_data = {
"events": [
{
"date": "2025-10-15T09:30:00Z",
"competitions": [
{
"neutralSite": True,
"venue": {
"fullName": "London Stadium",
"address": {"city": "London", "country": "UK"},
},
"competitors": [
{"homeAway": "home", "team": {"displayName": "Jacksonville Jaguars"}},
{"homeAway": "away", "team": {"displayName": "Buffalo Bills"}},
],
}
],
}
]
}
games = scraper._parse_espn_response(espn_data, "http://espn.com/api")
# London game should be filtered
assert len(games) == 0
def test_keeps_us_games(self):
"""Test US games are kept."""
scraper = NFLScraper(season=2025)
raw_games = [
RawGameData(
game_date=datetime(2025, 9, 7),
home_team_raw="Baltimore Ravens",
away_team_raw="Kansas City Chiefs",
stadium_raw="M&T Bank Stadium",
status="scheduled",
),
]
games, _ = scraper._normalize_games(raw_games)
assert len(games) == 1