Files
Sportstime/Scripts/sportstime_parser/scrapers/nhl.py
Trey t 4d097883a6 fix(data): add timezone handling for Sports-Reference scrapers and new stadiums
- Add ET timezone (America/New_York) to all Sports-Reference scrapers:
  - NBA: Basketball-Reference times parsed as ET
  - NFL: Pro-Football-Reference times parsed as ET
  - NHL: Hockey-Reference times parsed as ET
  - MLB: Baseball-Reference times parsed as ET
- Document source timezones in scraper docstrings
- Add 11 new stadiums to STADIUM_MAPPINGS:
  - NFL: 5 international venues (Corinthians Arena, Croke Park,
    Olympic Stadium Berlin, Santiago Bernabéu, Tom Benson Hall of Fame)
  - MLS: 4 alternate venues (Miami Freedom Park, Citi Field,
    LA Memorial Coliseum, M&T Bank Stadium)
  - NWSL: 2 alternate venues (Northwestern Medicine Field, ONE Spokane)
- Add 15 stadium aliases for MLS/NWSL team-based lookups
- Fix CanonicalSyncService to sync timezone identifier to SwiftData
- Update debug logging to use stadium timezone for display

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-21 16:04:45 -06:00

684 lines
22 KiB
Python

"""NHL scraper implementation with multi-source fallback."""
from datetime import datetime, date
from typing import Optional
from zoneinfo import ZoneInfo
from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
TeamResolver,
TEAM_MAPPINGS,
get_team_resolver,
)
from ..normalizers.stadium_resolver import (
StadiumResolver,
STADIUM_MAPPINGS,
get_stadium_resolver,
)
from ..utils.logging import get_logger, log_game, log_warning
# International game locations to filter out
INTERNATIONAL_LOCATIONS = {"Prague", "Stockholm", "Helsinki", "Tampere", "Gothenburg"}
# Hockey Reference month URLs
HR_MONTHS = [
"october", "november", "december",
"january", "february", "march", "april", "may", "june",
]
class NHLScraper(BaseScraper):
"""NHL schedule scraper with multi-source fallback.
Sources (in priority order):
1. Hockey-Reference - Most reliable for NHL
2. NHL API - Official NHL data
3. ESPN API - Backup option
Source Timezones:
- hockey_reference: Eastern Time (ET) - times displayed as "7:00p"
- nhl_api: UTC - ISO 8601 format with "Z" suffix (startTimeUTC field)
- espn: UTC - ISO 8601 format with "Z" suffix
"""
def __init__(self, season: int, **kwargs):
"""Initialize NHL scraper.
Args:
season: Season start year (e.g., 2025 for 2025-26)
"""
super().__init__("nhl", season, **kwargs)
self._team_resolver = get_team_resolver("nhl")
self._stadium_resolver = get_stadium_resolver("nhl")
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
return ["hockey_reference", "nhl_api", "espn"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
if source == "hockey_reference":
month = kwargs.get("month", "october")
year = kwargs.get("year", self.season + 1)
return f"https://www.hockey-reference.com/leagues/NHL_{year}_games.html"
elif source == "nhl_api":
start_date = kwargs.get("start_date", "")
end_date = kwargs.get("end_date", "")
return f"https://api-web.nhle.com/v1/schedule/{start_date}"
elif source == "espn":
date_str = kwargs.get("date", "")
return f"https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard?dates={date_str}"
raise ValueError(f"Unknown source: {source}")
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
"""Scrape games from a specific source."""
if source == "hockey_reference":
return self._scrape_hockey_reference()
elif source == "nhl_api":
return self._scrape_nhl_api()
elif source == "espn":
return self._scrape_espn()
else:
raise ValueError(f"Unknown source: {source}")
def _scrape_hockey_reference(self) -> list[RawGameData]:
"""Scrape games from Hockey-Reference.
HR has a single schedule page per season.
"""
end_year = self.season + 1
url = self._get_source_url("hockey_reference", year=end_year)
try:
html = self.session.get_html(url)
games = self._parse_hockey_reference(html, url)
return games
except Exception as e:
self._logger.error(f"Failed to scrape Hockey-Reference: {e}")
raise
def _parse_hockey_reference(
self,
html: str,
source_url: str,
) -> list[RawGameData]:
"""Parse Hockey-Reference schedule HTML."""
soup = BeautifulSoup(html, "lxml")
games: list[RawGameData] = []
# Find the schedule table
table = soup.find("table", id="games")
if not table:
return games
tbody = table.find("tbody")
if not tbody:
return games
for row in tbody.find_all("tr"):
# Skip header rows
if row.get("class") and "thead" in row.get("class", []):
continue
try:
game = self._parse_hr_row(row, source_url)
if game:
# Filter international games
if game.stadium_raw and any(loc in game.stadium_raw for loc in INTERNATIONAL_LOCATIONS):
continue
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse HR row: {e}")
continue
return games
def _parse_hr_row(
self,
row,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single Hockey-Reference table row."""
# Get date
date_cell = row.find("th", {"data-stat": "date_game"})
if not date_cell:
return None
date_text = date_cell.get_text(strip=True)
if not date_text:
return None
# Parse date (format: "2025-10-15")
try:
game_date = datetime.strptime(date_text, "%Y-%m-%d")
except ValueError:
return None
# Get game start time (format: "7:00p" or "10:30p") - times are in ET
time_cell = row.find("td", {"data-stat": "time_game"})
if time_cell:
time_text = time_cell.get_text(strip=True)
if time_text:
try:
# Parse time like "7:00p" or "10:30p"
# Normalize: "7:00p" -> "7:00 PM", "10:30p" -> "10:30 PM"
time_normalized = time_text.replace("p", " PM").replace("a", " AM")
game_time = datetime.strptime(time_normalized, "%I:%M %p")
# Combine date and time with ET timezone (Hockey-Reference uses ET)
game_date = game_date.replace(
hour=game_time.hour,
minute=game_time.minute,
tzinfo=ZoneInfo("America/New_York"),
)
except ValueError:
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
# Get teams
visitor_cell = row.find("td", {"data-stat": "visitor_team_name"})
home_cell = row.find("td", {"data-stat": "home_team_name"})
if not visitor_cell or not home_cell:
return None
away_team = visitor_cell.get_text(strip=True)
home_team = home_cell.get_text(strip=True)
if not away_team or not home_team:
return None
# Get scores
visitor_goals_cell = row.find("td", {"data-stat": "visitor_goals"})
home_goals_cell = row.find("td", {"data-stat": "home_goals"})
away_score = None
home_score = None
if visitor_goals_cell and visitor_goals_cell.get_text(strip=True):
try:
away_score = int(visitor_goals_cell.get_text(strip=True))
except ValueError:
pass
if home_goals_cell and home_goals_cell.get_text(strip=True):
try:
home_score = int(home_goals_cell.get_text(strip=True))
except ValueError:
pass
# Determine status
status = "final" if home_score is not None else "scheduled"
# Check for OT/SO
overtimes_cell = row.find("td", {"data-stat": "overtimes"})
if overtimes_cell:
ot_text = overtimes_cell.get_text(strip=True)
if ot_text:
status = "final" # OT games are still final
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=None, # HR doesn't have stadium
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_nhl_api(self) -> list[RawGameData]:
"""Scrape games from NHL API."""
all_games: list[RawGameData] = []
for year, month in self._get_season_months():
start_date = date(year, month, 1)
url = self._get_source_url("nhl_api", start_date=start_date.strftime("%Y-%m-%d"))
try:
data = self.session.get_json(url)
games = self._parse_nhl_api_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"NHL API error for {year}-{month}: {e}")
continue
return all_games
def _parse_nhl_api_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse NHL API response."""
games: list[RawGameData] = []
game_weeks = data.get("gameWeek", [])
for week in game_weeks:
for game_day in week.get("games", []):
try:
game = self._parse_nhl_api_game(game_day, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse NHL API game: {e}")
continue
return games
def _parse_nhl_api_game(
self,
game: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single NHL API game."""
# Get date
start_time = game.get("startTimeUTC", "")
if not start_time:
return None
try:
game_date = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
except ValueError:
return None
# Get teams
away_team_data = game.get("awayTeam", {})
home_team_data = game.get("homeTeam", {})
away_team = away_team_data.get("placeName", {}).get("default", "")
home_team = home_team_data.get("placeName", {}).get("default", "")
if not away_team or not home_team:
# Try full name
away_team = away_team_data.get("name", {}).get("default", "")
home_team = home_team_data.get("name", {}).get("default", "")
if not away_team or not home_team:
return None
# Get scores
away_score = away_team_data.get("score")
home_score = home_team_data.get("score")
# Get venue
venue = game.get("venue", {})
stadium = venue.get("default")
# Get status
game_state = game.get("gameState", "").lower()
if game_state in ["final", "off"]:
status = "final"
elif game_state == "postponed":
status = "postponed"
elif game_state in ["cancelled", "canceled"]:
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API."""
all_games: list[RawGameData] = []
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
days_in_month = (next_month - date(year, month, 1)).days
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
continue
return all_games
def _parse_espn_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse ESPN API response."""
games: list[RawGameData] = []
events = data.get("events", [])
for event in events:
try:
game = self._parse_espn_event(event, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse ESPN event: {e}")
continue
return games
def _parse_espn_event(
self,
event: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single ESPN event."""
# Get date
date_str = event.get("date", "")
if not date_str:
return None
try:
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get competitions
competitions = event.get("competitions", [])
if not competitions:
return None
competition = competitions[0]
# Check for neutral site (international games like Global Series)
if competition.get("neutralSite"):
venue = competition.get("venue", {})
venue_city = venue.get("address", {}).get("city", "")
if venue_city in INTERNATIONAL_LOCATIONS:
return None
# Get teams
competitors = competition.get("competitors", [])
if len(competitors) != 2:
return None
home_team = None
away_team = None
home_score = None
away_score = None
for competitor in competitors:
team_info = competitor.get("team", {})
team_name = team_info.get("displayName", "")
is_home = competitor.get("homeAway") == "home"
score = competitor.get("score")
if score:
try:
score = int(score)
except (ValueError, TypeError):
score = None
if is_home:
home_team = team_name
home_score = score
else:
away_team = team_name
away_score = score
if not home_team or not away_team:
return None
# Get venue
venue = competition.get("venue", {})
stadium = venue.get("fullName")
# Get status
status_info = competition.get("status", {})
status_type = status_info.get("type", {})
status_name = status_type.get("name", "").lower()
if status_name == "status_final":
status = "final"
elif status_name == "status_postponed":
status = "postponed"
elif status_name == "status_canceled":
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw games to Game objects with canonical IDs."""
games: list[Game] = []
review_items: list[ManualReviewItem] = []
for raw in raw_games:
game, item_reviews = self._normalize_single_game(raw)
if game:
games.append(game)
log_game(
self.sport,
game.id,
game.home_team_id,
game.away_team_id,
game.game_date.strftime("%Y-%m-%d"),
game.status,
)
review_items.extend(item_reviews)
return games, review_items
def _normalize_single_game(
self,
raw: RawGameData,
) -> tuple[Optional[Game], list[ManualReviewItem]]:
"""Normalize a single raw game."""
review_items: list[ManualReviewItem] = []
# Resolve home team
home_result = self._team_resolver.resolve(
raw.home_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if home_result.review_item:
review_items.append(home_result.review_item)
if not home_result.canonical_id:
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
return None, review_items
# Resolve away team
away_result = self._team_resolver.resolve(
raw.away_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if away_result.review_item:
review_items.append(away_result.review_item)
if not away_result.canonical_id:
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
return None, review_items
# Resolve stadium
stadium_id = None
if raw.stadium_raw:
stadium_result = self._stadium_resolver.resolve(
raw.stadium_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if stadium_result.review_item:
review_items.append(stadium_result.review_item)
stadium_id = stadium_result.canonical_id
# Fallback: Use home team's default stadium if no venue provided
# This is common for Hockey-Reference which doesn't have venue data
if not stadium_id:
home_team_data = TEAM_MAPPINGS.get("nhl", {})
home_abbrev = self._get_abbreviation(home_result.canonical_id)
for abbrev, (team_id, _, _, default_stadium) in home_team_data.items():
if team_id == home_result.canonical_id:
stadium_id = default_stadium
break
# Get abbreviations for game ID
home_abbrev = self._get_abbreviation(home_result.canonical_id)
away_abbrev = self._get_abbreviation(away_result.canonical_id)
# Generate canonical game ID
game_id = generate_game_id(
sport=self.sport,
season=self.season,
away_abbrev=away_abbrev,
home_abbrev=home_abbrev,
game_date=raw.game_date,
game_number=None, # NHL doesn't have doubleheaders
)
game = Game(
id=game_id,
sport=self.sport,
season=self.season,
home_team_id=home_result.canonical_id,
away_team_id=away_result.canonical_id,
stadium_id=stadium_id or "",
game_date=raw.game_date,
game_number=None,
home_score=raw.home_score,
away_score=raw.away_score,
status=raw.status,
source_url=raw.source_url,
raw_home_team=raw.home_team_raw,
raw_away_team=raw.away_team_raw,
raw_stadium=raw.stadium_raw,
)
return game, review_items
def _get_abbreviation(self, team_id: str) -> str:
"""Extract abbreviation from team ID."""
parts = team_id.split("_")
return parts[-1] if parts else ""
def scrape_teams(self) -> list[Team]:
"""Get all NHL teams from hardcoded mappings."""
teams: list[Team] = []
seen: set[str] = set()
# NHL conference/division structure
divisions = {
"Atlantic": ("Eastern", ["BOS", "BUF", "DET", "FLA", "MTL", "OTT", "TB", "TOR"]),
"Metropolitan": ("Eastern", ["CAR", "CBJ", "NJ", "NYI", "NYR", "PHI", "PIT", "WAS"]),
"Central": ("Western", ["ARI", "CHI", "COL", "DAL", "MIN", "NSH", "STL", "WPG"]),
"Pacific": ("Western", ["ANA", "CGY", "EDM", "LA", "SJ", "SEA", "VAN", "VGK"]),
}
# Build reverse lookup
team_divisions: dict[str, tuple[str, str]] = {}
for div, (conf, abbrevs) in divisions.items():
for abbrev in abbrevs:
team_divisions[abbrev] = (conf, div)
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nhl", {}).items():
if team_id in seen:
continue
seen.add(team_id)
# Parse team name
parts = full_name.split()
team_name = parts[-1] if parts else full_name
# Handle multi-word names
if team_name in ["Wings", "Jackets", "Knights", "Leafs"]:
team_name = " ".join(parts[-2:])
# Get conference and division
conf, div = team_divisions.get(abbrev, (None, None))
team = Team(
id=team_id,
sport="nhl",
city=city,
name=team_name,
full_name=full_name,
abbreviation=abbrev,
conference=conf,
division=div,
stadium_id=stadium_id,
)
teams.append(team)
return teams
def scrape_stadiums(self) -> list[Stadium]:
"""Get all NHL stadiums from hardcoded mappings."""
stadiums: list[Stadium] = []
nhl_stadiums = STADIUM_MAPPINGS.get("nhl", {})
for stadium_id, info in nhl_stadiums.items():
stadium = Stadium(
id=stadium_id,
sport="nhl",
name=info.name,
city=info.city,
state=info.state,
country=info.country,
latitude=info.latitude,
longitude=info.longitude,
timezone=info.timezone,
surface="ice",
roof_type="dome",
)
stadiums.append(stadium)
return stadiums
def create_nhl_scraper(season: int) -> NHLScraper:
"""Factory function to create an NHL scraper."""
return NHLScraper(season=season)