"""NHL scraper implementation with multi-source fallback.""" from datetime import datetime, date from typing import Optional from bs4 import BeautifulSoup from .base import BaseScraper, RawGameData, ScrapeResult from ..models.game import Game from ..models.team import Team from ..models.stadium import Stadium from ..models.aliases import ManualReviewItem from ..normalizers.canonical_id import generate_game_id from ..normalizers.team_resolver import ( TeamResolver, TEAM_MAPPINGS, get_team_resolver, ) from ..normalizers.stadium_resolver import ( StadiumResolver, STADIUM_MAPPINGS, get_stadium_resolver, ) from ..utils.logging import get_logger, log_game, log_warning # International game locations to filter out INTERNATIONAL_LOCATIONS = {"Prague", "Stockholm", "Helsinki", "Tampere", "Gothenburg"} # Hockey Reference month URLs HR_MONTHS = [ "october", "november", "december", "january", "february", "march", "april", "may", "june", ] class NHLScraper(BaseScraper): """NHL schedule scraper with multi-source fallback. Sources (in priority order): 1. Hockey-Reference - Most reliable for NHL 2. NHL API - Official NHL data 3. ESPN API - Backup option """ def __init__(self, season: int, **kwargs): """Initialize NHL scraper. Args: season: Season start year (e.g., 2025 for 2025-26) """ super().__init__("nhl", season, **kwargs) self._team_resolver = get_team_resolver("nhl") self._stadium_resolver = get_stadium_resolver("nhl") def _get_sources(self) -> list[str]: """Return source list in priority order.""" return ["hockey_reference", "nhl_api", "espn"] def _get_source_url(self, source: str, **kwargs) -> str: """Build URL for a source.""" if source == "hockey_reference": month = kwargs.get("month", "october") year = kwargs.get("year", self.season + 1) return f"https://www.hockey-reference.com/leagues/NHL_{year}_games.html" elif source == "nhl_api": start_date = kwargs.get("start_date", "") end_date = kwargs.get("end_date", "") return f"https://api-web.nhle.com/v1/schedule/{start_date}" elif source == "espn": date_str = kwargs.get("date", "") return f"https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard?dates={date_str}" raise ValueError(f"Unknown source: {source}") def _scrape_games_from_source(self, source: str) -> list[RawGameData]: """Scrape games from a specific source.""" if source == "hockey_reference": return self._scrape_hockey_reference() elif source == "nhl_api": return self._scrape_nhl_api() elif source == "espn": return self._scrape_espn() else: raise ValueError(f"Unknown source: {source}") def _scrape_hockey_reference(self) -> list[RawGameData]: """Scrape games from Hockey-Reference. HR has a single schedule page per season. """ end_year = self.season + 1 url = self._get_source_url("hockey_reference", year=end_year) try: html = self.session.get_html(url) games = self._parse_hockey_reference(html, url) return games except Exception as e: self._logger.error(f"Failed to scrape Hockey-Reference: {e}") raise def _parse_hockey_reference( self, html: str, source_url: str, ) -> list[RawGameData]: """Parse Hockey-Reference schedule HTML.""" soup = BeautifulSoup(html, "lxml") games: list[RawGameData] = [] # Find the schedule table table = soup.find("table", id="games") if not table: return games tbody = table.find("tbody") if not tbody: return games for row in tbody.find_all("tr"): # Skip header rows if row.get("class") and "thead" in row.get("class", []): continue try: game = self._parse_hr_row(row, source_url) if game: # Filter international games if game.stadium_raw and any(loc in game.stadium_raw for loc in INTERNATIONAL_LOCATIONS): continue games.append(game) except Exception as e: self._logger.debug(f"Failed to parse HR row: {e}") continue return games def _parse_hr_row( self, row, source_url: str, ) -> Optional[RawGameData]: """Parse a single Hockey-Reference table row.""" # Get date date_cell = row.find("th", {"data-stat": "date_game"}) if not date_cell: return None date_text = date_cell.get_text(strip=True) if not date_text: return None # Parse date (format: "2025-10-15") try: game_date = datetime.strptime(date_text, "%Y-%m-%d") except ValueError: return None # Get teams visitor_cell = row.find("td", {"data-stat": "visitor_team_name"}) home_cell = row.find("td", {"data-stat": "home_team_name"}) if not visitor_cell or not home_cell: return None away_team = visitor_cell.get_text(strip=True) home_team = home_cell.get_text(strip=True) if not away_team or not home_team: return None # Get scores visitor_goals_cell = row.find("td", {"data-stat": "visitor_goals"}) home_goals_cell = row.find("td", {"data-stat": "home_goals"}) away_score = None home_score = None if visitor_goals_cell and visitor_goals_cell.get_text(strip=True): try: away_score = int(visitor_goals_cell.get_text(strip=True)) except ValueError: pass if home_goals_cell and home_goals_cell.get_text(strip=True): try: home_score = int(home_goals_cell.get_text(strip=True)) except ValueError: pass # Determine status status = "final" if home_score is not None else "scheduled" # Check for OT/SO overtimes_cell = row.find("td", {"data-stat": "overtimes"}) if overtimes_cell: ot_text = overtimes_cell.get_text(strip=True) if ot_text: status = "final" # OT games are still final return RawGameData( game_date=game_date, home_team_raw=home_team, away_team_raw=away_team, stadium_raw=None, # HR doesn't have stadium home_score=home_score, away_score=away_score, status=status, source_url=source_url, ) def _scrape_nhl_api(self) -> list[RawGameData]: """Scrape games from NHL API.""" all_games: list[RawGameData] = [] for year, month in self._get_season_months(): start_date = date(year, month, 1) url = self._get_source_url("nhl_api", start_date=start_date.strftime("%Y-%m-%d")) try: data = self.session.get_json(url) games = self._parse_nhl_api_response(data, url) all_games.extend(games) except Exception as e: self._logger.debug(f"NHL API error for {year}-{month}: {e}") continue return all_games def _parse_nhl_api_response( self, data: dict, source_url: str, ) -> list[RawGameData]: """Parse NHL API response.""" games: list[RawGameData] = [] game_weeks = data.get("gameWeek", []) for week in game_weeks: for game_day in week.get("games", []): try: game = self._parse_nhl_api_game(game_day, source_url) if game: games.append(game) except Exception as e: self._logger.debug(f"Failed to parse NHL API game: {e}") continue return games def _parse_nhl_api_game( self, game: dict, source_url: str, ) -> Optional[RawGameData]: """Parse a single NHL API game.""" # Get date start_time = game.get("startTimeUTC", "") if not start_time: return None try: game_date = datetime.fromisoformat(start_time.replace("Z", "+00:00")) except ValueError: return None # Get teams away_team_data = game.get("awayTeam", {}) home_team_data = game.get("homeTeam", {}) away_team = away_team_data.get("placeName", {}).get("default", "") home_team = home_team_data.get("placeName", {}).get("default", "") if not away_team or not home_team: # Try full name away_team = away_team_data.get("name", {}).get("default", "") home_team = home_team_data.get("name", {}).get("default", "") if not away_team or not home_team: return None # Get scores away_score = away_team_data.get("score") home_score = home_team_data.get("score") # Get venue venue = game.get("venue", {}) stadium = venue.get("default") # Get status game_state = game.get("gameState", "").lower() if game_state in ["final", "off"]: status = "final" elif game_state == "postponed": status = "postponed" elif game_state in ["cancelled", "canceled"]: status = "cancelled" else: status = "scheduled" return RawGameData( game_date=game_date, home_team_raw=home_team, away_team_raw=away_team, stadium_raw=stadium, home_score=home_score, away_score=away_score, status=status, source_url=source_url, ) def _scrape_espn(self) -> list[RawGameData]: """Scrape games from ESPN API.""" all_games: list[RawGameData] = [] for year, month in self._get_season_months(): # Get number of days in month if month == 12: next_month = date(year + 1, 1, 1) else: next_month = date(year, month + 1, 1) days_in_month = (next_month - date(year, month, 1)).days for day in range(1, days_in_month + 1): try: game_date = date(year, month, day) date_str = game_date.strftime("%Y%m%d") url = self._get_source_url("espn", date=date_str) data = self.session.get_json(url) games = self._parse_espn_response(data, url) all_games.extend(games) except Exception as e: self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}") continue return all_games def _parse_espn_response( self, data: dict, source_url: str, ) -> list[RawGameData]: """Parse ESPN API response.""" games: list[RawGameData] = [] events = data.get("events", []) for event in events: try: game = self._parse_espn_event(event, source_url) if game: games.append(game) except Exception as e: self._logger.debug(f"Failed to parse ESPN event: {e}") continue return games def _parse_espn_event( self, event: dict, source_url: str, ) -> Optional[RawGameData]: """Parse a single ESPN event.""" # Get date date_str = event.get("date", "") if not date_str: return None try: game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00")) except ValueError: return None # Get competitions competitions = event.get("competitions", []) if not competitions: return None competition = competitions[0] # Check for neutral site (international games like Global Series) if competition.get("neutralSite"): venue = competition.get("venue", {}) venue_city = venue.get("address", {}).get("city", "") if venue_city in INTERNATIONAL_LOCATIONS: return None # Get teams competitors = competition.get("competitors", []) if len(competitors) != 2: return None home_team = None away_team = None home_score = None away_score = None for competitor in competitors: team_info = competitor.get("team", {}) team_name = team_info.get("displayName", "") is_home = competitor.get("homeAway") == "home" score = competitor.get("score") if score: try: score = int(score) except (ValueError, TypeError): score = None if is_home: home_team = team_name home_score = score else: away_team = team_name away_score = score if not home_team or not away_team: return None # Get venue venue = competition.get("venue", {}) stadium = venue.get("fullName") # Get status status_info = competition.get("status", {}) status_type = status_info.get("type", {}) status_name = status_type.get("name", "").lower() if status_name == "status_final": status = "final" elif status_name == "status_postponed": status = "postponed" elif status_name == "status_canceled": status = "cancelled" else: status = "scheduled" return RawGameData( game_date=game_date, home_team_raw=home_team, away_team_raw=away_team, stadium_raw=stadium, home_score=home_score, away_score=away_score, status=status, source_url=source_url, ) def _normalize_games( self, raw_games: list[RawGameData], ) -> tuple[list[Game], list[ManualReviewItem]]: """Normalize raw games to Game objects with canonical IDs.""" games: list[Game] = [] review_items: list[ManualReviewItem] = [] for raw in raw_games: game, item_reviews = self._normalize_single_game(raw) if game: games.append(game) log_game( self.sport, game.id, game.home_team_id, game.away_team_id, game.game_date.strftime("%Y-%m-%d"), game.status, ) review_items.extend(item_reviews) return games, review_items def _normalize_single_game( self, raw: RawGameData, ) -> tuple[Optional[Game], list[ManualReviewItem]]: """Normalize a single raw game.""" review_items: list[ManualReviewItem] = [] # Resolve home team home_result = self._team_resolver.resolve( raw.home_team_raw, check_date=raw.game_date.date(), source_url=raw.source_url, ) if home_result.review_item: review_items.append(home_result.review_item) if not home_result.canonical_id: log_warning(f"Could not resolve home team: {raw.home_team_raw}") return None, review_items # Resolve away team away_result = self._team_resolver.resolve( raw.away_team_raw, check_date=raw.game_date.date(), source_url=raw.source_url, ) if away_result.review_item: review_items.append(away_result.review_item) if not away_result.canonical_id: log_warning(f"Could not resolve away team: {raw.away_team_raw}") return None, review_items # Resolve stadium stadium_id = None if raw.stadium_raw: stadium_result = self._stadium_resolver.resolve( raw.stadium_raw, check_date=raw.game_date.date(), source_url=raw.source_url, ) if stadium_result.review_item: review_items.append(stadium_result.review_item) stadium_id = stadium_result.canonical_id # Get abbreviations for game ID home_abbrev = self._get_abbreviation(home_result.canonical_id) away_abbrev = self._get_abbreviation(away_result.canonical_id) # Generate canonical game ID game_id = generate_game_id( sport=self.sport, season=self.season, away_abbrev=away_abbrev, home_abbrev=home_abbrev, game_date=raw.game_date, game_number=None, # NHL doesn't have doubleheaders ) game = Game( id=game_id, sport=self.sport, season=self.season, home_team_id=home_result.canonical_id, away_team_id=away_result.canonical_id, stadium_id=stadium_id or "", game_date=raw.game_date, game_number=None, home_score=raw.home_score, away_score=raw.away_score, status=raw.status, source_url=raw.source_url, raw_home_team=raw.home_team_raw, raw_away_team=raw.away_team_raw, raw_stadium=raw.stadium_raw, ) return game, review_items def _get_abbreviation(self, team_id: str) -> str: """Extract abbreviation from team ID.""" parts = team_id.split("_") return parts[-1] if parts else "" def scrape_teams(self) -> list[Team]: """Get all NHL teams from hardcoded mappings.""" teams: list[Team] = [] seen: set[str] = set() # NHL conference/division structure divisions = { "Atlantic": ("Eastern", ["BOS", "BUF", "DET", "FLA", "MTL", "OTT", "TB", "TOR"]), "Metropolitan": ("Eastern", ["CAR", "CBJ", "NJ", "NYI", "NYR", "PHI", "PIT", "WAS"]), "Central": ("Western", ["ARI", "CHI", "COL", "DAL", "MIN", "NSH", "STL", "WPG"]), "Pacific": ("Western", ["ANA", "CGY", "EDM", "LA", "SJ", "SEA", "VAN", "VGK"]), } # Build reverse lookup team_divisions: dict[str, tuple[str, str]] = {} for div, (conf, abbrevs) in divisions.items(): for abbrev in abbrevs: team_divisions[abbrev] = (conf, div) for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nhl", {}).items(): if team_id in seen: continue seen.add(team_id) # Parse team name parts = full_name.split() team_name = parts[-1] if parts else full_name # Handle multi-word names if team_name in ["Wings", "Jackets", "Knights", "Leafs"]: team_name = " ".join(parts[-2:]) # Get conference and division conf, div = team_divisions.get(abbrev, (None, None)) team = Team( id=team_id, sport="nhl", city=city, name=team_name, full_name=full_name, abbreviation=abbrev, conference=conf, division=div, stadium_id=stadium_id, ) teams.append(team) return teams def scrape_stadiums(self) -> list[Stadium]: """Get all NHL stadiums from hardcoded mappings.""" stadiums: list[Stadium] = [] nhl_stadiums = STADIUM_MAPPINGS.get("nhl", {}) for stadium_id, info in nhl_stadiums.items(): stadium = Stadium( id=stadium_id, sport="nhl", name=info.name, city=info.city, state=info.state, country=info.country, latitude=info.latitude, longitude=info.longitude, surface="ice", roof_type="dome", ) stadiums.append(stadium) return stadiums def create_nhl_scraper(season: int) -> NHLScraper: """Factory function to create an NHL scraper.""" return NHLScraper(season=season)