"""NBA scraper implementation with multi-source fallback.""" from datetime import datetime, date, timezone from typing import Optional from bs4 import BeautifulSoup import re from .base import BaseScraper, RawGameData, ScrapeResult from ..models.game import Game from ..models.team import Team from ..models.stadium import Stadium from ..models.aliases import ManualReviewItem from ..normalizers.canonical_id import generate_game_id from ..normalizers.team_resolver import ( TeamResolver, TEAM_MAPPINGS, get_team_resolver, ) from ..normalizers.stadium_resolver import ( StadiumResolver, STADIUM_MAPPINGS, get_stadium_resolver, ) from ..normalizers.timezone import parse_datetime from ..utils.logging import get_logger, log_game, log_warning # Month name to number mapping MONTH_MAP = { "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12, } # Basketball Reference month URLs BR_MONTHS = [ "october", "november", "december", "january", "february", "march", "april", "may", "june", ] class NBAScraper(BaseScraper): """NBA schedule scraper with multi-source fallback. Sources (in priority order): 1. Basketball-Reference - Most reliable, complete historical data 2. ESPN API - Good for current/future seasons 3. CBS Sports - Backup option """ def __init__(self, season: int, **kwargs): """Initialize NBA scraper. Args: season: Season start year (e.g., 2025 for 2025-26) """ super().__init__("nba", season, **kwargs) self._team_resolver = get_team_resolver("nba") self._stadium_resolver = get_stadium_resolver("nba") def _get_sources(self) -> list[str]: """Return source list in priority order.""" # CBS scraper not yet implemented - TODO for future return ["basketball_reference", "espn"] def _get_source_url(self, source: str, **kwargs) -> str: """Build URL for a source.""" if source == "basketball_reference": month = kwargs.get("month", "october") year = kwargs.get("year", self.season + 1) return f"https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html" elif source == "espn": date_str = kwargs.get("date", "") return f"https://site.api.espn.com/apis/site/v2/sports/basketball/nba/scoreboard?dates={date_str}" elif source == "cbs": return "https://www.cbssports.com/nba/schedule/" raise ValueError(f"Unknown source: {source}") def _scrape_games_from_source(self, source: str) -> list[RawGameData]: """Scrape games from a specific source.""" if source == "basketball_reference": return self._scrape_basketball_reference() elif source == "espn": return self._scrape_espn() elif source == "cbs": return self._scrape_cbs() else: raise ValueError(f"Unknown source: {source}") def _scrape_basketball_reference(self) -> list[RawGameData]: """Scrape games from Basketball-Reference. BR organizes games by month with separate pages. Format: https://www.basketball-reference.com/leagues/NBA_YYYY_games-month.html where YYYY is the ending year of the season. Bails early if first few months have no data (season doesn't exist). """ all_games: list[RawGameData] = [] end_year = self.season + 1 consecutive_empty_months = 0 for month in BR_MONTHS: url = self._get_source_url("basketball_reference", month=month, year=end_year) try: html = self.session.get_html(url) games = self._parse_basketball_reference(html, url) if games: all_games.extend(games) consecutive_empty_months = 0 self._logger.debug(f"Found {len(games)} games in {month}") else: consecutive_empty_months += 1 except Exception as e: # Some months may not exist (e.g., no games in August) self._logger.debug(f"No data for {month}: {e}") consecutive_empty_months += 1 # If first 3 months (Oct, Nov, Dec) all have no data, season doesn't exist if consecutive_empty_months >= 3 and not all_games: self._logger.info(f"No games found in first {consecutive_empty_months} months, season likely doesn't exist") break return all_games def _parse_basketball_reference( self, html: str, source_url: str, ) -> list[RawGameData]: """Parse Basketball-Reference schedule HTML. Table structure: - th[data-stat="date_game"]: Date (e.g., "Tue, Oct 22, 2024") - td[data-stat="visitor_team_name"]: Away team - td[data-stat="home_team_name"]: Home team - td[data-stat="visitor_pts"]: Away score - td[data-stat="home_pts"]: Home score - td[data-stat="arena_name"]: Arena/stadium name """ soup = BeautifulSoup(html, "lxml") games: list[RawGameData] = [] # Find the schedule table table = soup.find("table", id="schedule") if not table: return games tbody = table.find("tbody") if not tbody: return games for row in tbody.find_all("tr"): # Skip header rows if row.get("class") and "thead" in row.get("class", []): continue try: game = self._parse_br_row(row, source_url) if game: games.append(game) except Exception as e: self._logger.debug(f"Failed to parse row: {e}") continue return games def _parse_br_row( self, row, source_url: str, ) -> Optional[RawGameData]: """Parse a single Basketball-Reference table row.""" # Get date date_cell = row.find("th", {"data-stat": "date_game"}) if not date_cell: return None date_text = date_cell.get_text(strip=True) if not date_text: return None # Parse date (format: "Tue, Oct 22, 2024") try: game_date = datetime.strptime(date_text, "%a, %b %d, %Y") except ValueError: # Try alternative format try: game_date = datetime.strptime(date_text, "%B %d, %Y") except ValueError: self._logger.debug(f"Could not parse date: {date_text}") return None # Get teams away_cell = row.find("td", {"data-stat": "visitor_team_name"}) home_cell = row.find("td", {"data-stat": "home_team_name"}) if not away_cell or not home_cell: return None away_team = away_cell.get_text(strip=True) home_team = home_cell.get_text(strip=True) if not away_team or not home_team: return None # Get scores (may be empty for future games) away_score_cell = row.find("td", {"data-stat": "visitor_pts"}) home_score_cell = row.find("td", {"data-stat": "home_pts"}) away_score = None home_score = None if away_score_cell and away_score_cell.get_text(strip=True): try: away_score = int(away_score_cell.get_text(strip=True)) except ValueError: pass if home_score_cell and home_score_cell.get_text(strip=True): try: home_score = int(home_score_cell.get_text(strip=True)) except ValueError: pass # Get arena arena_cell = row.find("td", {"data-stat": "arena_name"}) arena = arena_cell.get_text(strip=True) if arena_cell else None # Determine status status = "final" if home_score is not None else "scheduled" # Check for postponed/cancelled notes_cell = row.find("td", {"data-stat": "game_remarks"}) if notes_cell: notes = notes_cell.get_text(strip=True).lower() if "postponed" in notes: status = "postponed" elif "cancelled" in notes or "canceled" in notes: status = "cancelled" return RawGameData( game_date=game_date, home_team_raw=home_team, away_team_raw=away_team, stadium_raw=arena, home_score=home_score, away_score=away_score, status=status, source_url=source_url, ) def _scrape_espn(self) -> list[RawGameData]: """Scrape games from ESPN API. ESPN API returns games for a specific date range. We iterate through each day of the season. Bails out early if no games found after checking first month. """ all_games: list[RawGameData] = [] consecutive_empty_days = 0 max_empty_days = 45 # Bail after ~1.5 months of no games for year, month in self._get_season_months(): # Get number of days in month if month == 12: next_month = date(year + 1, 1, 1) else: next_month = date(year, month + 1, 1) days_in_month = (next_month - date(year, month, 1)).days for day in range(1, days_in_month + 1): try: game_date = date(year, month, day) date_str = game_date.strftime("%Y%m%d") url = self._get_source_url("espn", date=date_str) data = self.session.get_json(url) games = self._parse_espn_response(data, url) if games: all_games.extend(games) consecutive_empty_days = 0 else: consecutive_empty_days += 1 # Bail early if no games found for a long stretch if consecutive_empty_days >= max_empty_days: self._logger.info(f"No games found for {max_empty_days} consecutive days, stopping ESPN scrape") return all_games except Exception as e: self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}") consecutive_empty_days += 1 if consecutive_empty_days >= max_empty_days: self._logger.info(f"Too many consecutive failures, stopping ESPN scrape") return all_games continue return all_games def _parse_espn_response( self, data: dict, source_url: str, ) -> list[RawGameData]: """Parse ESPN API response.""" games: list[RawGameData] = [] events = data.get("events", []) for event in events: try: game = self._parse_espn_event(event, source_url) if game: games.append(game) except Exception as e: self._logger.debug(f"Failed to parse ESPN event: {e}") continue return games def _parse_espn_event( self, event: dict, source_url: str, ) -> Optional[RawGameData]: """Parse a single ESPN event.""" # Get date date_str = event.get("date", "") if not date_str: return None try: # ESPN uses ISO format game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00")) except ValueError: return None # Get competitions (usually just one) competitions = event.get("competitions", []) if not competitions: return None competition = competitions[0] # Get teams competitors = competition.get("competitors", []) if len(competitors) != 2: return None home_team = None away_team = None home_score = None away_score = None for competitor in competitors: team_info = competitor.get("team", {}) team_name = team_info.get("displayName", "") is_home = competitor.get("homeAway") == "home" score = competitor.get("score") if score: try: score = int(score) except (ValueError, TypeError): score = None if is_home: home_team = team_name home_score = score else: away_team = team_name away_score = score if not home_team or not away_team: return None # Get venue venue = competition.get("venue", {}) arena = venue.get("fullName") # Get status status_info = competition.get("status", {}) status_type = status_info.get("type", {}) status_name = status_type.get("name", "").lower() if status_name == "status_final": status = "final" elif status_name == "status_postponed": status = "postponed" elif status_name == "status_canceled": status = "cancelled" else: status = "scheduled" return RawGameData( game_date=game_date, home_team_raw=home_team, away_team_raw=away_team, stadium_raw=arena, home_score=home_score, away_score=away_score, status=status, source_url=source_url, ) def _scrape_cbs(self) -> list[RawGameData]: """Scrape games from CBS Sports. CBS Sports is a backup source with less structured data. """ # CBS Sports scraping would go here # For now, return empty to fall back to other sources raise NotImplementedError("CBS scraper not implemented") def _normalize_games( self, raw_games: list[RawGameData], ) -> tuple[list[Game], list[ManualReviewItem]]: """Normalize raw games to Game objects with canonical IDs.""" games: list[Game] = [] review_items: list[ManualReviewItem] = [] # Track games by date for doubleheader detection games_by_date: dict[str, list[RawGameData]] = {} for raw in raw_games: date_key = raw.game_date.strftime("%Y%m%d") matchup_key = f"{date_key}_{raw.away_team_raw}_{raw.home_team_raw}" if matchup_key not in games_by_date: games_by_date[matchup_key] = [] games_by_date[matchup_key].append(raw) # Process games with doubleheader detection for matchup_key, matchup_games in games_by_date.items(): is_doubleheader = len(matchup_games) > 1 for i, raw in enumerate(matchup_games): game_number = (i + 1) if is_doubleheader else None game, item_reviews = self._normalize_single_game(raw, game_number) if game: games.append(game) log_game( self.sport, game.id, game.home_team_id, game.away_team_id, game.game_date.strftime("%Y-%m-%d"), game.status, ) review_items.extend(item_reviews) return games, review_items def _normalize_single_game( self, raw: RawGameData, game_number: Optional[int], ) -> tuple[Optional[Game], list[ManualReviewItem]]: """Normalize a single raw game.""" review_items: list[ManualReviewItem] = [] # Resolve home team home_result = self._team_resolver.resolve( raw.home_team_raw, check_date=raw.game_date.date(), source_url=raw.source_url, ) if home_result.review_item: review_items.append(home_result.review_item) if not home_result.canonical_id: log_warning(f"Could not resolve home team: {raw.home_team_raw}") return None, review_items # Resolve away team away_result = self._team_resolver.resolve( raw.away_team_raw, check_date=raw.game_date.date(), source_url=raw.source_url, ) if away_result.review_item: review_items.append(away_result.review_item) if not away_result.canonical_id: log_warning(f"Could not resolve away team: {raw.away_team_raw}") return None, review_items # Resolve stadium (optional - use home team's stadium if not found) stadium_id = None if raw.stadium_raw: stadium_result = self._stadium_resolver.resolve( raw.stadium_raw, check_date=raw.game_date.date(), source_url=raw.source_url, ) if stadium_result.review_item: review_items.append(stadium_result.review_item) stadium_id = stadium_result.canonical_id # If no stadium found, use home team's default stadium if not stadium_id: # Look up home team's stadium from mappings home_abbrev = home_result.canonical_id.split("_")[-1].upper() team_info = self._team_resolver.get_team_info(home_abbrev) if team_info: # Try to find stadium by team's home arena for sid, sinfo in STADIUM_MAPPINGS.get("nba", {}).items(): # Match by city if sinfo.city.lower() in team_info[2].lower(): stadium_id = sid break # Get abbreviations for game ID home_abbrev = self._get_abbreviation(home_result.canonical_id) away_abbrev = self._get_abbreviation(away_result.canonical_id) # Generate canonical game ID game_id = generate_game_id( sport=self.sport, season=self.season, away_abbrev=away_abbrev, home_abbrev=home_abbrev, game_date=raw.game_date, game_number=game_number, ) game = Game( id=game_id, sport=self.sport, season=self.season, home_team_id=home_result.canonical_id, away_team_id=away_result.canonical_id, stadium_id=stadium_id or "", game_date=raw.game_date, game_number=game_number, home_score=raw.home_score, away_score=raw.away_score, status=raw.status, source_url=raw.source_url, raw_home_team=raw.home_team_raw, raw_away_team=raw.away_team_raw, raw_stadium=raw.stadium_raw, ) return game, review_items def _get_abbreviation(self, team_id: str) -> str: """Extract abbreviation from team ID.""" # team_nba_okc -> okc parts = team_id.split("_") return parts[-1] if parts else "" def scrape_teams(self) -> list[Team]: """Get all NBA teams from hardcoded mappings.""" teams: list[Team] = [] seen: set[str] = set() # NBA conference/division structure divisions = { "Atlantic": ("Eastern", ["BOS", "BKN", "NYK", "PHI", "TOR"]), "Central": ("Eastern", ["CHI", "CLE", "DET", "IND", "MIL"]), "Southeast": ("Eastern", ["ATL", "CHA", "MIA", "ORL", "WAS"]), "Northwest": ("Western", ["DEN", "MIN", "OKC", "POR", "UTA"]), "Pacific": ("Western", ["GSW", "LAC", "LAL", "PHX", "SAC"]), "Southwest": ("Western", ["DAL", "HOU", "MEM", "NOP", "SAS"]), } # Build reverse lookup team_divisions: dict[str, tuple[str, str]] = {} for div, (conf, abbrevs) in divisions.items(): for abbrev in abbrevs: team_divisions[abbrev] = (conf, div) for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nba", {}).items(): if team_id in seen: continue seen.add(team_id) # Parse full name into city and name parts parts = full_name.split() if len(parts) >= 2: # Handle special cases like "Oklahoma City Thunder" if city == "Oklahoma City": team_name = "Thunder" elif city == "Golden State": team_name = "Warriors" elif city == "San Antonio": team_name = "Spurs" elif city == "New York": team_name = parts[-1] # Knicks elif city == "New Orleans": team_name = "Pelicans" elif city == "Los Angeles": team_name = parts[-1] # Lakers or Clippers else: team_name = parts[-1] else: team_name = full_name # Get conference and division conf, div = team_divisions.get(abbrev, (None, None)) team = Team( id=team_id, sport="nba", city=city, name=team_name, full_name=full_name, abbreviation=abbrev, conference=conf, division=div, stadium_id=stadium_id, ) teams.append(team) return teams def scrape_stadiums(self) -> list[Stadium]: """Get all NBA stadiums from hardcoded mappings.""" stadiums: list[Stadium] = [] for stadium_id, info in STADIUM_MAPPINGS.get("nba", {}).items(): stadium = Stadium( id=stadium_id, sport="nba", name=info.name, city=info.city, state=info.state, country=info.country, latitude=info.latitude, longitude=info.longitude, surface="hardwood", roof_type="dome", ) stadiums.append(stadium) return stadiums def create_nba_scraper(season: int) -> NBAScraper: """Factory function to create an NBA scraper.""" return NBAScraper(season=season)