Sportstime/Scripts/sportstime_parser/scrapers/nba.py

"""NBA scraper implementation with multi-source fallback."""

from datetime import datetime, date, timezone
from typing import Optional
from bs4 import BeautifulSoup
import re

from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
    TeamResolver,
    TEAM_MAPPINGS,
    get_team_resolver,
)
from ..normalizers.stadium_resolver import (
    StadiumResolver,
    STADIUM_MAPPINGS,
    get_stadium_resolver,
)
from ..normalizers.timezone import parse_datetime
from ..utils.logging import get_logger, log_game, log_warning


# Month name to number mapping
MONTH_MAP = {
    "january": 1, "february": 2, "march": 3, "april": 4,
    "may": 5, "june": 6, "july": 7, "august": 8,
    "september": 9, "october": 10, "november": 11, "december": 12,
}

# Basketball Reference month URLs
BR_MONTHS = [
    "october", "november", "december",
    "january", "february", "march", "april", "may", "june",
]


class NBAScraper(BaseScraper):
    """NBA schedule scraper with multi-source fallback.

    Sources (in priority order):
    1. Basketball-Reference - Most reliable, complete historical data
    2. ESPN API - Good for current/future seasons
    3. CBS Sports - Backup option
    """

    def __init__(self, season: int, **kwargs):
        """Initialize NBA scraper.

        Args:
            season: Season start year (e.g., 2025 for 2025-26)
        """
        super().__init__("nba", season, **kwargs)
        self._team_resolver = get_team_resolver("nba")
        self._stadium_resolver = get_stadium_resolver("nba")

    def _get_sources(self) -> list[str]:
        """Return source list in priority order."""
        # CBS scraper not yet implemented - TODO for future
        return ["basketball_reference", "espn"]

    def _get_source_url(self, source: str, **kwargs) -> str:
        """Build URL for a source."""
        if source == "basketball_reference":
            month = kwargs.get("month", "october")
            year = kwargs.get("year", self.season + 1)
            return f"https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html"

        elif source == "espn":
            date_str = kwargs.get("date", "")
            return f"https://site.api.espn.com/apis/site/v2/sports/basketball/nba/scoreboard?dates={date_str}"

        elif source == "cbs":
            return "https://www.cbssports.com/nba/schedule/"

        raise ValueError(f"Unknown source: {source}")

    def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
        """Scrape games from a specific source."""
        if source == "basketball_reference":
            return self._scrape_basketball_reference()
        elif source == "espn":
            return self._scrape_espn()
        elif source == "cbs":
            return self._scrape_cbs()
        else:
            raise ValueError(f"Unknown source: {source}")

    def _scrape_basketball_reference(self) -> list[RawGameData]:
        """Scrape games from Basketball-Reference.

        BR organizes games by month with separate pages.
        Format: https://www.basketball-reference.com/leagues/NBA_YYYY_games-month.html
        where YYYY is the ending year of the season.
        Bails early if first few months have no data (season doesn't exist).
        """
        all_games: list[RawGameData] = []
        end_year = self.season + 1
        consecutive_empty_months = 0

        for month in BR_MONTHS:
            url = self._get_source_url("basketball_reference", month=month, year=end_year)

            try:
                html = self.session.get_html(url)
                games = self._parse_basketball_reference(html, url)

                if games:
                    all_games.extend(games)
                    consecutive_empty_months = 0
                    self._logger.debug(f"Found {len(games)} games in {month}")
                else:
                    consecutive_empty_months += 1

            except Exception as e:
                # Some months may not exist (e.g., no games in August)
                self._logger.debug(f"No data for {month}: {e}")
                consecutive_empty_months += 1

            # If first 3 months (Oct, Nov, Dec) all have no data, season doesn't exist
            if consecutive_empty_months >= 3 and not all_games:
                self._logger.info(f"No games found in first {consecutive_empty_months} months, season likely doesn't exist")
                break

        return all_games

    def _parse_basketball_reference(
        self,
        html: str,
        source_url: str,
    ) -> list[RawGameData]:
        """Parse Basketball-Reference schedule HTML.

        Table structure:
        - th[data-stat="date_game"]: Date (e.g., "Tue, Oct 22, 2024")
        - td[data-stat="visitor_team_name"]: Away team
        - td[data-stat="home_team_name"]: Home team
        - td[data-stat="visitor_pts"]: Away score
        - td[data-stat="home_pts"]: Home score
        - td[data-stat="arena_name"]: Arena/stadium name
        """
        soup = BeautifulSoup(html, "lxml")
        games: list[RawGameData] = []

        # Find the schedule table
        table = soup.find("table", id="schedule")
        if not table:
            return games

        tbody = table.find("tbody")
        if not tbody:
            return games

        for row in tbody.find_all("tr"):
            # Skip header rows
            if row.get("class") and "thead" in row.get("class", []):
                continue

            try:
                game = self._parse_br_row(row, source_url)
                if game:
                    games.append(game)
            except Exception as e:
                self._logger.debug(f"Failed to parse row: {e}")
                continue

        return games

    def _parse_br_row(
        self,
        row,
        source_url: str,
    ) -> Optional[RawGameData]:
        """Parse a single Basketball-Reference table row."""
        # Get date
        date_cell = row.find("th", {"data-stat": "date_game"})
        if not date_cell:
            return None

        date_text = date_cell.get_text(strip=True)
        if not date_text:
            return None

        # Parse date (format: "Tue, Oct 22, 2024")
        try:
            game_date = datetime.strptime(date_text, "%a, %b %d, %Y")
        except ValueError:
            # Try alternative format
            try:
                game_date = datetime.strptime(date_text, "%B %d, %Y")
            except ValueError:
                self._logger.debug(f"Could not parse date: {date_text}")
                return None

        # Get teams
        away_cell = row.find("td", {"data-stat": "visitor_team_name"})
        home_cell = row.find("td", {"data-stat": "home_team_name"})

        if not away_cell or not home_cell:
            return None

        away_team = away_cell.get_text(strip=True)
        home_team = home_cell.get_text(strip=True)

        if not away_team or not home_team:
            return None

        # Get scores (may be empty for future games)
        away_score_cell = row.find("td", {"data-stat": "visitor_pts"})
        home_score_cell = row.find("td", {"data-stat": "home_pts"})

        away_score = None
        home_score = None

        if away_score_cell and away_score_cell.get_text(strip=True):
            try:
                away_score = int(away_score_cell.get_text(strip=True))
            except ValueError:
                pass

        if home_score_cell and home_score_cell.get_text(strip=True):
            try:
                home_score = int(home_score_cell.get_text(strip=True))
            except ValueError:
                pass

        # Get arena
        arena_cell = row.find("td", {"data-stat": "arena_name"})
        arena = arena_cell.get_text(strip=True) if arena_cell else None

        # Determine status
        status = "final" if home_score is not None else "scheduled"

        # Check for postponed/cancelled
        notes_cell = row.find("td", {"data-stat": "game_remarks"})
        if notes_cell:
            notes = notes_cell.get_text(strip=True).lower()
            if "postponed" in notes:
                status = "postponed"
            elif "cancelled" in notes or "canceled" in notes:
                status = "cancelled"

        return RawGameData(
            game_date=game_date,
            home_team_raw=home_team,
            away_team_raw=away_team,
            stadium_raw=arena,
            home_score=home_score,
            away_score=away_score,
            status=status,
            source_url=source_url,
        )

    def _scrape_espn(self) -> list[RawGameData]:
        """Scrape games from ESPN API.

        ESPN API returns games for a specific date range.
        We iterate through each day of the season.
        Bails out early if no games found after checking first month.
        """
        all_games: list[RawGameData] = []
        consecutive_empty_days = 0
        max_empty_days = 45  # Bail after ~1.5 months of no games

        for year, month in self._get_season_months():
            # Get number of days in month
            if month == 12:
                next_month = date(year + 1, 1, 1)
            else:
                next_month = date(year, month + 1, 1)

            days_in_month = (next_month - date(year, month, 1)).days

            for day in range(1, days_in_month + 1):
                try:
                    game_date = date(year, month, day)
                    date_str = game_date.strftime("%Y%m%d")
                    url = self._get_source_url("espn", date=date_str)

                    data = self.session.get_json(url)
                    games = self._parse_espn_response(data, url)

                    if games:
                        all_games.extend(games)
                        consecutive_empty_days = 0
                    else:
                        consecutive_empty_days += 1

                    # Bail early if no games found for a long stretch
                    if consecutive_empty_days >= max_empty_days:
                        self._logger.info(f"No games found for {max_empty_days} consecutive days, stopping ESPN scrape")
                        return all_games

                except Exception as e:
                    self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
                    consecutive_empty_days += 1

                    if consecutive_empty_days >= max_empty_days:
                        self._logger.info(f"Too many consecutive failures, stopping ESPN scrape")
                        return all_games
                    continue

        return all_games

    def _parse_espn_response(
        self,
        data: dict,
        source_url: str,
    ) -> list[RawGameData]:
        """Parse ESPN API response."""
        games: list[RawGameData] = []

        events = data.get("events", [])

        for event in events:
            try:
                game = self._parse_espn_event(event, source_url)
                if game:
                    games.append(game)
            except Exception as e:
                self._logger.debug(f"Failed to parse ESPN event: {e}")
                continue

        return games

    def _parse_espn_event(
        self,
        event: dict,
        source_url: str,
    ) -> Optional[RawGameData]:
        """Parse a single ESPN event."""
        # Get date
        date_str = event.get("date", "")
        if not date_str:
            return None

        try:
            # ESPN uses ISO format
            game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
        except ValueError:
            return None

        # Get competitions (usually just one)
        competitions = event.get("competitions", [])
        if not competitions:
            return None

        competition = competitions[0]

        # Get teams
        competitors = competition.get("competitors", [])
        if len(competitors) != 2:
            return None

        home_team = None
        away_team = None
        home_score = None
        away_score = None

        for competitor in competitors:
            team_info = competitor.get("team", {})
            team_name = team_info.get("displayName", "")
            is_home = competitor.get("homeAway") == "home"
            score = competitor.get("score")

            if score:
                try:
                    score = int(score)
                except (ValueError, TypeError):
                    score = None

            if is_home:
                home_team = team_name
                home_score = score
            else:
                away_team = team_name
                away_score = score

        if not home_team or not away_team:
            return None

        # Get venue
        venue = competition.get("venue", {})
        arena = venue.get("fullName")

        # Get status
        status_info = competition.get("status", {})
        status_type = status_info.get("type", {})
        status_name = status_type.get("name", "").lower()

        if status_name == "status_final":
            status = "final"
        elif status_name == "status_postponed":
            status = "postponed"
        elif status_name == "status_canceled":
            status = "cancelled"
        else:
            status = "scheduled"

        return RawGameData(
            game_date=game_date,
            home_team_raw=home_team,
            away_team_raw=away_team,
            stadium_raw=arena,
            home_score=home_score,
            away_score=away_score,
            status=status,
            source_url=source_url,
        )

    def _scrape_cbs(self) -> list[RawGameData]:
        """Scrape games from CBS Sports.

        CBS Sports is a backup source with less structured data.
        """
        # CBS Sports scraping would go here
        # For now, return empty to fall back to other sources
        raise NotImplementedError("CBS scraper not implemented")

    def _normalize_games(
        self,
        raw_games: list[RawGameData],
    ) -> tuple[list[Game], list[ManualReviewItem]]:
        """Normalize raw games to Game objects with canonical IDs."""
        games: list[Game] = []
        review_items: list[ManualReviewItem] = []

        # Track games by date for doubleheader detection
        games_by_date: dict[str, list[RawGameData]] = {}

        for raw in raw_games:
            date_key = raw.game_date.strftime("%Y%m%d")
            matchup_key = f"{date_key}_{raw.away_team_raw}_{raw.home_team_raw}"

            if matchup_key not in games_by_date:
                games_by_date[matchup_key] = []
            games_by_date[matchup_key].append(raw)

        # Process games with doubleheader detection
        for matchup_key, matchup_games in games_by_date.items():
            is_doubleheader = len(matchup_games) > 1

            for i, raw in enumerate(matchup_games):
                game_number = (i + 1) if is_doubleheader else None

                game, item_reviews = self._normalize_single_game(raw, game_number)

                if game:
                    games.append(game)
                    log_game(
                        self.sport,
                        game.id,
                        game.home_team_id,
                        game.away_team_id,
                        game.game_date.strftime("%Y-%m-%d"),
                        game.status,
                    )

                review_items.extend(item_reviews)

        return games, review_items

    def _normalize_single_game(
        self,
        raw: RawGameData,
        game_number: Optional[int],
    ) -> tuple[Optional[Game], list[ManualReviewItem]]:
        """Normalize a single raw game."""
        review_items: list[ManualReviewItem] = []

        # Resolve home team
        home_result = self._team_resolver.resolve(
            raw.home_team_raw,
            check_date=raw.game_date.date(),
            source_url=raw.source_url,
        )

        if home_result.review_item:
            review_items.append(home_result.review_item)

        if not home_result.canonical_id:
            log_warning(f"Could not resolve home team: {raw.home_team_raw}")
            return None, review_items

        # Resolve away team
        away_result = self._team_resolver.resolve(
            raw.away_team_raw,
            check_date=raw.game_date.date(),
            source_url=raw.source_url,
        )

        if away_result.review_item:
            review_items.append(away_result.review_item)

        if not away_result.canonical_id:
            log_warning(f"Could not resolve away team: {raw.away_team_raw}")
            return None, review_items

        # Resolve stadium (optional - use home team's stadium if not found)
        stadium_id = None

        if raw.stadium_raw:
            stadium_result = self._stadium_resolver.resolve(
                raw.stadium_raw,
                check_date=raw.game_date.date(),
                source_url=raw.source_url,
            )

            if stadium_result.review_item:
                review_items.append(stadium_result.review_item)

            stadium_id = stadium_result.canonical_id

        # If no stadium found, use home team's default stadium
        if not stadium_id:
            # Look up home team's stadium from mappings
            home_abbrev = home_result.canonical_id.split("_")[-1].upper()
            team_info = self._team_resolver.get_team_info(home_abbrev)

            if team_info:
                # Try to find stadium by team's home arena
                for sid, sinfo in STADIUM_MAPPINGS.get("nba", {}).items():
                    # Match by city
                    if sinfo.city.lower() in team_info[2].lower():
                        stadium_id = sid
                        break

        # Get abbreviations for game ID
        home_abbrev = self._get_abbreviation(home_result.canonical_id)
        away_abbrev = self._get_abbreviation(away_result.canonical_id)

        # Generate canonical game ID
        game_id = generate_game_id(
            sport=self.sport,
            season=self.season,
            away_abbrev=away_abbrev,
            home_abbrev=home_abbrev,
            game_date=raw.game_date,
            game_number=game_number,
        )

        game = Game(
            id=game_id,
            sport=self.sport,
            season=self.season,
            home_team_id=home_result.canonical_id,
            away_team_id=away_result.canonical_id,
            stadium_id=stadium_id or "",
            game_date=raw.game_date,
            game_number=game_number,
            home_score=raw.home_score,
            away_score=raw.away_score,
            status=raw.status,
            source_url=raw.source_url,
            raw_home_team=raw.home_team_raw,
            raw_away_team=raw.away_team_raw,
            raw_stadium=raw.stadium_raw,
        )

        return game, review_items

    def _get_abbreviation(self, team_id: str) -> str:
        """Extract abbreviation from team ID."""
        # team_nba_okc -> okc
        parts = team_id.split("_")
        return parts[-1] if parts else ""

    def scrape_teams(self) -> list[Team]:
        """Get all NBA teams from hardcoded mappings."""
        teams: list[Team] = []
        seen: set[str] = set()

        # NBA conference/division structure
        divisions = {
            "Atlantic": ("Eastern", ["BOS", "BKN", "NYK", "PHI", "TOR"]),
            "Central": ("Eastern", ["CHI", "CLE", "DET", "IND", "MIL"]),
            "Southeast": ("Eastern", ["ATL", "CHA", "MIA", "ORL", "WAS"]),
            "Northwest": ("Western", ["DEN", "MIN", "OKC", "POR", "UTA"]),
            "Pacific": ("Western", ["GSW", "LAC", "LAL", "PHX", "SAC"]),
            "Southwest": ("Western", ["DAL", "HOU", "MEM", "NOP", "SAS"]),
        }

        # Build reverse lookup
        team_divisions: dict[str, tuple[str, str]] = {}
        for div, (conf, abbrevs) in divisions.items():
            for abbrev in abbrevs:
                team_divisions[abbrev] = (conf, div)

        for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nba", {}).items():
            if team_id in seen:
                continue
            seen.add(team_id)

            # Parse full name into city and name parts
            parts = full_name.split()
            if len(parts) >= 2:
                # Handle special cases like "Oklahoma City Thunder"
                if city == "Oklahoma City":
                    team_name = "Thunder"
                elif city == "Golden State":
                    team_name = "Warriors"
                elif city == "San Antonio":
                    team_name = "Spurs"
                elif city == "New York":
                    team_name = parts[-1]  # Knicks
                elif city == "New Orleans":
                    team_name = "Pelicans"
                elif city == "Los Angeles":
                    team_name = parts[-1]  # Lakers or Clippers
                else:
                    team_name = parts[-1]
            else:
                team_name = full_name

            # Get conference and division
            conf, div = team_divisions.get(abbrev, (None, None))

            team = Team(
                id=team_id,
                sport="nba",
                city=city,
                name=team_name,
                full_name=full_name,
                abbreviation=abbrev,
                conference=conf,
                division=div,
                stadium_id=stadium_id,
            )
            teams.append(team)

        return teams

    def scrape_stadiums(self) -> list[Stadium]:
        """Get all NBA stadiums from hardcoded mappings."""
        stadiums: list[Stadium] = []

        for stadium_id, info in STADIUM_MAPPINGS.get("nba", {}).items():
            stadium = Stadium(
                id=stadium_id,
                sport="nba",
                name=info.name,
                city=info.city,
                state=info.state,
                country=info.country,
                latitude=info.latitude,
                longitude=info.longitude,
                surface="hardwood",
                roof_type="dome",
            )
            stadiums.append(stadium)

        return stadiums


def create_nba_scraper(season: int) -> NBAScraper:
    """Factory function to create an NBA scraper."""
    return NBAScraper(season=season)