Files
Sportstime/Scripts/sportstime_parser/scrapers/nba.py
Trey t 8ea3e6112a feat(scripts): complete data pipeline remediation
Scripts changes:
- Add WNBA abbreviation aliases to team_resolver.py
- Fix NHL stadium coordinates in stadium_resolver.py
- Add validate_aliases.py script for orphan detection
- Update scrapers with improved error handling
- Add DATA_AUDIT.md and REMEDIATION_PLAN.md documentation
- Update alias JSON files with new mappings

iOS bundle updates:
- Update games_canonical.json with latest scraped data
- Update teams_canonical.json and stadiums_canonical.json
- Sync alias files with Scripts versions

All 5 remediation phases complete.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:58:47 -06:00

662 lines
22 KiB
Python

"""NBA scraper implementation with multi-source fallback."""
from datetime import datetime, date, timezone
from typing import Optional
from bs4 import BeautifulSoup
import re
from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
TeamResolver,
TEAM_MAPPINGS,
get_team_resolver,
)
from ..normalizers.stadium_resolver import (
StadiumResolver,
STADIUM_MAPPINGS,
get_stadium_resolver,
)
from ..normalizers.timezone import parse_datetime
from ..utils.logging import get_logger, log_game, log_warning
# Month name to number mapping
MONTH_MAP = {
"january": 1, "february": 2, "march": 3, "april": 4,
"may": 5, "june": 6, "july": 7, "august": 8,
"september": 9, "october": 10, "november": 11, "december": 12,
}
# Basketball Reference month URLs
BR_MONTHS = [
"october", "november", "december",
"january", "february", "march", "april", "may", "june",
]
class NBAScraper(BaseScraper):
"""NBA schedule scraper with multi-source fallback.
Sources (in priority order):
1. Basketball-Reference - Most reliable, complete historical data
2. ESPN API - Good for current/future seasons
3. CBS Sports - Backup option
"""
def __init__(self, season: int, **kwargs):
"""Initialize NBA scraper.
Args:
season: Season start year (e.g., 2025 for 2025-26)
"""
super().__init__("nba", season, **kwargs)
self._team_resolver = get_team_resolver("nba")
self._stadium_resolver = get_stadium_resolver("nba")
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
# CBS scraper not yet implemented - TODO for future
return ["basketball_reference", "espn"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
if source == "basketball_reference":
month = kwargs.get("month", "october")
year = kwargs.get("year", self.season + 1)
return f"https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html"
elif source == "espn":
date_str = kwargs.get("date", "")
return f"https://site.api.espn.com/apis/site/v2/sports/basketball/nba/scoreboard?dates={date_str}"
elif source == "cbs":
return "https://www.cbssports.com/nba/schedule/"
raise ValueError(f"Unknown source: {source}")
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
"""Scrape games from a specific source."""
if source == "basketball_reference":
return self._scrape_basketball_reference()
elif source == "espn":
return self._scrape_espn()
elif source == "cbs":
return self._scrape_cbs()
else:
raise ValueError(f"Unknown source: {source}")
def _scrape_basketball_reference(self) -> list[RawGameData]:
"""Scrape games from Basketball-Reference.
BR organizes games by month with separate pages.
Format: https://www.basketball-reference.com/leagues/NBA_YYYY_games-month.html
where YYYY is the ending year of the season.
Bails early if first few months have no data (season doesn't exist).
"""
all_games: list[RawGameData] = []
end_year = self.season + 1
consecutive_empty_months = 0
for month in BR_MONTHS:
url = self._get_source_url("basketball_reference", month=month, year=end_year)
try:
html = self.session.get_html(url)
games = self._parse_basketball_reference(html, url)
if games:
all_games.extend(games)
consecutive_empty_months = 0
self._logger.debug(f"Found {len(games)} games in {month}")
else:
consecutive_empty_months += 1
except Exception as e:
# Some months may not exist (e.g., no games in August)
self._logger.debug(f"No data for {month}: {e}")
consecutive_empty_months += 1
# If first 3 months (Oct, Nov, Dec) all have no data, season doesn't exist
if consecutive_empty_months >= 3 and not all_games:
self._logger.info(f"No games found in first {consecutive_empty_months} months, season likely doesn't exist")
break
return all_games
def _parse_basketball_reference(
self,
html: str,
source_url: str,
) -> list[RawGameData]:
"""Parse Basketball-Reference schedule HTML.
Table structure:
- th[data-stat="date_game"]: Date (e.g., "Tue, Oct 22, 2024")
- td[data-stat="visitor_team_name"]: Away team
- td[data-stat="home_team_name"]: Home team
- td[data-stat="visitor_pts"]: Away score
- td[data-stat="home_pts"]: Home score
- td[data-stat="arena_name"]: Arena/stadium name
"""
soup = BeautifulSoup(html, "lxml")
games: list[RawGameData] = []
# Find the schedule table
table = soup.find("table", id="schedule")
if not table:
return games
tbody = table.find("tbody")
if not tbody:
return games
for row in tbody.find_all("tr"):
# Skip header rows
if row.get("class") and "thead" in row.get("class", []):
continue
try:
game = self._parse_br_row(row, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse row: {e}")
continue
return games
def _parse_br_row(
self,
row,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single Basketball-Reference table row."""
# Get date
date_cell = row.find("th", {"data-stat": "date_game"})
if not date_cell:
return None
date_text = date_cell.get_text(strip=True)
if not date_text:
return None
# Parse date (format: "Tue, Oct 22, 2024")
try:
game_date = datetime.strptime(date_text, "%a, %b %d, %Y")
except ValueError:
# Try alternative format
try:
game_date = datetime.strptime(date_text, "%B %d, %Y")
except ValueError:
self._logger.debug(f"Could not parse date: {date_text}")
return None
# Get teams
away_cell = row.find("td", {"data-stat": "visitor_team_name"})
home_cell = row.find("td", {"data-stat": "home_team_name"})
if not away_cell or not home_cell:
return None
away_team = away_cell.get_text(strip=True)
home_team = home_cell.get_text(strip=True)
if not away_team or not home_team:
return None
# Get scores (may be empty for future games)
away_score_cell = row.find("td", {"data-stat": "visitor_pts"})
home_score_cell = row.find("td", {"data-stat": "home_pts"})
away_score = None
home_score = None
if away_score_cell and away_score_cell.get_text(strip=True):
try:
away_score = int(away_score_cell.get_text(strip=True))
except ValueError:
pass
if home_score_cell and home_score_cell.get_text(strip=True):
try:
home_score = int(home_score_cell.get_text(strip=True))
except ValueError:
pass
# Get arena
arena_cell = row.find("td", {"data-stat": "arena_name"})
arena = arena_cell.get_text(strip=True) if arena_cell else None
# Determine status
status = "final" if home_score is not None else "scheduled"
# Check for postponed/cancelled
notes_cell = row.find("td", {"data-stat": "game_remarks"})
if notes_cell:
notes = notes_cell.get_text(strip=True).lower()
if "postponed" in notes:
status = "postponed"
elif "cancelled" in notes or "canceled" in notes:
status = "cancelled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=arena,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API.
ESPN API returns games for a specific date range.
We iterate through each day of the season.
Bails out early if no games found after checking first month.
"""
all_games: list[RawGameData] = []
consecutive_empty_days = 0
max_empty_days = 45 # Bail after ~1.5 months of no games
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
days_in_month = (next_month - date(year, month, 1)).days
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
if games:
all_games.extend(games)
consecutive_empty_days = 0
else:
consecutive_empty_days += 1
# Bail early if no games found for a long stretch
if consecutive_empty_days >= max_empty_days:
self._logger.info(f"No games found for {max_empty_days} consecutive days, stopping ESPN scrape")
return all_games
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
consecutive_empty_days += 1
if consecutive_empty_days >= max_empty_days:
self._logger.info(f"Too many consecutive failures, stopping ESPN scrape")
return all_games
continue
return all_games
def _parse_espn_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse ESPN API response."""
games: list[RawGameData] = []
events = data.get("events", [])
for event in events:
try:
game = self._parse_espn_event(event, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse ESPN event: {e}")
continue
return games
def _parse_espn_event(
self,
event: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single ESPN event."""
# Get date
date_str = event.get("date", "")
if not date_str:
return None
try:
# ESPN uses ISO format
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get competitions (usually just one)
competitions = event.get("competitions", [])
if not competitions:
return None
competition = competitions[0]
# Get teams
competitors = competition.get("competitors", [])
if len(competitors) != 2:
return None
home_team = None
away_team = None
home_score = None
away_score = None
for competitor in competitors:
team_info = competitor.get("team", {})
team_name = team_info.get("displayName", "")
is_home = competitor.get("homeAway") == "home"
score = competitor.get("score")
if score:
try:
score = int(score)
except (ValueError, TypeError):
score = None
if is_home:
home_team = team_name
home_score = score
else:
away_team = team_name
away_score = score
if not home_team or not away_team:
return None
# Get venue
venue = competition.get("venue", {})
arena = venue.get("fullName")
# Get status
status_info = competition.get("status", {})
status_type = status_info.get("type", {})
status_name = status_type.get("name", "").lower()
if status_name == "status_final":
status = "final"
elif status_name == "status_postponed":
status = "postponed"
elif status_name == "status_canceled":
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=arena,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_cbs(self) -> list[RawGameData]:
"""Scrape games from CBS Sports.
CBS Sports is a backup source with less structured data.
"""
# CBS Sports scraping would go here
# For now, return empty to fall back to other sources
raise NotImplementedError("CBS scraper not implemented")
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw games to Game objects with canonical IDs."""
games: list[Game] = []
review_items: list[ManualReviewItem] = []
# Track games by date for doubleheader detection
games_by_date: dict[str, list[RawGameData]] = {}
for raw in raw_games:
date_key = raw.game_date.strftime("%Y%m%d")
matchup_key = f"{date_key}_{raw.away_team_raw}_{raw.home_team_raw}"
if matchup_key not in games_by_date:
games_by_date[matchup_key] = []
games_by_date[matchup_key].append(raw)
# Process games with doubleheader detection
for matchup_key, matchup_games in games_by_date.items():
is_doubleheader = len(matchup_games) > 1
for i, raw in enumerate(matchup_games):
game_number = (i + 1) if is_doubleheader else None
game, item_reviews = self._normalize_single_game(raw, game_number)
if game:
games.append(game)
log_game(
self.sport,
game.id,
game.home_team_id,
game.away_team_id,
game.game_date.strftime("%Y-%m-%d"),
game.status,
)
review_items.extend(item_reviews)
return games, review_items
def _normalize_single_game(
self,
raw: RawGameData,
game_number: Optional[int],
) -> tuple[Optional[Game], list[ManualReviewItem]]:
"""Normalize a single raw game."""
review_items: list[ManualReviewItem] = []
# Resolve home team
home_result = self._team_resolver.resolve(
raw.home_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if home_result.review_item:
review_items.append(home_result.review_item)
if not home_result.canonical_id:
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
return None, review_items
# Resolve away team
away_result = self._team_resolver.resolve(
raw.away_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if away_result.review_item:
review_items.append(away_result.review_item)
if not away_result.canonical_id:
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
return None, review_items
# Resolve stadium (optional - use home team's stadium if not found)
stadium_id = None
if raw.stadium_raw:
stadium_result = self._stadium_resolver.resolve(
raw.stadium_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if stadium_result.review_item:
review_items.append(stadium_result.review_item)
stadium_id = stadium_result.canonical_id
# If no stadium found, use home team's default stadium
if not stadium_id:
# Look up home team's stadium from mappings
home_abbrev = home_result.canonical_id.split("_")[-1].upper()
team_info = self._team_resolver.get_team_info(home_abbrev)
if team_info:
# Try to find stadium by team's home arena
for sid, sinfo in STADIUM_MAPPINGS.get("nba", {}).items():
# Match by city
if sinfo.city.lower() in team_info[2].lower():
stadium_id = sid
break
# Get abbreviations for game ID
home_abbrev = self._get_abbreviation(home_result.canonical_id)
away_abbrev = self._get_abbreviation(away_result.canonical_id)
# Generate canonical game ID
game_id = generate_game_id(
sport=self.sport,
season=self.season,
away_abbrev=away_abbrev,
home_abbrev=home_abbrev,
game_date=raw.game_date,
game_number=game_number,
)
game = Game(
id=game_id,
sport=self.sport,
season=self.season,
home_team_id=home_result.canonical_id,
away_team_id=away_result.canonical_id,
stadium_id=stadium_id or "",
game_date=raw.game_date,
game_number=game_number,
home_score=raw.home_score,
away_score=raw.away_score,
status=raw.status,
source_url=raw.source_url,
raw_home_team=raw.home_team_raw,
raw_away_team=raw.away_team_raw,
raw_stadium=raw.stadium_raw,
)
return game, review_items
def _get_abbreviation(self, team_id: str) -> str:
"""Extract abbreviation from team ID."""
# team_nba_okc -> okc
parts = team_id.split("_")
return parts[-1] if parts else ""
def scrape_teams(self) -> list[Team]:
"""Get all NBA teams from hardcoded mappings."""
teams: list[Team] = []
seen: set[str] = set()
# NBA conference/division structure
divisions = {
"Atlantic": ("Eastern", ["BOS", "BKN", "NYK", "PHI", "TOR"]),
"Central": ("Eastern", ["CHI", "CLE", "DET", "IND", "MIL"]),
"Southeast": ("Eastern", ["ATL", "CHA", "MIA", "ORL", "WAS"]),
"Northwest": ("Western", ["DEN", "MIN", "OKC", "POR", "UTA"]),
"Pacific": ("Western", ["GSW", "LAC", "LAL", "PHX", "SAC"]),
"Southwest": ("Western", ["DAL", "HOU", "MEM", "NOP", "SAS"]),
}
# Build reverse lookup
team_divisions: dict[str, tuple[str, str]] = {}
for div, (conf, abbrevs) in divisions.items():
for abbrev in abbrevs:
team_divisions[abbrev] = (conf, div)
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nba", {}).items():
if team_id in seen:
continue
seen.add(team_id)
# Parse full name into city and name parts
parts = full_name.split()
if len(parts) >= 2:
# Handle special cases like "Oklahoma City Thunder"
if city == "Oklahoma City":
team_name = "Thunder"
elif city == "Golden State":
team_name = "Warriors"
elif city == "San Antonio":
team_name = "Spurs"
elif city == "New York":
team_name = parts[-1] # Knicks
elif city == "New Orleans":
team_name = "Pelicans"
elif city == "Los Angeles":
team_name = parts[-1] # Lakers or Clippers
else:
team_name = parts[-1]
else:
team_name = full_name
# Get conference and division
conf, div = team_divisions.get(abbrev, (None, None))
team = Team(
id=team_id,
sport="nba",
city=city,
name=team_name,
full_name=full_name,
abbreviation=abbrev,
conference=conf,
division=div,
stadium_id=stadium_id,
)
teams.append(team)
return teams
def scrape_stadiums(self) -> list[Stadium]:
"""Get all NBA stadiums from hardcoded mappings."""
stadiums: list[Stadium] = []
for stadium_id, info in STADIUM_MAPPINGS.get("nba", {}).items():
stadium = Stadium(
id=stadium_id,
sport="nba",
name=info.name,
city=info.city,
state=info.state,
country=info.country,
latitude=info.latitude,
longitude=info.longitude,
surface="hardwood",
roof_type="dome",
)
stadiums.append(stadium)
return stadiums
def create_nba_scraper(season: int) -> NBAScraper:
"""Factory function to create an NBA scraper."""
return NBAScraper(season=season)