Added year_opened field to stadium data across all 4 sport modules: - MLB: 30 ballparks (1912-2023) - NBA: 30 arenas (1968-2024) - NHL: 32 arenas (1968-2021) - NFL: 30 stadiums (1924-2020) Updated Stadium object creation in all modules to pass year_opened. Stadium dataclass already supported the field. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
413 lines
18 KiB
Python
413 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
NBA schedule and stadium scrapers for SportsTime.
|
|
|
|
This module provides:
|
|
- NBA game scrapers (Basketball-Reference, ESPN, CBS Sports)
|
|
- NBA stadium scrapers (hardcoded with coordinates)
|
|
- Multi-source fallback configurations
|
|
"""
|
|
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional
|
|
|
|
import requests
|
|
|
|
# Support both direct execution and import from parent directory
|
|
try:
|
|
from core import (
|
|
Game,
|
|
Stadium,
|
|
ScraperSource,
|
|
StadiumScraperSource,
|
|
fetch_page,
|
|
scrape_with_fallback,
|
|
scrape_stadiums_with_fallback,
|
|
)
|
|
except ImportError:
|
|
from Scripts.core import (
|
|
Game,
|
|
Stadium,
|
|
ScraperSource,
|
|
StadiumScraperSource,
|
|
fetch_page,
|
|
scrape_with_fallback,
|
|
scrape_stadiums_with_fallback,
|
|
)
|
|
|
|
|
|
__all__ = [
|
|
# Team data
|
|
'NBA_TEAMS',
|
|
# Game scrapers
|
|
'scrape_nba_basketball_reference',
|
|
'scrape_nba_espn',
|
|
'scrape_nba_cbssports',
|
|
# Stadium scrapers
|
|
'scrape_nba_stadiums',
|
|
# Source configurations
|
|
'NBA_GAME_SOURCES',
|
|
'NBA_STADIUM_SOURCES',
|
|
# Convenience functions
|
|
'scrape_nba_games',
|
|
'get_nba_season_string',
|
|
]
|
|
|
|
|
|
# =============================================================================
|
|
# TEAM MAPPINGS
|
|
# =============================================================================
|
|
|
|
NBA_TEAMS = {
|
|
'ATL': {'name': 'Atlanta Hawks', 'city': 'Atlanta', 'arena': 'State Farm Arena'},
|
|
'BOS': {'name': 'Boston Celtics', 'city': 'Boston', 'arena': 'TD Garden'},
|
|
'BRK': {'name': 'Brooklyn Nets', 'city': 'Brooklyn', 'arena': 'Barclays Center'},
|
|
'CHO': {'name': 'Charlotte Hornets', 'city': 'Charlotte', 'arena': 'Spectrum Center'},
|
|
'CHI': {'name': 'Chicago Bulls', 'city': 'Chicago', 'arena': 'United Center'},
|
|
'CLE': {'name': 'Cleveland Cavaliers', 'city': 'Cleveland', 'arena': 'Rocket Mortgage FieldHouse'},
|
|
'DAL': {'name': 'Dallas Mavericks', 'city': 'Dallas', 'arena': 'American Airlines Center'},
|
|
'DEN': {'name': 'Denver Nuggets', 'city': 'Denver', 'arena': 'Ball Arena'},
|
|
'DET': {'name': 'Detroit Pistons', 'city': 'Detroit', 'arena': 'Little Caesars Arena'},
|
|
'GSW': {'name': 'Golden State Warriors', 'city': 'San Francisco', 'arena': 'Chase Center'},
|
|
'HOU': {'name': 'Houston Rockets', 'city': 'Houston', 'arena': 'Toyota Center'},
|
|
'IND': {'name': 'Indiana Pacers', 'city': 'Indianapolis', 'arena': 'Gainbridge Fieldhouse'},
|
|
'LAC': {'name': 'Los Angeles Clippers', 'city': 'Inglewood', 'arena': 'Intuit Dome'},
|
|
'LAL': {'name': 'Los Angeles Lakers', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
|
|
'MEM': {'name': 'Memphis Grizzlies', 'city': 'Memphis', 'arena': 'FedExForum'},
|
|
'MIA': {'name': 'Miami Heat', 'city': 'Miami', 'arena': 'Kaseya Center'},
|
|
'MIL': {'name': 'Milwaukee Bucks', 'city': 'Milwaukee', 'arena': 'Fiserv Forum'},
|
|
'MIN': {'name': 'Minnesota Timberwolves', 'city': 'Minneapolis', 'arena': 'Target Center'},
|
|
'NOP': {'name': 'New Orleans Pelicans', 'city': 'New Orleans', 'arena': 'Smoothie King Center'},
|
|
'NYK': {'name': 'New York Knicks', 'city': 'New York', 'arena': 'Madison Square Garden'},
|
|
'OKC': {'name': 'Oklahoma City Thunder', 'city': 'Oklahoma City', 'arena': 'Paycom Center'},
|
|
'ORL': {'name': 'Orlando Magic', 'city': 'Orlando', 'arena': 'Kia Center'},
|
|
'PHI': {'name': 'Philadelphia 76ers', 'city': 'Philadelphia', 'arena': 'Wells Fargo Center'},
|
|
'PHO': {'name': 'Phoenix Suns', 'city': 'Phoenix', 'arena': 'Footprint Center'},
|
|
'POR': {'name': 'Portland Trail Blazers', 'city': 'Portland', 'arena': 'Moda Center'},
|
|
'SAC': {'name': 'Sacramento Kings', 'city': 'Sacramento', 'arena': 'Golden 1 Center'},
|
|
'SAS': {'name': 'San Antonio Spurs', 'city': 'San Antonio', 'arena': 'Frost Bank Center'},
|
|
'TOR': {'name': 'Toronto Raptors', 'city': 'Toronto', 'arena': 'Scotiabank Arena'},
|
|
'UTA': {'name': 'Utah Jazz', 'city': 'Salt Lake City', 'arena': 'Delta Center'},
|
|
'WAS': {'name': 'Washington Wizards', 'city': 'Washington', 'arena': 'Capital One Arena'},
|
|
}
|
|
|
|
|
|
def get_nba_team_abbrev(team_name: str) -> str:
|
|
"""Get NBA team abbreviation from full name."""
|
|
for abbrev, info in NBA_TEAMS.items():
|
|
if info['name'].lower() == team_name.lower():
|
|
return abbrev
|
|
if team_name.lower() in info['name'].lower():
|
|
return abbrev
|
|
|
|
# Return first 3 letters as fallback
|
|
return team_name[:3].upper()
|
|
|
|
|
|
def get_nba_season_string(season: int) -> str:
|
|
"""
|
|
Get NBA season string in "2024-25" format.
|
|
|
|
Args:
|
|
season: The ending year of the season (e.g., 2025 for 2024-25 season)
|
|
|
|
Returns:
|
|
Season string like "2024-25"
|
|
"""
|
|
return f"{season-1}-{str(season)[2:]}"
|
|
|
|
|
|
# =============================================================================
|
|
# GAME SCRAPERS
|
|
# =============================================================================
|
|
|
|
def scrape_nba_basketball_reference(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NBA schedule from Basketball-Reference.
|
|
URL: https://www.basketball-reference.com/leagues/NBA_{YEAR}_games-{month}.html
|
|
Season year is the ending year (e.g., 2025 for 2024-25 season)
|
|
"""
|
|
games = []
|
|
months = ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june']
|
|
|
|
print(f"Scraping NBA {season} from Basketball-Reference...")
|
|
|
|
for month in months:
|
|
url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html"
|
|
soup = fetch_page(url, 'basketball-reference.com')
|
|
|
|
if not soup:
|
|
continue
|
|
|
|
table = soup.find('table', {'id': 'schedule'})
|
|
if not table:
|
|
continue
|
|
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
continue
|
|
|
|
for row in tbody.find_all('tr'):
|
|
if row.get('class') and 'thead' in row.get('class'):
|
|
continue
|
|
|
|
cells = row.find_all(['td', 'th'])
|
|
if len(cells) < 6:
|
|
continue
|
|
|
|
try:
|
|
# Parse date
|
|
date_cell = row.find('th', {'data-stat': 'date_game'})
|
|
if not date_cell:
|
|
continue
|
|
date_link = date_cell.find('a')
|
|
date_str = date_link.text if date_link else date_cell.text
|
|
|
|
# Parse time
|
|
time_cell = row.find('td', {'data-stat': 'game_start_time'})
|
|
time_str = time_cell.text.strip() if time_cell else None
|
|
|
|
# Parse teams
|
|
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
|
|
home_cell = row.find('td', {'data-stat': 'home_team_name'})
|
|
|
|
if not visitor_cell or not home_cell:
|
|
continue
|
|
|
|
visitor_link = visitor_cell.find('a')
|
|
home_link = home_cell.find('a')
|
|
|
|
away_team = visitor_link.text if visitor_link else visitor_cell.text
|
|
home_team = home_link.text if home_link else home_cell.text
|
|
|
|
# Parse arena
|
|
arena_cell = row.find('td', {'data-stat': 'arena_name'})
|
|
arena = arena_cell.text.strip() if arena_cell else ''
|
|
|
|
# Convert date
|
|
try:
|
|
parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y')
|
|
date_formatted = parsed_date.strftime('%Y-%m-%d')
|
|
except:
|
|
continue
|
|
|
|
# Generate game ID
|
|
away_abbrev = get_nba_team_abbrev(away_team)
|
|
home_abbrev = get_nba_team_abbrev(home_team)
|
|
game_id = f"nba_{date_formatted}_{away_abbrev}_{home_abbrev}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NBA',
|
|
season=get_nba_season_string(season),
|
|
date=date_formatted,
|
|
time=time_str,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=home_abbrev,
|
|
away_team_abbrev=away_abbrev,
|
|
venue=arena,
|
|
source='basketball-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
print(f" Error parsing row: {e}")
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Basketball-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_nba_espn(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NBA schedule from ESPN.
|
|
URL: https://www.espn.com/nba/schedule/_/date/{YYYYMMDD}
|
|
"""
|
|
games = []
|
|
print(f"Scraping NBA {season} from ESPN...")
|
|
|
|
# Determine date range for season
|
|
start_date = datetime(season - 1, 10, 1) # October of previous year
|
|
end_date = datetime(season, 6, 30) # June of season year
|
|
|
|
current_date = start_date
|
|
while current_date <= end_date:
|
|
date_str = current_date.strftime('%Y%m%d')
|
|
url = f"https://www.espn.com/nba/schedule/_/date/{date_str}"
|
|
|
|
soup = fetch_page(url, 'espn.com')
|
|
if soup:
|
|
# ESPN uses JavaScript rendering, so we need to parse what's available
|
|
# This is a simplified version - full implementation would need Selenium
|
|
pass
|
|
|
|
current_date += timedelta(days=7) # Sample weekly to respect rate limits
|
|
|
|
print(f" Found {len(games)} games from ESPN")
|
|
return games
|
|
|
|
|
|
def scrape_nba_cbssports(season: int) -> list[Game]:
|
|
"""
|
|
Fetch NBA schedule from CBS Sports.
|
|
CBS Sports provides a JSON API for schedule data.
|
|
"""
|
|
games = []
|
|
print(f"Fetching NBA {season} from CBS Sports...")
|
|
|
|
# CBS Sports has a schedule endpoint
|
|
url = "https://www.cbssports.com/nba/schedule/"
|
|
|
|
soup = fetch_page(url, 'cbssports.com')
|
|
if not soup:
|
|
return games
|
|
|
|
# Find all game rows
|
|
tables = soup.find_all('table', class_='TableBase-table')
|
|
|
|
for table in tables:
|
|
rows = table.find_all('tr')
|
|
for row in rows:
|
|
try:
|
|
cells = row.find_all('td')
|
|
if len(cells) < 2:
|
|
continue
|
|
|
|
# Parse teams from row
|
|
team_cells = row.find_all('a', class_='TeamName')
|
|
if len(team_cells) < 2:
|
|
continue
|
|
|
|
away_team = team_cells[0].get_text(strip=True)
|
|
home_team = team_cells[1].get_text(strip=True)
|
|
|
|
# Get date from table section
|
|
date_formatted = datetime.now().strftime('%Y-%m-%d') # Placeholder
|
|
|
|
away_abbrev = get_nba_team_abbrev(away_team)
|
|
home_abbrev = get_nba_team_abbrev(home_team)
|
|
game_id = f"nba_{date_formatted}_{away_abbrev}_{home_abbrev}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NBA',
|
|
season=get_nba_season_string(season),
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=home_abbrev,
|
|
away_team_abbrev=away_abbrev,
|
|
venue='',
|
|
source='cbssports.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from CBS Sports")
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# STADIUM SCRAPERS
|
|
# =============================================================================
|
|
|
|
def scrape_nba_stadiums() -> list[Stadium]:
|
|
"""
|
|
Fetch NBA arena data (hardcoded with accurate coordinates).
|
|
"""
|
|
print("\nNBA STADIUMS")
|
|
print("-" * 40)
|
|
print(" Loading NBA arenas...")
|
|
|
|
nba_arenas = {
|
|
'State Farm Arena': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.7573, 'lng': -84.3963, 'capacity': 18118, 'teams': ['ATL'], 'year_opened': 1999},
|
|
'TD Garden': {'city': 'Boston', 'state': 'MA', 'lat': 42.3662, 'lng': -71.0621, 'capacity': 19156, 'teams': ['BOS'], 'year_opened': 1995},
|
|
'Barclays Center': {'city': 'Brooklyn', 'state': 'NY', 'lat': 40.6826, 'lng': -73.9754, 'capacity': 17732, 'teams': ['BRK'], 'year_opened': 2012},
|
|
'Spectrum Center': {'city': 'Charlotte', 'state': 'NC', 'lat': 35.2251, 'lng': -80.8392, 'capacity': 19077, 'teams': ['CHO'], 'year_opened': 2005},
|
|
'United Center': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8807, 'lng': -87.6742, 'capacity': 20917, 'teams': ['CHI'], 'year_opened': 1994},
|
|
'Rocket Mortgage FieldHouse': {'city': 'Cleveland', 'state': 'OH', 'lat': 41.4965, 'lng': -81.6882, 'capacity': 19432, 'teams': ['CLE'], 'year_opened': 1994},
|
|
'American Airlines Center': {'city': 'Dallas', 'state': 'TX', 'lat': 32.7905, 'lng': -96.8103, 'capacity': 19200, 'teams': ['DAL'], 'year_opened': 2001},
|
|
'Ball Arena': {'city': 'Denver', 'state': 'CO', 'lat': 39.7487, 'lng': -105.0077, 'capacity': 19520, 'teams': ['DEN'], 'year_opened': 1999},
|
|
'Little Caesars Arena': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3411, 'lng': -83.0553, 'capacity': 20332, 'teams': ['DET'], 'year_opened': 2017},
|
|
'Chase Center': {'city': 'San Francisco', 'state': 'CA', 'lat': 37.7680, 'lng': -122.3879, 'capacity': 18064, 'teams': ['GSW'], 'year_opened': 2019},
|
|
'Toyota Center': {'city': 'Houston', 'state': 'TX', 'lat': 29.7508, 'lng': -95.3621, 'capacity': 18055, 'teams': ['HOU'], 'year_opened': 2003},
|
|
'Gainbridge Fieldhouse': {'city': 'Indianapolis', 'state': 'IN', 'lat': 39.7640, 'lng': -86.1555, 'capacity': 17923, 'teams': ['IND'], 'year_opened': 1999},
|
|
'Intuit Dome': {'city': 'Inglewood', 'state': 'CA', 'lat': 33.9425, 'lng': -118.3419, 'capacity': 18000, 'teams': ['LAC'], 'year_opened': 2024},
|
|
'Crypto.com Arena': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0430, 'lng': -118.2673, 'capacity': 18997, 'teams': ['LAL'], 'year_opened': 1999},
|
|
'FedExForum': {'city': 'Memphis', 'state': 'TN', 'lat': 35.1382, 'lng': -90.0506, 'capacity': 17794, 'teams': ['MEM'], 'year_opened': 2004},
|
|
'Kaseya Center': {'city': 'Miami', 'state': 'FL', 'lat': 25.7814, 'lng': -80.1870, 'capacity': 19600, 'teams': ['MIA'], 'year_opened': 1999},
|
|
'Fiserv Forum': {'city': 'Milwaukee', 'state': 'WI', 'lat': 43.0451, 'lng': -87.9174, 'capacity': 17341, 'teams': ['MIL'], 'year_opened': 2018},
|
|
'Target Center': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9795, 'lng': -93.2761, 'capacity': 18978, 'teams': ['MIN'], 'year_opened': 1990},
|
|
'Smoothie King Center': {'city': 'New Orleans', 'state': 'LA', 'lat': 29.9490, 'lng': -90.0821, 'capacity': 16867, 'teams': ['NOP'], 'year_opened': 1999},
|
|
'Madison Square Garden': {'city': 'New York', 'state': 'NY', 'lat': 40.7505, 'lng': -73.9934, 'capacity': 19812, 'teams': ['NYK'], 'year_opened': 1968},
|
|
'Paycom Center': {'city': 'Oklahoma City', 'state': 'OK', 'lat': 35.4634, 'lng': -97.5151, 'capacity': 18203, 'teams': ['OKC'], 'year_opened': 2002},
|
|
'Kia Center': {'city': 'Orlando', 'state': 'FL', 'lat': 28.5392, 'lng': -81.3839, 'capacity': 18846, 'teams': ['ORL'], 'year_opened': 1989},
|
|
'Wells Fargo Center': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9012, 'lng': -75.1720, 'capacity': 20478, 'teams': ['PHI'], 'year_opened': 1996},
|
|
'Footprint Center': {'city': 'Phoenix', 'state': 'AZ', 'lat': 33.4457, 'lng': -112.0712, 'capacity': 17071, 'teams': ['PHO'], 'year_opened': 1992},
|
|
'Moda Center': {'city': 'Portland', 'state': 'OR', 'lat': 45.5316, 'lng': -122.6668, 'capacity': 19393, 'teams': ['POR'], 'year_opened': 1995},
|
|
'Golden 1 Center': {'city': 'Sacramento', 'state': 'CA', 'lat': 38.5802, 'lng': -121.4997, 'capacity': 17608, 'teams': ['SAC'], 'year_opened': 2016},
|
|
'Frost Bank Center': {'city': 'San Antonio', 'state': 'TX', 'lat': 29.4270, 'lng': -98.4375, 'capacity': 18418, 'teams': ['SAS'], 'year_opened': 2002},
|
|
'Scotiabank Arena': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6435, 'lng': -79.3791, 'capacity': 19800, 'teams': ['TOR'], 'year_opened': 1999},
|
|
'Delta Center': {'city': 'Salt Lake City', 'state': 'UT', 'lat': 40.7683, 'lng': -111.9011, 'capacity': 18306, 'teams': ['UTA'], 'year_opened': 1991},
|
|
'Capital One Arena': {'city': 'Washington', 'state': 'DC', 'lat': 38.8982, 'lng': -77.0209, 'capacity': 20356, 'teams': ['WAS'], 'year_opened': 1997},
|
|
}
|
|
|
|
stadiums = []
|
|
for name, info in nba_arenas.items():
|
|
stadium = Stadium(
|
|
id=f"nba_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info['city'],
|
|
state=info['state'],
|
|
latitude=info['lat'],
|
|
longitude=info['lng'],
|
|
capacity=info['capacity'],
|
|
sport='NBA',
|
|
team_abbrevs=info['teams'],
|
|
source='nba_hardcoded',
|
|
year_opened=info.get('year_opened')
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
print(f" ✓ Found {len(stadiums)} NBA arenas")
|
|
return stadiums
|
|
|
|
|
|
# =============================================================================
|
|
# SOURCE CONFIGURATIONS
|
|
# =============================================================================
|
|
|
|
NBA_GAME_SOURCES = [
|
|
ScraperSource('Basketball-Reference', scrape_nba_basketball_reference, priority=1, min_games=100),
|
|
ScraperSource('CBS Sports', scrape_nba_cbssports, priority=2, min_games=50),
|
|
ScraperSource('ESPN', scrape_nba_espn, priority=3, min_games=50),
|
|
]
|
|
|
|
NBA_STADIUM_SOURCES = [
|
|
StadiumScraperSource('Hardcoded', scrape_nba_stadiums, priority=1, min_venues=25),
|
|
]
|
|
|
|
|
|
# =============================================================================
|
|
# CONVENIENCE FUNCTIONS
|
|
# =============================================================================
|
|
|
|
def scrape_nba_games(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NBA games for a season using multi-source fallback.
|
|
|
|
Args:
|
|
season: Season ending year (e.g., 2025 for 2024-25 season)
|
|
|
|
Returns:
|
|
List of Game objects from the first successful source
|
|
"""
|
|
print(f"\nNBA {get_nba_season_string(season)} SCHEDULE")
|
|
print("-" * 40)
|
|
|
|
return scrape_with_fallback('NBA', season, NBA_GAME_SOURCES)
|