#!/usr/bin/env python3 """ Sports Schedule Scraper for SportsTime App Scrapes NBA, MLB, NHL schedules from multiple sources for cross-validation. Usage: python scrape_schedules.py --sport nba --season 2025 python scrape_schedules.py --sport all --season 2025 python scrape_schedules.py --stadiums-only """ import argparse import json import time import re from datetime import datetime, timedelta from pathlib import Path from dataclasses import dataclass, asdict from typing import Optional import requests from bs4 import BeautifulSoup import pandas as pd # Rate limiting REQUEST_DELAY = 3.0 # seconds between requests to same domain last_request_time = {} def rate_limit(domain: str): """Enforce rate limiting per domain.""" now = time.time() if domain in last_request_time: elapsed = now - last_request_time[domain] if elapsed < REQUEST_DELAY: time.sleep(REQUEST_DELAY - elapsed) last_request_time[domain] = time.time() def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]: """Fetch and parse a webpage with rate limiting.""" rate_limit(domain) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } try: response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() return BeautifulSoup(response.content, 'html.parser') except Exception as e: print(f"Error fetching {url}: {e}") return None # ============================================================================= # DATA CLASSES # ============================================================================= @dataclass class Game: id: str sport: str season: str date: str # YYYY-MM-DD time: Optional[str] # HH:MM (24hr, ET) home_team: str away_team: str home_team_abbrev: str away_team_abbrev: str venue: str source: str is_playoff: bool = False broadcast: Optional[str] = None @dataclass class Stadium: id: str name: str city: str state: str latitude: float longitude: float capacity: int sport: str team_abbrevs: list source: str year_opened: Optional[int] = None # ============================================================================= # TEAM MAPPINGS # ============================================================================= NBA_TEAMS = { 'ATL': {'name': 'Atlanta Hawks', 'city': 'Atlanta', 'arena': 'State Farm Arena'}, 'BOS': {'name': 'Boston Celtics', 'city': 'Boston', 'arena': 'TD Garden'}, 'BRK': {'name': 'Brooklyn Nets', 'city': 'Brooklyn', 'arena': 'Barclays Center'}, 'CHO': {'name': 'Charlotte Hornets', 'city': 'Charlotte', 'arena': 'Spectrum Center'}, 'CHI': {'name': 'Chicago Bulls', 'city': 'Chicago', 'arena': 'United Center'}, 'CLE': {'name': 'Cleveland Cavaliers', 'city': 'Cleveland', 'arena': 'Rocket Mortgage FieldHouse'}, 'DAL': {'name': 'Dallas Mavericks', 'city': 'Dallas', 'arena': 'American Airlines Center'}, 'DEN': {'name': 'Denver Nuggets', 'city': 'Denver', 'arena': 'Ball Arena'}, 'DET': {'name': 'Detroit Pistons', 'city': 'Detroit', 'arena': 'Little Caesars Arena'}, 'GSW': {'name': 'Golden State Warriors', 'city': 'San Francisco', 'arena': 'Chase Center'}, 'HOU': {'name': 'Houston Rockets', 'city': 'Houston', 'arena': 'Toyota Center'}, 'IND': {'name': 'Indiana Pacers', 'city': 'Indianapolis', 'arena': 'Gainbridge Fieldhouse'}, 'LAC': {'name': 'Los Angeles Clippers', 'city': 'Inglewood', 'arena': 'Intuit Dome'}, 'LAL': {'name': 'Los Angeles Lakers', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'}, 'MEM': {'name': 'Memphis Grizzlies', 'city': 'Memphis', 'arena': 'FedExForum'}, 'MIA': {'name': 'Miami Heat', 'city': 'Miami', 'arena': 'Kaseya Center'}, 'MIL': {'name': 'Milwaukee Bucks', 'city': 'Milwaukee', 'arena': 'Fiserv Forum'}, 'MIN': {'name': 'Minnesota Timberwolves', 'city': 'Minneapolis', 'arena': 'Target Center'}, 'NOP': {'name': 'New Orleans Pelicans', 'city': 'New Orleans', 'arena': 'Smoothie King Center'}, 'NYK': {'name': 'New York Knicks', 'city': 'New York', 'arena': 'Madison Square Garden'}, 'OKC': {'name': 'Oklahoma City Thunder', 'city': 'Oklahoma City', 'arena': 'Paycom Center'}, 'ORL': {'name': 'Orlando Magic', 'city': 'Orlando', 'arena': 'Kia Center'}, 'PHI': {'name': 'Philadelphia 76ers', 'city': 'Philadelphia', 'arena': 'Wells Fargo Center'}, 'PHO': {'name': 'Phoenix Suns', 'city': 'Phoenix', 'arena': 'Footprint Center'}, 'POR': {'name': 'Portland Trail Blazers', 'city': 'Portland', 'arena': 'Moda Center'}, 'SAC': {'name': 'Sacramento Kings', 'city': 'Sacramento', 'arena': 'Golden 1 Center'}, 'SAS': {'name': 'San Antonio Spurs', 'city': 'San Antonio', 'arena': 'Frost Bank Center'}, 'TOR': {'name': 'Toronto Raptors', 'city': 'Toronto', 'arena': 'Scotiabank Arena'}, 'UTA': {'name': 'Utah Jazz', 'city': 'Salt Lake City', 'arena': 'Delta Center'}, 'WAS': {'name': 'Washington Wizards', 'city': 'Washington', 'arena': 'Capital One Arena'}, } MLB_TEAMS = { 'ARI': {'name': 'Arizona Diamondbacks', 'city': 'Phoenix', 'stadium': 'Chase Field'}, 'ATL': {'name': 'Atlanta Braves', 'city': 'Atlanta', 'stadium': 'Truist Park'}, 'BAL': {'name': 'Baltimore Orioles', 'city': 'Baltimore', 'stadium': 'Oriole Park at Camden Yards'}, 'BOS': {'name': 'Boston Red Sox', 'city': 'Boston', 'stadium': 'Fenway Park'}, 'CHC': {'name': 'Chicago Cubs', 'city': 'Chicago', 'stadium': 'Wrigley Field'}, 'CHW': {'name': 'Chicago White Sox', 'city': 'Chicago', 'stadium': 'Guaranteed Rate Field'}, 'CIN': {'name': 'Cincinnati Reds', 'city': 'Cincinnati', 'stadium': 'Great American Ball Park'}, 'CLE': {'name': 'Cleveland Guardians', 'city': 'Cleveland', 'stadium': 'Progressive Field'}, 'COL': {'name': 'Colorado Rockies', 'city': 'Denver', 'stadium': 'Coors Field'}, 'DET': {'name': 'Detroit Tigers', 'city': 'Detroit', 'stadium': 'Comerica Park'}, 'HOU': {'name': 'Houston Astros', 'city': 'Houston', 'stadium': 'Minute Maid Park'}, 'KCR': {'name': 'Kansas City Royals', 'city': 'Kansas City', 'stadium': 'Kauffman Stadium'}, 'LAA': {'name': 'Los Angeles Angels', 'city': 'Anaheim', 'stadium': 'Angel Stadium'}, 'LAD': {'name': 'Los Angeles Dodgers', 'city': 'Los Angeles', 'stadium': 'Dodger Stadium'}, 'MIA': {'name': 'Miami Marlins', 'city': 'Miami', 'stadium': 'LoanDepot Park'}, 'MIL': {'name': 'Milwaukee Brewers', 'city': 'Milwaukee', 'stadium': 'American Family Field'}, 'MIN': {'name': 'Minnesota Twins', 'city': 'Minneapolis', 'stadium': 'Target Field'}, 'NYM': {'name': 'New York Mets', 'city': 'New York', 'stadium': 'Citi Field'}, 'NYY': {'name': 'New York Yankees', 'city': 'New York', 'stadium': 'Yankee Stadium'}, 'OAK': {'name': 'Oakland Athletics', 'city': 'Sacramento', 'stadium': 'Sutter Health Park'}, 'PHI': {'name': 'Philadelphia Phillies', 'city': 'Philadelphia', 'stadium': 'Citizens Bank Park'}, 'PIT': {'name': 'Pittsburgh Pirates', 'city': 'Pittsburgh', 'stadium': 'PNC Park'}, 'SDP': {'name': 'San Diego Padres', 'city': 'San Diego', 'stadium': 'Petco Park'}, 'SFG': {'name': 'San Francisco Giants', 'city': 'San Francisco', 'stadium': 'Oracle Park'}, 'SEA': {'name': 'Seattle Mariners', 'city': 'Seattle', 'stadium': 'T-Mobile Park'}, 'STL': {'name': 'St. Louis Cardinals', 'city': 'St. Louis', 'stadium': 'Busch Stadium'}, 'TBR': {'name': 'Tampa Bay Rays', 'city': 'St. Petersburg', 'stadium': 'Tropicana Field'}, 'TEX': {'name': 'Texas Rangers', 'city': 'Arlington', 'stadium': 'Globe Life Field'}, 'TOR': {'name': 'Toronto Blue Jays', 'city': 'Toronto', 'stadium': 'Rogers Centre'}, 'WSN': {'name': 'Washington Nationals', 'city': 'Washington', 'stadium': 'Nationals Park'}, } NHL_TEAMS = { 'ANA': {'name': 'Anaheim Ducks', 'city': 'Anaheim', 'arena': 'Honda Center'}, 'ARI': {'name': 'Utah Hockey Club', 'city': 'Salt Lake City', 'arena': 'Delta Center'}, 'BOS': {'name': 'Boston Bruins', 'city': 'Boston', 'arena': 'TD Garden'}, 'BUF': {'name': 'Buffalo Sabres', 'city': 'Buffalo', 'arena': 'KeyBank Center'}, 'CGY': {'name': 'Calgary Flames', 'city': 'Calgary', 'arena': 'Scotiabank Saddledome'}, 'CAR': {'name': 'Carolina Hurricanes', 'city': 'Raleigh', 'arena': 'PNC Arena'}, 'CHI': {'name': 'Chicago Blackhawks', 'city': 'Chicago', 'arena': 'United Center'}, 'COL': {'name': 'Colorado Avalanche', 'city': 'Denver', 'arena': 'Ball Arena'}, 'CBJ': {'name': 'Columbus Blue Jackets', 'city': 'Columbus', 'arena': 'Nationwide Arena'}, 'DAL': {'name': 'Dallas Stars', 'city': 'Dallas', 'arena': 'American Airlines Center'}, 'DET': {'name': 'Detroit Red Wings', 'city': 'Detroit', 'arena': 'Little Caesars Arena'}, 'EDM': {'name': 'Edmonton Oilers', 'city': 'Edmonton', 'arena': 'Rogers Place'}, 'FLA': {'name': 'Florida Panthers', 'city': 'Sunrise', 'arena': 'Amerant Bank Arena'}, 'LAK': {'name': 'Los Angeles Kings', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'}, 'MIN': {'name': 'Minnesota Wild', 'city': 'St. Paul', 'arena': 'Xcel Energy Center'}, 'MTL': {'name': 'Montreal Canadiens', 'city': 'Montreal', 'arena': 'Bell Centre'}, 'NSH': {'name': 'Nashville Predators', 'city': 'Nashville', 'arena': 'Bridgestone Arena'}, 'NJD': {'name': 'New Jersey Devils', 'city': 'Newark', 'arena': 'Prudential Center'}, 'NYI': {'name': 'New York Islanders', 'city': 'Elmont', 'arena': 'UBS Arena'}, 'NYR': {'name': 'New York Rangers', 'city': 'New York', 'arena': 'Madison Square Garden'}, 'OTT': {'name': 'Ottawa Senators', 'city': 'Ottawa', 'arena': 'Canadian Tire Centre'}, 'PHI': {'name': 'Philadelphia Flyers', 'city': 'Philadelphia', 'arena': 'Wells Fargo Center'}, 'PIT': {'name': 'Pittsburgh Penguins', 'city': 'Pittsburgh', 'arena': 'PPG Paints Arena'}, 'SJS': {'name': 'San Jose Sharks', 'city': 'San Jose', 'arena': 'SAP Center'}, 'SEA': {'name': 'Seattle Kraken', 'city': 'Seattle', 'arena': 'Climate Pledge Arena'}, 'STL': {'name': 'St. Louis Blues', 'city': 'St. Louis', 'arena': 'Enterprise Center'}, 'TBL': {'name': 'Tampa Bay Lightning', 'city': 'Tampa', 'arena': 'Amalie Arena'}, 'TOR': {'name': 'Toronto Maple Leafs', 'city': 'Toronto', 'arena': 'Scotiabank Arena'}, 'VAN': {'name': 'Vancouver Canucks', 'city': 'Vancouver', 'arena': 'Rogers Arena'}, 'VGK': {'name': 'Vegas Golden Knights', 'city': 'Las Vegas', 'arena': 'T-Mobile Arena'}, 'WSH': {'name': 'Washington Capitals', 'city': 'Washington', 'arena': 'Capital One Arena'}, 'WPG': {'name': 'Winnipeg Jets', 'city': 'Winnipeg', 'arena': 'Canada Life Centre'}, } # ============================================================================= # SCRAPERS - NBA # ============================================================================= def scrape_nba_basketball_reference(season: int) -> list[Game]: """ Scrape NBA schedule from Basketball-Reference. URL: https://www.basketball-reference.com/leagues/NBA_{YEAR}_games-{month}.html Season year is the ending year (e.g., 2025 for 2024-25 season) """ games = [] months = ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june'] print(f"Scraping NBA {season} from Basketball-Reference...") for month in months: url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html" soup = fetch_page(url, 'basketball-reference.com') if not soup: continue table = soup.find('table', {'id': 'schedule'}) if not table: continue tbody = table.find('tbody') if not tbody: continue for row in tbody.find_all('tr'): if row.get('class') and 'thead' in row.get('class'): continue cells = row.find_all(['td', 'th']) if len(cells) < 6: continue try: # Parse date date_cell = row.find('th', {'data-stat': 'date_game'}) if not date_cell: continue date_link = date_cell.find('a') date_str = date_link.text if date_link else date_cell.text # Parse time time_cell = row.find('td', {'data-stat': 'game_start_time'}) time_str = time_cell.text.strip() if time_cell else None # Parse teams visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'}) home_cell = row.find('td', {'data-stat': 'home_team_name'}) if not visitor_cell or not home_cell: continue visitor_link = visitor_cell.find('a') home_link = home_cell.find('a') away_team = visitor_link.text if visitor_link else visitor_cell.text home_team = home_link.text if home_link else home_cell.text # Parse arena arena_cell = row.find('td', {'data-stat': 'arena_name'}) arena = arena_cell.text.strip() if arena_cell else '' # Convert date try: parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y') date_formatted = parsed_date.strftime('%Y-%m-%d') except: continue # Generate game ID game_id = f"nba_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '') game = Game( id=game_id, sport='NBA', season=f"{season-1}-{str(season)[2:]}", date=date_formatted, time=time_str, home_team=home_team, away_team=away_team, home_team_abbrev=get_team_abbrev(home_team, 'NBA'), away_team_abbrev=get_team_abbrev(away_team, 'NBA'), venue=arena, source='basketball-reference.com' ) games.append(game) except Exception as e: print(f" Error parsing row: {e}") continue print(f" Found {len(games)} games from Basketball-Reference") return games def scrape_nba_espn(season: int) -> list[Game]: """ Scrape NBA schedule from ESPN. URL: https://www.espn.com/nba/schedule/_/date/{YYYYMMDD} """ games = [] print(f"Scraping NBA {season} from ESPN...") # Determine date range for season start_date = datetime(season - 1, 10, 1) # October of previous year end_date = datetime(season, 6, 30) # June of season year current_date = start_date while current_date <= end_date: date_str = current_date.strftime('%Y%m%d') url = f"https://www.espn.com/nba/schedule/_/date/{date_str}" soup = fetch_page(url, 'espn.com') if soup: # ESPN uses JavaScript rendering, so we need to parse what's available # This is a simplified version - full implementation would need Selenium pass current_date += timedelta(days=7) # Sample weekly to respect rate limits print(f" Found {len(games)} games from ESPN") return games # ============================================================================= # SCRAPERS - MLB # ============================================================================= def scrape_mlb_baseball_reference(season: int) -> list[Game]: """ Scrape MLB schedule from Baseball-Reference. URL: https://www.baseball-reference.com/leagues/majors/{YEAR}-schedule.shtml """ games = [] url = f"https://www.baseball-reference.com/leagues/majors/{season}-schedule.shtml" print(f"Scraping MLB {season} from Baseball-Reference...") soup = fetch_page(url, 'baseball-reference.com') if not soup: return games # Baseball-Reference groups games by date in h3 headers current_date = None # Find the schedule section schedule_div = soup.find('div', {'id': 'all_schedule'}) if not schedule_div: schedule_div = soup # Process all elements to track date context for element in schedule_div.find_all(['h3', 'p', 'div']): # Check for date header if element.name == 'h3': date_text = element.get_text(strip=True) # Parse date like "Thursday, March 27, 2025" try: for fmt in ['%A, %B %d, %Y', '%B %d, %Y', '%a, %b %d, %Y']: try: parsed = datetime.strptime(date_text, fmt) current_date = parsed.strftime('%Y-%m-%d') break except: continue except: pass # Check for game entries elif element.name == 'p' and 'game' in element.get('class', []): if not current_date: continue try: links = element.find_all('a') if len(links) >= 2: away_team = links[0].text.strip() home_team = links[1].text.strip() # Generate unique game ID away_abbrev = get_team_abbrev(away_team, 'MLB') home_abbrev = get_team_abbrev(home_team, 'MLB') game_id = f"mlb_br_{current_date}_{away_abbrev}_{home_abbrev}".lower() game = Game( id=game_id, sport='MLB', season=str(season), date=current_date, time=None, home_team=home_team, away_team=away_team, home_team_abbrev=home_abbrev, away_team_abbrev=away_abbrev, venue='', source='baseball-reference.com' ) games.append(game) except Exception as e: continue print(f" Found {len(games)} games from Baseball-Reference") return games def scrape_mlb_statsapi(season: int) -> list[Game]: """ Fetch MLB schedule from official Stats API (JSON). URL: https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={YEAR}&gameType=R """ games = [] url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={season}&gameType=R&hydrate=team,venue" print(f"Fetching MLB {season} from Stats API...") try: response = requests.get(url, timeout=30) response.raise_for_status() data = response.json() for date_entry in data.get('dates', []): game_date = date_entry.get('date', '') for game_data in date_entry.get('games', []): try: teams = game_data.get('teams', {}) away = teams.get('away', {}).get('team', {}) home = teams.get('home', {}).get('team', {}) venue = game_data.get('venue', {}) game_time = game_data.get('gameDate', '') if 'T' in game_time: time_str = game_time.split('T')[1][:5] else: time_str = None game = Game( id=f"mlb_{game_data.get('gamePk', '')}", sport='MLB', season=str(season), date=game_date, time=time_str, home_team=home.get('name', ''), away_team=away.get('name', ''), home_team_abbrev=home.get('abbreviation', ''), away_team_abbrev=away.get('abbreviation', ''), venue=venue.get('name', ''), source='statsapi.mlb.com' ) games.append(game) except Exception as e: continue except Exception as e: print(f" Error fetching MLB API: {e}") print(f" Found {len(games)} games from MLB Stats API") return games # ============================================================================= # SCRAPERS - NHL # ============================================================================= def scrape_nhl_hockey_reference(season: int) -> list[Game]: """ Scrape NHL schedule from Hockey-Reference. URL: https://www.hockey-reference.com/leagues/NHL_{YEAR}_games.html """ games = [] url = f"https://www.hockey-reference.com/leagues/NHL_{season}_games.html" print(f"Scraping NHL {season} from Hockey-Reference...") soup = fetch_page(url, 'hockey-reference.com') if not soup: return games table = soup.find('table', {'id': 'games'}) if not table: print(" Could not find games table") return games tbody = table.find('tbody') if not tbody: return games for row in tbody.find_all('tr'): try: cells = row.find_all(['td', 'th']) if len(cells) < 5: continue # Parse date date_cell = row.find('th', {'data-stat': 'date_game'}) if not date_cell: continue date_link = date_cell.find('a') date_str = date_link.text if date_link else date_cell.text # Parse teams visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'}) home_cell = row.find('td', {'data-stat': 'home_team_name'}) if not visitor_cell or not home_cell: continue visitor_link = visitor_cell.find('a') home_link = home_cell.find('a') away_team = visitor_link.text if visitor_link else visitor_cell.text home_team = home_link.text if home_link else home_cell.text # Convert date try: parsed_date = datetime.strptime(date_str.strip(), '%Y-%m-%d') date_formatted = parsed_date.strftime('%Y-%m-%d') except: continue game_id = f"nhl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '') game = Game( id=game_id, sport='NHL', season=f"{season-1}-{str(season)[2:]}", date=date_formatted, time=None, home_team=home_team, away_team=away_team, home_team_abbrev=get_team_abbrev(home_team, 'NHL'), away_team_abbrev=get_team_abbrev(away_team, 'NHL'), venue='', source='hockey-reference.com' ) games.append(game) except Exception as e: continue print(f" Found {len(games)} games from Hockey-Reference") return games def scrape_nhl_api(season: int) -> list[Game]: """ Fetch NHL schedule from official API (JSON). URL: https://api-web.nhle.com/v1/schedule/{YYYY-MM-DD} """ games = [] print(f"Fetching NHL {season} from NHL API...") # NHL API provides club schedules # We'd need to iterate through dates or teams # Simplified implementation here return games # ============================================================================= # STADIUM SCRAPER # ============================================================================= def scrape_stadiums_hifld() -> list[Stadium]: """ Fetch stadium data from HIFLD Open Data (US Government). Returns GeoJSON with coordinates. """ stadiums = [] url = "https://services1.arcgis.com/Hp6G80Pky0om7QvQ/arcgis/rest/services/Major_Sport_Venues/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json" print("Fetching stadiums from HIFLD Open Data...") try: response = requests.get(url, timeout=30) response.raise_for_status() data = response.json() for feature in data.get('features', []): attrs = feature.get('attributes', {}) geom = feature.get('geometry', {}) # Filter for NBA, MLB, NHL venues league = attrs.get('LEAGUE', '') if league not in ['NBA', 'MLB', 'NHL', 'NFL']: continue sport_map = {'NBA': 'NBA', 'MLB': 'MLB', 'NHL': 'NHL'} if league not in sport_map: continue stadium = Stadium( id=f"hifld_{attrs.get('OBJECTID', '')}", name=attrs.get('NAME', ''), city=attrs.get('CITY', ''), state=attrs.get('STATE', ''), latitude=geom.get('y', 0), longitude=geom.get('x', 0), capacity=attrs.get('CAPACITY', 0) or 0, sport=sport_map.get(league, ''), team_abbrevs=[attrs.get('TEAM', '')], source='hifld.gov', year_opened=attrs.get('YEAR_OPEN') ) stadiums.append(stadium) except Exception as e: print(f" Error fetching HIFLD data: {e}") print(f" Found {len(stadiums)} stadiums from HIFLD") return stadiums def generate_stadiums_from_teams() -> list[Stadium]: """ Generate stadium data from team mappings with manual coordinates. This serves as a fallback/validation source. """ stadiums = [] # NBA Arenas with coordinates (manually curated) nba_coords = { 'State Farm Arena': (33.7573, -84.3963), 'TD Garden': (42.3662, -71.0621), 'Barclays Center': (40.6826, -73.9754), 'Spectrum Center': (35.2251, -80.8392), 'United Center': (41.8807, -87.6742), 'Rocket Mortgage FieldHouse': (41.4965, -81.6882), 'American Airlines Center': (32.7905, -96.8103), 'Ball Arena': (39.7487, -105.0077), 'Little Caesars Arena': (42.3411, -83.0553), 'Chase Center': (37.7680, -122.3879), 'Toyota Center': (29.7508, -95.3621), 'Gainbridge Fieldhouse': (39.7640, -86.1555), 'Intuit Dome': (33.9425, -118.3419), 'Crypto.com Arena': (34.0430, -118.2673), 'FedExForum': (35.1382, -90.0506), 'Kaseya Center': (25.7814, -80.1870), 'Fiserv Forum': (43.0451, -87.9174), 'Target Center': (44.9795, -93.2761), 'Smoothie King Center': (29.9490, -90.0821), 'Madison Square Garden': (40.7505, -73.9934), 'Paycom Center': (35.4634, -97.5151), 'Kia Center': (28.5392, -81.3839), 'Wells Fargo Center': (39.9012, -75.1720), 'Footprint Center': (33.4457, -112.0712), 'Moda Center': (45.5316, -122.6668), 'Golden 1 Center': (38.5802, -121.4997), 'Frost Bank Center': (29.4270, -98.4375), 'Scotiabank Arena': (43.6435, -79.3791), 'Delta Center': (40.7683, -111.9011), 'Capital One Arena': (38.8982, -77.0209), } for abbrev, info in NBA_TEAMS.items(): arena = info['arena'] coords = nba_coords.get(arena, (0, 0)) stadium = Stadium( id=f"manual_nba_{abbrev.lower()}", name=arena, city=info['city'], state='', latitude=coords[0], longitude=coords[1], capacity=0, sport='NBA', team_abbrevs=[abbrev], source='manual' ) stadiums.append(stadium) # MLB Stadiums with coordinates mlb_coords = { 'Chase Field': (33.4453, -112.0667, 'AZ', 48686), 'Truist Park': (33.8907, -84.4678, 'GA', 41084), 'Oriole Park at Camden Yards': (39.2838, -76.6218, 'MD', 45971), 'Fenway Park': (42.3467, -71.0972, 'MA', 37755), 'Wrigley Field': (41.9484, -87.6553, 'IL', 41649), 'Guaranteed Rate Field': (41.8299, -87.6338, 'IL', 40615), 'Great American Ball Park': (39.0979, -84.5082, 'OH', 42319), 'Progressive Field': (41.4962, -81.6852, 'OH', 34830), 'Coors Field': (39.7559, -104.9942, 'CO', 50144), 'Comerica Park': (42.3390, -83.0485, 'MI', 41083), 'Minute Maid Park': (29.7573, -95.3555, 'TX', 41168), 'Kauffman Stadium': (39.0517, -94.4803, 'MO', 37903), 'Angel Stadium': (33.8003, -117.8827, 'CA', 45517), 'Dodger Stadium': (34.0739, -118.2400, 'CA', 56000), 'LoanDepot Park': (25.7781, -80.2196, 'FL', 36742), 'American Family Field': (43.0280, -87.9712, 'WI', 41900), 'Target Field': (44.9817, -93.2776, 'MN', 38544), 'Citi Field': (40.7571, -73.8458, 'NY', 41922), 'Yankee Stadium': (40.8296, -73.9262, 'NY', 46537), 'Sutter Health Park': (38.5802, -121.5097, 'CA', 14014), 'Citizens Bank Park': (39.9061, -75.1665, 'PA', 42792), 'PNC Park': (40.4469, -80.0057, 'PA', 38362), 'Petco Park': (32.7076, -117.1570, 'CA', 40209), 'Oracle Park': (37.7786, -122.3893, 'CA', 41265), 'T-Mobile Park': (47.5914, -122.3325, 'WA', 47929), 'Busch Stadium': (38.6226, -90.1928, 'MO', 45494), 'Tropicana Field': (27.7682, -82.6534, 'FL', 25000), 'Globe Life Field': (32.7473, -97.0845, 'TX', 40300), 'Rogers Centre': (43.6414, -79.3894, 'ON', 49282), 'Nationals Park': (38.8730, -77.0074, 'DC', 41339), } for abbrev, info in MLB_TEAMS.items(): stadium_name = info['stadium'] coord_data = mlb_coords.get(stadium_name, (0, 0, '', 0)) stadium = Stadium( id=f"manual_mlb_{abbrev.lower()}", name=stadium_name, city=info['city'], state=coord_data[2] if len(coord_data) > 2 else '', latitude=coord_data[0], longitude=coord_data[1], capacity=coord_data[3] if len(coord_data) > 3 else 0, sport='MLB', team_abbrevs=[abbrev], source='manual' ) stadiums.append(stadium) # NHL Arenas with coordinates nhl_coords = { 'Honda Center': (33.8078, -117.8765, 'CA', 17174), 'Delta Center': (40.7683, -111.9011, 'UT', 18306), 'TD Garden': (42.3662, -71.0621, 'MA', 17565), 'KeyBank Center': (42.8750, -78.8764, 'NY', 19070), 'Scotiabank Saddledome': (51.0374, -114.0519, 'AB', 19289), 'PNC Arena': (35.8034, -78.7220, 'NC', 18680), 'United Center': (41.8807, -87.6742, 'IL', 19717), 'Ball Arena': (39.7487, -105.0077, 'CO', 18007), 'Nationwide Arena': (39.9693, -83.0061, 'OH', 18500), 'American Airlines Center': (32.7905, -96.8103, 'TX', 18532), 'Little Caesars Arena': (42.3411, -83.0553, 'MI', 19515), 'Rogers Place': (53.5469, -113.4978, 'AB', 18347), 'Amerant Bank Arena': (26.1584, -80.3256, 'FL', 19250), 'Crypto.com Arena': (34.0430, -118.2673, 'CA', 18230), 'Xcel Energy Center': (44.9448, -93.1010, 'MN', 17954), 'Bell Centre': (45.4961, -73.5693, 'QC', 21302), 'Bridgestone Arena': (36.1592, -86.7785, 'TN', 17159), 'Prudential Center': (40.7334, -74.1712, 'NJ', 16514), 'UBS Arena': (40.7161, -73.7246, 'NY', 17255), 'Madison Square Garden': (40.7505, -73.9934, 'NY', 18006), 'Canadian Tire Centre': (45.2969, -75.9272, 'ON', 18652), 'Wells Fargo Center': (39.9012, -75.1720, 'PA', 19543), 'PPG Paints Arena': (40.4395, -79.9892, 'PA', 18387), 'SAP Center': (37.3327, -121.9010, 'CA', 17562), 'Climate Pledge Arena': (47.6221, -122.3540, 'WA', 17100), 'Enterprise Center': (38.6268, -90.2025, 'MO', 18096), 'Amalie Arena': (27.9426, -82.4519, 'FL', 19092), 'Scotiabank Arena': (43.6435, -79.3791, 'ON', 18819), 'Rogers Arena': (49.2778, -123.1089, 'BC', 18910), 'T-Mobile Arena': (36.1028, -115.1784, 'NV', 17500), 'Capital One Arena': (38.8982, -77.0209, 'DC', 18573), 'Canada Life Centre': (49.8928, -97.1436, 'MB', 15321), } for abbrev, info in NHL_TEAMS.items(): arena_name = info['arena'] coord_data = nhl_coords.get(arena_name, (0, 0, '', 0)) stadium = Stadium( id=f"manual_nhl_{abbrev.lower()}", name=arena_name, city=info['city'], state=coord_data[2] if len(coord_data) > 2 else '', latitude=coord_data[0], longitude=coord_data[1], capacity=coord_data[3] if len(coord_data) > 3 else 0, sport='NHL', team_abbrevs=[abbrev], source='manual' ) stadiums.append(stadium) return stadiums # ============================================================================= # HELPERS # ============================================================================= def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]: """ Assign stable IDs based on matchup + occurrence number within season. Format: {sport}_{season}_{away}_{home}_{num} This ensures IDs don't change when games are rescheduled. """ from collections import defaultdict # Group games by matchup (away @ home) matchups = defaultdict(list) for game in games: key = f"{game.away_team_abbrev}_{game.home_team_abbrev}" matchups[key].append(game) # Sort each matchup by date and assign occurrence number for key, matchup_games in matchups.items(): matchup_games.sort(key=lambda g: g.date) for i, game in enumerate(matchup_games, 1): away = game.away_team_abbrev.lower() home = game.home_team_abbrev.lower() # Normalize season format (e.g., "2024-25" -> "2024-25", "2025" -> "2025") season_str = season.replace('-', '') game.id = f"{sport.lower()}_{season_str}_{away}_{home}_{i}" return games def get_team_abbrev(team_name: str, sport: str) -> str: """Get team abbreviation from full name.""" teams = {'NBA': NBA_TEAMS, 'MLB': MLB_TEAMS, 'NHL': NHL_TEAMS}.get(sport, {}) for abbrev, info in teams.items(): if info['name'].lower() == team_name.lower(): return abbrev if team_name.lower() in info['name'].lower(): return abbrev # Return first 3 letters as fallback return team_name[:3].upper() def validate_games(games_by_source: dict) -> dict: """ Cross-validate games from multiple sources. Returns discrepancies. """ discrepancies = { 'missing_in_source': [], 'date_mismatch': [], 'time_mismatch': [], 'venue_mismatch': [], } sources = list(games_by_source.keys()) if len(sources) < 2: return discrepancies primary = sources[0] primary_games = {g.id: g for g in games_by_source[primary]} for source in sources[1:]: secondary_games = {g.id: g for g in games_by_source[source]} for game_id, game in primary_games.items(): if game_id not in secondary_games: discrepancies['missing_in_source'].append({ 'game_id': game_id, 'present_in': primary, 'missing_in': source }) return discrepancies def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path): """Export scraped data to JSON files.""" output_dir.mkdir(parents=True, exist_ok=True) # Export games games_data = [asdict(g) for g in games] with open(output_dir / 'games.json', 'w') as f: json.dump(games_data, f, indent=2) # Export stadiums stadiums_data = [asdict(s) for s in stadiums] with open(output_dir / 'stadiums.json', 'w') as f: json.dump(stadiums_data, f, indent=2) # Export as CSV for easy viewing if games: df_games = pd.DataFrame(games_data) df_games.to_csv(output_dir / 'games.csv', index=False) if stadiums: df_stadiums = pd.DataFrame(stadiums_data) df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False) print(f"\nExported to {output_dir}") # ============================================================================= # MAIN # ============================================================================= def main(): parser = argparse.ArgumentParser(description='Scrape sports schedules') parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'all'], default='all') parser.add_argument('--season', type=int, default=2025, help='Season year (ending year)') parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data') parser.add_argument('--output', type=str, default='./data', help='Output directory') args = parser.parse_args() output_dir = Path(args.output) all_games = [] all_stadiums = [] # Scrape stadiums print("\n" + "="*60) print("SCRAPING STADIUMS") print("="*60) all_stadiums.extend(scrape_stadiums_hifld()) all_stadiums.extend(generate_stadiums_from_teams()) if args.stadiums_only: export_to_json([], all_stadiums, output_dir) return # Scrape schedules if args.sport in ['nba', 'all']: print("\n" + "="*60) print(f"SCRAPING NBA {args.season}") print("="*60) nba_games_br = scrape_nba_basketball_reference(args.season) nba_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2024-25" nba_games_br = assign_stable_ids(nba_games_br, 'NBA', nba_season) all_games.extend(nba_games_br) if args.sport in ['mlb', 'all']: print("\n" + "="*60) print(f"SCRAPING MLB {args.season}") print("="*60) mlb_games_api = scrape_mlb_statsapi(args.season) # MLB API uses official gamePk which is already stable - no reassignment needed all_games.extend(mlb_games_api) if args.sport in ['nhl', 'all']: print("\n" + "="*60) print(f"SCRAPING NHL {args.season}") print("="*60) nhl_games_hr = scrape_nhl_hockey_reference(args.season) nhl_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2024-25" nhl_games_hr = assign_stable_ids(nhl_games_hr, 'NHL', nhl_season) all_games.extend(nhl_games_hr) # Export print("\n" + "="*60) print("EXPORTING DATA") print("="*60) export_to_json(all_games, all_stadiums, output_dir) # Summary print("\n" + "="*60) print("SUMMARY") print("="*60) print(f"Total games scraped: {len(all_games)}") print(f"Total stadiums: {len(all_stadiums)}") # Games by sport by_sport = {} for g in all_games: by_sport[g.sport] = by_sport.get(g.sport, 0) + 1 for sport, count in by_sport.items(): print(f" {sport}: {count} games") if __name__ == '__main__': main()