Files
Sportstime/Scripts/scrape_schedules.py
Trey t 9088b46563 Initial commit: SportsTime trip planning app
- Three-scenario planning engine (A: date range, B: selected games, C: directional routes)
- GeographicRouteExplorer with anchor game support for route exploration
- Shared ItineraryBuilder for travel segment calculation
- TravelEstimator for driving time/distance estimation
- SwiftUI views for trip creation and detail display
- CloudKit integration for schedule data
- Python scraping scripts for sports schedules

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 00:46:40 -06:00

971 lines
38 KiB
Python

#!/usr/bin/env python3
"""
Sports Schedule Scraper for SportsTime App
Scrapes NBA, MLB, NHL schedules from multiple sources for cross-validation.
Usage:
python scrape_schedules.py --sport nba --season 2025
python scrape_schedules.py --sport all --season 2025
python scrape_schedules.py --stadiums-only
"""
import argparse
import json
import time
import re
from datetime import datetime, timedelta
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import Optional
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Rate limiting
REQUEST_DELAY = 3.0 # seconds between requests to same domain
last_request_time = {}
def rate_limit(domain: str):
"""Enforce rate limiting per domain."""
now = time.time()
if domain in last_request_time:
elapsed = now - last_request_time[domain]
if elapsed < REQUEST_DELAY:
time.sleep(REQUEST_DELAY - elapsed)
last_request_time[domain] = time.time()
def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]:
"""Fetch and parse a webpage with rate limiting."""
rate_limit(domain)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
try:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class Game:
id: str
sport: str
season: str
date: str # YYYY-MM-DD
time: Optional[str] # HH:MM (24hr, ET)
home_team: str
away_team: str
home_team_abbrev: str
away_team_abbrev: str
venue: str
source: str
is_playoff: bool = False
broadcast: Optional[str] = None
@dataclass
class Stadium:
id: str
name: str
city: str
state: str
latitude: float
longitude: float
capacity: int
sport: str
team_abbrevs: list
source: str
year_opened: Optional[int] = None
# =============================================================================
# TEAM MAPPINGS
# =============================================================================
NBA_TEAMS = {
'ATL': {'name': 'Atlanta Hawks', 'city': 'Atlanta', 'arena': 'State Farm Arena'},
'BOS': {'name': 'Boston Celtics', 'city': 'Boston', 'arena': 'TD Garden'},
'BRK': {'name': 'Brooklyn Nets', 'city': 'Brooklyn', 'arena': 'Barclays Center'},
'CHO': {'name': 'Charlotte Hornets', 'city': 'Charlotte', 'arena': 'Spectrum Center'},
'CHI': {'name': 'Chicago Bulls', 'city': 'Chicago', 'arena': 'United Center'},
'CLE': {'name': 'Cleveland Cavaliers', 'city': 'Cleveland', 'arena': 'Rocket Mortgage FieldHouse'},
'DAL': {'name': 'Dallas Mavericks', 'city': 'Dallas', 'arena': 'American Airlines Center'},
'DEN': {'name': 'Denver Nuggets', 'city': 'Denver', 'arena': 'Ball Arena'},
'DET': {'name': 'Detroit Pistons', 'city': 'Detroit', 'arena': 'Little Caesars Arena'},
'GSW': {'name': 'Golden State Warriors', 'city': 'San Francisco', 'arena': 'Chase Center'},
'HOU': {'name': 'Houston Rockets', 'city': 'Houston', 'arena': 'Toyota Center'},
'IND': {'name': 'Indiana Pacers', 'city': 'Indianapolis', 'arena': 'Gainbridge Fieldhouse'},
'LAC': {'name': 'Los Angeles Clippers', 'city': 'Inglewood', 'arena': 'Intuit Dome'},
'LAL': {'name': 'Los Angeles Lakers', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
'MEM': {'name': 'Memphis Grizzlies', 'city': 'Memphis', 'arena': 'FedExForum'},
'MIA': {'name': 'Miami Heat', 'city': 'Miami', 'arena': 'Kaseya Center'},
'MIL': {'name': 'Milwaukee Bucks', 'city': 'Milwaukee', 'arena': 'Fiserv Forum'},
'MIN': {'name': 'Minnesota Timberwolves', 'city': 'Minneapolis', 'arena': 'Target Center'},
'NOP': {'name': 'New Orleans Pelicans', 'city': 'New Orleans', 'arena': 'Smoothie King Center'},
'NYK': {'name': 'New York Knicks', 'city': 'New York', 'arena': 'Madison Square Garden'},
'OKC': {'name': 'Oklahoma City Thunder', 'city': 'Oklahoma City', 'arena': 'Paycom Center'},
'ORL': {'name': 'Orlando Magic', 'city': 'Orlando', 'arena': 'Kia Center'},
'PHI': {'name': 'Philadelphia 76ers', 'city': 'Philadelphia', 'arena': 'Wells Fargo Center'},
'PHO': {'name': 'Phoenix Suns', 'city': 'Phoenix', 'arena': 'Footprint Center'},
'POR': {'name': 'Portland Trail Blazers', 'city': 'Portland', 'arena': 'Moda Center'},
'SAC': {'name': 'Sacramento Kings', 'city': 'Sacramento', 'arena': 'Golden 1 Center'},
'SAS': {'name': 'San Antonio Spurs', 'city': 'San Antonio', 'arena': 'Frost Bank Center'},
'TOR': {'name': 'Toronto Raptors', 'city': 'Toronto', 'arena': 'Scotiabank Arena'},
'UTA': {'name': 'Utah Jazz', 'city': 'Salt Lake City', 'arena': 'Delta Center'},
'WAS': {'name': 'Washington Wizards', 'city': 'Washington', 'arena': 'Capital One Arena'},
}
MLB_TEAMS = {
'ARI': {'name': 'Arizona Diamondbacks', 'city': 'Phoenix', 'stadium': 'Chase Field'},
'ATL': {'name': 'Atlanta Braves', 'city': 'Atlanta', 'stadium': 'Truist Park'},
'BAL': {'name': 'Baltimore Orioles', 'city': 'Baltimore', 'stadium': 'Oriole Park at Camden Yards'},
'BOS': {'name': 'Boston Red Sox', 'city': 'Boston', 'stadium': 'Fenway Park'},
'CHC': {'name': 'Chicago Cubs', 'city': 'Chicago', 'stadium': 'Wrigley Field'},
'CHW': {'name': 'Chicago White Sox', 'city': 'Chicago', 'stadium': 'Guaranteed Rate Field'},
'CIN': {'name': 'Cincinnati Reds', 'city': 'Cincinnati', 'stadium': 'Great American Ball Park'},
'CLE': {'name': 'Cleveland Guardians', 'city': 'Cleveland', 'stadium': 'Progressive Field'},
'COL': {'name': 'Colorado Rockies', 'city': 'Denver', 'stadium': 'Coors Field'},
'DET': {'name': 'Detroit Tigers', 'city': 'Detroit', 'stadium': 'Comerica Park'},
'HOU': {'name': 'Houston Astros', 'city': 'Houston', 'stadium': 'Minute Maid Park'},
'KCR': {'name': 'Kansas City Royals', 'city': 'Kansas City', 'stadium': 'Kauffman Stadium'},
'LAA': {'name': 'Los Angeles Angels', 'city': 'Anaheim', 'stadium': 'Angel Stadium'},
'LAD': {'name': 'Los Angeles Dodgers', 'city': 'Los Angeles', 'stadium': 'Dodger Stadium'},
'MIA': {'name': 'Miami Marlins', 'city': 'Miami', 'stadium': 'LoanDepot Park'},
'MIL': {'name': 'Milwaukee Brewers', 'city': 'Milwaukee', 'stadium': 'American Family Field'},
'MIN': {'name': 'Minnesota Twins', 'city': 'Minneapolis', 'stadium': 'Target Field'},
'NYM': {'name': 'New York Mets', 'city': 'New York', 'stadium': 'Citi Field'},
'NYY': {'name': 'New York Yankees', 'city': 'New York', 'stadium': 'Yankee Stadium'},
'OAK': {'name': 'Oakland Athletics', 'city': 'Sacramento', 'stadium': 'Sutter Health Park'},
'PHI': {'name': 'Philadelphia Phillies', 'city': 'Philadelphia', 'stadium': 'Citizens Bank Park'},
'PIT': {'name': 'Pittsburgh Pirates', 'city': 'Pittsburgh', 'stadium': 'PNC Park'},
'SDP': {'name': 'San Diego Padres', 'city': 'San Diego', 'stadium': 'Petco Park'},
'SFG': {'name': 'San Francisco Giants', 'city': 'San Francisco', 'stadium': 'Oracle Park'},
'SEA': {'name': 'Seattle Mariners', 'city': 'Seattle', 'stadium': 'T-Mobile Park'},
'STL': {'name': 'St. Louis Cardinals', 'city': 'St. Louis', 'stadium': 'Busch Stadium'},
'TBR': {'name': 'Tampa Bay Rays', 'city': 'St. Petersburg', 'stadium': 'Tropicana Field'},
'TEX': {'name': 'Texas Rangers', 'city': 'Arlington', 'stadium': 'Globe Life Field'},
'TOR': {'name': 'Toronto Blue Jays', 'city': 'Toronto', 'stadium': 'Rogers Centre'},
'WSN': {'name': 'Washington Nationals', 'city': 'Washington', 'stadium': 'Nationals Park'},
}
NHL_TEAMS = {
'ANA': {'name': 'Anaheim Ducks', 'city': 'Anaheim', 'arena': 'Honda Center'},
'ARI': {'name': 'Utah Hockey Club', 'city': 'Salt Lake City', 'arena': 'Delta Center'},
'BOS': {'name': 'Boston Bruins', 'city': 'Boston', 'arena': 'TD Garden'},
'BUF': {'name': 'Buffalo Sabres', 'city': 'Buffalo', 'arena': 'KeyBank Center'},
'CGY': {'name': 'Calgary Flames', 'city': 'Calgary', 'arena': 'Scotiabank Saddledome'},
'CAR': {'name': 'Carolina Hurricanes', 'city': 'Raleigh', 'arena': 'PNC Arena'},
'CHI': {'name': 'Chicago Blackhawks', 'city': 'Chicago', 'arena': 'United Center'},
'COL': {'name': 'Colorado Avalanche', 'city': 'Denver', 'arena': 'Ball Arena'},
'CBJ': {'name': 'Columbus Blue Jackets', 'city': 'Columbus', 'arena': 'Nationwide Arena'},
'DAL': {'name': 'Dallas Stars', 'city': 'Dallas', 'arena': 'American Airlines Center'},
'DET': {'name': 'Detroit Red Wings', 'city': 'Detroit', 'arena': 'Little Caesars Arena'},
'EDM': {'name': 'Edmonton Oilers', 'city': 'Edmonton', 'arena': 'Rogers Place'},
'FLA': {'name': 'Florida Panthers', 'city': 'Sunrise', 'arena': 'Amerant Bank Arena'},
'LAK': {'name': 'Los Angeles Kings', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
'MIN': {'name': 'Minnesota Wild', 'city': 'St. Paul', 'arena': 'Xcel Energy Center'},
'MTL': {'name': 'Montreal Canadiens', 'city': 'Montreal', 'arena': 'Bell Centre'},
'NSH': {'name': 'Nashville Predators', 'city': 'Nashville', 'arena': 'Bridgestone Arena'},
'NJD': {'name': 'New Jersey Devils', 'city': 'Newark', 'arena': 'Prudential Center'},
'NYI': {'name': 'New York Islanders', 'city': 'Elmont', 'arena': 'UBS Arena'},
'NYR': {'name': 'New York Rangers', 'city': 'New York', 'arena': 'Madison Square Garden'},
'OTT': {'name': 'Ottawa Senators', 'city': 'Ottawa', 'arena': 'Canadian Tire Centre'},
'PHI': {'name': 'Philadelphia Flyers', 'city': 'Philadelphia', 'arena': 'Wells Fargo Center'},
'PIT': {'name': 'Pittsburgh Penguins', 'city': 'Pittsburgh', 'arena': 'PPG Paints Arena'},
'SJS': {'name': 'San Jose Sharks', 'city': 'San Jose', 'arena': 'SAP Center'},
'SEA': {'name': 'Seattle Kraken', 'city': 'Seattle', 'arena': 'Climate Pledge Arena'},
'STL': {'name': 'St. Louis Blues', 'city': 'St. Louis', 'arena': 'Enterprise Center'},
'TBL': {'name': 'Tampa Bay Lightning', 'city': 'Tampa', 'arena': 'Amalie Arena'},
'TOR': {'name': 'Toronto Maple Leafs', 'city': 'Toronto', 'arena': 'Scotiabank Arena'},
'VAN': {'name': 'Vancouver Canucks', 'city': 'Vancouver', 'arena': 'Rogers Arena'},
'VGK': {'name': 'Vegas Golden Knights', 'city': 'Las Vegas', 'arena': 'T-Mobile Arena'},
'WSH': {'name': 'Washington Capitals', 'city': 'Washington', 'arena': 'Capital One Arena'},
'WPG': {'name': 'Winnipeg Jets', 'city': 'Winnipeg', 'arena': 'Canada Life Centre'},
}
# =============================================================================
# SCRAPERS - NBA
# =============================================================================
def scrape_nba_basketball_reference(season: int) -> list[Game]:
"""
Scrape NBA schedule from Basketball-Reference.
URL: https://www.basketball-reference.com/leagues/NBA_{YEAR}_games-{month}.html
Season year is the ending year (e.g., 2025 for 2024-25 season)
"""
games = []
months = ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june']
print(f"Scraping NBA {season} from Basketball-Reference...")
for month in months:
url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html"
soup = fetch_page(url, 'basketball-reference.com')
if not soup:
continue
table = soup.find('table', {'id': 'schedule'})
if not table:
continue
tbody = table.find('tbody')
if not tbody:
continue
for row in tbody.find_all('tr'):
if row.get('class') and 'thead' in row.get('class'):
continue
cells = row.find_all(['td', 'th'])
if len(cells) < 6:
continue
try:
# Parse date
date_cell = row.find('th', {'data-stat': 'date_game'})
if not date_cell:
continue
date_link = date_cell.find('a')
date_str = date_link.text if date_link else date_cell.text
# Parse time
time_cell = row.find('td', {'data-stat': 'game_start_time'})
time_str = time_cell.text.strip() if time_cell else None
# Parse teams
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
home_cell = row.find('td', {'data-stat': 'home_team_name'})
if not visitor_cell or not home_cell:
continue
visitor_link = visitor_cell.find('a')
home_link = home_cell.find('a')
away_team = visitor_link.text if visitor_link else visitor_cell.text
home_team = home_link.text if home_link else home_cell.text
# Parse arena
arena_cell = row.find('td', {'data-stat': 'arena_name'})
arena = arena_cell.text.strip() if arena_cell else ''
# Convert date
try:
parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y')
date_formatted = parsed_date.strftime('%Y-%m-%d')
except:
continue
# Generate game ID
game_id = f"nba_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='NBA',
season=f"{season-1}-{str(season)[2:]}",
date=date_formatted,
time=time_str,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'NBA'),
away_team_abbrev=get_team_abbrev(away_team, 'NBA'),
venue=arena,
source='basketball-reference.com'
)
games.append(game)
except Exception as e:
print(f" Error parsing row: {e}")
continue
print(f" Found {len(games)} games from Basketball-Reference")
return games
def scrape_nba_espn(season: int) -> list[Game]:
"""
Scrape NBA schedule from ESPN.
URL: https://www.espn.com/nba/schedule/_/date/{YYYYMMDD}
"""
games = []
print(f"Scraping NBA {season} from ESPN...")
# Determine date range for season
start_date = datetime(season - 1, 10, 1) # October of previous year
end_date = datetime(season, 6, 30) # June of season year
current_date = start_date
while current_date <= end_date:
date_str = current_date.strftime('%Y%m%d')
url = f"https://www.espn.com/nba/schedule/_/date/{date_str}"
soup = fetch_page(url, 'espn.com')
if soup:
# ESPN uses JavaScript rendering, so we need to parse what's available
# This is a simplified version - full implementation would need Selenium
pass
current_date += timedelta(days=7) # Sample weekly to respect rate limits
print(f" Found {len(games)} games from ESPN")
return games
# =============================================================================
# SCRAPERS - MLB
# =============================================================================
def scrape_mlb_baseball_reference(season: int) -> list[Game]:
"""
Scrape MLB schedule from Baseball-Reference.
URL: https://www.baseball-reference.com/leagues/majors/{YEAR}-schedule.shtml
"""
games = []
url = f"https://www.baseball-reference.com/leagues/majors/{season}-schedule.shtml"
print(f"Scraping MLB {season} from Baseball-Reference...")
soup = fetch_page(url, 'baseball-reference.com')
if not soup:
return games
# Baseball-Reference groups games by date in h3 headers
current_date = None
# Find the schedule section
schedule_div = soup.find('div', {'id': 'all_schedule'})
if not schedule_div:
schedule_div = soup
# Process all elements to track date context
for element in schedule_div.find_all(['h3', 'p', 'div']):
# Check for date header
if element.name == 'h3':
date_text = element.get_text(strip=True)
# Parse date like "Thursday, March 27, 2025"
try:
for fmt in ['%A, %B %d, %Y', '%B %d, %Y', '%a, %b %d, %Y']:
try:
parsed = datetime.strptime(date_text, fmt)
current_date = parsed.strftime('%Y-%m-%d')
break
except:
continue
except:
pass
# Check for game entries
elif element.name == 'p' and 'game' in element.get('class', []):
if not current_date:
continue
try:
links = element.find_all('a')
if len(links) >= 2:
away_team = links[0].text.strip()
home_team = links[1].text.strip()
# Generate unique game ID
away_abbrev = get_team_abbrev(away_team, 'MLB')
home_abbrev = get_team_abbrev(home_team, 'MLB')
game_id = f"mlb_br_{current_date}_{away_abbrev}_{home_abbrev}".lower()
game = Game(
id=game_id,
sport='MLB',
season=str(season),
date=current_date,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=home_abbrev,
away_team_abbrev=away_abbrev,
venue='',
source='baseball-reference.com'
)
games.append(game)
except Exception as e:
continue
print(f" Found {len(games)} games from Baseball-Reference")
return games
def scrape_mlb_statsapi(season: int) -> list[Game]:
"""
Fetch MLB schedule from official Stats API (JSON).
URL: https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={YEAR}&gameType=R
"""
games = []
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={season}&gameType=R&hydrate=team,venue"
print(f"Fetching MLB {season} from Stats API...")
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
for date_entry in data.get('dates', []):
game_date = date_entry.get('date', '')
for game_data in date_entry.get('games', []):
try:
teams = game_data.get('teams', {})
away = teams.get('away', {}).get('team', {})
home = teams.get('home', {}).get('team', {})
venue = game_data.get('venue', {})
game_time = game_data.get('gameDate', '')
if 'T' in game_time:
time_str = game_time.split('T')[1][:5]
else:
time_str = None
game = Game(
id=f"mlb_{game_data.get('gamePk', '')}",
sport='MLB',
season=str(season),
date=game_date,
time=time_str,
home_team=home.get('name', ''),
away_team=away.get('name', ''),
home_team_abbrev=home.get('abbreviation', ''),
away_team_abbrev=away.get('abbreviation', ''),
venue=venue.get('name', ''),
source='statsapi.mlb.com'
)
games.append(game)
except Exception as e:
continue
except Exception as e:
print(f" Error fetching MLB API: {e}")
print(f" Found {len(games)} games from MLB Stats API")
return games
# =============================================================================
# SCRAPERS - NHL
# =============================================================================
def scrape_nhl_hockey_reference(season: int) -> list[Game]:
"""
Scrape NHL schedule from Hockey-Reference.
URL: https://www.hockey-reference.com/leagues/NHL_{YEAR}_games.html
"""
games = []
url = f"https://www.hockey-reference.com/leagues/NHL_{season}_games.html"
print(f"Scraping NHL {season} from Hockey-Reference...")
soup = fetch_page(url, 'hockey-reference.com')
if not soup:
return games
table = soup.find('table', {'id': 'games'})
if not table:
print(" Could not find games table")
return games
tbody = table.find('tbody')
if not tbody:
return games
for row in tbody.find_all('tr'):
try:
cells = row.find_all(['td', 'th'])
if len(cells) < 5:
continue
# Parse date
date_cell = row.find('th', {'data-stat': 'date_game'})
if not date_cell:
continue
date_link = date_cell.find('a')
date_str = date_link.text if date_link else date_cell.text
# Parse teams
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
home_cell = row.find('td', {'data-stat': 'home_team_name'})
if not visitor_cell or not home_cell:
continue
visitor_link = visitor_cell.find('a')
home_link = home_cell.find('a')
away_team = visitor_link.text if visitor_link else visitor_cell.text
home_team = home_link.text if home_link else home_cell.text
# Convert date
try:
parsed_date = datetime.strptime(date_str.strip(), '%Y-%m-%d')
date_formatted = parsed_date.strftime('%Y-%m-%d')
except:
continue
game_id = f"nhl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='NHL',
season=f"{season-1}-{str(season)[2:]}",
date=date_formatted,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'NHL'),
away_team_abbrev=get_team_abbrev(away_team, 'NHL'),
venue='',
source='hockey-reference.com'
)
games.append(game)
except Exception as e:
continue
print(f" Found {len(games)} games from Hockey-Reference")
return games
def scrape_nhl_api(season: int) -> list[Game]:
"""
Fetch NHL schedule from official API (JSON).
URL: https://api-web.nhle.com/v1/schedule/{YYYY-MM-DD}
"""
games = []
print(f"Fetching NHL {season} from NHL API...")
# NHL API provides club schedules
# We'd need to iterate through dates or teams
# Simplified implementation here
return games
# =============================================================================
# STADIUM SCRAPER
# =============================================================================
def scrape_stadiums_hifld() -> list[Stadium]:
"""
Fetch stadium data from HIFLD Open Data (US Government).
Returns GeoJSON with coordinates.
"""
stadiums = []
url = "https://services1.arcgis.com/Hp6G80Pky0om7QvQ/arcgis/rest/services/Major_Sport_Venues/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
print("Fetching stadiums from HIFLD Open Data...")
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
for feature in data.get('features', []):
attrs = feature.get('attributes', {})
geom = feature.get('geometry', {})
# Filter for NBA, MLB, NHL venues
league = attrs.get('LEAGUE', '')
if league not in ['NBA', 'MLB', 'NHL', 'NFL']:
continue
sport_map = {'NBA': 'NBA', 'MLB': 'MLB', 'NHL': 'NHL'}
if league not in sport_map:
continue
stadium = Stadium(
id=f"hifld_{attrs.get('OBJECTID', '')}",
name=attrs.get('NAME', ''),
city=attrs.get('CITY', ''),
state=attrs.get('STATE', ''),
latitude=geom.get('y', 0),
longitude=geom.get('x', 0),
capacity=attrs.get('CAPACITY', 0) or 0,
sport=sport_map.get(league, ''),
team_abbrevs=[attrs.get('TEAM', '')],
source='hifld.gov',
year_opened=attrs.get('YEAR_OPEN')
)
stadiums.append(stadium)
except Exception as e:
print(f" Error fetching HIFLD data: {e}")
print(f" Found {len(stadiums)} stadiums from HIFLD")
return stadiums
def generate_stadiums_from_teams() -> list[Stadium]:
"""
Generate stadium data from team mappings with manual coordinates.
This serves as a fallback/validation source.
"""
stadiums = []
# NBA Arenas with coordinates (manually curated)
nba_coords = {
'State Farm Arena': (33.7573, -84.3963),
'TD Garden': (42.3662, -71.0621),
'Barclays Center': (40.6826, -73.9754),
'Spectrum Center': (35.2251, -80.8392),
'United Center': (41.8807, -87.6742),
'Rocket Mortgage FieldHouse': (41.4965, -81.6882),
'American Airlines Center': (32.7905, -96.8103),
'Ball Arena': (39.7487, -105.0077),
'Little Caesars Arena': (42.3411, -83.0553),
'Chase Center': (37.7680, -122.3879),
'Toyota Center': (29.7508, -95.3621),
'Gainbridge Fieldhouse': (39.7640, -86.1555),
'Intuit Dome': (33.9425, -118.3419),
'Crypto.com Arena': (34.0430, -118.2673),
'FedExForum': (35.1382, -90.0506),
'Kaseya Center': (25.7814, -80.1870),
'Fiserv Forum': (43.0451, -87.9174),
'Target Center': (44.9795, -93.2761),
'Smoothie King Center': (29.9490, -90.0821),
'Madison Square Garden': (40.7505, -73.9934),
'Paycom Center': (35.4634, -97.5151),
'Kia Center': (28.5392, -81.3839),
'Wells Fargo Center': (39.9012, -75.1720),
'Footprint Center': (33.4457, -112.0712),
'Moda Center': (45.5316, -122.6668),
'Golden 1 Center': (38.5802, -121.4997),
'Frost Bank Center': (29.4270, -98.4375),
'Scotiabank Arena': (43.6435, -79.3791),
'Delta Center': (40.7683, -111.9011),
'Capital One Arena': (38.8982, -77.0209),
}
for abbrev, info in NBA_TEAMS.items():
arena = info['arena']
coords = nba_coords.get(arena, (0, 0))
stadium = Stadium(
id=f"manual_nba_{abbrev.lower()}",
name=arena,
city=info['city'],
state='',
latitude=coords[0],
longitude=coords[1],
capacity=0,
sport='NBA',
team_abbrevs=[abbrev],
source='manual'
)
stadiums.append(stadium)
# MLB Stadiums with coordinates
mlb_coords = {
'Chase Field': (33.4453, -112.0667, 'AZ', 48686),
'Truist Park': (33.8907, -84.4678, 'GA', 41084),
'Oriole Park at Camden Yards': (39.2838, -76.6218, 'MD', 45971),
'Fenway Park': (42.3467, -71.0972, 'MA', 37755),
'Wrigley Field': (41.9484, -87.6553, 'IL', 41649),
'Guaranteed Rate Field': (41.8299, -87.6338, 'IL', 40615),
'Great American Ball Park': (39.0979, -84.5082, 'OH', 42319),
'Progressive Field': (41.4962, -81.6852, 'OH', 34830),
'Coors Field': (39.7559, -104.9942, 'CO', 50144),
'Comerica Park': (42.3390, -83.0485, 'MI', 41083),
'Minute Maid Park': (29.7573, -95.3555, 'TX', 41168),
'Kauffman Stadium': (39.0517, -94.4803, 'MO', 37903),
'Angel Stadium': (33.8003, -117.8827, 'CA', 45517),
'Dodger Stadium': (34.0739, -118.2400, 'CA', 56000),
'LoanDepot Park': (25.7781, -80.2196, 'FL', 36742),
'American Family Field': (43.0280, -87.9712, 'WI', 41900),
'Target Field': (44.9817, -93.2776, 'MN', 38544),
'Citi Field': (40.7571, -73.8458, 'NY', 41922),
'Yankee Stadium': (40.8296, -73.9262, 'NY', 46537),
'Sutter Health Park': (38.5802, -121.5097, 'CA', 14014),
'Citizens Bank Park': (39.9061, -75.1665, 'PA', 42792),
'PNC Park': (40.4469, -80.0057, 'PA', 38362),
'Petco Park': (32.7076, -117.1570, 'CA', 40209),
'Oracle Park': (37.7786, -122.3893, 'CA', 41265),
'T-Mobile Park': (47.5914, -122.3325, 'WA', 47929),
'Busch Stadium': (38.6226, -90.1928, 'MO', 45494),
'Tropicana Field': (27.7682, -82.6534, 'FL', 25000),
'Globe Life Field': (32.7473, -97.0845, 'TX', 40300),
'Rogers Centre': (43.6414, -79.3894, 'ON', 49282),
'Nationals Park': (38.8730, -77.0074, 'DC', 41339),
}
for abbrev, info in MLB_TEAMS.items():
stadium_name = info['stadium']
coord_data = mlb_coords.get(stadium_name, (0, 0, '', 0))
stadium = Stadium(
id=f"manual_mlb_{abbrev.lower()}",
name=stadium_name,
city=info['city'],
state=coord_data[2] if len(coord_data) > 2 else '',
latitude=coord_data[0],
longitude=coord_data[1],
capacity=coord_data[3] if len(coord_data) > 3 else 0,
sport='MLB',
team_abbrevs=[abbrev],
source='manual'
)
stadiums.append(stadium)
# NHL Arenas with coordinates
nhl_coords = {
'Honda Center': (33.8078, -117.8765, 'CA', 17174),
'Delta Center': (40.7683, -111.9011, 'UT', 18306),
'TD Garden': (42.3662, -71.0621, 'MA', 17565),
'KeyBank Center': (42.8750, -78.8764, 'NY', 19070),
'Scotiabank Saddledome': (51.0374, -114.0519, 'AB', 19289),
'PNC Arena': (35.8034, -78.7220, 'NC', 18680),
'United Center': (41.8807, -87.6742, 'IL', 19717),
'Ball Arena': (39.7487, -105.0077, 'CO', 18007),
'Nationwide Arena': (39.9693, -83.0061, 'OH', 18500),
'American Airlines Center': (32.7905, -96.8103, 'TX', 18532),
'Little Caesars Arena': (42.3411, -83.0553, 'MI', 19515),
'Rogers Place': (53.5469, -113.4978, 'AB', 18347),
'Amerant Bank Arena': (26.1584, -80.3256, 'FL', 19250),
'Crypto.com Arena': (34.0430, -118.2673, 'CA', 18230),
'Xcel Energy Center': (44.9448, -93.1010, 'MN', 17954),
'Bell Centre': (45.4961, -73.5693, 'QC', 21302),
'Bridgestone Arena': (36.1592, -86.7785, 'TN', 17159),
'Prudential Center': (40.7334, -74.1712, 'NJ', 16514),
'UBS Arena': (40.7161, -73.7246, 'NY', 17255),
'Madison Square Garden': (40.7505, -73.9934, 'NY', 18006),
'Canadian Tire Centre': (45.2969, -75.9272, 'ON', 18652),
'Wells Fargo Center': (39.9012, -75.1720, 'PA', 19543),
'PPG Paints Arena': (40.4395, -79.9892, 'PA', 18387),
'SAP Center': (37.3327, -121.9010, 'CA', 17562),
'Climate Pledge Arena': (47.6221, -122.3540, 'WA', 17100),
'Enterprise Center': (38.6268, -90.2025, 'MO', 18096),
'Amalie Arena': (27.9426, -82.4519, 'FL', 19092),
'Scotiabank Arena': (43.6435, -79.3791, 'ON', 18819),
'Rogers Arena': (49.2778, -123.1089, 'BC', 18910),
'T-Mobile Arena': (36.1028, -115.1784, 'NV', 17500),
'Capital One Arena': (38.8982, -77.0209, 'DC', 18573),
'Canada Life Centre': (49.8928, -97.1436, 'MB', 15321),
}
for abbrev, info in NHL_TEAMS.items():
arena_name = info['arena']
coord_data = nhl_coords.get(arena_name, (0, 0, '', 0))
stadium = Stadium(
id=f"manual_nhl_{abbrev.lower()}",
name=arena_name,
city=info['city'],
state=coord_data[2] if len(coord_data) > 2 else '',
latitude=coord_data[0],
longitude=coord_data[1],
capacity=coord_data[3] if len(coord_data) > 3 else 0,
sport='NHL',
team_abbrevs=[abbrev],
source='manual'
)
stadiums.append(stadium)
return stadiums
# =============================================================================
# HELPERS
# =============================================================================
def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]:
"""
Assign stable IDs based on matchup + occurrence number within season.
Format: {sport}_{season}_{away}_{home}_{num}
This ensures IDs don't change when games are rescheduled.
"""
from collections import defaultdict
# Group games by matchup (away @ home)
matchups = defaultdict(list)
for game in games:
key = f"{game.away_team_abbrev}_{game.home_team_abbrev}"
matchups[key].append(game)
# Sort each matchup by date and assign occurrence number
for key, matchup_games in matchups.items():
matchup_games.sort(key=lambda g: g.date)
for i, game in enumerate(matchup_games, 1):
away = game.away_team_abbrev.lower()
home = game.home_team_abbrev.lower()
# Normalize season format (e.g., "2024-25" -> "2024-25", "2025" -> "2025")
season_str = season.replace('-', '')
game.id = f"{sport.lower()}_{season_str}_{away}_{home}_{i}"
return games
def get_team_abbrev(team_name: str, sport: str) -> str:
"""Get team abbreviation from full name."""
teams = {'NBA': NBA_TEAMS, 'MLB': MLB_TEAMS, 'NHL': NHL_TEAMS}.get(sport, {})
for abbrev, info in teams.items():
if info['name'].lower() == team_name.lower():
return abbrev
if team_name.lower() in info['name'].lower():
return abbrev
# Return first 3 letters as fallback
return team_name[:3].upper()
def validate_games(games_by_source: dict) -> dict:
"""
Cross-validate games from multiple sources.
Returns discrepancies.
"""
discrepancies = {
'missing_in_source': [],
'date_mismatch': [],
'time_mismatch': [],
'venue_mismatch': [],
}
sources = list(games_by_source.keys())
if len(sources) < 2:
return discrepancies
primary = sources[0]
primary_games = {g.id: g for g in games_by_source[primary]}
for source in sources[1:]:
secondary_games = {g.id: g for g in games_by_source[source]}
for game_id, game in primary_games.items():
if game_id not in secondary_games:
discrepancies['missing_in_source'].append({
'game_id': game_id,
'present_in': primary,
'missing_in': source
})
return discrepancies
def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path):
"""Export scraped data to JSON files."""
output_dir.mkdir(parents=True, exist_ok=True)
# Export games
games_data = [asdict(g) for g in games]
with open(output_dir / 'games.json', 'w') as f:
json.dump(games_data, f, indent=2)
# Export stadiums
stadiums_data = [asdict(s) for s in stadiums]
with open(output_dir / 'stadiums.json', 'w') as f:
json.dump(stadiums_data, f, indent=2)
# Export as CSV for easy viewing
if games:
df_games = pd.DataFrame(games_data)
df_games.to_csv(output_dir / 'games.csv', index=False)
if stadiums:
df_stadiums = pd.DataFrame(stadiums_data)
df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False)
print(f"\nExported to {output_dir}")
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(description='Scrape sports schedules')
parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'all'], default='all')
parser.add_argument('--season', type=int, default=2025, help='Season year (ending year)')
parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data')
parser.add_argument('--output', type=str, default='./data', help='Output directory')
args = parser.parse_args()
output_dir = Path(args.output)
all_games = []
all_stadiums = []
# Scrape stadiums
print("\n" + "="*60)
print("SCRAPING STADIUMS")
print("="*60)
all_stadiums.extend(scrape_stadiums_hifld())
all_stadiums.extend(generate_stadiums_from_teams())
if args.stadiums_only:
export_to_json([], all_stadiums, output_dir)
return
# Scrape schedules
if args.sport in ['nba', 'all']:
print("\n" + "="*60)
print(f"SCRAPING NBA {args.season}")
print("="*60)
nba_games_br = scrape_nba_basketball_reference(args.season)
nba_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2024-25"
nba_games_br = assign_stable_ids(nba_games_br, 'NBA', nba_season)
all_games.extend(nba_games_br)
if args.sport in ['mlb', 'all']:
print("\n" + "="*60)
print(f"SCRAPING MLB {args.season}")
print("="*60)
mlb_games_api = scrape_mlb_statsapi(args.season)
# MLB API uses official gamePk which is already stable - no reassignment needed
all_games.extend(mlb_games_api)
if args.sport in ['nhl', 'all']:
print("\n" + "="*60)
print(f"SCRAPING NHL {args.season}")
print("="*60)
nhl_games_hr = scrape_nhl_hockey_reference(args.season)
nhl_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2024-25"
nhl_games_hr = assign_stable_ids(nhl_games_hr, 'NHL', nhl_season)
all_games.extend(nhl_games_hr)
# Export
print("\n" + "="*60)
print("EXPORTING DATA")
print("="*60)
export_to_json(all_games, all_stadiums, output_dir)
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Total games scraped: {len(all_games)}")
print(f"Total stadiums: {len(all_stadiums)}")
# Games by sport
by_sport = {}
for g in all_games:
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
for sport, count in by_sport.items():
print(f" {sport}: {count} games")
if __name__ == '__main__':
main()