- Three-scenario planning engine (A: date range, B: selected games, C: directional routes) - GeographicRouteExplorer with anchor game support for route exploration - Shared ItineraryBuilder for travel segment calculation - TravelEstimator for driving time/distance estimation - SwiftUI views for trip creation and detail display - CloudKit integration for schedule data - Python scraping scripts for sports schedules 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
971 lines
38 KiB
Python
971 lines
38 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Sports Schedule Scraper for SportsTime App
|
|
Scrapes NBA, MLB, NHL schedules from multiple sources for cross-validation.
|
|
|
|
Usage:
|
|
python scrape_schedules.py --sport nba --season 2025
|
|
python scrape_schedules.py --sport all --season 2025
|
|
python scrape_schedules.py --stadiums-only
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import time
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from dataclasses import dataclass, asdict
|
|
from typing import Optional
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
|
|
# Rate limiting
|
|
REQUEST_DELAY = 3.0 # seconds between requests to same domain
|
|
last_request_time = {}
|
|
|
|
|
|
def rate_limit(domain: str):
|
|
"""Enforce rate limiting per domain."""
|
|
now = time.time()
|
|
if domain in last_request_time:
|
|
elapsed = now - last_request_time[domain]
|
|
if elapsed < REQUEST_DELAY:
|
|
time.sleep(REQUEST_DELAY - elapsed)
|
|
last_request_time[domain] = time.time()
|
|
|
|
|
|
def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]:
|
|
"""Fetch and parse a webpage with rate limiting."""
|
|
rate_limit(domain)
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
}
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
return BeautifulSoup(response.content, 'html.parser')
|
|
except Exception as e:
|
|
print(f"Error fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
# =============================================================================
|
|
# DATA CLASSES
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class Game:
|
|
id: str
|
|
sport: str
|
|
season: str
|
|
date: str # YYYY-MM-DD
|
|
time: Optional[str] # HH:MM (24hr, ET)
|
|
home_team: str
|
|
away_team: str
|
|
home_team_abbrev: str
|
|
away_team_abbrev: str
|
|
venue: str
|
|
source: str
|
|
is_playoff: bool = False
|
|
broadcast: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class Stadium:
|
|
id: str
|
|
name: str
|
|
city: str
|
|
state: str
|
|
latitude: float
|
|
longitude: float
|
|
capacity: int
|
|
sport: str
|
|
team_abbrevs: list
|
|
source: str
|
|
year_opened: Optional[int] = None
|
|
|
|
|
|
# =============================================================================
|
|
# TEAM MAPPINGS
|
|
# =============================================================================
|
|
|
|
NBA_TEAMS = {
|
|
'ATL': {'name': 'Atlanta Hawks', 'city': 'Atlanta', 'arena': 'State Farm Arena'},
|
|
'BOS': {'name': 'Boston Celtics', 'city': 'Boston', 'arena': 'TD Garden'},
|
|
'BRK': {'name': 'Brooklyn Nets', 'city': 'Brooklyn', 'arena': 'Barclays Center'},
|
|
'CHO': {'name': 'Charlotte Hornets', 'city': 'Charlotte', 'arena': 'Spectrum Center'},
|
|
'CHI': {'name': 'Chicago Bulls', 'city': 'Chicago', 'arena': 'United Center'},
|
|
'CLE': {'name': 'Cleveland Cavaliers', 'city': 'Cleveland', 'arena': 'Rocket Mortgage FieldHouse'},
|
|
'DAL': {'name': 'Dallas Mavericks', 'city': 'Dallas', 'arena': 'American Airlines Center'},
|
|
'DEN': {'name': 'Denver Nuggets', 'city': 'Denver', 'arena': 'Ball Arena'},
|
|
'DET': {'name': 'Detroit Pistons', 'city': 'Detroit', 'arena': 'Little Caesars Arena'},
|
|
'GSW': {'name': 'Golden State Warriors', 'city': 'San Francisco', 'arena': 'Chase Center'},
|
|
'HOU': {'name': 'Houston Rockets', 'city': 'Houston', 'arena': 'Toyota Center'},
|
|
'IND': {'name': 'Indiana Pacers', 'city': 'Indianapolis', 'arena': 'Gainbridge Fieldhouse'},
|
|
'LAC': {'name': 'Los Angeles Clippers', 'city': 'Inglewood', 'arena': 'Intuit Dome'},
|
|
'LAL': {'name': 'Los Angeles Lakers', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
|
|
'MEM': {'name': 'Memphis Grizzlies', 'city': 'Memphis', 'arena': 'FedExForum'},
|
|
'MIA': {'name': 'Miami Heat', 'city': 'Miami', 'arena': 'Kaseya Center'},
|
|
'MIL': {'name': 'Milwaukee Bucks', 'city': 'Milwaukee', 'arena': 'Fiserv Forum'},
|
|
'MIN': {'name': 'Minnesota Timberwolves', 'city': 'Minneapolis', 'arena': 'Target Center'},
|
|
'NOP': {'name': 'New Orleans Pelicans', 'city': 'New Orleans', 'arena': 'Smoothie King Center'},
|
|
'NYK': {'name': 'New York Knicks', 'city': 'New York', 'arena': 'Madison Square Garden'},
|
|
'OKC': {'name': 'Oklahoma City Thunder', 'city': 'Oklahoma City', 'arena': 'Paycom Center'},
|
|
'ORL': {'name': 'Orlando Magic', 'city': 'Orlando', 'arena': 'Kia Center'},
|
|
'PHI': {'name': 'Philadelphia 76ers', 'city': 'Philadelphia', 'arena': 'Wells Fargo Center'},
|
|
'PHO': {'name': 'Phoenix Suns', 'city': 'Phoenix', 'arena': 'Footprint Center'},
|
|
'POR': {'name': 'Portland Trail Blazers', 'city': 'Portland', 'arena': 'Moda Center'},
|
|
'SAC': {'name': 'Sacramento Kings', 'city': 'Sacramento', 'arena': 'Golden 1 Center'},
|
|
'SAS': {'name': 'San Antonio Spurs', 'city': 'San Antonio', 'arena': 'Frost Bank Center'},
|
|
'TOR': {'name': 'Toronto Raptors', 'city': 'Toronto', 'arena': 'Scotiabank Arena'},
|
|
'UTA': {'name': 'Utah Jazz', 'city': 'Salt Lake City', 'arena': 'Delta Center'},
|
|
'WAS': {'name': 'Washington Wizards', 'city': 'Washington', 'arena': 'Capital One Arena'},
|
|
}
|
|
|
|
MLB_TEAMS = {
|
|
'ARI': {'name': 'Arizona Diamondbacks', 'city': 'Phoenix', 'stadium': 'Chase Field'},
|
|
'ATL': {'name': 'Atlanta Braves', 'city': 'Atlanta', 'stadium': 'Truist Park'},
|
|
'BAL': {'name': 'Baltimore Orioles', 'city': 'Baltimore', 'stadium': 'Oriole Park at Camden Yards'},
|
|
'BOS': {'name': 'Boston Red Sox', 'city': 'Boston', 'stadium': 'Fenway Park'},
|
|
'CHC': {'name': 'Chicago Cubs', 'city': 'Chicago', 'stadium': 'Wrigley Field'},
|
|
'CHW': {'name': 'Chicago White Sox', 'city': 'Chicago', 'stadium': 'Guaranteed Rate Field'},
|
|
'CIN': {'name': 'Cincinnati Reds', 'city': 'Cincinnati', 'stadium': 'Great American Ball Park'},
|
|
'CLE': {'name': 'Cleveland Guardians', 'city': 'Cleveland', 'stadium': 'Progressive Field'},
|
|
'COL': {'name': 'Colorado Rockies', 'city': 'Denver', 'stadium': 'Coors Field'},
|
|
'DET': {'name': 'Detroit Tigers', 'city': 'Detroit', 'stadium': 'Comerica Park'},
|
|
'HOU': {'name': 'Houston Astros', 'city': 'Houston', 'stadium': 'Minute Maid Park'},
|
|
'KCR': {'name': 'Kansas City Royals', 'city': 'Kansas City', 'stadium': 'Kauffman Stadium'},
|
|
'LAA': {'name': 'Los Angeles Angels', 'city': 'Anaheim', 'stadium': 'Angel Stadium'},
|
|
'LAD': {'name': 'Los Angeles Dodgers', 'city': 'Los Angeles', 'stadium': 'Dodger Stadium'},
|
|
'MIA': {'name': 'Miami Marlins', 'city': 'Miami', 'stadium': 'LoanDepot Park'},
|
|
'MIL': {'name': 'Milwaukee Brewers', 'city': 'Milwaukee', 'stadium': 'American Family Field'},
|
|
'MIN': {'name': 'Minnesota Twins', 'city': 'Minneapolis', 'stadium': 'Target Field'},
|
|
'NYM': {'name': 'New York Mets', 'city': 'New York', 'stadium': 'Citi Field'},
|
|
'NYY': {'name': 'New York Yankees', 'city': 'New York', 'stadium': 'Yankee Stadium'},
|
|
'OAK': {'name': 'Oakland Athletics', 'city': 'Sacramento', 'stadium': 'Sutter Health Park'},
|
|
'PHI': {'name': 'Philadelphia Phillies', 'city': 'Philadelphia', 'stadium': 'Citizens Bank Park'},
|
|
'PIT': {'name': 'Pittsburgh Pirates', 'city': 'Pittsburgh', 'stadium': 'PNC Park'},
|
|
'SDP': {'name': 'San Diego Padres', 'city': 'San Diego', 'stadium': 'Petco Park'},
|
|
'SFG': {'name': 'San Francisco Giants', 'city': 'San Francisco', 'stadium': 'Oracle Park'},
|
|
'SEA': {'name': 'Seattle Mariners', 'city': 'Seattle', 'stadium': 'T-Mobile Park'},
|
|
'STL': {'name': 'St. Louis Cardinals', 'city': 'St. Louis', 'stadium': 'Busch Stadium'},
|
|
'TBR': {'name': 'Tampa Bay Rays', 'city': 'St. Petersburg', 'stadium': 'Tropicana Field'},
|
|
'TEX': {'name': 'Texas Rangers', 'city': 'Arlington', 'stadium': 'Globe Life Field'},
|
|
'TOR': {'name': 'Toronto Blue Jays', 'city': 'Toronto', 'stadium': 'Rogers Centre'},
|
|
'WSN': {'name': 'Washington Nationals', 'city': 'Washington', 'stadium': 'Nationals Park'},
|
|
}
|
|
|
|
NHL_TEAMS = {
|
|
'ANA': {'name': 'Anaheim Ducks', 'city': 'Anaheim', 'arena': 'Honda Center'},
|
|
'ARI': {'name': 'Utah Hockey Club', 'city': 'Salt Lake City', 'arena': 'Delta Center'},
|
|
'BOS': {'name': 'Boston Bruins', 'city': 'Boston', 'arena': 'TD Garden'},
|
|
'BUF': {'name': 'Buffalo Sabres', 'city': 'Buffalo', 'arena': 'KeyBank Center'},
|
|
'CGY': {'name': 'Calgary Flames', 'city': 'Calgary', 'arena': 'Scotiabank Saddledome'},
|
|
'CAR': {'name': 'Carolina Hurricanes', 'city': 'Raleigh', 'arena': 'PNC Arena'},
|
|
'CHI': {'name': 'Chicago Blackhawks', 'city': 'Chicago', 'arena': 'United Center'},
|
|
'COL': {'name': 'Colorado Avalanche', 'city': 'Denver', 'arena': 'Ball Arena'},
|
|
'CBJ': {'name': 'Columbus Blue Jackets', 'city': 'Columbus', 'arena': 'Nationwide Arena'},
|
|
'DAL': {'name': 'Dallas Stars', 'city': 'Dallas', 'arena': 'American Airlines Center'},
|
|
'DET': {'name': 'Detroit Red Wings', 'city': 'Detroit', 'arena': 'Little Caesars Arena'},
|
|
'EDM': {'name': 'Edmonton Oilers', 'city': 'Edmonton', 'arena': 'Rogers Place'},
|
|
'FLA': {'name': 'Florida Panthers', 'city': 'Sunrise', 'arena': 'Amerant Bank Arena'},
|
|
'LAK': {'name': 'Los Angeles Kings', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
|
|
'MIN': {'name': 'Minnesota Wild', 'city': 'St. Paul', 'arena': 'Xcel Energy Center'},
|
|
'MTL': {'name': 'Montreal Canadiens', 'city': 'Montreal', 'arena': 'Bell Centre'},
|
|
'NSH': {'name': 'Nashville Predators', 'city': 'Nashville', 'arena': 'Bridgestone Arena'},
|
|
'NJD': {'name': 'New Jersey Devils', 'city': 'Newark', 'arena': 'Prudential Center'},
|
|
'NYI': {'name': 'New York Islanders', 'city': 'Elmont', 'arena': 'UBS Arena'},
|
|
'NYR': {'name': 'New York Rangers', 'city': 'New York', 'arena': 'Madison Square Garden'},
|
|
'OTT': {'name': 'Ottawa Senators', 'city': 'Ottawa', 'arena': 'Canadian Tire Centre'},
|
|
'PHI': {'name': 'Philadelphia Flyers', 'city': 'Philadelphia', 'arena': 'Wells Fargo Center'},
|
|
'PIT': {'name': 'Pittsburgh Penguins', 'city': 'Pittsburgh', 'arena': 'PPG Paints Arena'},
|
|
'SJS': {'name': 'San Jose Sharks', 'city': 'San Jose', 'arena': 'SAP Center'},
|
|
'SEA': {'name': 'Seattle Kraken', 'city': 'Seattle', 'arena': 'Climate Pledge Arena'},
|
|
'STL': {'name': 'St. Louis Blues', 'city': 'St. Louis', 'arena': 'Enterprise Center'},
|
|
'TBL': {'name': 'Tampa Bay Lightning', 'city': 'Tampa', 'arena': 'Amalie Arena'},
|
|
'TOR': {'name': 'Toronto Maple Leafs', 'city': 'Toronto', 'arena': 'Scotiabank Arena'},
|
|
'VAN': {'name': 'Vancouver Canucks', 'city': 'Vancouver', 'arena': 'Rogers Arena'},
|
|
'VGK': {'name': 'Vegas Golden Knights', 'city': 'Las Vegas', 'arena': 'T-Mobile Arena'},
|
|
'WSH': {'name': 'Washington Capitals', 'city': 'Washington', 'arena': 'Capital One Arena'},
|
|
'WPG': {'name': 'Winnipeg Jets', 'city': 'Winnipeg', 'arena': 'Canada Life Centre'},
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPERS - NBA
|
|
# =============================================================================
|
|
|
|
def scrape_nba_basketball_reference(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NBA schedule from Basketball-Reference.
|
|
URL: https://www.basketball-reference.com/leagues/NBA_{YEAR}_games-{month}.html
|
|
Season year is the ending year (e.g., 2025 for 2024-25 season)
|
|
"""
|
|
games = []
|
|
months = ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june']
|
|
|
|
print(f"Scraping NBA {season} from Basketball-Reference...")
|
|
|
|
for month in months:
|
|
url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html"
|
|
soup = fetch_page(url, 'basketball-reference.com')
|
|
|
|
if not soup:
|
|
continue
|
|
|
|
table = soup.find('table', {'id': 'schedule'})
|
|
if not table:
|
|
continue
|
|
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
continue
|
|
|
|
for row in tbody.find_all('tr'):
|
|
if row.get('class') and 'thead' in row.get('class'):
|
|
continue
|
|
|
|
cells = row.find_all(['td', 'th'])
|
|
if len(cells) < 6:
|
|
continue
|
|
|
|
try:
|
|
# Parse date
|
|
date_cell = row.find('th', {'data-stat': 'date_game'})
|
|
if not date_cell:
|
|
continue
|
|
date_link = date_cell.find('a')
|
|
date_str = date_link.text if date_link else date_cell.text
|
|
|
|
# Parse time
|
|
time_cell = row.find('td', {'data-stat': 'game_start_time'})
|
|
time_str = time_cell.text.strip() if time_cell else None
|
|
|
|
# Parse teams
|
|
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
|
|
home_cell = row.find('td', {'data-stat': 'home_team_name'})
|
|
|
|
if not visitor_cell or not home_cell:
|
|
continue
|
|
|
|
visitor_link = visitor_cell.find('a')
|
|
home_link = home_cell.find('a')
|
|
|
|
away_team = visitor_link.text if visitor_link else visitor_cell.text
|
|
home_team = home_link.text if home_link else home_cell.text
|
|
|
|
# Parse arena
|
|
arena_cell = row.find('td', {'data-stat': 'arena_name'})
|
|
arena = arena_cell.text.strip() if arena_cell else ''
|
|
|
|
# Convert date
|
|
try:
|
|
parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y')
|
|
date_formatted = parsed_date.strftime('%Y-%m-%d')
|
|
except:
|
|
continue
|
|
|
|
# Generate game ID
|
|
game_id = f"nba_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NBA',
|
|
season=f"{season-1}-{str(season)[2:]}",
|
|
date=date_formatted,
|
|
time=time_str,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'NBA'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'NBA'),
|
|
venue=arena,
|
|
source='basketball-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
print(f" Error parsing row: {e}")
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Basketball-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_nba_espn(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NBA schedule from ESPN.
|
|
URL: https://www.espn.com/nba/schedule/_/date/{YYYYMMDD}
|
|
"""
|
|
games = []
|
|
print(f"Scraping NBA {season} from ESPN...")
|
|
|
|
# Determine date range for season
|
|
start_date = datetime(season - 1, 10, 1) # October of previous year
|
|
end_date = datetime(season, 6, 30) # June of season year
|
|
|
|
current_date = start_date
|
|
while current_date <= end_date:
|
|
date_str = current_date.strftime('%Y%m%d')
|
|
url = f"https://www.espn.com/nba/schedule/_/date/{date_str}"
|
|
|
|
soup = fetch_page(url, 'espn.com')
|
|
if soup:
|
|
# ESPN uses JavaScript rendering, so we need to parse what's available
|
|
# This is a simplified version - full implementation would need Selenium
|
|
pass
|
|
|
|
current_date += timedelta(days=7) # Sample weekly to respect rate limits
|
|
|
|
print(f" Found {len(games)} games from ESPN")
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPERS - MLB
|
|
# =============================================================================
|
|
|
|
def scrape_mlb_baseball_reference(season: int) -> list[Game]:
|
|
"""
|
|
Scrape MLB schedule from Baseball-Reference.
|
|
URL: https://www.baseball-reference.com/leagues/majors/{YEAR}-schedule.shtml
|
|
"""
|
|
games = []
|
|
url = f"https://www.baseball-reference.com/leagues/majors/{season}-schedule.shtml"
|
|
|
|
print(f"Scraping MLB {season} from Baseball-Reference...")
|
|
soup = fetch_page(url, 'baseball-reference.com')
|
|
|
|
if not soup:
|
|
return games
|
|
|
|
# Baseball-Reference groups games by date in h3 headers
|
|
current_date = None
|
|
|
|
# Find the schedule section
|
|
schedule_div = soup.find('div', {'id': 'all_schedule'})
|
|
if not schedule_div:
|
|
schedule_div = soup
|
|
|
|
# Process all elements to track date context
|
|
for element in schedule_div.find_all(['h3', 'p', 'div']):
|
|
# Check for date header
|
|
if element.name == 'h3':
|
|
date_text = element.get_text(strip=True)
|
|
# Parse date like "Thursday, March 27, 2025"
|
|
try:
|
|
for fmt in ['%A, %B %d, %Y', '%B %d, %Y', '%a, %b %d, %Y']:
|
|
try:
|
|
parsed = datetime.strptime(date_text, fmt)
|
|
current_date = parsed.strftime('%Y-%m-%d')
|
|
break
|
|
except:
|
|
continue
|
|
except:
|
|
pass
|
|
|
|
# Check for game entries
|
|
elif element.name == 'p' and 'game' in element.get('class', []):
|
|
if not current_date:
|
|
continue
|
|
|
|
try:
|
|
links = element.find_all('a')
|
|
if len(links) >= 2:
|
|
away_team = links[0].text.strip()
|
|
home_team = links[1].text.strip()
|
|
|
|
# Generate unique game ID
|
|
away_abbrev = get_team_abbrev(away_team, 'MLB')
|
|
home_abbrev = get_team_abbrev(home_team, 'MLB')
|
|
game_id = f"mlb_br_{current_date}_{away_abbrev}_{home_abbrev}".lower()
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='MLB',
|
|
season=str(season),
|
|
date=current_date,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=home_abbrev,
|
|
away_team_abbrev=away_abbrev,
|
|
venue='',
|
|
source='baseball-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Baseball-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_mlb_statsapi(season: int) -> list[Game]:
|
|
"""
|
|
Fetch MLB schedule from official Stats API (JSON).
|
|
URL: https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={YEAR}&gameType=R
|
|
"""
|
|
games = []
|
|
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={season}&gameType=R&hydrate=team,venue"
|
|
|
|
print(f"Fetching MLB {season} from Stats API...")
|
|
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for date_entry in data.get('dates', []):
|
|
game_date = date_entry.get('date', '')
|
|
|
|
for game_data in date_entry.get('games', []):
|
|
try:
|
|
teams = game_data.get('teams', {})
|
|
away = teams.get('away', {}).get('team', {})
|
|
home = teams.get('home', {}).get('team', {})
|
|
venue = game_data.get('venue', {})
|
|
|
|
game_time = game_data.get('gameDate', '')
|
|
if 'T' in game_time:
|
|
time_str = game_time.split('T')[1][:5]
|
|
else:
|
|
time_str = None
|
|
|
|
game = Game(
|
|
id=f"mlb_{game_data.get('gamePk', '')}",
|
|
sport='MLB',
|
|
season=str(season),
|
|
date=game_date,
|
|
time=time_str,
|
|
home_team=home.get('name', ''),
|
|
away_team=away.get('name', ''),
|
|
home_team_abbrev=home.get('abbreviation', ''),
|
|
away_team_abbrev=away.get('abbreviation', ''),
|
|
venue=venue.get('name', ''),
|
|
source='statsapi.mlb.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
except Exception as e:
|
|
print(f" Error fetching MLB API: {e}")
|
|
|
|
print(f" Found {len(games)} games from MLB Stats API")
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPERS - NHL
|
|
# =============================================================================
|
|
|
|
def scrape_nhl_hockey_reference(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NHL schedule from Hockey-Reference.
|
|
URL: https://www.hockey-reference.com/leagues/NHL_{YEAR}_games.html
|
|
"""
|
|
games = []
|
|
url = f"https://www.hockey-reference.com/leagues/NHL_{season}_games.html"
|
|
|
|
print(f"Scraping NHL {season} from Hockey-Reference...")
|
|
soup = fetch_page(url, 'hockey-reference.com')
|
|
|
|
if not soup:
|
|
return games
|
|
|
|
table = soup.find('table', {'id': 'games'})
|
|
if not table:
|
|
print(" Could not find games table")
|
|
return games
|
|
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
return games
|
|
|
|
for row in tbody.find_all('tr'):
|
|
try:
|
|
cells = row.find_all(['td', 'th'])
|
|
if len(cells) < 5:
|
|
continue
|
|
|
|
# Parse date
|
|
date_cell = row.find('th', {'data-stat': 'date_game'})
|
|
if not date_cell:
|
|
continue
|
|
date_link = date_cell.find('a')
|
|
date_str = date_link.text if date_link else date_cell.text
|
|
|
|
# Parse teams
|
|
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
|
|
home_cell = row.find('td', {'data-stat': 'home_team_name'})
|
|
|
|
if not visitor_cell or not home_cell:
|
|
continue
|
|
|
|
visitor_link = visitor_cell.find('a')
|
|
home_link = home_cell.find('a')
|
|
|
|
away_team = visitor_link.text if visitor_link else visitor_cell.text
|
|
home_team = home_link.text if home_link else home_cell.text
|
|
|
|
# Convert date
|
|
try:
|
|
parsed_date = datetime.strptime(date_str.strip(), '%Y-%m-%d')
|
|
date_formatted = parsed_date.strftime('%Y-%m-%d')
|
|
except:
|
|
continue
|
|
|
|
game_id = f"nhl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NHL',
|
|
season=f"{season-1}-{str(season)[2:]}",
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'NHL'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'NHL'),
|
|
venue='',
|
|
source='hockey-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Hockey-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_nhl_api(season: int) -> list[Game]:
|
|
"""
|
|
Fetch NHL schedule from official API (JSON).
|
|
URL: https://api-web.nhle.com/v1/schedule/{YYYY-MM-DD}
|
|
"""
|
|
games = []
|
|
print(f"Fetching NHL {season} from NHL API...")
|
|
|
|
# NHL API provides club schedules
|
|
# We'd need to iterate through dates or teams
|
|
# Simplified implementation here
|
|
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# STADIUM SCRAPER
|
|
# =============================================================================
|
|
|
|
def scrape_stadiums_hifld() -> list[Stadium]:
|
|
"""
|
|
Fetch stadium data from HIFLD Open Data (US Government).
|
|
Returns GeoJSON with coordinates.
|
|
"""
|
|
stadiums = []
|
|
url = "https://services1.arcgis.com/Hp6G80Pky0om7QvQ/arcgis/rest/services/Major_Sport_Venues/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
|
|
|
|
print("Fetching stadiums from HIFLD Open Data...")
|
|
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for feature in data.get('features', []):
|
|
attrs = feature.get('attributes', {})
|
|
geom = feature.get('geometry', {})
|
|
|
|
# Filter for NBA, MLB, NHL venues
|
|
league = attrs.get('LEAGUE', '')
|
|
if league not in ['NBA', 'MLB', 'NHL', 'NFL']:
|
|
continue
|
|
|
|
sport_map = {'NBA': 'NBA', 'MLB': 'MLB', 'NHL': 'NHL'}
|
|
if league not in sport_map:
|
|
continue
|
|
|
|
stadium = Stadium(
|
|
id=f"hifld_{attrs.get('OBJECTID', '')}",
|
|
name=attrs.get('NAME', ''),
|
|
city=attrs.get('CITY', ''),
|
|
state=attrs.get('STATE', ''),
|
|
latitude=geom.get('y', 0),
|
|
longitude=geom.get('x', 0),
|
|
capacity=attrs.get('CAPACITY', 0) or 0,
|
|
sport=sport_map.get(league, ''),
|
|
team_abbrevs=[attrs.get('TEAM', '')],
|
|
source='hifld.gov',
|
|
year_opened=attrs.get('YEAR_OPEN')
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
except Exception as e:
|
|
print(f" Error fetching HIFLD data: {e}")
|
|
|
|
print(f" Found {len(stadiums)} stadiums from HIFLD")
|
|
return stadiums
|
|
|
|
|
|
def generate_stadiums_from_teams() -> list[Stadium]:
|
|
"""
|
|
Generate stadium data from team mappings with manual coordinates.
|
|
This serves as a fallback/validation source.
|
|
"""
|
|
stadiums = []
|
|
|
|
# NBA Arenas with coordinates (manually curated)
|
|
nba_coords = {
|
|
'State Farm Arena': (33.7573, -84.3963),
|
|
'TD Garden': (42.3662, -71.0621),
|
|
'Barclays Center': (40.6826, -73.9754),
|
|
'Spectrum Center': (35.2251, -80.8392),
|
|
'United Center': (41.8807, -87.6742),
|
|
'Rocket Mortgage FieldHouse': (41.4965, -81.6882),
|
|
'American Airlines Center': (32.7905, -96.8103),
|
|
'Ball Arena': (39.7487, -105.0077),
|
|
'Little Caesars Arena': (42.3411, -83.0553),
|
|
'Chase Center': (37.7680, -122.3879),
|
|
'Toyota Center': (29.7508, -95.3621),
|
|
'Gainbridge Fieldhouse': (39.7640, -86.1555),
|
|
'Intuit Dome': (33.9425, -118.3419),
|
|
'Crypto.com Arena': (34.0430, -118.2673),
|
|
'FedExForum': (35.1382, -90.0506),
|
|
'Kaseya Center': (25.7814, -80.1870),
|
|
'Fiserv Forum': (43.0451, -87.9174),
|
|
'Target Center': (44.9795, -93.2761),
|
|
'Smoothie King Center': (29.9490, -90.0821),
|
|
'Madison Square Garden': (40.7505, -73.9934),
|
|
'Paycom Center': (35.4634, -97.5151),
|
|
'Kia Center': (28.5392, -81.3839),
|
|
'Wells Fargo Center': (39.9012, -75.1720),
|
|
'Footprint Center': (33.4457, -112.0712),
|
|
'Moda Center': (45.5316, -122.6668),
|
|
'Golden 1 Center': (38.5802, -121.4997),
|
|
'Frost Bank Center': (29.4270, -98.4375),
|
|
'Scotiabank Arena': (43.6435, -79.3791),
|
|
'Delta Center': (40.7683, -111.9011),
|
|
'Capital One Arena': (38.8982, -77.0209),
|
|
}
|
|
|
|
for abbrev, info in NBA_TEAMS.items():
|
|
arena = info['arena']
|
|
coords = nba_coords.get(arena, (0, 0))
|
|
|
|
stadium = Stadium(
|
|
id=f"manual_nba_{abbrev.lower()}",
|
|
name=arena,
|
|
city=info['city'],
|
|
state='',
|
|
latitude=coords[0],
|
|
longitude=coords[1],
|
|
capacity=0,
|
|
sport='NBA',
|
|
team_abbrevs=[abbrev],
|
|
source='manual'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
# MLB Stadiums with coordinates
|
|
mlb_coords = {
|
|
'Chase Field': (33.4453, -112.0667, 'AZ', 48686),
|
|
'Truist Park': (33.8907, -84.4678, 'GA', 41084),
|
|
'Oriole Park at Camden Yards': (39.2838, -76.6218, 'MD', 45971),
|
|
'Fenway Park': (42.3467, -71.0972, 'MA', 37755),
|
|
'Wrigley Field': (41.9484, -87.6553, 'IL', 41649),
|
|
'Guaranteed Rate Field': (41.8299, -87.6338, 'IL', 40615),
|
|
'Great American Ball Park': (39.0979, -84.5082, 'OH', 42319),
|
|
'Progressive Field': (41.4962, -81.6852, 'OH', 34830),
|
|
'Coors Field': (39.7559, -104.9942, 'CO', 50144),
|
|
'Comerica Park': (42.3390, -83.0485, 'MI', 41083),
|
|
'Minute Maid Park': (29.7573, -95.3555, 'TX', 41168),
|
|
'Kauffman Stadium': (39.0517, -94.4803, 'MO', 37903),
|
|
'Angel Stadium': (33.8003, -117.8827, 'CA', 45517),
|
|
'Dodger Stadium': (34.0739, -118.2400, 'CA', 56000),
|
|
'LoanDepot Park': (25.7781, -80.2196, 'FL', 36742),
|
|
'American Family Field': (43.0280, -87.9712, 'WI', 41900),
|
|
'Target Field': (44.9817, -93.2776, 'MN', 38544),
|
|
'Citi Field': (40.7571, -73.8458, 'NY', 41922),
|
|
'Yankee Stadium': (40.8296, -73.9262, 'NY', 46537),
|
|
'Sutter Health Park': (38.5802, -121.5097, 'CA', 14014),
|
|
'Citizens Bank Park': (39.9061, -75.1665, 'PA', 42792),
|
|
'PNC Park': (40.4469, -80.0057, 'PA', 38362),
|
|
'Petco Park': (32.7076, -117.1570, 'CA', 40209),
|
|
'Oracle Park': (37.7786, -122.3893, 'CA', 41265),
|
|
'T-Mobile Park': (47.5914, -122.3325, 'WA', 47929),
|
|
'Busch Stadium': (38.6226, -90.1928, 'MO', 45494),
|
|
'Tropicana Field': (27.7682, -82.6534, 'FL', 25000),
|
|
'Globe Life Field': (32.7473, -97.0845, 'TX', 40300),
|
|
'Rogers Centre': (43.6414, -79.3894, 'ON', 49282),
|
|
'Nationals Park': (38.8730, -77.0074, 'DC', 41339),
|
|
}
|
|
|
|
for abbrev, info in MLB_TEAMS.items():
|
|
stadium_name = info['stadium']
|
|
coord_data = mlb_coords.get(stadium_name, (0, 0, '', 0))
|
|
|
|
stadium = Stadium(
|
|
id=f"manual_mlb_{abbrev.lower()}",
|
|
name=stadium_name,
|
|
city=info['city'],
|
|
state=coord_data[2] if len(coord_data) > 2 else '',
|
|
latitude=coord_data[0],
|
|
longitude=coord_data[1],
|
|
capacity=coord_data[3] if len(coord_data) > 3 else 0,
|
|
sport='MLB',
|
|
team_abbrevs=[abbrev],
|
|
source='manual'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
# NHL Arenas with coordinates
|
|
nhl_coords = {
|
|
'Honda Center': (33.8078, -117.8765, 'CA', 17174),
|
|
'Delta Center': (40.7683, -111.9011, 'UT', 18306),
|
|
'TD Garden': (42.3662, -71.0621, 'MA', 17565),
|
|
'KeyBank Center': (42.8750, -78.8764, 'NY', 19070),
|
|
'Scotiabank Saddledome': (51.0374, -114.0519, 'AB', 19289),
|
|
'PNC Arena': (35.8034, -78.7220, 'NC', 18680),
|
|
'United Center': (41.8807, -87.6742, 'IL', 19717),
|
|
'Ball Arena': (39.7487, -105.0077, 'CO', 18007),
|
|
'Nationwide Arena': (39.9693, -83.0061, 'OH', 18500),
|
|
'American Airlines Center': (32.7905, -96.8103, 'TX', 18532),
|
|
'Little Caesars Arena': (42.3411, -83.0553, 'MI', 19515),
|
|
'Rogers Place': (53.5469, -113.4978, 'AB', 18347),
|
|
'Amerant Bank Arena': (26.1584, -80.3256, 'FL', 19250),
|
|
'Crypto.com Arena': (34.0430, -118.2673, 'CA', 18230),
|
|
'Xcel Energy Center': (44.9448, -93.1010, 'MN', 17954),
|
|
'Bell Centre': (45.4961, -73.5693, 'QC', 21302),
|
|
'Bridgestone Arena': (36.1592, -86.7785, 'TN', 17159),
|
|
'Prudential Center': (40.7334, -74.1712, 'NJ', 16514),
|
|
'UBS Arena': (40.7161, -73.7246, 'NY', 17255),
|
|
'Madison Square Garden': (40.7505, -73.9934, 'NY', 18006),
|
|
'Canadian Tire Centre': (45.2969, -75.9272, 'ON', 18652),
|
|
'Wells Fargo Center': (39.9012, -75.1720, 'PA', 19543),
|
|
'PPG Paints Arena': (40.4395, -79.9892, 'PA', 18387),
|
|
'SAP Center': (37.3327, -121.9010, 'CA', 17562),
|
|
'Climate Pledge Arena': (47.6221, -122.3540, 'WA', 17100),
|
|
'Enterprise Center': (38.6268, -90.2025, 'MO', 18096),
|
|
'Amalie Arena': (27.9426, -82.4519, 'FL', 19092),
|
|
'Scotiabank Arena': (43.6435, -79.3791, 'ON', 18819),
|
|
'Rogers Arena': (49.2778, -123.1089, 'BC', 18910),
|
|
'T-Mobile Arena': (36.1028, -115.1784, 'NV', 17500),
|
|
'Capital One Arena': (38.8982, -77.0209, 'DC', 18573),
|
|
'Canada Life Centre': (49.8928, -97.1436, 'MB', 15321),
|
|
}
|
|
|
|
for abbrev, info in NHL_TEAMS.items():
|
|
arena_name = info['arena']
|
|
coord_data = nhl_coords.get(arena_name, (0, 0, '', 0))
|
|
|
|
stadium = Stadium(
|
|
id=f"manual_nhl_{abbrev.lower()}",
|
|
name=arena_name,
|
|
city=info['city'],
|
|
state=coord_data[2] if len(coord_data) > 2 else '',
|
|
latitude=coord_data[0],
|
|
longitude=coord_data[1],
|
|
capacity=coord_data[3] if len(coord_data) > 3 else 0,
|
|
sport='NHL',
|
|
team_abbrevs=[abbrev],
|
|
source='manual'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
# =============================================================================
|
|
# HELPERS
|
|
# =============================================================================
|
|
|
|
def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]:
|
|
"""
|
|
Assign stable IDs based on matchup + occurrence number within season.
|
|
Format: {sport}_{season}_{away}_{home}_{num}
|
|
|
|
This ensures IDs don't change when games are rescheduled.
|
|
"""
|
|
from collections import defaultdict
|
|
|
|
# Group games by matchup (away @ home)
|
|
matchups = defaultdict(list)
|
|
for game in games:
|
|
key = f"{game.away_team_abbrev}_{game.home_team_abbrev}"
|
|
matchups[key].append(game)
|
|
|
|
# Sort each matchup by date and assign occurrence number
|
|
for key, matchup_games in matchups.items():
|
|
matchup_games.sort(key=lambda g: g.date)
|
|
for i, game in enumerate(matchup_games, 1):
|
|
away = game.away_team_abbrev.lower()
|
|
home = game.home_team_abbrev.lower()
|
|
# Normalize season format (e.g., "2024-25" -> "2024-25", "2025" -> "2025")
|
|
season_str = season.replace('-', '')
|
|
game.id = f"{sport.lower()}_{season_str}_{away}_{home}_{i}"
|
|
|
|
return games
|
|
|
|
|
|
def get_team_abbrev(team_name: str, sport: str) -> str:
|
|
"""Get team abbreviation from full name."""
|
|
teams = {'NBA': NBA_TEAMS, 'MLB': MLB_TEAMS, 'NHL': NHL_TEAMS}.get(sport, {})
|
|
|
|
for abbrev, info in teams.items():
|
|
if info['name'].lower() == team_name.lower():
|
|
return abbrev
|
|
if team_name.lower() in info['name'].lower():
|
|
return abbrev
|
|
|
|
# Return first 3 letters as fallback
|
|
return team_name[:3].upper()
|
|
|
|
|
|
def validate_games(games_by_source: dict) -> dict:
|
|
"""
|
|
Cross-validate games from multiple sources.
|
|
Returns discrepancies.
|
|
"""
|
|
discrepancies = {
|
|
'missing_in_source': [],
|
|
'date_mismatch': [],
|
|
'time_mismatch': [],
|
|
'venue_mismatch': [],
|
|
}
|
|
|
|
sources = list(games_by_source.keys())
|
|
if len(sources) < 2:
|
|
return discrepancies
|
|
|
|
primary = sources[0]
|
|
primary_games = {g.id: g for g in games_by_source[primary]}
|
|
|
|
for source in sources[1:]:
|
|
secondary_games = {g.id: g for g in games_by_source[source]}
|
|
|
|
for game_id, game in primary_games.items():
|
|
if game_id not in secondary_games:
|
|
discrepancies['missing_in_source'].append({
|
|
'game_id': game_id,
|
|
'present_in': primary,
|
|
'missing_in': source
|
|
})
|
|
|
|
return discrepancies
|
|
|
|
|
|
def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path):
|
|
"""Export scraped data to JSON files."""
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Export games
|
|
games_data = [asdict(g) for g in games]
|
|
with open(output_dir / 'games.json', 'w') as f:
|
|
json.dump(games_data, f, indent=2)
|
|
|
|
# Export stadiums
|
|
stadiums_data = [asdict(s) for s in stadiums]
|
|
with open(output_dir / 'stadiums.json', 'w') as f:
|
|
json.dump(stadiums_data, f, indent=2)
|
|
|
|
# Export as CSV for easy viewing
|
|
if games:
|
|
df_games = pd.DataFrame(games_data)
|
|
df_games.to_csv(output_dir / 'games.csv', index=False)
|
|
|
|
if stadiums:
|
|
df_stadiums = pd.DataFrame(stadiums_data)
|
|
df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False)
|
|
|
|
print(f"\nExported to {output_dir}")
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Scrape sports schedules')
|
|
parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'all'], default='all')
|
|
parser.add_argument('--season', type=int, default=2025, help='Season year (ending year)')
|
|
parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data')
|
|
parser.add_argument('--output', type=str, default='./data', help='Output directory')
|
|
|
|
args = parser.parse_args()
|
|
output_dir = Path(args.output)
|
|
|
|
all_games = []
|
|
all_stadiums = []
|
|
|
|
# Scrape stadiums
|
|
print("\n" + "="*60)
|
|
print("SCRAPING STADIUMS")
|
|
print("="*60)
|
|
|
|
all_stadiums.extend(scrape_stadiums_hifld())
|
|
all_stadiums.extend(generate_stadiums_from_teams())
|
|
|
|
if args.stadiums_only:
|
|
export_to_json([], all_stadiums, output_dir)
|
|
return
|
|
|
|
# Scrape schedules
|
|
if args.sport in ['nba', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING NBA {args.season}")
|
|
print("="*60)
|
|
|
|
nba_games_br = scrape_nba_basketball_reference(args.season)
|
|
nba_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2024-25"
|
|
nba_games_br = assign_stable_ids(nba_games_br, 'NBA', nba_season)
|
|
all_games.extend(nba_games_br)
|
|
|
|
if args.sport in ['mlb', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING MLB {args.season}")
|
|
print("="*60)
|
|
|
|
mlb_games_api = scrape_mlb_statsapi(args.season)
|
|
# MLB API uses official gamePk which is already stable - no reassignment needed
|
|
all_games.extend(mlb_games_api)
|
|
|
|
if args.sport in ['nhl', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING NHL {args.season}")
|
|
print("="*60)
|
|
|
|
nhl_games_hr = scrape_nhl_hockey_reference(args.season)
|
|
nhl_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2024-25"
|
|
nhl_games_hr = assign_stable_ids(nhl_games_hr, 'NHL', nhl_season)
|
|
all_games.extend(nhl_games_hr)
|
|
|
|
# Export
|
|
print("\n" + "="*60)
|
|
print("EXPORTING DATA")
|
|
print("="*60)
|
|
|
|
export_to_json(all_games, all_stadiums, output_dir)
|
|
|
|
# Summary
|
|
print("\n" + "="*60)
|
|
print("SUMMARY")
|
|
print("="*60)
|
|
print(f"Total games scraped: {len(all_games)}")
|
|
print(f"Total stadiums: {len(all_stadiums)}")
|
|
|
|
# Games by sport
|
|
by_sport = {}
|
|
for g in all_games:
|
|
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
|
|
for sport, count in by_sport.items():
|
|
print(f" {sport}: {count} games")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|