#!/usr/bin/env python3 """ MLB schedule and stadium scrapers for SportsTime. This module provides: - MLB game scrapers (Baseball-Reference, Stats API, ESPN) - MLB stadium scrapers (MLBScoreBot, GeoJSON, hardcoded) - Multi-source fallback configurations """ from datetime import datetime from typing import Optional import requests # Support both direct execution and import from parent directory try: from core import ( Game, Stadium, ScraperSource, StadiumScraperSource, fetch_page, scrape_with_fallback, scrape_stadiums_with_fallback, ) except ImportError: from Scripts.core import ( Game, Stadium, ScraperSource, StadiumScraperSource, fetch_page, scrape_with_fallback, scrape_stadiums_with_fallback, ) __all__ = [ # Team data 'MLB_TEAMS', # Game scrapers 'scrape_mlb_baseball_reference', 'scrape_mlb_statsapi', 'scrape_mlb_espn', # Stadium scrapers 'scrape_mlb_stadiums_scorebot', 'scrape_mlb_stadiums_geojson', 'scrape_mlb_stadiums_hardcoded', 'scrape_mlb_stadiums', # Source configurations 'MLB_GAME_SOURCES', 'MLB_STADIUM_SOURCES', # Convenience function 'scrape_mlb_games', ] # ============================================================================= # TEAM MAPPINGS # ============================================================================= MLB_TEAMS = { 'ARI': {'name': 'Arizona Diamondbacks', 'city': 'Phoenix', 'stadium': 'Chase Field'}, 'ATL': {'name': 'Atlanta Braves', 'city': 'Atlanta', 'stadium': 'Truist Park'}, 'BAL': {'name': 'Baltimore Orioles', 'city': 'Baltimore', 'stadium': 'Oriole Park at Camden Yards'}, 'BOS': {'name': 'Boston Red Sox', 'city': 'Boston', 'stadium': 'Fenway Park'}, 'CHC': {'name': 'Chicago Cubs', 'city': 'Chicago', 'stadium': 'Wrigley Field'}, 'CHW': {'name': 'Chicago White Sox', 'city': 'Chicago', 'stadium': 'Guaranteed Rate Field'}, 'CIN': {'name': 'Cincinnati Reds', 'city': 'Cincinnati', 'stadium': 'Great American Ball Park'}, 'CLE': {'name': 'Cleveland Guardians', 'city': 'Cleveland', 'stadium': 'Progressive Field'}, 'COL': {'name': 'Colorado Rockies', 'city': 'Denver', 'stadium': 'Coors Field'}, 'DET': {'name': 'Detroit Tigers', 'city': 'Detroit', 'stadium': 'Comerica Park'}, 'HOU': {'name': 'Houston Astros', 'city': 'Houston', 'stadium': 'Minute Maid Park'}, 'KCR': {'name': 'Kansas City Royals', 'city': 'Kansas City', 'stadium': 'Kauffman Stadium'}, 'LAA': {'name': 'Los Angeles Angels', 'city': 'Anaheim', 'stadium': 'Angel Stadium'}, 'LAD': {'name': 'Los Angeles Dodgers', 'city': 'Los Angeles', 'stadium': 'Dodger Stadium'}, 'MIA': {'name': 'Miami Marlins', 'city': 'Miami', 'stadium': 'LoanDepot Park'}, 'MIL': {'name': 'Milwaukee Brewers', 'city': 'Milwaukee', 'stadium': 'American Family Field'}, 'MIN': {'name': 'Minnesota Twins', 'city': 'Minneapolis', 'stadium': 'Target Field'}, 'NYM': {'name': 'New York Mets', 'city': 'New York', 'stadium': 'Citi Field'}, 'NYY': {'name': 'New York Yankees', 'city': 'New York', 'stadium': 'Yankee Stadium'}, 'OAK': {'name': 'Oakland Athletics', 'city': 'Sacramento', 'stadium': 'Sutter Health Park'}, 'PHI': {'name': 'Philadelphia Phillies', 'city': 'Philadelphia', 'stadium': 'Citizens Bank Park'}, 'PIT': {'name': 'Pittsburgh Pirates', 'city': 'Pittsburgh', 'stadium': 'PNC Park'}, 'SDP': {'name': 'San Diego Padres', 'city': 'San Diego', 'stadium': 'Petco Park'}, 'SFG': {'name': 'San Francisco Giants', 'city': 'San Francisco', 'stadium': 'Oracle Park'}, 'SEA': {'name': 'Seattle Mariners', 'city': 'Seattle', 'stadium': 'T-Mobile Park'}, 'STL': {'name': 'St. Louis Cardinals', 'city': 'St. Louis', 'stadium': 'Busch Stadium'}, 'TBR': {'name': 'Tampa Bay Rays', 'city': 'St. Petersburg', 'stadium': 'Tropicana Field'}, 'TEX': {'name': 'Texas Rangers', 'city': 'Arlington', 'stadium': 'Globe Life Field'}, 'TOR': {'name': 'Toronto Blue Jays', 'city': 'Toronto', 'stadium': 'Rogers Centre'}, 'WSN': {'name': 'Washington Nationals', 'city': 'Washington', 'stadium': 'Nationals Park'}, } def get_mlb_team_abbrev(team_name: str) -> str: """Get MLB team abbreviation from full name.""" for abbrev, info in MLB_TEAMS.items(): if info['name'].lower() == team_name.lower(): return abbrev if team_name.lower() in info['name'].lower(): return abbrev # Return first 3 letters as fallback return team_name[:3].upper() # ============================================================================= # GAME SCRAPERS # ============================================================================= def scrape_mlb_baseball_reference(season: int) -> list[Game]: """ Scrape MLB schedule from Baseball-Reference. URL: https://www.baseball-reference.com/leagues/majors/{YEAR}-schedule.shtml """ games = [] url = f"https://www.baseball-reference.com/leagues/majors/{season}-schedule.shtml" print(f"Scraping MLB {season} from Baseball-Reference...") soup = fetch_page(url, 'baseball-reference.com') if not soup: return games # Baseball-Reference groups games by date in h3 headers current_date = None # Find the schedule section schedule_div = soup.find('div', {'id': 'all_schedule'}) if not schedule_div: schedule_div = soup # Process all elements to track date context for element in schedule_div.find_all(['h3', 'p', 'div']): # Check for date header if element.name == 'h3': date_text = element.get_text(strip=True) # Parse date like "Thursday, March 27, 2025" try: for fmt in ['%A, %B %d, %Y', '%B %d, %Y', '%a, %b %d, %Y']: try: parsed = datetime.strptime(date_text, fmt) current_date = parsed.strftime('%Y-%m-%d') break except: continue except: pass # Check for game entries elif element.name == 'p' and 'game' in element.get('class', []): if not current_date: continue try: links = element.find_all('a') if len(links) >= 2: away_team = links[0].text.strip() home_team = links[1].text.strip() # Generate unique game ID away_abbrev = get_mlb_team_abbrev(away_team) home_abbrev = get_mlb_team_abbrev(home_team) game_id = f"mlb_br_{current_date}_{away_abbrev}_{home_abbrev}".lower() game = Game( id=game_id, sport='MLB', season=str(season), date=current_date, time=None, home_team=home_team, away_team=away_team, home_team_abbrev=home_abbrev, away_team_abbrev=away_abbrev, venue='', source='baseball-reference.com' ) games.append(game) except Exception as e: continue print(f" Found {len(games)} games from Baseball-Reference") return games def scrape_mlb_statsapi(season: int) -> list[Game]: """ Fetch MLB schedule from official Stats API (JSON). URL: https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={YEAR}&gameType=R """ games = [] url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={season}&gameType=R&hydrate=team,venue" print(f"Fetching MLB {season} from Stats API...") try: response = requests.get(url, timeout=30) response.raise_for_status() data = response.json() for date_entry in data.get('dates', []): game_date = date_entry.get('date', '') for game_data in date_entry.get('games', []): try: teams = game_data.get('teams', {}) away = teams.get('away', {}).get('team', {}) home = teams.get('home', {}).get('team', {}) venue = game_data.get('venue', {}) game_time = game_data.get('gameDate', '') if 'T' in game_time: time_str = game_time.split('T')[1][:5] else: time_str = None game = Game( id='', # Will be assigned by assign_stable_ids sport='MLB', season=str(season), date=game_date, time=time_str, home_team=home.get('name', ''), away_team=away.get('name', ''), home_team_abbrev=home.get('abbreviation', ''), away_team_abbrev=away.get('abbreviation', ''), venue=venue.get('name', ''), source='statsapi.mlb.com' ) games.append(game) except Exception as e: continue except Exception as e: print(f" Error fetching MLB API: {e}") print(f" Found {len(games)} games from MLB Stats API") return games def scrape_mlb_espn(season: int) -> list[Game]: """Fetch MLB schedule from ESPN API.""" games = [] print(f"Fetching MLB {season} from ESPN API...") # MLB regular season: Late March - Early October start = f"{season}0320" end = f"{season}1010" url = "https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard" params = { 'dates': f"{start}-{end}", 'limit': 1000 } headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } try: response = requests.get(url, params=params, headers=headers, timeout=30) response.raise_for_status() data = response.json() events = data.get('events', []) for event in events: try: date_str = event.get('date', '')[:10] time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None competitions = event.get('competitions', [{}]) if not competitions: continue comp = competitions[0] competitors = comp.get('competitors', []) if len(competitors) < 2: continue home_team = away_team = home_abbrev = away_abbrev = None for team in competitors: team_data = team.get('team', {}) team_name = team_data.get('displayName', team_data.get('name', '')) team_abbrev = team_data.get('abbreviation', '') if team.get('homeAway') == 'home': home_team = team_name home_abbrev = team_abbrev else: away_team = team_name away_abbrev = team_abbrev if not home_team or not away_team: continue venue = comp.get('venue', {}).get('fullName', '') game_id = f"mlb_{date_str}_{away_abbrev}_{home_abbrev}".lower() game = Game( id=game_id, sport='MLB', season=str(season), date=date_str, time=time_str, home_team=home_team, away_team=away_team, home_team_abbrev=home_abbrev or get_mlb_team_abbrev(home_team), away_team_abbrev=away_abbrev or get_mlb_team_abbrev(away_team), venue=venue, source='espn.com' ) games.append(game) except Exception: continue print(f" Found {len(games)} games from ESPN") except Exception as e: print(f"Error fetching ESPN MLB: {e}") return games # ============================================================================= # STADIUM SCRAPERS # ============================================================================= def scrape_mlb_stadiums_scorebot() -> list[Stadium]: """ Source 1: MLBScoreBot/ballparks GitHub (public domain). """ stadiums = [] url = "https://raw.githubusercontent.com/MLBScoreBot/ballparks/main/ballparks.json" response = requests.get(url, timeout=30) response.raise_for_status() data = response.json() for name, info in data.items(): stadium = Stadium( id=f"mlb_{name.lower().replace(' ', '_')[:30]}", name=name, city=info.get('city', ''), state=info.get('state', ''), latitude=info.get('lat', 0) / 1000000 if info.get('lat') else 0, longitude=info.get('long', 0) / 1000000 if info.get('long') else 0, capacity=info.get('capacity', 0), sport='MLB', team_abbrevs=[info.get('team', '')], source='github.com/MLBScoreBot' ) stadiums.append(stadium) return stadiums def scrape_mlb_stadiums_geojson() -> list[Stadium]: """ Source 2: cageyjames/GeoJSON-Ballparks GitHub. """ stadiums = [] url = "https://raw.githubusercontent.com/cageyjames/GeoJSON-Ballparks/master/ballparks.geojson" response = requests.get(url, timeout=30) response.raise_for_status() data = response.json() for feature in data.get('features', []): props = feature.get('properties', {}) coords = feature.get('geometry', {}).get('coordinates', [0, 0]) # Only include MLB stadiums (filter by League) if props.get('League', '').upper() != 'MLB': continue stadium = Stadium( id=f"mlb_{props.get('Ballpark', '').lower().replace(' ', '_')[:30]}", name=props.get('Ballpark', ''), city=props.get('City', ''), state=props.get('State', ''), latitude=coords[1] if len(coords) > 1 else 0, longitude=coords[0] if len(coords) > 0 else 0, capacity=0, # Not in this dataset sport='MLB', team_abbrevs=[props.get('Team', '')], source='github.com/cageyjames' ) stadiums.append(stadium) return stadiums def scrape_mlb_stadiums_hardcoded() -> list[Stadium]: """ Source 3: Hardcoded MLB ballparks (fallback). """ mlb_ballparks = { 'Chase Field': {'city': 'Phoenix', 'state': 'AZ', 'lat': 33.4453, 'lng': -112.0667, 'capacity': 48519, 'teams': ['ARI'], 'year_opened': 1998}, 'Truist Park': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.8907, 'lng': -84.4677, 'capacity': 41084, 'teams': ['ATL'], 'year_opened': 2017}, 'Oriole Park at Camden Yards': {'city': 'Baltimore', 'state': 'MD', 'lat': 39.2839, 'lng': -76.6216, 'capacity': 44970, 'teams': ['BAL'], 'year_opened': 1992}, 'Fenway Park': {'city': 'Boston', 'state': 'MA', 'lat': 42.3467, 'lng': -71.0972, 'capacity': 37755, 'teams': ['BOS'], 'year_opened': 1912}, 'Wrigley Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.9484, 'lng': -87.6553, 'capacity': 41649, 'teams': ['CHC'], 'year_opened': 1914}, 'Guaranteed Rate Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8299, 'lng': -87.6338, 'capacity': 40615, 'teams': ['CHW'], 'year_opened': 1991}, 'Great American Ball Park': {'city': 'Cincinnati', 'state': 'OH', 'lat': 39.0979, 'lng': -84.5082, 'capacity': 42319, 'teams': ['CIN'], 'year_opened': 2003}, 'Progressive Field': {'city': 'Cleveland', 'state': 'OH', 'lat': 41.4958, 'lng': -81.6853, 'capacity': 34830, 'teams': ['CLE'], 'year_opened': 1994}, 'Coors Field': {'city': 'Denver', 'state': 'CO', 'lat': 39.7559, 'lng': -104.9942, 'capacity': 50144, 'teams': ['COL'], 'year_opened': 1995}, 'Comerica Park': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3390, 'lng': -83.0485, 'capacity': 41083, 'teams': ['DET'], 'year_opened': 2000}, 'Minute Maid Park': {'city': 'Houston', 'state': 'TX', 'lat': 29.7573, 'lng': -95.3555, 'capacity': 41168, 'teams': ['HOU'], 'year_opened': 2000}, 'Kauffman Stadium': {'city': 'Kansas City', 'state': 'MO', 'lat': 39.0517, 'lng': -94.4803, 'capacity': 37903, 'teams': ['KCR'], 'year_opened': 1973}, 'Angel Stadium': {'city': 'Anaheim', 'state': 'CA', 'lat': 33.8003, 'lng': -117.8827, 'capacity': 45517, 'teams': ['LAA'], 'year_opened': 1966}, 'Dodger Stadium': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0739, 'lng': -118.2400, 'capacity': 56000, 'teams': ['LAD'], 'year_opened': 1962}, 'LoanDepot Park': {'city': 'Miami', 'state': 'FL', 'lat': 25.7781, 'lng': -80.2196, 'capacity': 36742, 'teams': ['MIA'], 'year_opened': 2012}, 'American Family Field': {'city': 'Milwaukee', 'state': 'WI', 'lat': 43.0280, 'lng': -87.9712, 'capacity': 41900, 'teams': ['MIL'], 'year_opened': 2001}, 'Target Field': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9818, 'lng': -93.2775, 'capacity': 38544, 'teams': ['MIN'], 'year_opened': 2010}, 'Citi Field': {'city': 'Queens', 'state': 'NY', 'lat': 40.7571, 'lng': -73.8458, 'capacity': 41922, 'teams': ['NYM'], 'year_opened': 2009}, 'Yankee Stadium': {'city': 'Bronx', 'state': 'NY', 'lat': 40.8296, 'lng': -73.9262, 'capacity': 46537, 'teams': ['NYY'], 'year_opened': 2009}, 'Sutter Health Park': {'city': 'Sacramento', 'state': 'CA', 'lat': 38.5803, 'lng': -121.5108, 'capacity': 14014, 'teams': ['OAK'], 'year_opened': 2000}, 'Citizens Bank Park': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9061, 'lng': -75.1665, 'capacity': 42901, 'teams': ['PHI'], 'year_opened': 2004}, 'PNC Park': {'city': 'Pittsburgh', 'state': 'PA', 'lat': 40.4469, 'lng': -80.0057, 'capacity': 38362, 'teams': ['PIT'], 'year_opened': 2001}, 'Petco Park': {'city': 'San Diego', 'state': 'CA', 'lat': 32.7073, 'lng': -117.1566, 'capacity': 40209, 'teams': ['SDP'], 'year_opened': 2004}, 'Oracle Park': {'city': 'San Francisco', 'state': 'CA', 'lat': 37.7786, 'lng': -122.3893, 'capacity': 41915, 'teams': ['SFG'], 'year_opened': 2000}, 'T-Mobile Park': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5914, 'lng': -122.3325, 'capacity': 47929, 'teams': ['SEA'], 'year_opened': 1999}, 'Busch Stadium': {'city': 'St. Louis', 'state': 'MO', 'lat': 38.6226, 'lng': -90.1928, 'capacity': 45538, 'teams': ['STL'], 'year_opened': 2006}, 'Tropicana Field': {'city': 'St. Petersburg', 'state': 'FL', 'lat': 27.7682, 'lng': -82.6534, 'capacity': 25000, 'teams': ['TBR'], 'year_opened': 1990}, 'Globe Life Field': {'city': 'Arlington', 'state': 'TX', 'lat': 32.7473, 'lng': -97.0844, 'capacity': 40300, 'teams': ['TEX'], 'year_opened': 2020}, 'Rogers Centre': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6414, 'lng': -79.3894, 'capacity': 49282, 'teams': ['TOR'], 'year_opened': 1989}, 'Nationals Park': {'city': 'Washington', 'state': 'DC', 'lat': 38.8729, 'lng': -77.0074, 'capacity': 41339, 'teams': ['WSN'], 'year_opened': 2008}, } stadiums = [] for name, info in mlb_ballparks.items(): stadium = Stadium( id=f"mlb_{name.lower().replace(' ', '_')[:30]}", name=name, city=info['city'], state=info['state'], latitude=info['lat'], longitude=info['lng'], capacity=info['capacity'], sport='MLB', team_abbrevs=info['teams'], source='mlb_hardcoded', year_opened=info.get('year_opened') ) stadiums.append(stadium) return stadiums def scrape_mlb_stadiums() -> list[Stadium]: """ Fetch MLB stadium data with multi-source fallback. """ print("\nMLB STADIUMS") print("-" * 40) sources = [ StadiumScraperSource('MLBScoreBot', scrape_mlb_stadiums_scorebot, priority=1, min_venues=25), StadiumScraperSource('GeoJSON-Ballparks', scrape_mlb_stadiums_geojson, priority=2, min_venues=25), StadiumScraperSource('Hardcoded', scrape_mlb_stadiums_hardcoded, priority=3, min_venues=25), ] return scrape_stadiums_with_fallback('MLB', sources) # ============================================================================= # SOURCE CONFIGURATIONS # ============================================================================= MLB_GAME_SOURCES = [ ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=100), ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=100), ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=100), ] MLB_STADIUM_SOURCES = [ StadiumScraperSource('MLBScoreBot', scrape_mlb_stadiums_scorebot, priority=1, min_venues=25), StadiumScraperSource('GeoJSON-Ballparks', scrape_mlb_stadiums_geojson, priority=2, min_venues=25), StadiumScraperSource('Hardcoded', scrape_mlb_stadiums_hardcoded, priority=3, min_venues=25), ] # ============================================================================= # CONVENIENCE FUNCTIONS # ============================================================================= def scrape_mlb_games(season: int) -> list[Game]: """ Scrape MLB games for a season using multi-source fallback. Args: season: Season year (e.g., 2026) Returns: List of Game objects from the first successful source """ print(f"\nMLB {season} SCHEDULE") print("-" * 40) return scrape_with_fallback('MLB', season, MLB_GAME_SOURCES)