From cdf4c775ffd08534d7ad9360bbc9c9253eee613e Mon Sep 17 00:00:00 2001 From: Trey t Date: Fri, 9 Jan 2026 23:59:04 -0600 Subject: [PATCH] feat(01-01): create mlb.py sport module - MLB_TEAMS dictionary with all 30 teams - Game scrapers: Baseball-Reference, MLB Stats API, ESPN - Stadium scrapers: MLBScoreBot, GeoJSON, hardcoded fallback - MLB_GAME_SOURCES and MLB_STADIUM_SOURCES configurations - scrape_mlb_games() convenience function Co-Authored-By: Claude Opus 4.5 --- Scripts/mlb.py | 509 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 509 insertions(+) create mode 100644 Scripts/mlb.py diff --git a/Scripts/mlb.py b/Scripts/mlb.py new file mode 100644 index 0000000..c708ea3 --- /dev/null +++ b/Scripts/mlb.py @@ -0,0 +1,509 @@ +#!/usr/bin/env python3 +""" +MLB schedule and stadium scrapers for SportsTime. + +This module provides: +- MLB game scrapers (Baseball-Reference, Stats API, ESPN) +- MLB stadium scrapers (MLBScoreBot, GeoJSON, hardcoded) +- Multi-source fallback configurations +""" + +from datetime import datetime +from typing import Optional + +import requests + +# Support both direct execution and import from parent directory +try: + from core import ( + Game, + Stadium, + ScraperSource, + StadiumScraperSource, + fetch_page, + scrape_with_fallback, + scrape_stadiums_with_fallback, + ) +except ImportError: + from Scripts.core import ( + Game, + Stadium, + ScraperSource, + StadiumScraperSource, + fetch_page, + scrape_with_fallback, + scrape_stadiums_with_fallback, + ) + + +__all__ = [ + # Team data + 'MLB_TEAMS', + # Game scrapers + 'scrape_mlb_baseball_reference', + 'scrape_mlb_statsapi', + 'scrape_mlb_espn', + # Stadium scrapers + 'scrape_mlb_stadiums_scorebot', + 'scrape_mlb_stadiums_geojson', + 'scrape_mlb_stadiums_hardcoded', + 'scrape_mlb_stadiums', + # Source configurations + 'MLB_GAME_SOURCES', + 'MLB_STADIUM_SOURCES', + # Convenience function + 'scrape_mlb_games', +] + + +# ============================================================================= +# TEAM MAPPINGS +# ============================================================================= + +MLB_TEAMS = { + 'ARI': {'name': 'Arizona Diamondbacks', 'city': 'Phoenix', 'stadium': 'Chase Field'}, + 'ATL': {'name': 'Atlanta Braves', 'city': 'Atlanta', 'stadium': 'Truist Park'}, + 'BAL': {'name': 'Baltimore Orioles', 'city': 'Baltimore', 'stadium': 'Oriole Park at Camden Yards'}, + 'BOS': {'name': 'Boston Red Sox', 'city': 'Boston', 'stadium': 'Fenway Park'}, + 'CHC': {'name': 'Chicago Cubs', 'city': 'Chicago', 'stadium': 'Wrigley Field'}, + 'CHW': {'name': 'Chicago White Sox', 'city': 'Chicago', 'stadium': 'Guaranteed Rate Field'}, + 'CIN': {'name': 'Cincinnati Reds', 'city': 'Cincinnati', 'stadium': 'Great American Ball Park'}, + 'CLE': {'name': 'Cleveland Guardians', 'city': 'Cleveland', 'stadium': 'Progressive Field'}, + 'COL': {'name': 'Colorado Rockies', 'city': 'Denver', 'stadium': 'Coors Field'}, + 'DET': {'name': 'Detroit Tigers', 'city': 'Detroit', 'stadium': 'Comerica Park'}, + 'HOU': {'name': 'Houston Astros', 'city': 'Houston', 'stadium': 'Minute Maid Park'}, + 'KCR': {'name': 'Kansas City Royals', 'city': 'Kansas City', 'stadium': 'Kauffman Stadium'}, + 'LAA': {'name': 'Los Angeles Angels', 'city': 'Anaheim', 'stadium': 'Angel Stadium'}, + 'LAD': {'name': 'Los Angeles Dodgers', 'city': 'Los Angeles', 'stadium': 'Dodger Stadium'}, + 'MIA': {'name': 'Miami Marlins', 'city': 'Miami', 'stadium': 'LoanDepot Park'}, + 'MIL': {'name': 'Milwaukee Brewers', 'city': 'Milwaukee', 'stadium': 'American Family Field'}, + 'MIN': {'name': 'Minnesota Twins', 'city': 'Minneapolis', 'stadium': 'Target Field'}, + 'NYM': {'name': 'New York Mets', 'city': 'New York', 'stadium': 'Citi Field'}, + 'NYY': {'name': 'New York Yankees', 'city': 'New York', 'stadium': 'Yankee Stadium'}, + 'OAK': {'name': 'Oakland Athletics', 'city': 'Sacramento', 'stadium': 'Sutter Health Park'}, + 'PHI': {'name': 'Philadelphia Phillies', 'city': 'Philadelphia', 'stadium': 'Citizens Bank Park'}, + 'PIT': {'name': 'Pittsburgh Pirates', 'city': 'Pittsburgh', 'stadium': 'PNC Park'}, + 'SDP': {'name': 'San Diego Padres', 'city': 'San Diego', 'stadium': 'Petco Park'}, + 'SFG': {'name': 'San Francisco Giants', 'city': 'San Francisco', 'stadium': 'Oracle Park'}, + 'SEA': {'name': 'Seattle Mariners', 'city': 'Seattle', 'stadium': 'T-Mobile Park'}, + 'STL': {'name': 'St. Louis Cardinals', 'city': 'St. Louis', 'stadium': 'Busch Stadium'}, + 'TBR': {'name': 'Tampa Bay Rays', 'city': 'St. Petersburg', 'stadium': 'Tropicana Field'}, + 'TEX': {'name': 'Texas Rangers', 'city': 'Arlington', 'stadium': 'Globe Life Field'}, + 'TOR': {'name': 'Toronto Blue Jays', 'city': 'Toronto', 'stadium': 'Rogers Centre'}, + 'WSN': {'name': 'Washington Nationals', 'city': 'Washington', 'stadium': 'Nationals Park'}, +} + + +def get_mlb_team_abbrev(team_name: str) -> str: + """Get MLB team abbreviation from full name.""" + for abbrev, info in MLB_TEAMS.items(): + if info['name'].lower() == team_name.lower(): + return abbrev + if team_name.lower() in info['name'].lower(): + return abbrev + + # Return first 3 letters as fallback + return team_name[:3].upper() + + +# ============================================================================= +# GAME SCRAPERS +# ============================================================================= + +def scrape_mlb_baseball_reference(season: int) -> list[Game]: + """ + Scrape MLB schedule from Baseball-Reference. + URL: https://www.baseball-reference.com/leagues/majors/{YEAR}-schedule.shtml + """ + games = [] + url = f"https://www.baseball-reference.com/leagues/majors/{season}-schedule.shtml" + + print(f"Scraping MLB {season} from Baseball-Reference...") + soup = fetch_page(url, 'baseball-reference.com') + + if not soup: + return games + + # Baseball-Reference groups games by date in h3 headers + current_date = None + + # Find the schedule section + schedule_div = soup.find('div', {'id': 'all_schedule'}) + if not schedule_div: + schedule_div = soup + + # Process all elements to track date context + for element in schedule_div.find_all(['h3', 'p', 'div']): + # Check for date header + if element.name == 'h3': + date_text = element.get_text(strip=True) + # Parse date like "Thursday, March 27, 2025" + try: + for fmt in ['%A, %B %d, %Y', '%B %d, %Y', '%a, %b %d, %Y']: + try: + parsed = datetime.strptime(date_text, fmt) + current_date = parsed.strftime('%Y-%m-%d') + break + except: + continue + except: + pass + + # Check for game entries + elif element.name == 'p' and 'game' in element.get('class', []): + if not current_date: + continue + + try: + links = element.find_all('a') + if len(links) >= 2: + away_team = links[0].text.strip() + home_team = links[1].text.strip() + + # Generate unique game ID + away_abbrev = get_mlb_team_abbrev(away_team) + home_abbrev = get_mlb_team_abbrev(home_team) + game_id = f"mlb_br_{current_date}_{away_abbrev}_{home_abbrev}".lower() + + game = Game( + id=game_id, + sport='MLB', + season=str(season), + date=current_date, + time=None, + home_team=home_team, + away_team=away_team, + home_team_abbrev=home_abbrev, + away_team_abbrev=away_abbrev, + venue='', + source='baseball-reference.com' + ) + games.append(game) + + except Exception as e: + continue + + print(f" Found {len(games)} games from Baseball-Reference") + return games + + +def scrape_mlb_statsapi(season: int) -> list[Game]: + """ + Fetch MLB schedule from official Stats API (JSON). + URL: https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={YEAR}&gameType=R + """ + games = [] + url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={season}&gameType=R&hydrate=team,venue" + + print(f"Fetching MLB {season} from Stats API...") + + try: + response = requests.get(url, timeout=30) + response.raise_for_status() + data = response.json() + + for date_entry in data.get('dates', []): + game_date = date_entry.get('date', '') + + for game_data in date_entry.get('games', []): + try: + teams = game_data.get('teams', {}) + away = teams.get('away', {}).get('team', {}) + home = teams.get('home', {}).get('team', {}) + venue = game_data.get('venue', {}) + + game_time = game_data.get('gameDate', '') + if 'T' in game_time: + time_str = game_time.split('T')[1][:5] + else: + time_str = None + + game = Game( + id='', # Will be assigned by assign_stable_ids + sport='MLB', + season=str(season), + date=game_date, + time=time_str, + home_team=home.get('name', ''), + away_team=away.get('name', ''), + home_team_abbrev=home.get('abbreviation', ''), + away_team_abbrev=away.get('abbreviation', ''), + venue=venue.get('name', ''), + source='statsapi.mlb.com' + ) + games.append(game) + + except Exception as e: + continue + + except Exception as e: + print(f" Error fetching MLB API: {e}") + + print(f" Found {len(games)} games from MLB Stats API") + return games + + +def scrape_mlb_espn(season: int) -> list[Game]: + """Fetch MLB schedule from ESPN API.""" + games = [] + print(f"Fetching MLB {season} from ESPN API...") + + # MLB regular season: Late March - Early October + start = f"{season}0320" + end = f"{season}1010" + + url = "https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard" + params = { + 'dates': f"{start}-{end}", + 'limit': 1000 + } + + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' + } + + try: + response = requests.get(url, params=params, headers=headers, timeout=30) + response.raise_for_status() + data = response.json() + + events = data.get('events', []) + + for event in events: + try: + date_str = event.get('date', '')[:10] + time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None + + competitions = event.get('competitions', [{}]) + if not competitions: + continue + + comp = competitions[0] + competitors = comp.get('competitors', []) + + if len(competitors) < 2: + continue + + home_team = away_team = home_abbrev = away_abbrev = None + + for team in competitors: + team_data = team.get('team', {}) + team_name = team_data.get('displayName', team_data.get('name', '')) + team_abbrev = team_data.get('abbreviation', '') + + if team.get('homeAway') == 'home': + home_team = team_name + home_abbrev = team_abbrev + else: + away_team = team_name + away_abbrev = team_abbrev + + if not home_team or not away_team: + continue + + venue = comp.get('venue', {}).get('fullName', '') + + game_id = f"mlb_{date_str}_{away_abbrev}_{home_abbrev}".lower() + + game = Game( + id=game_id, + sport='MLB', + season=str(season), + date=date_str, + time=time_str, + home_team=home_team, + away_team=away_team, + home_team_abbrev=home_abbrev or get_mlb_team_abbrev(home_team), + away_team_abbrev=away_abbrev or get_mlb_team_abbrev(away_team), + venue=venue, + source='espn.com' + ) + games.append(game) + + except Exception: + continue + + print(f" Found {len(games)} games from ESPN") + + except Exception as e: + print(f"Error fetching ESPN MLB: {e}") + + return games + + +# ============================================================================= +# STADIUM SCRAPERS +# ============================================================================= + +def scrape_mlb_stadiums_scorebot() -> list[Stadium]: + """ + Source 1: MLBScoreBot/ballparks GitHub (public domain). + """ + stadiums = [] + url = "https://raw.githubusercontent.com/MLBScoreBot/ballparks/main/ballparks.json" + + response = requests.get(url, timeout=30) + response.raise_for_status() + data = response.json() + + for name, info in data.items(): + stadium = Stadium( + id=f"mlb_{name.lower().replace(' ', '_')[:30]}", + name=name, + city=info.get('city', ''), + state=info.get('state', ''), + latitude=info.get('lat', 0) / 1000000 if info.get('lat') else 0, + longitude=info.get('long', 0) / 1000000 if info.get('long') else 0, + capacity=info.get('capacity', 0), + sport='MLB', + team_abbrevs=[info.get('team', '')], + source='github.com/MLBScoreBot' + ) + stadiums.append(stadium) + + return stadiums + + +def scrape_mlb_stadiums_geojson() -> list[Stadium]: + """ + Source 2: cageyjames/GeoJSON-Ballparks GitHub. + """ + stadiums = [] + url = "https://raw.githubusercontent.com/cageyjames/GeoJSON-Ballparks/master/ballparks.geojson" + + response = requests.get(url, timeout=30) + response.raise_for_status() + data = response.json() + + for feature in data.get('features', []): + props = feature.get('properties', {}) + coords = feature.get('geometry', {}).get('coordinates', [0, 0]) + + # Only include MLB stadiums (filter by League) + if props.get('League', '').upper() != 'MLB': + continue + + stadium = Stadium( + id=f"mlb_{props.get('Ballpark', '').lower().replace(' ', '_')[:30]}", + name=props.get('Ballpark', ''), + city=props.get('City', ''), + state=props.get('State', ''), + latitude=coords[1] if len(coords) > 1 else 0, + longitude=coords[0] if len(coords) > 0 else 0, + capacity=0, # Not in this dataset + sport='MLB', + team_abbrevs=[props.get('Team', '')], + source='github.com/cageyjames' + ) + stadiums.append(stadium) + + return stadiums + + +def scrape_mlb_stadiums_hardcoded() -> list[Stadium]: + """ + Source 3: Hardcoded MLB ballparks (fallback). + """ + mlb_ballparks = { + 'Chase Field': {'city': 'Phoenix', 'state': 'AZ', 'lat': 33.4453, 'lng': -112.0667, 'capacity': 48519, 'teams': ['ARI']}, + 'Truist Park': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.8907, 'lng': -84.4677, 'capacity': 41084, 'teams': ['ATL']}, + 'Oriole Park at Camden Yards': {'city': 'Baltimore', 'state': 'MD', 'lat': 39.2839, 'lng': -76.6216, 'capacity': 44970, 'teams': ['BAL']}, + 'Fenway Park': {'city': 'Boston', 'state': 'MA', 'lat': 42.3467, 'lng': -71.0972, 'capacity': 37755, 'teams': ['BOS']}, + 'Wrigley Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.9484, 'lng': -87.6553, 'capacity': 41649, 'teams': ['CHC']}, + 'Guaranteed Rate Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8299, 'lng': -87.6338, 'capacity': 40615, 'teams': ['CHW']}, + 'Great American Ball Park': {'city': 'Cincinnati', 'state': 'OH', 'lat': 39.0979, 'lng': -84.5082, 'capacity': 42319, 'teams': ['CIN']}, + 'Progressive Field': {'city': 'Cleveland', 'state': 'OH', 'lat': 41.4958, 'lng': -81.6853, 'capacity': 34830, 'teams': ['CLE']}, + 'Coors Field': {'city': 'Denver', 'state': 'CO', 'lat': 39.7559, 'lng': -104.9942, 'capacity': 50144, 'teams': ['COL']}, + 'Comerica Park': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3390, 'lng': -83.0485, 'capacity': 41083, 'teams': ['DET']}, + 'Minute Maid Park': {'city': 'Houston', 'state': 'TX', 'lat': 29.7573, 'lng': -95.3555, 'capacity': 41168, 'teams': ['HOU']}, + 'Kauffman Stadium': {'city': 'Kansas City', 'state': 'MO', 'lat': 39.0517, 'lng': -94.4803, 'capacity': 37903, 'teams': ['KCR']}, + 'Angel Stadium': {'city': 'Anaheim', 'state': 'CA', 'lat': 33.8003, 'lng': -117.8827, 'capacity': 45517, 'teams': ['LAA']}, + 'Dodger Stadium': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0739, 'lng': -118.2400, 'capacity': 56000, 'teams': ['LAD']}, + 'LoanDepot Park': {'city': 'Miami', 'state': 'FL', 'lat': 25.7781, 'lng': -80.2196, 'capacity': 36742, 'teams': ['MIA']}, + 'American Family Field': {'city': 'Milwaukee', 'state': 'WI', 'lat': 43.0280, 'lng': -87.9712, 'capacity': 41900, 'teams': ['MIL']}, + 'Target Field': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9818, 'lng': -93.2775, 'capacity': 38544, 'teams': ['MIN']}, + 'Citi Field': {'city': 'Queens', 'state': 'NY', 'lat': 40.7571, 'lng': -73.8458, 'capacity': 41922, 'teams': ['NYM']}, + 'Yankee Stadium': {'city': 'Bronx', 'state': 'NY', 'lat': 40.8296, 'lng': -73.9262, 'capacity': 46537, 'teams': ['NYY']}, + 'Sutter Health Park': {'city': 'Sacramento', 'state': 'CA', 'lat': 38.5803, 'lng': -121.5108, 'capacity': 14014, 'teams': ['OAK']}, + 'Citizens Bank Park': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9061, 'lng': -75.1665, 'capacity': 42901, 'teams': ['PHI']}, + 'PNC Park': {'city': 'Pittsburgh', 'state': 'PA', 'lat': 40.4469, 'lng': -80.0057, 'capacity': 38362, 'teams': ['PIT']}, + 'Petco Park': {'city': 'San Diego', 'state': 'CA', 'lat': 32.7073, 'lng': -117.1566, 'capacity': 40209, 'teams': ['SDP']}, + 'Oracle Park': {'city': 'San Francisco', 'state': 'CA', 'lat': 37.7786, 'lng': -122.3893, 'capacity': 41915, 'teams': ['SFG']}, + 'T-Mobile Park': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5914, 'lng': -122.3325, 'capacity': 47929, 'teams': ['SEA']}, + 'Busch Stadium': {'city': 'St. Louis', 'state': 'MO', 'lat': 38.6226, 'lng': -90.1928, 'capacity': 45538, 'teams': ['STL']}, + 'Tropicana Field': {'city': 'St. Petersburg', 'state': 'FL', 'lat': 27.7682, 'lng': -82.6534, 'capacity': 25000, 'teams': ['TBR']}, + 'Globe Life Field': {'city': 'Arlington', 'state': 'TX', 'lat': 32.7473, 'lng': -97.0844, 'capacity': 40300, 'teams': ['TEX']}, + 'Rogers Centre': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6414, 'lng': -79.3894, 'capacity': 49282, 'teams': ['TOR']}, + 'Nationals Park': {'city': 'Washington', 'state': 'DC', 'lat': 38.8729, 'lng': -77.0074, 'capacity': 41339, 'teams': ['WSN']}, + } + + stadiums = [] + for name, info in mlb_ballparks.items(): + stadium = Stadium( + id=f"mlb_{name.lower().replace(' ', '_')[:30]}", + name=name, + city=info['city'], + state=info['state'], + latitude=info['lat'], + longitude=info['lng'], + capacity=info['capacity'], + sport='MLB', + team_abbrevs=info['teams'], + source='mlb_hardcoded' + ) + stadiums.append(stadium) + + return stadiums + + +def scrape_mlb_stadiums() -> list[Stadium]: + """ + Fetch MLB stadium data with multi-source fallback. + """ + print("\nMLB STADIUMS") + print("-" * 40) + + sources = [ + StadiumScraperSource('MLBScoreBot', scrape_mlb_stadiums_scorebot, priority=1, min_venues=25), + StadiumScraperSource('GeoJSON-Ballparks', scrape_mlb_stadiums_geojson, priority=2, min_venues=25), + StadiumScraperSource('Hardcoded', scrape_mlb_stadiums_hardcoded, priority=3, min_venues=25), + ] + + return scrape_stadiums_with_fallback('MLB', sources) + + +# ============================================================================= +# SOURCE CONFIGURATIONS +# ============================================================================= + +MLB_GAME_SOURCES = [ + ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=100), + ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=100), + ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=100), +] + +MLB_STADIUM_SOURCES = [ + StadiumScraperSource('MLBScoreBot', scrape_mlb_stadiums_scorebot, priority=1, min_venues=25), + StadiumScraperSource('GeoJSON-Ballparks', scrape_mlb_stadiums_geojson, priority=2, min_venues=25), + StadiumScraperSource('Hardcoded', scrape_mlb_stadiums_hardcoded, priority=3, min_venues=25), +] + + +# ============================================================================= +# CONVENIENCE FUNCTIONS +# ============================================================================= + +def scrape_mlb_games(season: int) -> list[Game]: + """ + Scrape MLB games for a season using multi-source fallback. + + Args: + season: Season year (e.g., 2026) + + Returns: + List of Game objects from the first successful source + """ + print(f"\nMLB {season} SCHEDULE") + print("-" * 40) + + return scrape_with_fallback('MLB', season, MLB_GAME_SOURCES)