Added year_opened field to stadium data across all 4 sport modules: - MLB: 30 ballparks (1912-2023) - NBA: 30 arenas (1968-2024) - NHL: 32 arenas (1968-2021) - NFL: 30 stadiums (1924-2020) Updated Stadium object creation in all modules to pass year_opened. Stadium dataclass already supported the field. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
511 lines
22 KiB
Python
511 lines
22 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
MLB schedule and stadium scrapers for SportsTime.
|
|
|
|
This module provides:
|
|
- MLB game scrapers (Baseball-Reference, Stats API, ESPN)
|
|
- MLB stadium scrapers (MLBScoreBot, GeoJSON, hardcoded)
|
|
- Multi-source fallback configurations
|
|
"""
|
|
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
import requests
|
|
|
|
# Support both direct execution and import from parent directory
|
|
try:
|
|
from core import (
|
|
Game,
|
|
Stadium,
|
|
ScraperSource,
|
|
StadiumScraperSource,
|
|
fetch_page,
|
|
scrape_with_fallback,
|
|
scrape_stadiums_with_fallback,
|
|
)
|
|
except ImportError:
|
|
from Scripts.core import (
|
|
Game,
|
|
Stadium,
|
|
ScraperSource,
|
|
StadiumScraperSource,
|
|
fetch_page,
|
|
scrape_with_fallback,
|
|
scrape_stadiums_with_fallback,
|
|
)
|
|
|
|
|
|
__all__ = [
|
|
# Team data
|
|
'MLB_TEAMS',
|
|
# Game scrapers
|
|
'scrape_mlb_baseball_reference',
|
|
'scrape_mlb_statsapi',
|
|
'scrape_mlb_espn',
|
|
# Stadium scrapers
|
|
'scrape_mlb_stadiums_scorebot',
|
|
'scrape_mlb_stadiums_geojson',
|
|
'scrape_mlb_stadiums_hardcoded',
|
|
'scrape_mlb_stadiums',
|
|
# Source configurations
|
|
'MLB_GAME_SOURCES',
|
|
'MLB_STADIUM_SOURCES',
|
|
# Convenience function
|
|
'scrape_mlb_games',
|
|
]
|
|
|
|
|
|
# =============================================================================
|
|
# TEAM MAPPINGS
|
|
# =============================================================================
|
|
|
|
MLB_TEAMS = {
|
|
'ARI': {'name': 'Arizona Diamondbacks', 'city': 'Phoenix', 'stadium': 'Chase Field'},
|
|
'ATL': {'name': 'Atlanta Braves', 'city': 'Atlanta', 'stadium': 'Truist Park'},
|
|
'BAL': {'name': 'Baltimore Orioles', 'city': 'Baltimore', 'stadium': 'Oriole Park at Camden Yards'},
|
|
'BOS': {'name': 'Boston Red Sox', 'city': 'Boston', 'stadium': 'Fenway Park'},
|
|
'CHC': {'name': 'Chicago Cubs', 'city': 'Chicago', 'stadium': 'Wrigley Field'},
|
|
'CHW': {'name': 'Chicago White Sox', 'city': 'Chicago', 'stadium': 'Guaranteed Rate Field'},
|
|
'CIN': {'name': 'Cincinnati Reds', 'city': 'Cincinnati', 'stadium': 'Great American Ball Park'},
|
|
'CLE': {'name': 'Cleveland Guardians', 'city': 'Cleveland', 'stadium': 'Progressive Field'},
|
|
'COL': {'name': 'Colorado Rockies', 'city': 'Denver', 'stadium': 'Coors Field'},
|
|
'DET': {'name': 'Detroit Tigers', 'city': 'Detroit', 'stadium': 'Comerica Park'},
|
|
'HOU': {'name': 'Houston Astros', 'city': 'Houston', 'stadium': 'Minute Maid Park'},
|
|
'KCR': {'name': 'Kansas City Royals', 'city': 'Kansas City', 'stadium': 'Kauffman Stadium'},
|
|
'LAA': {'name': 'Los Angeles Angels', 'city': 'Anaheim', 'stadium': 'Angel Stadium'},
|
|
'LAD': {'name': 'Los Angeles Dodgers', 'city': 'Los Angeles', 'stadium': 'Dodger Stadium'},
|
|
'MIA': {'name': 'Miami Marlins', 'city': 'Miami', 'stadium': 'LoanDepot Park'},
|
|
'MIL': {'name': 'Milwaukee Brewers', 'city': 'Milwaukee', 'stadium': 'American Family Field'},
|
|
'MIN': {'name': 'Minnesota Twins', 'city': 'Minneapolis', 'stadium': 'Target Field'},
|
|
'NYM': {'name': 'New York Mets', 'city': 'New York', 'stadium': 'Citi Field'},
|
|
'NYY': {'name': 'New York Yankees', 'city': 'New York', 'stadium': 'Yankee Stadium'},
|
|
'OAK': {'name': 'Oakland Athletics', 'city': 'Sacramento', 'stadium': 'Sutter Health Park'},
|
|
'PHI': {'name': 'Philadelphia Phillies', 'city': 'Philadelphia', 'stadium': 'Citizens Bank Park'},
|
|
'PIT': {'name': 'Pittsburgh Pirates', 'city': 'Pittsburgh', 'stadium': 'PNC Park'},
|
|
'SDP': {'name': 'San Diego Padres', 'city': 'San Diego', 'stadium': 'Petco Park'},
|
|
'SFG': {'name': 'San Francisco Giants', 'city': 'San Francisco', 'stadium': 'Oracle Park'},
|
|
'SEA': {'name': 'Seattle Mariners', 'city': 'Seattle', 'stadium': 'T-Mobile Park'},
|
|
'STL': {'name': 'St. Louis Cardinals', 'city': 'St. Louis', 'stadium': 'Busch Stadium'},
|
|
'TBR': {'name': 'Tampa Bay Rays', 'city': 'St. Petersburg', 'stadium': 'Tropicana Field'},
|
|
'TEX': {'name': 'Texas Rangers', 'city': 'Arlington', 'stadium': 'Globe Life Field'},
|
|
'TOR': {'name': 'Toronto Blue Jays', 'city': 'Toronto', 'stadium': 'Rogers Centre'},
|
|
'WSN': {'name': 'Washington Nationals', 'city': 'Washington', 'stadium': 'Nationals Park'},
|
|
}
|
|
|
|
|
|
def get_mlb_team_abbrev(team_name: str) -> str:
|
|
"""Get MLB team abbreviation from full name."""
|
|
for abbrev, info in MLB_TEAMS.items():
|
|
if info['name'].lower() == team_name.lower():
|
|
return abbrev
|
|
if team_name.lower() in info['name'].lower():
|
|
return abbrev
|
|
|
|
# Return first 3 letters as fallback
|
|
return team_name[:3].upper()
|
|
|
|
|
|
# =============================================================================
|
|
# GAME SCRAPERS
|
|
# =============================================================================
|
|
|
|
def scrape_mlb_baseball_reference(season: int) -> list[Game]:
|
|
"""
|
|
Scrape MLB schedule from Baseball-Reference.
|
|
URL: https://www.baseball-reference.com/leagues/majors/{YEAR}-schedule.shtml
|
|
"""
|
|
games = []
|
|
url = f"https://www.baseball-reference.com/leagues/majors/{season}-schedule.shtml"
|
|
|
|
print(f"Scraping MLB {season} from Baseball-Reference...")
|
|
soup = fetch_page(url, 'baseball-reference.com')
|
|
|
|
if not soup:
|
|
return games
|
|
|
|
# Baseball-Reference groups games by date in h3 headers
|
|
current_date = None
|
|
|
|
# Find the schedule section
|
|
schedule_div = soup.find('div', {'id': 'all_schedule'})
|
|
if not schedule_div:
|
|
schedule_div = soup
|
|
|
|
# Process all elements to track date context
|
|
for element in schedule_div.find_all(['h3', 'p', 'div']):
|
|
# Check for date header
|
|
if element.name == 'h3':
|
|
date_text = element.get_text(strip=True)
|
|
# Parse date like "Thursday, March 27, 2025"
|
|
try:
|
|
for fmt in ['%A, %B %d, %Y', '%B %d, %Y', '%a, %b %d, %Y']:
|
|
try:
|
|
parsed = datetime.strptime(date_text, fmt)
|
|
current_date = parsed.strftime('%Y-%m-%d')
|
|
break
|
|
except:
|
|
continue
|
|
except:
|
|
pass
|
|
|
|
# Check for game entries
|
|
elif element.name == 'p' and 'game' in element.get('class', []):
|
|
if not current_date:
|
|
continue
|
|
|
|
try:
|
|
links = element.find_all('a')
|
|
if len(links) >= 2:
|
|
away_team = links[0].text.strip()
|
|
home_team = links[1].text.strip()
|
|
|
|
# Generate unique game ID
|
|
away_abbrev = get_mlb_team_abbrev(away_team)
|
|
home_abbrev = get_mlb_team_abbrev(home_team)
|
|
game_id = f"mlb_br_{current_date}_{away_abbrev}_{home_abbrev}".lower()
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='MLB',
|
|
season=str(season),
|
|
date=current_date,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=home_abbrev,
|
|
away_team_abbrev=away_abbrev,
|
|
venue='',
|
|
source='baseball-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Baseball-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_mlb_statsapi(season: int) -> list[Game]:
|
|
"""
|
|
Fetch MLB schedule from official Stats API (JSON).
|
|
URL: https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={YEAR}&gameType=R
|
|
"""
|
|
games = []
|
|
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={season}&gameType=R&hydrate=team,venue"
|
|
|
|
print(f"Fetching MLB {season} from Stats API...")
|
|
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for date_entry in data.get('dates', []):
|
|
game_date = date_entry.get('date', '')
|
|
|
|
for game_data in date_entry.get('games', []):
|
|
try:
|
|
teams = game_data.get('teams', {})
|
|
away = teams.get('away', {}).get('team', {})
|
|
home = teams.get('home', {}).get('team', {})
|
|
venue = game_data.get('venue', {})
|
|
|
|
game_time = game_data.get('gameDate', '')
|
|
if 'T' in game_time:
|
|
time_str = game_time.split('T')[1][:5]
|
|
else:
|
|
time_str = None
|
|
|
|
game = Game(
|
|
id='', # Will be assigned by assign_stable_ids
|
|
sport='MLB',
|
|
season=str(season),
|
|
date=game_date,
|
|
time=time_str,
|
|
home_team=home.get('name', ''),
|
|
away_team=away.get('name', ''),
|
|
home_team_abbrev=home.get('abbreviation', ''),
|
|
away_team_abbrev=away.get('abbreviation', ''),
|
|
venue=venue.get('name', ''),
|
|
source='statsapi.mlb.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
except Exception as e:
|
|
print(f" Error fetching MLB API: {e}")
|
|
|
|
print(f" Found {len(games)} games from MLB Stats API")
|
|
return games
|
|
|
|
|
|
def scrape_mlb_espn(season: int) -> list[Game]:
|
|
"""Fetch MLB schedule from ESPN API."""
|
|
games = []
|
|
print(f"Fetching MLB {season} from ESPN API...")
|
|
|
|
# MLB regular season: Late March - Early October
|
|
start = f"{season}0320"
|
|
end = f"{season}1010"
|
|
|
|
url = "https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard"
|
|
params = {
|
|
'dates': f"{start}-{end}",
|
|
'limit': 1000
|
|
}
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, params=params, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
events = data.get('events', [])
|
|
|
|
for event in events:
|
|
try:
|
|
date_str = event.get('date', '')[:10]
|
|
time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None
|
|
|
|
competitions = event.get('competitions', [{}])
|
|
if not competitions:
|
|
continue
|
|
|
|
comp = competitions[0]
|
|
competitors = comp.get('competitors', [])
|
|
|
|
if len(competitors) < 2:
|
|
continue
|
|
|
|
home_team = away_team = home_abbrev = away_abbrev = None
|
|
|
|
for team in competitors:
|
|
team_data = team.get('team', {})
|
|
team_name = team_data.get('displayName', team_data.get('name', ''))
|
|
team_abbrev = team_data.get('abbreviation', '')
|
|
|
|
if team.get('homeAway') == 'home':
|
|
home_team = team_name
|
|
home_abbrev = team_abbrev
|
|
else:
|
|
away_team = team_name
|
|
away_abbrev = team_abbrev
|
|
|
|
if not home_team or not away_team:
|
|
continue
|
|
|
|
venue = comp.get('venue', {}).get('fullName', '')
|
|
|
|
game_id = f"mlb_{date_str}_{away_abbrev}_{home_abbrev}".lower()
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='MLB',
|
|
season=str(season),
|
|
date=date_str,
|
|
time=time_str,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=home_abbrev or get_mlb_team_abbrev(home_team),
|
|
away_team_abbrev=away_abbrev or get_mlb_team_abbrev(away_team),
|
|
venue=venue,
|
|
source='espn.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from ESPN")
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching ESPN MLB: {e}")
|
|
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# STADIUM SCRAPERS
|
|
# =============================================================================
|
|
|
|
def scrape_mlb_stadiums_scorebot() -> list[Stadium]:
|
|
"""
|
|
Source 1: MLBScoreBot/ballparks GitHub (public domain).
|
|
"""
|
|
stadiums = []
|
|
url = "https://raw.githubusercontent.com/MLBScoreBot/ballparks/main/ballparks.json"
|
|
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for name, info in data.items():
|
|
stadium = Stadium(
|
|
id=f"mlb_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info.get('city', ''),
|
|
state=info.get('state', ''),
|
|
latitude=info.get('lat', 0) / 1000000 if info.get('lat') else 0,
|
|
longitude=info.get('long', 0) / 1000000 if info.get('long') else 0,
|
|
capacity=info.get('capacity', 0),
|
|
sport='MLB',
|
|
team_abbrevs=[info.get('team', '')],
|
|
source='github.com/MLBScoreBot'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_mlb_stadiums_geojson() -> list[Stadium]:
|
|
"""
|
|
Source 2: cageyjames/GeoJSON-Ballparks GitHub.
|
|
"""
|
|
stadiums = []
|
|
url = "https://raw.githubusercontent.com/cageyjames/GeoJSON-Ballparks/master/ballparks.geojson"
|
|
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for feature in data.get('features', []):
|
|
props = feature.get('properties', {})
|
|
coords = feature.get('geometry', {}).get('coordinates', [0, 0])
|
|
|
|
# Only include MLB stadiums (filter by League)
|
|
if props.get('League', '').upper() != 'MLB':
|
|
continue
|
|
|
|
stadium = Stadium(
|
|
id=f"mlb_{props.get('Ballpark', '').lower().replace(' ', '_')[:30]}",
|
|
name=props.get('Ballpark', ''),
|
|
city=props.get('City', ''),
|
|
state=props.get('State', ''),
|
|
latitude=coords[1] if len(coords) > 1 else 0,
|
|
longitude=coords[0] if len(coords) > 0 else 0,
|
|
capacity=0, # Not in this dataset
|
|
sport='MLB',
|
|
team_abbrevs=[props.get('Team', '')],
|
|
source='github.com/cageyjames'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_mlb_stadiums_hardcoded() -> list[Stadium]:
|
|
"""
|
|
Source 3: Hardcoded MLB ballparks (fallback).
|
|
"""
|
|
mlb_ballparks = {
|
|
'Chase Field': {'city': 'Phoenix', 'state': 'AZ', 'lat': 33.4453, 'lng': -112.0667, 'capacity': 48519, 'teams': ['ARI'], 'year_opened': 1998},
|
|
'Truist Park': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.8907, 'lng': -84.4677, 'capacity': 41084, 'teams': ['ATL'], 'year_opened': 2017},
|
|
'Oriole Park at Camden Yards': {'city': 'Baltimore', 'state': 'MD', 'lat': 39.2839, 'lng': -76.6216, 'capacity': 44970, 'teams': ['BAL'], 'year_opened': 1992},
|
|
'Fenway Park': {'city': 'Boston', 'state': 'MA', 'lat': 42.3467, 'lng': -71.0972, 'capacity': 37755, 'teams': ['BOS'], 'year_opened': 1912},
|
|
'Wrigley Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.9484, 'lng': -87.6553, 'capacity': 41649, 'teams': ['CHC'], 'year_opened': 1914},
|
|
'Guaranteed Rate Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8299, 'lng': -87.6338, 'capacity': 40615, 'teams': ['CHW'], 'year_opened': 1991},
|
|
'Great American Ball Park': {'city': 'Cincinnati', 'state': 'OH', 'lat': 39.0979, 'lng': -84.5082, 'capacity': 42319, 'teams': ['CIN'], 'year_opened': 2003},
|
|
'Progressive Field': {'city': 'Cleveland', 'state': 'OH', 'lat': 41.4958, 'lng': -81.6853, 'capacity': 34830, 'teams': ['CLE'], 'year_opened': 1994},
|
|
'Coors Field': {'city': 'Denver', 'state': 'CO', 'lat': 39.7559, 'lng': -104.9942, 'capacity': 50144, 'teams': ['COL'], 'year_opened': 1995},
|
|
'Comerica Park': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3390, 'lng': -83.0485, 'capacity': 41083, 'teams': ['DET'], 'year_opened': 2000},
|
|
'Minute Maid Park': {'city': 'Houston', 'state': 'TX', 'lat': 29.7573, 'lng': -95.3555, 'capacity': 41168, 'teams': ['HOU'], 'year_opened': 2000},
|
|
'Kauffman Stadium': {'city': 'Kansas City', 'state': 'MO', 'lat': 39.0517, 'lng': -94.4803, 'capacity': 37903, 'teams': ['KCR'], 'year_opened': 1973},
|
|
'Angel Stadium': {'city': 'Anaheim', 'state': 'CA', 'lat': 33.8003, 'lng': -117.8827, 'capacity': 45517, 'teams': ['LAA'], 'year_opened': 1966},
|
|
'Dodger Stadium': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0739, 'lng': -118.2400, 'capacity': 56000, 'teams': ['LAD'], 'year_opened': 1962},
|
|
'LoanDepot Park': {'city': 'Miami', 'state': 'FL', 'lat': 25.7781, 'lng': -80.2196, 'capacity': 36742, 'teams': ['MIA'], 'year_opened': 2012},
|
|
'American Family Field': {'city': 'Milwaukee', 'state': 'WI', 'lat': 43.0280, 'lng': -87.9712, 'capacity': 41900, 'teams': ['MIL'], 'year_opened': 2001},
|
|
'Target Field': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9818, 'lng': -93.2775, 'capacity': 38544, 'teams': ['MIN'], 'year_opened': 2010},
|
|
'Citi Field': {'city': 'Queens', 'state': 'NY', 'lat': 40.7571, 'lng': -73.8458, 'capacity': 41922, 'teams': ['NYM'], 'year_opened': 2009},
|
|
'Yankee Stadium': {'city': 'Bronx', 'state': 'NY', 'lat': 40.8296, 'lng': -73.9262, 'capacity': 46537, 'teams': ['NYY'], 'year_opened': 2009},
|
|
'Sutter Health Park': {'city': 'Sacramento', 'state': 'CA', 'lat': 38.5803, 'lng': -121.5108, 'capacity': 14014, 'teams': ['OAK'], 'year_opened': 2000},
|
|
'Citizens Bank Park': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9061, 'lng': -75.1665, 'capacity': 42901, 'teams': ['PHI'], 'year_opened': 2004},
|
|
'PNC Park': {'city': 'Pittsburgh', 'state': 'PA', 'lat': 40.4469, 'lng': -80.0057, 'capacity': 38362, 'teams': ['PIT'], 'year_opened': 2001},
|
|
'Petco Park': {'city': 'San Diego', 'state': 'CA', 'lat': 32.7073, 'lng': -117.1566, 'capacity': 40209, 'teams': ['SDP'], 'year_opened': 2004},
|
|
'Oracle Park': {'city': 'San Francisco', 'state': 'CA', 'lat': 37.7786, 'lng': -122.3893, 'capacity': 41915, 'teams': ['SFG'], 'year_opened': 2000},
|
|
'T-Mobile Park': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5914, 'lng': -122.3325, 'capacity': 47929, 'teams': ['SEA'], 'year_opened': 1999},
|
|
'Busch Stadium': {'city': 'St. Louis', 'state': 'MO', 'lat': 38.6226, 'lng': -90.1928, 'capacity': 45538, 'teams': ['STL'], 'year_opened': 2006},
|
|
'Tropicana Field': {'city': 'St. Petersburg', 'state': 'FL', 'lat': 27.7682, 'lng': -82.6534, 'capacity': 25000, 'teams': ['TBR'], 'year_opened': 1990},
|
|
'Globe Life Field': {'city': 'Arlington', 'state': 'TX', 'lat': 32.7473, 'lng': -97.0844, 'capacity': 40300, 'teams': ['TEX'], 'year_opened': 2020},
|
|
'Rogers Centre': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6414, 'lng': -79.3894, 'capacity': 49282, 'teams': ['TOR'], 'year_opened': 1989},
|
|
'Nationals Park': {'city': 'Washington', 'state': 'DC', 'lat': 38.8729, 'lng': -77.0074, 'capacity': 41339, 'teams': ['WSN'], 'year_opened': 2008},
|
|
}
|
|
|
|
stadiums = []
|
|
for name, info in mlb_ballparks.items():
|
|
stadium = Stadium(
|
|
id=f"mlb_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info['city'],
|
|
state=info['state'],
|
|
latitude=info['lat'],
|
|
longitude=info['lng'],
|
|
capacity=info['capacity'],
|
|
sport='MLB',
|
|
team_abbrevs=info['teams'],
|
|
source='mlb_hardcoded',
|
|
year_opened=info.get('year_opened')
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_mlb_stadiums() -> list[Stadium]:
|
|
"""
|
|
Fetch MLB stadium data with multi-source fallback.
|
|
"""
|
|
print("\nMLB STADIUMS")
|
|
print("-" * 40)
|
|
|
|
sources = [
|
|
StadiumScraperSource('MLBScoreBot', scrape_mlb_stadiums_scorebot, priority=1, min_venues=25),
|
|
StadiumScraperSource('GeoJSON-Ballparks', scrape_mlb_stadiums_geojson, priority=2, min_venues=25),
|
|
StadiumScraperSource('Hardcoded', scrape_mlb_stadiums_hardcoded, priority=3, min_venues=25),
|
|
]
|
|
|
|
return scrape_stadiums_with_fallback('MLB', sources)
|
|
|
|
|
|
# =============================================================================
|
|
# SOURCE CONFIGURATIONS
|
|
# =============================================================================
|
|
|
|
MLB_GAME_SOURCES = [
|
|
ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=100),
|
|
ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=100),
|
|
ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=100),
|
|
]
|
|
|
|
MLB_STADIUM_SOURCES = [
|
|
StadiumScraperSource('MLBScoreBot', scrape_mlb_stadiums_scorebot, priority=1, min_venues=25),
|
|
StadiumScraperSource('GeoJSON-Ballparks', scrape_mlb_stadiums_geojson, priority=2, min_venues=25),
|
|
StadiumScraperSource('Hardcoded', scrape_mlb_stadiums_hardcoded, priority=3, min_venues=25),
|
|
]
|
|
|
|
|
|
# =============================================================================
|
|
# CONVENIENCE FUNCTIONS
|
|
# =============================================================================
|
|
|
|
def scrape_mlb_games(season: int) -> list[Game]:
|
|
"""
|
|
Scrape MLB games for a season using multi-source fallback.
|
|
|
|
Args:
|
|
season: Season year (e.g., 2026)
|
|
|
|
Returns:
|
|
List of Game objects from the first successful source
|
|
"""
|
|
print(f"\nMLB {season} SCHEDULE")
|
|
print("-" * 40)
|
|
|
|
return scrape_with_fallback('MLB', season, MLB_GAME_SOURCES)
|