Files
Sportstime/Scripts/sportstime_parser/scrapers/mlb.py
Trey t 12ddca4d10 fix(data): populate stadium timezone in scrapers and CloudKit sync
Stadium timezones were always null because scrapers weren't passing
the timezone from STADIUM_MAPPINGS to the Stadium constructor. This
fix propagates timezone data through the entire pipeline: scrapers,
CloudKit uploader, and Swift CloudKit model.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:45:30 -06:00

687 lines
22 KiB
Python

"""MLB scraper implementation with multi-source fallback."""
from datetime import datetime, date, timedelta
from typing import Optional
from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
TeamResolver,
TEAM_MAPPINGS,
get_team_resolver,
)
from ..normalizers.stadium_resolver import (
StadiumResolver,
STADIUM_MAPPINGS,
get_stadium_resolver,
)
from ..normalizers.timezone import parse_datetime
from ..utils.logging import get_logger, log_game, log_warning
class MLBScraper(BaseScraper):
"""MLB schedule scraper with multi-source fallback.
Sources (in priority order):
1. Baseball-Reference - Most reliable, complete historical data
2. MLB Stats API - Official MLB data
3. ESPN API - Backup option
"""
def __init__(self, season: int, **kwargs):
"""Initialize MLB scraper.
Args:
season: Season year (e.g., 2026 for 2026 season)
"""
super().__init__("mlb", season, **kwargs)
self._team_resolver = get_team_resolver("mlb")
self._stadium_resolver = get_stadium_resolver("mlb")
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
# MLB API is best - returns full schedule in one request
# ESPN caps at ~25 results for baseball
# Baseball-Reference requires HTML parsing
return ["mlb_api", "espn", "baseball_reference"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
if source == "baseball_reference":
month = kwargs.get("month", "april")
# Baseball-Reference uses season year in URL
return f"https://www.baseball-reference.com/leagues/majors/{self.season}-schedule.shtml"
elif source == "mlb_api":
start_date = kwargs.get("start_date", "")
end_date = kwargs.get("end_date", "")
return f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date}&endDate={end_date}"
elif source == "espn":
date_str = kwargs.get("date", "")
return f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?dates={date_str}"
raise ValueError(f"Unknown source: {source}")
def _get_season_months(self) -> list[tuple[int, int]]:
"""Get the months to scrape for MLB season.
MLB season runs March/April through October/November.
"""
months = []
# Spring training / early season
for month in range(3, 12): # March-November
months.append((self.season, month))
return months
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
"""Scrape games from a specific source."""
if source == "baseball_reference":
return self._scrape_baseball_reference()
elif source == "mlb_api":
return self._scrape_mlb_api()
elif source == "espn":
return self._scrape_espn()
else:
raise ValueError(f"Unknown source: {source}")
def _scrape_baseball_reference(self) -> list[RawGameData]:
"""Scrape games from Baseball-Reference.
BR has a single schedule page per season.
Format: https://www.baseball-reference.com/leagues/majors/YYYY-schedule.shtml
"""
url = self._get_source_url("baseball_reference")
try:
html = self.session.get_html(url)
games = self._parse_baseball_reference(html, url)
return games
except Exception as e:
self._logger.error(f"Failed to scrape Baseball-Reference: {e}")
raise
def _parse_baseball_reference(
self,
html: str,
source_url: str,
) -> list[RawGameData]:
"""Parse Baseball-Reference schedule HTML.
Structure: Games are organized by date in div elements.
Each game row has: date, away team, away score, home team, home score, venue.
"""
soup = BeautifulSoup(html, "lxml")
games: list[RawGameData] = []
# Find all game divs - they use class "game" or similar
# Baseball-Reference uses <p class="game"> for each game
game_paragraphs = soup.find_all("p", class_="game")
current_date = None
for elem in soup.find_all(["h3", "p"]):
# H3 contains date headers
if elem.name == "h3":
date_text = elem.get_text(strip=True)
try:
# Format: "Thursday, April 1, 2026"
current_date = datetime.strptime(date_text, "%A, %B %d, %Y")
except ValueError:
continue
elif elem.name == "p" and "game" in elem.get("class", []):
if current_date is None:
continue
try:
game = self._parse_br_game(elem, current_date, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse game: {e}")
continue
return games
def _parse_br_game(
self,
elem,
game_date: datetime,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single Baseball-Reference game element."""
text = elem.get_text(" ", strip=True)
# Parse game text - formats vary:
# "Team A (5) @ Team B (3)" or "Team A @ Team B"
# Also handles doubleheader notation
# Find all links - usually team names
links = elem.find_all("a")
if len(links) < 2:
return None
# First link is away team, second is home team
away_team = links[0].get_text(strip=True)
home_team = links[1].get_text(strip=True)
# Try to extract scores from text
away_score = None
home_score = None
# Look for score pattern "(N)"
import re
score_pattern = r"\((\d+)\)"
scores = re.findall(score_pattern, text)
if len(scores) >= 2:
try:
away_score = int(scores[0])
home_score = int(scores[1])
except (ValueError, IndexError):
pass
# Determine status
status = "final" if home_score is not None else "scheduled"
# Check for postponed/cancelled
text_lower = text.lower()
if "postponed" in text_lower:
status = "postponed"
elif "cancelled" in text_lower or "canceled" in text_lower:
status = "cancelled"
# Extract venue if present (usually after @ symbol)
stadium = None
if len(links) > 2:
# Third link might be stadium
stadium = links[2].get_text(strip=True)
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_mlb_api(self) -> list[RawGameData]:
"""Scrape games from MLB Stats API using full season query."""
# Build date range for entire season (March-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
start_date = date(start_year, start_month, 1)
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date.strftime('%Y-%m-%d')}&endDate={end_date.strftime('%Y-%m-%d')}"
self._logger.info(f"Fetching MLB schedule: {start_date} to {end_date}")
try:
data = self.session.get_json(url)
return self._parse_mlb_api_response(data, url)
except Exception as e:
self._logger.error(f"MLB API error: {e}")
return []
def _parse_mlb_api_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse MLB Stats API response."""
games: list[RawGameData] = []
dates = data.get("dates", [])
for date_entry in dates:
for game in date_entry.get("games", []):
try:
raw_game = self._parse_mlb_api_game(game, source_url)
if raw_game:
games.append(raw_game)
except Exception as e:
self._logger.debug(f"Failed to parse MLB API game: {e}")
continue
return games
def _parse_mlb_api_game(
self,
game: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single MLB API game."""
# Get game date/time
game_date_str = game.get("gameDate", "")
if not game_date_str:
return None
try:
game_date = datetime.fromisoformat(game_date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get teams
teams = game.get("teams", {})
away_data = teams.get("away", {})
home_data = teams.get("home", {})
away_team_info = away_data.get("team", {})
home_team_info = home_data.get("team", {})
away_team = away_team_info.get("name", "")
home_team = home_team_info.get("name", "")
if not away_team or not home_team:
return None
# Get scores
away_score = away_data.get("score")
home_score = home_data.get("score")
# Get venue
venue = game.get("venue", {})
stadium = venue.get("name")
# Get status
status_data = game.get("status", {})
abstract_game_state = status_data.get("abstractGameState", "").lower()
detailed_state = status_data.get("detailedState", "").lower()
if abstract_game_state == "final":
status = "final"
elif "postponed" in detailed_state:
status = "postponed"
elif "cancelled" in detailed_state or "canceled" in detailed_state:
status = "cancelled"
else:
status = "scheduled"
# Check for doubleheader
game_number = game.get("gameNumber")
if game.get("doubleHeader") == "Y":
game_number = game.get("gameNumber", 1)
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
game_number=game_number if game.get("doubleHeader") == "Y" else None,
)
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (March-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
url = f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?limit=3000&dates={date_range}"
self._logger.info(f"Fetching MLB schedule: {date_range}")
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse ESPN API response."""
games: list[RawGameData] = []
events = data.get("events", [])
for event in events:
try:
game = self._parse_espn_event(event, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse ESPN event: {e}")
continue
return games
def _parse_espn_event(
self,
event: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single ESPN event."""
# Get date
date_str = event.get("date", "")
if not date_str:
return None
try:
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get competitions
competitions = event.get("competitions", [])
if not competitions:
return None
competition = competitions[0]
# Get teams
competitors = competition.get("competitors", [])
if len(competitors) != 2:
return None
home_team = None
away_team = None
home_score = None
away_score = None
for competitor in competitors:
team_info = competitor.get("team", {})
team_name = team_info.get("displayName", "")
is_home = competitor.get("homeAway") == "home"
score = competitor.get("score")
if score:
try:
score = int(score)
except (ValueError, TypeError):
score = None
if is_home:
home_team = team_name
home_score = score
else:
away_team = team_name
away_score = score
if not home_team or not away_team:
return None
# Get venue
venue = competition.get("venue", {})
stadium = venue.get("fullName")
# Get status
status_info = competition.get("status", {})
status_type = status_info.get("type", {})
status_name = status_type.get("name", "").lower()
if status_name == "status_final":
status = "final"
elif status_name == "status_postponed":
status = "postponed"
elif status_name == "status_canceled":
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw games to Game objects with canonical IDs."""
games: list[Game] = []
review_items: list[ManualReviewItem] = []
# Track games by date/matchup for doubleheader detection
games_by_matchup: dict[str, list[RawGameData]] = {}
for raw in raw_games:
date_key = raw.game_date.strftime("%Y%m%d")
matchup_key = f"{date_key}_{raw.away_team_raw}_{raw.home_team_raw}"
if matchup_key not in games_by_matchup:
games_by_matchup[matchup_key] = []
games_by_matchup[matchup_key].append(raw)
# Process games with doubleheader detection
for matchup_key, matchup_games in games_by_matchup.items():
is_doubleheader = len(matchup_games) > 1
# Sort by time if doubleheader
if is_doubleheader:
matchup_games.sort(key=lambda g: g.game_date)
for i, raw in enumerate(matchup_games):
# Use provided game_number or calculate from order
game_number = raw.game_number or ((i + 1) if is_doubleheader else None)
game, item_reviews = self._normalize_single_game(raw, game_number)
if game:
games.append(game)
log_game(
self.sport,
game.id,
game.home_team_id,
game.away_team_id,
game.game_date.strftime("%Y-%m-%d"),
game.status,
)
review_items.extend(item_reviews)
return games, review_items
def _normalize_single_game(
self,
raw: RawGameData,
game_number: Optional[int],
) -> tuple[Optional[Game], list[ManualReviewItem]]:
"""Normalize a single raw game."""
review_items: list[ManualReviewItem] = []
# Resolve home team
home_result = self._team_resolver.resolve(
raw.home_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if home_result.review_item:
review_items.append(home_result.review_item)
if not home_result.canonical_id:
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
return None, review_items
# Resolve away team
away_result = self._team_resolver.resolve(
raw.away_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if away_result.review_item:
review_items.append(away_result.review_item)
if not away_result.canonical_id:
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
return None, review_items
# Resolve stadium
stadium_id = None
if raw.stadium_raw:
stadium_result = self._stadium_resolver.resolve(
raw.stadium_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if stadium_result.review_item:
review_items.append(stadium_result.review_item)
stadium_id = stadium_result.canonical_id
# Get abbreviations for game ID
home_abbrev = self._get_abbreviation(home_result.canonical_id)
away_abbrev = self._get_abbreviation(away_result.canonical_id)
# Generate canonical game ID
game_id = generate_game_id(
sport=self.sport,
season=self.season,
away_abbrev=away_abbrev,
home_abbrev=home_abbrev,
game_date=raw.game_date,
game_number=game_number,
)
game = Game(
id=game_id,
sport=self.sport,
season=self.season,
home_team_id=home_result.canonical_id,
away_team_id=away_result.canonical_id,
stadium_id=stadium_id or "",
game_date=raw.game_date,
game_number=game_number,
home_score=raw.home_score,
away_score=raw.away_score,
status=raw.status,
source_url=raw.source_url,
raw_home_team=raw.home_team_raw,
raw_away_team=raw.away_team_raw,
raw_stadium=raw.stadium_raw,
)
return game, review_items
def _get_abbreviation(self, team_id: str) -> str:
"""Extract abbreviation from team ID."""
# team_mlb_nyy -> nyy
parts = team_id.split("_")
return parts[-1] if parts else ""
def scrape_teams(self) -> list[Team]:
"""Get all MLB teams from hardcoded mappings."""
teams: list[Team] = []
seen: set[str] = set()
# MLB league/division structure
divisions = {
"AL East": ("American", ["BAL", "BOS", "NYY", "TB", "TOR"]),
"AL Central": ("American", ["CHW", "CLE", "DET", "KC", "MIN"]),
"AL West": ("American", ["HOU", "LAA", "OAK", "SEA", "TEX"]),
"NL East": ("National", ["ATL", "MIA", "NYM", "PHI", "WSN"]),
"NL Central": ("National", ["CHC", "CIN", "MIL", "PIT", "STL"]),
"NL West": ("National", ["ARI", "COL", "LAD", "SD", "SF"]),
}
# Build reverse lookup
team_divisions: dict[str, tuple[str, str]] = {}
for div, (league, abbrevs) in divisions.items():
for abbrev in abbrevs:
team_divisions[abbrev] = (league, div)
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("mlb", {}).items():
if team_id in seen:
continue
seen.add(team_id)
# Parse team name from full name
parts = full_name.split()
if len(parts) >= 2:
team_name = parts[-1]
# Handle multi-word team names
if team_name in ["Sox", "Jays"]:
team_name = " ".join(parts[-2:])
else:
team_name = full_name
# Get league and division
league, div = team_divisions.get(abbrev, (None, None))
team = Team(
id=team_id,
sport="mlb",
city=city,
name=team_name,
full_name=full_name,
abbreviation=abbrev,
conference=league, # MLB uses "league" but we map to conference field
division=div,
stadium_id=stadium_id,
)
teams.append(team)
return teams
def scrape_stadiums(self) -> list[Stadium]:
"""Get all MLB stadiums from hardcoded mappings."""
stadiums: list[Stadium] = []
mlb_stadiums = STADIUM_MAPPINGS.get("mlb", {})
for stadium_id, info in mlb_stadiums.items():
stadium = Stadium(
id=stadium_id,
sport="mlb",
name=info.name,
city=info.city,
state=info.state,
country=info.country,
latitude=info.latitude,
longitude=info.longitude,
timezone=info.timezone,
surface="grass", # Most MLB stadiums
roof_type="open", # Most MLB stadiums
)
stadiums.append(stadium)
return stadiums
def create_mlb_scraper(season: int) -> MLBScraper:
"""Factory function to create an MLB scraper."""
return MLBScraper(season=season)