- Add ET timezone (America/New_York) to all Sports-Reference scrapers:
- NBA: Basketball-Reference times parsed as ET
- NFL: Pro-Football-Reference times parsed as ET
- NHL: Hockey-Reference times parsed as ET
- MLB: Baseball-Reference times parsed as ET
- Document source timezones in scraper docstrings
- Add 11 new stadiums to STADIUM_MAPPINGS:
- NFL: 5 international venues (Corinthians Arena, Croke Park,
Olympic Stadium Berlin, Santiago Bernabéu, Tom Benson Hall of Fame)
- MLS: 4 alternate venues (Miami Freedom Park, Citi Field,
LA Memorial Coliseum, M&T Bank Stadium)
- NWSL: 2 alternate venues (Northwestern Medicine Field, ONE Spokane)
- Add 15 stadium aliases for MLS/NWSL team-based lookups
- Fix CanonicalSyncService to sync timezone identifier to SwiftData
- Update debug logging to use stadium timezone for display
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
718 lines
24 KiB
Python
718 lines
24 KiB
Python
"""MLB scraper implementation with multi-source fallback."""
|
|
|
|
from datetime import datetime, date, timedelta
|
|
from typing import Optional
|
|
from zoneinfo import ZoneInfo
|
|
from bs4 import BeautifulSoup
|
|
|
|
from .base import BaseScraper, RawGameData, ScrapeResult
|
|
from ..models.game import Game
|
|
from ..models.team import Team
|
|
from ..models.stadium import Stadium
|
|
from ..models.aliases import ManualReviewItem
|
|
from ..normalizers.canonical_id import generate_game_id
|
|
from ..normalizers.team_resolver import (
|
|
TeamResolver,
|
|
TEAM_MAPPINGS,
|
|
get_team_resolver,
|
|
)
|
|
from ..normalizers.stadium_resolver import (
|
|
StadiumResolver,
|
|
STADIUM_MAPPINGS,
|
|
get_stadium_resolver,
|
|
)
|
|
from ..normalizers.timezone import parse_datetime
|
|
from ..utils.logging import get_logger, log_game, log_warning
|
|
|
|
|
|
class MLBScraper(BaseScraper):
|
|
"""MLB schedule scraper with multi-source fallback.
|
|
|
|
Sources (in priority order):
|
|
1. MLB Stats API - Official MLB data (primary)
|
|
2. ESPN API - Backup option
|
|
3. Baseball-Reference - Complete historical data
|
|
|
|
Source Timezones:
|
|
- mlb_api: UTC - ISO 8601 format with "Z" suffix (gameDate field)
|
|
- espn: UTC - ISO 8601 format with "Z" suffix
|
|
- baseball_reference: Eastern Time (ET) - times displayed as "7:05 PM ET"
|
|
"""
|
|
|
|
def __init__(self, season: int, **kwargs):
|
|
"""Initialize MLB scraper.
|
|
|
|
Args:
|
|
season: Season year (e.g., 2026 for 2026 season)
|
|
"""
|
|
super().__init__("mlb", season, **kwargs)
|
|
self._team_resolver = get_team_resolver("mlb")
|
|
self._stadium_resolver = get_stadium_resolver("mlb")
|
|
|
|
def _get_sources(self) -> list[str]:
|
|
"""Return source list in priority order."""
|
|
# MLB API is best - returns full schedule in one request
|
|
# ESPN caps at ~25 results for baseball
|
|
# Baseball-Reference requires HTML parsing
|
|
return ["mlb_api", "espn", "baseball_reference"]
|
|
|
|
def _get_source_url(self, source: str, **kwargs) -> str:
|
|
"""Build URL for a source."""
|
|
if source == "baseball_reference":
|
|
month = kwargs.get("month", "april")
|
|
# Baseball-Reference uses season year in URL
|
|
return f"https://www.baseball-reference.com/leagues/majors/{self.season}-schedule.shtml"
|
|
|
|
elif source == "mlb_api":
|
|
start_date = kwargs.get("start_date", "")
|
|
end_date = kwargs.get("end_date", "")
|
|
return f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date}&endDate={end_date}"
|
|
|
|
elif source == "espn":
|
|
date_str = kwargs.get("date", "")
|
|
return f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?dates={date_str}"
|
|
|
|
raise ValueError(f"Unknown source: {source}")
|
|
|
|
def _get_season_months(self) -> list[tuple[int, int]]:
|
|
"""Get the months to scrape for MLB season.
|
|
|
|
MLB season runs March/April through October/November.
|
|
"""
|
|
months = []
|
|
|
|
# Spring training / early season
|
|
for month in range(3, 12): # March-November
|
|
months.append((self.season, month))
|
|
|
|
return months
|
|
|
|
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
|
|
"""Scrape games from a specific source."""
|
|
if source == "baseball_reference":
|
|
return self._scrape_baseball_reference()
|
|
elif source == "mlb_api":
|
|
return self._scrape_mlb_api()
|
|
elif source == "espn":
|
|
return self._scrape_espn()
|
|
else:
|
|
raise ValueError(f"Unknown source: {source}")
|
|
|
|
def _scrape_baseball_reference(self) -> list[RawGameData]:
|
|
"""Scrape games from Baseball-Reference.
|
|
|
|
BR has a single schedule page per season.
|
|
Format: https://www.baseball-reference.com/leagues/majors/YYYY-schedule.shtml
|
|
"""
|
|
url = self._get_source_url("baseball_reference")
|
|
|
|
try:
|
|
html = self.session.get_html(url)
|
|
games = self._parse_baseball_reference(html, url)
|
|
return games
|
|
|
|
except Exception as e:
|
|
self._logger.error(f"Failed to scrape Baseball-Reference: {e}")
|
|
raise
|
|
|
|
def _parse_baseball_reference(
|
|
self,
|
|
html: str,
|
|
source_url: str,
|
|
) -> list[RawGameData]:
|
|
"""Parse Baseball-Reference schedule HTML.
|
|
|
|
Structure: Games are organized by date in div elements.
|
|
Each game row has: date, away team, away score, home team, home score, venue.
|
|
"""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
games: list[RawGameData] = []
|
|
|
|
# Find all game divs - they use class "game" or similar
|
|
# Baseball-Reference uses <p class="game"> for each game
|
|
game_paragraphs = soup.find_all("p", class_="game")
|
|
|
|
current_date = None
|
|
|
|
for elem in soup.find_all(["h3", "p"]):
|
|
# H3 contains date headers
|
|
if elem.name == "h3":
|
|
date_text = elem.get_text(strip=True)
|
|
try:
|
|
# Format: "Thursday, April 1, 2026"
|
|
current_date = datetime.strptime(date_text, "%A, %B %d, %Y")
|
|
except ValueError:
|
|
continue
|
|
|
|
elif elem.name == "p" and "game" in elem.get("class", []):
|
|
if current_date is None:
|
|
continue
|
|
|
|
try:
|
|
# Extract game time from the element if present
|
|
# Baseball-Reference may have time in a span or in the text
|
|
game_time_for_row = None
|
|
time_elem = elem.find("span", class_="game_time")
|
|
if time_elem:
|
|
time_text = time_elem.get_text(strip=True)
|
|
if time_text:
|
|
try:
|
|
# Parse time like "7:05 PM ET" or "1:10 PM"
|
|
# Remove timezone suffix if present
|
|
time_clean = time_text.replace(" ET", "").replace(" PT", "").replace(" CT", "").replace(" MT", "").strip()
|
|
game_time_for_row = datetime.strptime(time_clean, "%I:%M %p")
|
|
except ValueError:
|
|
pass
|
|
|
|
game = self._parse_br_game(elem, current_date, source_url, game_time_for_row)
|
|
if game:
|
|
games.append(game)
|
|
except Exception as e:
|
|
self._logger.debug(f"Failed to parse game: {e}")
|
|
continue
|
|
|
|
return games
|
|
|
|
def _parse_br_game(
|
|
self,
|
|
elem,
|
|
game_date: datetime,
|
|
source_url: str,
|
|
game_time: Optional[datetime] = None,
|
|
) -> Optional[RawGameData]:
|
|
"""Parse a single Baseball-Reference game element."""
|
|
text = elem.get_text(" ", strip=True)
|
|
|
|
# Parse game text - formats vary:
|
|
# "Team A (5) @ Team B (3)" or "Team A @ Team B"
|
|
# Also handles doubleheader notation
|
|
|
|
# Find all links - usually team names
|
|
links = elem.find_all("a")
|
|
if len(links) < 2:
|
|
return None
|
|
|
|
# First link is away team, second is home team
|
|
away_team = links[0].get_text(strip=True)
|
|
home_team = links[1].get_text(strip=True)
|
|
|
|
# Try to extract scores from text
|
|
away_score = None
|
|
home_score = None
|
|
|
|
# Look for score pattern "(N)"
|
|
import re
|
|
score_pattern = r"\((\d+)\)"
|
|
scores = re.findall(score_pattern, text)
|
|
|
|
if len(scores) >= 2:
|
|
try:
|
|
away_score = int(scores[0])
|
|
home_score = int(scores[1])
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
# Determine status
|
|
status = "final" if home_score is not None else "scheduled"
|
|
|
|
# Check for postponed/cancelled
|
|
text_lower = text.lower()
|
|
if "postponed" in text_lower:
|
|
status = "postponed"
|
|
elif "cancelled" in text_lower or "canceled" in text_lower:
|
|
status = "cancelled"
|
|
|
|
# Extract venue if present (usually after @ symbol)
|
|
stadium = None
|
|
if len(links) > 2:
|
|
# Third link might be stadium
|
|
stadium = links[2].get_text(strip=True)
|
|
|
|
# Combine date and time if time was provided, with ET timezone (Baseball-Reference uses ET)
|
|
final_game_date = game_date
|
|
if game_time:
|
|
final_game_date = game_date.replace(
|
|
hour=game_time.hour,
|
|
minute=game_time.minute,
|
|
tzinfo=ZoneInfo("America/New_York"),
|
|
)
|
|
|
|
return RawGameData(
|
|
game_date=final_game_date,
|
|
home_team_raw=home_team,
|
|
away_team_raw=away_team,
|
|
stadium_raw=stadium,
|
|
home_score=home_score,
|
|
away_score=away_score,
|
|
status=status,
|
|
source_url=source_url,
|
|
)
|
|
|
|
def _scrape_mlb_api(self) -> list[RawGameData]:
|
|
"""Scrape games from MLB Stats API using full season query."""
|
|
# Build date range for entire season (March-November)
|
|
season_months = self._get_season_months()
|
|
start_year, start_month = season_months[0]
|
|
end_year, end_month = season_months[-1]
|
|
|
|
# Get last day of end month
|
|
if end_month == 12:
|
|
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
|
else:
|
|
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
|
|
|
start_date = date(start_year, start_month, 1)
|
|
|
|
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date.strftime('%Y-%m-%d')}&endDate={end_date.strftime('%Y-%m-%d')}"
|
|
self._logger.info(f"Fetching MLB schedule: {start_date} to {end_date}")
|
|
|
|
try:
|
|
data = self.session.get_json(url)
|
|
return self._parse_mlb_api_response(data, url)
|
|
except Exception as e:
|
|
self._logger.error(f"MLB API error: {e}")
|
|
return []
|
|
|
|
def _parse_mlb_api_response(
|
|
self,
|
|
data: dict,
|
|
source_url: str,
|
|
) -> list[RawGameData]:
|
|
"""Parse MLB Stats API response."""
|
|
games: list[RawGameData] = []
|
|
|
|
dates = data.get("dates", [])
|
|
|
|
for date_entry in dates:
|
|
for game in date_entry.get("games", []):
|
|
try:
|
|
raw_game = self._parse_mlb_api_game(game, source_url)
|
|
if raw_game:
|
|
games.append(raw_game)
|
|
except Exception as e:
|
|
self._logger.debug(f"Failed to parse MLB API game: {e}")
|
|
continue
|
|
|
|
return games
|
|
|
|
def _parse_mlb_api_game(
|
|
self,
|
|
game: dict,
|
|
source_url: str,
|
|
) -> Optional[RawGameData]:
|
|
"""Parse a single MLB API game."""
|
|
# Get game date/time
|
|
game_date_str = game.get("gameDate", "")
|
|
if not game_date_str:
|
|
return None
|
|
|
|
try:
|
|
game_date = datetime.fromisoformat(game_date_str.replace("Z", "+00:00"))
|
|
except ValueError:
|
|
return None
|
|
|
|
# Get teams
|
|
teams = game.get("teams", {})
|
|
away_data = teams.get("away", {})
|
|
home_data = teams.get("home", {})
|
|
|
|
away_team_info = away_data.get("team", {})
|
|
home_team_info = home_data.get("team", {})
|
|
|
|
away_team = away_team_info.get("name", "")
|
|
home_team = home_team_info.get("name", "")
|
|
|
|
if not away_team or not home_team:
|
|
return None
|
|
|
|
# Get scores
|
|
away_score = away_data.get("score")
|
|
home_score = home_data.get("score")
|
|
|
|
# Get venue
|
|
venue = game.get("venue", {})
|
|
stadium = venue.get("name")
|
|
|
|
# Get status
|
|
status_data = game.get("status", {})
|
|
abstract_game_state = status_data.get("abstractGameState", "").lower()
|
|
detailed_state = status_data.get("detailedState", "").lower()
|
|
|
|
if abstract_game_state == "final":
|
|
status = "final"
|
|
elif "postponed" in detailed_state:
|
|
status = "postponed"
|
|
elif "cancelled" in detailed_state or "canceled" in detailed_state:
|
|
status = "cancelled"
|
|
else:
|
|
status = "scheduled"
|
|
|
|
# Check for doubleheader
|
|
game_number = game.get("gameNumber")
|
|
if game.get("doubleHeader") == "Y":
|
|
game_number = game.get("gameNumber", 1)
|
|
|
|
return RawGameData(
|
|
game_date=game_date,
|
|
home_team_raw=home_team,
|
|
away_team_raw=away_team,
|
|
stadium_raw=stadium,
|
|
home_score=home_score,
|
|
away_score=away_score,
|
|
status=status,
|
|
source_url=source_url,
|
|
game_number=game_number if game.get("doubleHeader") == "Y" else None,
|
|
)
|
|
|
|
def _scrape_espn(self) -> list[RawGameData]:
|
|
"""Scrape games from ESPN API using date range query."""
|
|
# Build date range for entire season (March-November)
|
|
season_months = self._get_season_months()
|
|
start_year, start_month = season_months[0]
|
|
end_year, end_month = season_months[-1]
|
|
|
|
# Get last day of end month
|
|
if end_month == 12:
|
|
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
|
else:
|
|
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
|
|
|
start_date = date(start_year, start_month, 1)
|
|
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
|
|
|
url = f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?limit=3000&dates={date_range}"
|
|
self._logger.info(f"Fetching MLB schedule: {date_range}")
|
|
|
|
try:
|
|
data = self.session.get_json(url)
|
|
return self._parse_espn_response(data, url)
|
|
except Exception as e:
|
|
self._logger.error(f"ESPN error: {e}")
|
|
return []
|
|
|
|
def _parse_espn_response(
|
|
self,
|
|
data: dict,
|
|
source_url: str,
|
|
) -> list[RawGameData]:
|
|
"""Parse ESPN API response."""
|
|
games: list[RawGameData] = []
|
|
|
|
events = data.get("events", [])
|
|
|
|
for event in events:
|
|
try:
|
|
game = self._parse_espn_event(event, source_url)
|
|
if game:
|
|
games.append(game)
|
|
except Exception as e:
|
|
self._logger.debug(f"Failed to parse ESPN event: {e}")
|
|
continue
|
|
|
|
return games
|
|
|
|
def _parse_espn_event(
|
|
self,
|
|
event: dict,
|
|
source_url: str,
|
|
) -> Optional[RawGameData]:
|
|
"""Parse a single ESPN event."""
|
|
# Get date
|
|
date_str = event.get("date", "")
|
|
if not date_str:
|
|
return None
|
|
|
|
try:
|
|
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
|
except ValueError:
|
|
return None
|
|
|
|
# Get competitions
|
|
competitions = event.get("competitions", [])
|
|
if not competitions:
|
|
return None
|
|
|
|
competition = competitions[0]
|
|
|
|
# Get teams
|
|
competitors = competition.get("competitors", [])
|
|
if len(competitors) != 2:
|
|
return None
|
|
|
|
home_team = None
|
|
away_team = None
|
|
home_score = None
|
|
away_score = None
|
|
|
|
for competitor in competitors:
|
|
team_info = competitor.get("team", {})
|
|
team_name = team_info.get("displayName", "")
|
|
is_home = competitor.get("homeAway") == "home"
|
|
score = competitor.get("score")
|
|
|
|
if score:
|
|
try:
|
|
score = int(score)
|
|
except (ValueError, TypeError):
|
|
score = None
|
|
|
|
if is_home:
|
|
home_team = team_name
|
|
home_score = score
|
|
else:
|
|
away_team = team_name
|
|
away_score = score
|
|
|
|
if not home_team or not away_team:
|
|
return None
|
|
|
|
# Get venue
|
|
venue = competition.get("venue", {})
|
|
stadium = venue.get("fullName")
|
|
|
|
# Get status
|
|
status_info = competition.get("status", {})
|
|
status_type = status_info.get("type", {})
|
|
status_name = status_type.get("name", "").lower()
|
|
|
|
if status_name == "status_final":
|
|
status = "final"
|
|
elif status_name == "status_postponed":
|
|
status = "postponed"
|
|
elif status_name == "status_canceled":
|
|
status = "cancelled"
|
|
else:
|
|
status = "scheduled"
|
|
|
|
return RawGameData(
|
|
game_date=game_date,
|
|
home_team_raw=home_team,
|
|
away_team_raw=away_team,
|
|
stadium_raw=stadium,
|
|
home_score=home_score,
|
|
away_score=away_score,
|
|
status=status,
|
|
source_url=source_url,
|
|
)
|
|
|
|
def _normalize_games(
|
|
self,
|
|
raw_games: list[RawGameData],
|
|
) -> tuple[list[Game], list[ManualReviewItem]]:
|
|
"""Normalize raw games to Game objects with canonical IDs."""
|
|
games: list[Game] = []
|
|
review_items: list[ManualReviewItem] = []
|
|
|
|
# Track games by date/matchup for doubleheader detection
|
|
games_by_matchup: dict[str, list[RawGameData]] = {}
|
|
|
|
for raw in raw_games:
|
|
date_key = raw.game_date.strftime("%Y%m%d")
|
|
matchup_key = f"{date_key}_{raw.away_team_raw}_{raw.home_team_raw}"
|
|
|
|
if matchup_key not in games_by_matchup:
|
|
games_by_matchup[matchup_key] = []
|
|
games_by_matchup[matchup_key].append(raw)
|
|
|
|
# Process games with doubleheader detection
|
|
for matchup_key, matchup_games in games_by_matchup.items():
|
|
is_doubleheader = len(matchup_games) > 1
|
|
|
|
# Sort by time if doubleheader
|
|
if is_doubleheader:
|
|
matchup_games.sort(key=lambda g: g.game_date)
|
|
|
|
for i, raw in enumerate(matchup_games):
|
|
# Use provided game_number or calculate from order
|
|
game_number = raw.game_number or ((i + 1) if is_doubleheader else None)
|
|
|
|
game, item_reviews = self._normalize_single_game(raw, game_number)
|
|
|
|
if game:
|
|
games.append(game)
|
|
log_game(
|
|
self.sport,
|
|
game.id,
|
|
game.home_team_id,
|
|
game.away_team_id,
|
|
game.game_date.strftime("%Y-%m-%d"),
|
|
game.status,
|
|
)
|
|
|
|
review_items.extend(item_reviews)
|
|
|
|
return games, review_items
|
|
|
|
def _normalize_single_game(
|
|
self,
|
|
raw: RawGameData,
|
|
game_number: Optional[int],
|
|
) -> tuple[Optional[Game], list[ManualReviewItem]]:
|
|
"""Normalize a single raw game."""
|
|
review_items: list[ManualReviewItem] = []
|
|
|
|
# Resolve home team
|
|
home_result = self._team_resolver.resolve(
|
|
raw.home_team_raw,
|
|
check_date=raw.game_date.date(),
|
|
source_url=raw.source_url,
|
|
)
|
|
|
|
if home_result.review_item:
|
|
review_items.append(home_result.review_item)
|
|
|
|
if not home_result.canonical_id:
|
|
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
|
|
return None, review_items
|
|
|
|
# Resolve away team
|
|
away_result = self._team_resolver.resolve(
|
|
raw.away_team_raw,
|
|
check_date=raw.game_date.date(),
|
|
source_url=raw.source_url,
|
|
)
|
|
|
|
if away_result.review_item:
|
|
review_items.append(away_result.review_item)
|
|
|
|
if not away_result.canonical_id:
|
|
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
|
|
return None, review_items
|
|
|
|
# Resolve stadium
|
|
stadium_id = None
|
|
|
|
if raw.stadium_raw:
|
|
stadium_result = self._stadium_resolver.resolve(
|
|
raw.stadium_raw,
|
|
check_date=raw.game_date.date(),
|
|
source_url=raw.source_url,
|
|
)
|
|
|
|
if stadium_result.review_item:
|
|
review_items.append(stadium_result.review_item)
|
|
|
|
stadium_id = stadium_result.canonical_id
|
|
|
|
# Get abbreviations for game ID
|
|
home_abbrev = self._get_abbreviation(home_result.canonical_id)
|
|
away_abbrev = self._get_abbreviation(away_result.canonical_id)
|
|
|
|
# Generate canonical game ID
|
|
game_id = generate_game_id(
|
|
sport=self.sport,
|
|
season=self.season,
|
|
away_abbrev=away_abbrev,
|
|
home_abbrev=home_abbrev,
|
|
game_date=raw.game_date,
|
|
game_number=game_number,
|
|
)
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport=self.sport,
|
|
season=self.season,
|
|
home_team_id=home_result.canonical_id,
|
|
away_team_id=away_result.canonical_id,
|
|
stadium_id=stadium_id or "",
|
|
game_date=raw.game_date,
|
|
game_number=game_number,
|
|
home_score=raw.home_score,
|
|
away_score=raw.away_score,
|
|
status=raw.status,
|
|
source_url=raw.source_url,
|
|
raw_home_team=raw.home_team_raw,
|
|
raw_away_team=raw.away_team_raw,
|
|
raw_stadium=raw.stadium_raw,
|
|
)
|
|
|
|
return game, review_items
|
|
|
|
def _get_abbreviation(self, team_id: str) -> str:
|
|
"""Extract abbreviation from team ID."""
|
|
# team_mlb_nyy -> nyy
|
|
parts = team_id.split("_")
|
|
return parts[-1] if parts else ""
|
|
|
|
def scrape_teams(self) -> list[Team]:
|
|
"""Get all MLB teams from hardcoded mappings."""
|
|
teams: list[Team] = []
|
|
seen: set[str] = set()
|
|
|
|
# MLB league/division structure
|
|
divisions = {
|
|
"AL East": ("American", ["BAL", "BOS", "NYY", "TB", "TOR"]),
|
|
"AL Central": ("American", ["CHW", "CLE", "DET", "KC", "MIN"]),
|
|
"AL West": ("American", ["HOU", "LAA", "OAK", "SEA", "TEX"]),
|
|
"NL East": ("National", ["ATL", "MIA", "NYM", "PHI", "WSN"]),
|
|
"NL Central": ("National", ["CHC", "CIN", "MIL", "PIT", "STL"]),
|
|
"NL West": ("National", ["ARI", "COL", "LAD", "SD", "SF"]),
|
|
}
|
|
|
|
# Build reverse lookup
|
|
team_divisions: dict[str, tuple[str, str]] = {}
|
|
for div, (league, abbrevs) in divisions.items():
|
|
for abbrev in abbrevs:
|
|
team_divisions[abbrev] = (league, div)
|
|
|
|
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("mlb", {}).items():
|
|
if team_id in seen:
|
|
continue
|
|
seen.add(team_id)
|
|
|
|
# Parse team name from full name
|
|
parts = full_name.split()
|
|
if len(parts) >= 2:
|
|
team_name = parts[-1]
|
|
# Handle multi-word team names
|
|
if team_name in ["Sox", "Jays"]:
|
|
team_name = " ".join(parts[-2:])
|
|
else:
|
|
team_name = full_name
|
|
|
|
# Get league and division
|
|
league, div = team_divisions.get(abbrev, (None, None))
|
|
|
|
team = Team(
|
|
id=team_id,
|
|
sport="mlb",
|
|
city=city,
|
|
name=team_name,
|
|
full_name=full_name,
|
|
abbreviation=abbrev,
|
|
conference=league, # MLB uses "league" but we map to conference field
|
|
division=div,
|
|
stadium_id=stadium_id,
|
|
)
|
|
teams.append(team)
|
|
|
|
return teams
|
|
|
|
def scrape_stadiums(self) -> list[Stadium]:
|
|
"""Get all MLB stadiums from hardcoded mappings."""
|
|
stadiums: list[Stadium] = []
|
|
|
|
mlb_stadiums = STADIUM_MAPPINGS.get("mlb", {})
|
|
for stadium_id, info in mlb_stadiums.items():
|
|
stadium = Stadium(
|
|
id=stadium_id,
|
|
sport="mlb",
|
|
name=info.name,
|
|
city=info.city,
|
|
state=info.state,
|
|
country=info.country,
|
|
latitude=info.latitude,
|
|
longitude=info.longitude,
|
|
timezone=info.timezone,
|
|
surface="grass", # Most MLB stadiums
|
|
roof_type="open", # Most MLB stadiums
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def create_mlb_scraper(season: int) -> MLBScraper:
|
|
"""Factory function to create an MLB scraper."""
|
|
return MLBScraper(season=season)
|