feat(scripts): add sportstime-parser data pipeline

Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-20 18:56:25 -06:00
parent ac78042a7e
commit 52d445bca4
76 changed files with 25065 additions and 0 deletions

View File

@@ -0,0 +1,46 @@
"""Scrapers for fetching sports data from various sources."""
from .base import (
BaseScraper,
RawGameData,
ScrapeResult,
ScraperError,
PartialDataError,
)
from .nba import NBAScraper, create_nba_scraper
from .mlb import MLBScraper, create_mlb_scraper
from .nfl import NFLScraper, create_nfl_scraper
from .nhl import NHLScraper, create_nhl_scraper
from .mls import MLSScraper, create_mls_scraper
from .wnba import WNBAScraper, create_wnba_scraper
from .nwsl import NWSLScraper, create_nwsl_scraper
__all__ = [
# Base
"BaseScraper",
"RawGameData",
"ScrapeResult",
"ScraperError",
"PartialDataError",
# NBA
"NBAScraper",
"create_nba_scraper",
# MLB
"MLBScraper",
"create_mlb_scraper",
# NFL
"NFLScraper",
"create_nfl_scraper",
# NHL
"NHLScraper",
"create_nhl_scraper",
# MLS
"MLSScraper",
"create_mls_scraper",
# WNBA
"WNBAScraper",
"create_wnba_scraper",
# NWSL
"NWSLScraper",
"create_nwsl_scraper",
]

View File

@@ -0,0 +1,335 @@
"""Base scraper class for all sport scrapers."""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import date, datetime
from typing import Optional
from ..config import EXPECTED_GAME_COUNTS
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..utils.http import RateLimitedSession, get_session
from ..utils.logging import get_logger, log_error, log_warning
from ..utils.progress import ScrapeProgress
@dataclass
class RawGameData:
"""Raw game data before normalization.
This intermediate format holds data as scraped from sources,
before team/stadium resolution and canonical ID generation.
"""
game_date: datetime
home_team_raw: str
away_team_raw: str
stadium_raw: Optional[str] = None
home_score: Optional[int] = None
away_score: Optional[int] = None
status: str = "scheduled"
source_url: Optional[str] = None
game_number: Optional[int] = None # For doubleheaders
@dataclass
class ScrapeResult:
"""Result of a scraping operation.
Attributes:
games: List of normalized Game objects
teams: List of Team objects
stadiums: List of Stadium objects
review_items: Items requiring manual review
source: Name of the source used
success: Whether scraping succeeded
error_message: Error message if failed
"""
games: list[Game] = field(default_factory=list)
teams: list[Team] = field(default_factory=list)
stadiums: list[Stadium] = field(default_factory=list)
review_items: list[ManualReviewItem] = field(default_factory=list)
source: str = ""
success: bool = True
error_message: Optional[str] = None
@property
def game_count(self) -> int:
return len(self.games)
@property
def team_count(self) -> int:
return len(self.teams)
@property
def stadium_count(self) -> int:
return len(self.stadiums)
@property
def review_count(self) -> int:
return len(self.review_items)
class BaseScraper(ABC):
"""Abstract base class for sport scrapers.
Subclasses must implement:
- scrape_games(): Fetch and normalize game schedule
- scrape_teams(): Fetch team information
- scrape_stadiums(): Fetch stadium information
- _get_sources(): Return list of source names in priority order
Features:
- Multi-source fallback (try sources in order)
- Built-in rate limiting
- Error handling with partial data discard
- Progress tracking
- Source URL tracking for manual review
"""
def __init__(
self,
sport: str,
season: int,
session: Optional[RateLimitedSession] = None,
):
"""Initialize the scraper.
Args:
sport: Sport code (e.g., 'nba', 'mlb')
season: Season start year (e.g., 2025 for 2025-26)
session: Optional HTTP session (default: global session)
"""
self.sport = sport.lower()
self.season = season
self.session = session or get_session()
self._logger = get_logger()
self._progress: Optional[ScrapeProgress] = None
@property
def expected_game_count(self) -> int:
"""Get expected number of games for this sport."""
return EXPECTED_GAME_COUNTS.get(self.sport, 0)
@abstractmethod
def _get_sources(self) -> list[str]:
"""Return list of source names in priority order.
Returns:
List of source identifiers (e.g., ['basketball_reference', 'espn', 'cbs'])
"""
pass
@abstractmethod
def _scrape_games_from_source(
self,
source: str,
) -> list[RawGameData]:
"""Scrape games from a specific source.
Args:
source: Source identifier
Returns:
List of raw game data
Raises:
Exception: If scraping fails
"""
pass
@abstractmethod
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw game data to Game objects.
Args:
raw_games: Raw scraped data
Returns:
Tuple of (normalized games, review items)
"""
pass
@abstractmethod
def scrape_teams(self) -> list[Team]:
"""Fetch team information.
Returns:
List of Team objects
"""
pass
@abstractmethod
def scrape_stadiums(self) -> list[Stadium]:
"""Fetch stadium information.
Returns:
List of Stadium objects
"""
pass
def scrape_games(self) -> ScrapeResult:
"""Scrape games with multi-source fallback.
Tries each source in priority order. On failure, discards
partial data and tries the next source.
Returns:
ScrapeResult with games, review items, and status
"""
sources = self._get_sources()
last_error: Optional[str] = None
sources_tried = 0
# Allow 3 sources to be tried. This enables NHL to fall back to NHL API
# for venue data since Hockey Reference doesn't provide it.
max_sources_to_try = 3
for source in sources:
self._logger.info(f"Trying source: {source}")
sources_tried += 1
try:
# Scrape raw data
raw_games = self._scrape_games_from_source(source)
if not raw_games:
log_warning(f"No games found from {source}")
# If multiple sources return nothing, the schedule likely doesn't exist
if sources_tried >= max_sources_to_try:
return ScrapeResult(
success=False,
error_message=f"No schedule data available (tried {sources_tried} sources)",
)
continue
self._logger.info(f"Found {len(raw_games)} raw games from {source}")
# Normalize data
games, review_items = self._normalize_games(raw_games)
self._logger.info(
f"Normalized {len(games)} games, {len(review_items)} need review"
)
return ScrapeResult(
games=games,
review_items=review_items,
source=source,
success=True,
)
except Exception as e:
last_error = str(e)
log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
# If we've tried enough sources, bail out
if sources_tried >= max_sources_to_try:
break
continue
# All sources failed
return ScrapeResult(
success=False,
error_message=f"All sources failed. Last error: {last_error}",
)
def scrape_all(self) -> ScrapeResult:
"""Scrape games, teams, and stadiums.
Returns:
Complete ScrapeResult with all data
"""
self._progress = ScrapeProgress(self.sport, self.season)
self._progress.start()
try:
# Scrape games
result = self.scrape_games()
if not result.success:
self._progress.log_error(result.error_message or "Unknown error")
self._progress.finish()
return result
# Scrape teams
teams = self.scrape_teams()
result.teams = teams
# Scrape stadiums
stadiums = self.scrape_stadiums()
result.stadiums = stadiums
# Update progress
self._progress.games_count = result.game_count
self._progress.teams_count = result.team_count
self._progress.stadiums_count = result.stadium_count
self._progress.errors_count = result.review_count
self._progress.finish()
return result
except Exception as e:
log_error(f"Scraping failed: {e}", exc_info=True)
self._progress.finish()
return ScrapeResult(
success=False,
error_message=str(e),
)
def _get_season_months(self) -> list[tuple[int, int]]:
"""Get the months to scrape for this sport's season.
Returns:
List of (year, month) tuples
"""
# Default implementation for sports with fall-spring seasons
# (NBA, NHL, etc.)
months = []
# Fall months of season start year
for month in range(10, 13): # Oct-Dec
months.append((self.season, month))
# Winter-spring months of following year
for month in range(1, 7): # Jan-Jun
months.append((self.season + 1, month))
return months
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build a source URL with parameters.
Subclasses should override this to build URLs for their sources.
Args:
source: Source identifier
**kwargs: URL parameters
Returns:
Complete URL string
"""
raise NotImplementedError(f"URL builder not implemented for {source}")
class ScraperError(Exception):
"""Exception raised when scraping fails."""
def __init__(self, source: str, message: str):
self.source = source
self.message = message
super().__init__(f"[{source}] {message}")
class PartialDataError(ScraperError):
"""Exception raised when only partial data was retrieved."""
def __init__(self, source: str, message: str, partial_count: int):
self.partial_count = partial_count
super().__init__(source, f"{message} (got {partial_count} items)")

View File

@@ -0,0 +1,685 @@
"""MLB scraper implementation with multi-source fallback."""
from datetime import datetime, date, timedelta
from typing import Optional
from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
TeamResolver,
TEAM_MAPPINGS,
get_team_resolver,
)
from ..normalizers.stadium_resolver import (
StadiumResolver,
STADIUM_MAPPINGS,
get_stadium_resolver,
)
from ..normalizers.timezone import parse_datetime
from ..utils.logging import get_logger, log_game, log_warning
class MLBScraper(BaseScraper):
"""MLB schedule scraper with multi-source fallback.
Sources (in priority order):
1. Baseball-Reference - Most reliable, complete historical data
2. MLB Stats API - Official MLB data
3. ESPN API - Backup option
"""
def __init__(self, season: int, **kwargs):
"""Initialize MLB scraper.
Args:
season: Season year (e.g., 2026 for 2026 season)
"""
super().__init__("mlb", season, **kwargs)
self._team_resolver = get_team_resolver("mlb")
self._stadium_resolver = get_stadium_resolver("mlb")
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
# MLB API is best - returns full schedule in one request
# ESPN caps at ~25 results for baseball
# Baseball-Reference requires HTML parsing
return ["mlb_api", "espn", "baseball_reference"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
if source == "baseball_reference":
month = kwargs.get("month", "april")
# Baseball-Reference uses season year in URL
return f"https://www.baseball-reference.com/leagues/majors/{self.season}-schedule.shtml"
elif source == "mlb_api":
start_date = kwargs.get("start_date", "")
end_date = kwargs.get("end_date", "")
return f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date}&endDate={end_date}"
elif source == "espn":
date_str = kwargs.get("date", "")
return f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?dates={date_str}"
raise ValueError(f"Unknown source: {source}")
def _get_season_months(self) -> list[tuple[int, int]]:
"""Get the months to scrape for MLB season.
MLB season runs March/April through October/November.
"""
months = []
# Spring training / early season
for month in range(3, 12): # March-November
months.append((self.season, month))
return months
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
"""Scrape games from a specific source."""
if source == "baseball_reference":
return self._scrape_baseball_reference()
elif source == "mlb_api":
return self._scrape_mlb_api()
elif source == "espn":
return self._scrape_espn()
else:
raise ValueError(f"Unknown source: {source}")
def _scrape_baseball_reference(self) -> list[RawGameData]:
"""Scrape games from Baseball-Reference.
BR has a single schedule page per season.
Format: https://www.baseball-reference.com/leagues/majors/YYYY-schedule.shtml
"""
url = self._get_source_url("baseball_reference")
try:
html = self.session.get_html(url)
games = self._parse_baseball_reference(html, url)
return games
except Exception as e:
self._logger.error(f"Failed to scrape Baseball-Reference: {e}")
raise
def _parse_baseball_reference(
self,
html: str,
source_url: str,
) -> list[RawGameData]:
"""Parse Baseball-Reference schedule HTML.
Structure: Games are organized by date in div elements.
Each game row has: date, away team, away score, home team, home score, venue.
"""
soup = BeautifulSoup(html, "lxml")
games: list[RawGameData] = []
# Find all game divs - they use class "game" or similar
# Baseball-Reference uses <p class="game"> for each game
game_paragraphs = soup.find_all("p", class_="game")
current_date = None
for elem in soup.find_all(["h3", "p"]):
# H3 contains date headers
if elem.name == "h3":
date_text = elem.get_text(strip=True)
try:
# Format: "Thursday, April 1, 2026"
current_date = datetime.strptime(date_text, "%A, %B %d, %Y")
except ValueError:
continue
elif elem.name == "p" and "game" in elem.get("class", []):
if current_date is None:
continue
try:
game = self._parse_br_game(elem, current_date, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse game: {e}")
continue
return games
def _parse_br_game(
self,
elem,
game_date: datetime,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single Baseball-Reference game element."""
text = elem.get_text(" ", strip=True)
# Parse game text - formats vary:
# "Team A (5) @ Team B (3)" or "Team A @ Team B"
# Also handles doubleheader notation
# Find all links - usually team names
links = elem.find_all("a")
if len(links) < 2:
return None
# First link is away team, second is home team
away_team = links[0].get_text(strip=True)
home_team = links[1].get_text(strip=True)
# Try to extract scores from text
away_score = None
home_score = None
# Look for score pattern "(N)"
import re
score_pattern = r"\((\d+)\)"
scores = re.findall(score_pattern, text)
if len(scores) >= 2:
try:
away_score = int(scores[0])
home_score = int(scores[1])
except (ValueError, IndexError):
pass
# Determine status
status = "final" if home_score is not None else "scheduled"
# Check for postponed/cancelled
text_lower = text.lower()
if "postponed" in text_lower:
status = "postponed"
elif "cancelled" in text_lower or "canceled" in text_lower:
status = "cancelled"
# Extract venue if present (usually after @ symbol)
stadium = None
if len(links) > 2:
# Third link might be stadium
stadium = links[2].get_text(strip=True)
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_mlb_api(self) -> list[RawGameData]:
"""Scrape games from MLB Stats API using full season query."""
# Build date range for entire season (March-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
start_date = date(start_year, start_month, 1)
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date.strftime('%Y-%m-%d')}&endDate={end_date.strftime('%Y-%m-%d')}"
self._logger.info(f"Fetching MLB schedule: {start_date} to {end_date}")
try:
data = self.session.get_json(url)
return self._parse_mlb_api_response(data, url)
except Exception as e:
self._logger.error(f"MLB API error: {e}")
return []
def _parse_mlb_api_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse MLB Stats API response."""
games: list[RawGameData] = []
dates = data.get("dates", [])
for date_entry in dates:
for game in date_entry.get("games", []):
try:
raw_game = self._parse_mlb_api_game(game, source_url)
if raw_game:
games.append(raw_game)
except Exception as e:
self._logger.debug(f"Failed to parse MLB API game: {e}")
continue
return games
def _parse_mlb_api_game(
self,
game: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single MLB API game."""
# Get game date/time
game_date_str = game.get("gameDate", "")
if not game_date_str:
return None
try:
game_date = datetime.fromisoformat(game_date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get teams
teams = game.get("teams", {})
away_data = teams.get("away", {})
home_data = teams.get("home", {})
away_team_info = away_data.get("team", {})
home_team_info = home_data.get("team", {})
away_team = away_team_info.get("name", "")
home_team = home_team_info.get("name", "")
if not away_team or not home_team:
return None
# Get scores
away_score = away_data.get("score")
home_score = home_data.get("score")
# Get venue
venue = game.get("venue", {})
stadium = venue.get("name")
# Get status
status_data = game.get("status", {})
abstract_game_state = status_data.get("abstractGameState", "").lower()
detailed_state = status_data.get("detailedState", "").lower()
if abstract_game_state == "final":
status = "final"
elif "postponed" in detailed_state:
status = "postponed"
elif "cancelled" in detailed_state or "canceled" in detailed_state:
status = "cancelled"
else:
status = "scheduled"
# Check for doubleheader
game_number = game.get("gameNumber")
if game.get("doubleHeader") == "Y":
game_number = game.get("gameNumber", 1)
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
game_number=game_number if game.get("doubleHeader") == "Y" else None,
)
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (March-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
url = f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?limit=3000&dates={date_range}"
self._logger.info(f"Fetching MLB schedule: {date_range}")
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse ESPN API response."""
games: list[RawGameData] = []
events = data.get("events", [])
for event in events:
try:
game = self._parse_espn_event(event, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse ESPN event: {e}")
continue
return games
def _parse_espn_event(
self,
event: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single ESPN event."""
# Get date
date_str = event.get("date", "")
if not date_str:
return None
try:
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get competitions
competitions = event.get("competitions", [])
if not competitions:
return None
competition = competitions[0]
# Get teams
competitors = competition.get("competitors", [])
if len(competitors) != 2:
return None
home_team = None
away_team = None
home_score = None
away_score = None
for competitor in competitors:
team_info = competitor.get("team", {})
team_name = team_info.get("displayName", "")
is_home = competitor.get("homeAway") == "home"
score = competitor.get("score")
if score:
try:
score = int(score)
except (ValueError, TypeError):
score = None
if is_home:
home_team = team_name
home_score = score
else:
away_team = team_name
away_score = score
if not home_team or not away_team:
return None
# Get venue
venue = competition.get("venue", {})
stadium = venue.get("fullName")
# Get status
status_info = competition.get("status", {})
status_type = status_info.get("type", {})
status_name = status_type.get("name", "").lower()
if status_name == "status_final":
status = "final"
elif status_name == "status_postponed":
status = "postponed"
elif status_name == "status_canceled":
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw games to Game objects with canonical IDs."""
games: list[Game] = []
review_items: list[ManualReviewItem] = []
# Track games by date/matchup for doubleheader detection
games_by_matchup: dict[str, list[RawGameData]] = {}
for raw in raw_games:
date_key = raw.game_date.strftime("%Y%m%d")
matchup_key = f"{date_key}_{raw.away_team_raw}_{raw.home_team_raw}"
if matchup_key not in games_by_matchup:
games_by_matchup[matchup_key] = []
games_by_matchup[matchup_key].append(raw)
# Process games with doubleheader detection
for matchup_key, matchup_games in games_by_matchup.items():
is_doubleheader = len(matchup_games) > 1
# Sort by time if doubleheader
if is_doubleheader:
matchup_games.sort(key=lambda g: g.game_date)
for i, raw in enumerate(matchup_games):
# Use provided game_number or calculate from order
game_number = raw.game_number or ((i + 1) if is_doubleheader else None)
game, item_reviews = self._normalize_single_game(raw, game_number)
if game:
games.append(game)
log_game(
self.sport,
game.id,
game.home_team_id,
game.away_team_id,
game.game_date.strftime("%Y-%m-%d"),
game.status,
)
review_items.extend(item_reviews)
return games, review_items
def _normalize_single_game(
self,
raw: RawGameData,
game_number: Optional[int],
) -> tuple[Optional[Game], list[ManualReviewItem]]:
"""Normalize a single raw game."""
review_items: list[ManualReviewItem] = []
# Resolve home team
home_result = self._team_resolver.resolve(
raw.home_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if home_result.review_item:
review_items.append(home_result.review_item)
if not home_result.canonical_id:
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
return None, review_items
# Resolve away team
away_result = self._team_resolver.resolve(
raw.away_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if away_result.review_item:
review_items.append(away_result.review_item)
if not away_result.canonical_id:
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
return None, review_items
# Resolve stadium
stadium_id = None
if raw.stadium_raw:
stadium_result = self._stadium_resolver.resolve(
raw.stadium_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if stadium_result.review_item:
review_items.append(stadium_result.review_item)
stadium_id = stadium_result.canonical_id
# Get abbreviations for game ID
home_abbrev = self._get_abbreviation(home_result.canonical_id)
away_abbrev = self._get_abbreviation(away_result.canonical_id)
# Generate canonical game ID
game_id = generate_game_id(
sport=self.sport,
season=self.season,
away_abbrev=away_abbrev,
home_abbrev=home_abbrev,
game_date=raw.game_date,
game_number=game_number,
)
game = Game(
id=game_id,
sport=self.sport,
season=self.season,
home_team_id=home_result.canonical_id,
away_team_id=away_result.canonical_id,
stadium_id=stadium_id or "",
game_date=raw.game_date,
game_number=game_number,
home_score=raw.home_score,
away_score=raw.away_score,
status=raw.status,
source_url=raw.source_url,
raw_home_team=raw.home_team_raw,
raw_away_team=raw.away_team_raw,
raw_stadium=raw.stadium_raw,
)
return game, review_items
def _get_abbreviation(self, team_id: str) -> str:
"""Extract abbreviation from team ID."""
# team_mlb_nyy -> nyy
parts = team_id.split("_")
return parts[-1] if parts else ""
def scrape_teams(self) -> list[Team]:
"""Get all MLB teams from hardcoded mappings."""
teams: list[Team] = []
seen: set[str] = set()
# MLB league/division structure
divisions = {
"AL East": ("American", ["BAL", "BOS", "NYY", "TB", "TOR"]),
"AL Central": ("American", ["CHW", "CLE", "DET", "KC", "MIN"]),
"AL West": ("American", ["HOU", "LAA", "OAK", "SEA", "TEX"]),
"NL East": ("National", ["ATL", "MIA", "NYM", "PHI", "WSN"]),
"NL Central": ("National", ["CHC", "CIN", "MIL", "PIT", "STL"]),
"NL West": ("National", ["ARI", "COL", "LAD", "SD", "SF"]),
}
# Build reverse lookup
team_divisions: dict[str, tuple[str, str]] = {}
for div, (league, abbrevs) in divisions.items():
for abbrev in abbrevs:
team_divisions[abbrev] = (league, div)
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("mlb", {}).items():
if team_id in seen:
continue
seen.add(team_id)
# Parse team name from full name
parts = full_name.split()
if len(parts) >= 2:
team_name = parts[-1]
# Handle multi-word team names
if team_name in ["Sox", "Jays"]:
team_name = " ".join(parts[-2:])
else:
team_name = full_name
# Get league and division
league, div = team_divisions.get(abbrev, (None, None))
team = Team(
id=team_id,
sport="mlb",
city=city,
name=team_name,
full_name=full_name,
abbreviation=abbrev,
conference=league, # MLB uses "league" but we map to conference field
division=div,
stadium_id=stadium_id,
)
teams.append(team)
return teams
def scrape_stadiums(self) -> list[Stadium]:
"""Get all MLB stadiums from hardcoded mappings."""
stadiums: list[Stadium] = []
mlb_stadiums = STADIUM_MAPPINGS.get("mlb", {})
for stadium_id, info in mlb_stadiums.items():
stadium = Stadium(
id=stadium_id,
sport="mlb",
name=info.name,
city=info.city,
state=info.state,
country=info.country,
latitude=info.latitude,
longitude=info.longitude,
surface="grass", # Most MLB stadiums
roof_type="open", # Most MLB stadiums
)
stadiums.append(stadium)
return stadiums
def create_mlb_scraper(season: int) -> MLBScraper:
"""Factory function to create an MLB scraper."""
return MLBScraper(season=season)

View File

@@ -0,0 +1,400 @@
"""MLS scraper implementation with multi-source fallback."""
from datetime import datetime, date, timedelta
from typing import Optional
from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
TeamResolver,
TEAM_MAPPINGS,
get_team_resolver,
)
from ..normalizers.stadium_resolver import (
StadiumResolver,
STADIUM_MAPPINGS,
get_stadium_resolver,
)
from ..utils.logging import get_logger, log_game, log_warning
class MLSScraper(BaseScraper):
"""MLS schedule scraper with multi-source fallback.
Sources (in priority order):
1. ESPN API - Most reliable for MLS
2. FBref - Backup option
"""
def __init__(self, season: int, **kwargs):
"""Initialize MLS scraper.
Args:
season: Season year (e.g., 2026 for 2026 season)
"""
super().__init__("mls", season, **kwargs)
self._team_resolver = get_team_resolver("mls")
self._stadium_resolver = get_stadium_resolver("mls")
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
# FBref scraper not yet implemented - TODO for future
return ["espn"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
if source == "espn":
date_str = kwargs.get("date", "")
return f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.1/scoreboard?dates={date_str}"
elif source == "fbref":
return f"https://fbref.com/en/comps/22/{self.season}/schedule/{self.season}-Major-League-Soccer-Scores-and-Fixtures"
raise ValueError(f"Unknown source: {source}")
def _get_season_months(self) -> list[tuple[int, int]]:
"""Get the months to scrape for MLS season.
MLS season runs February/March through October/November.
"""
months = []
# MLS runs within a calendar year
for month in range(2, 12): # Feb-Nov
months.append((self.season, month))
return months
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
"""Scrape games from a specific source."""
if source == "espn":
return self._scrape_espn()
elif source == "fbref":
return self._scrape_fbref()
else:
raise ValueError(f"Unknown source: {source}")
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (Feb-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.1/scoreboard?limit=1000&dates={date_range}"
self._logger.info(f"Fetching MLS schedule: {date_range}")
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse ESPN API response."""
games: list[RawGameData] = []
events = data.get("events", [])
for event in events:
try:
game = self._parse_espn_event(event, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse ESPN event: {e}")
continue
return games
def _parse_espn_event(
self,
event: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single ESPN event."""
# Get date
date_str = event.get("date", "")
if not date_str:
return None
try:
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get competitions
competitions = event.get("competitions", [])
if not competitions:
return None
competition = competitions[0]
# Get teams
competitors = competition.get("competitors", [])
if len(competitors) != 2:
return None
home_team = None
away_team = None
home_score = None
away_score = None
for competitor in competitors:
team_info = competitor.get("team", {})
team_name = team_info.get("displayName", "")
is_home = competitor.get("homeAway") == "home"
score = competitor.get("score")
if score:
try:
score = int(score)
except (ValueError, TypeError):
score = None
if is_home:
home_team = team_name
home_score = score
else:
away_team = team_name
away_score = score
if not home_team or not away_team:
return None
# Get venue
venue = competition.get("venue", {})
stadium = venue.get("fullName")
# Get status
status_info = competition.get("status", {})
status_type = status_info.get("type", {})
status_name = status_type.get("name", "").lower()
if status_name == "status_final":
status = "final"
elif status_name == "status_postponed":
status = "postponed"
elif status_name == "status_canceled":
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_fbref(self) -> list[RawGameData]:
"""Scrape games from FBref."""
# FBref scraping would go here
raise NotImplementedError("FBref scraper not implemented")
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw games to Game objects with canonical IDs."""
games: list[Game] = []
review_items: list[ManualReviewItem] = []
for raw in raw_games:
game, item_reviews = self._normalize_single_game(raw)
if game:
games.append(game)
log_game(
self.sport,
game.id,
game.home_team_id,
game.away_team_id,
game.game_date.strftime("%Y-%m-%d"),
game.status,
)
review_items.extend(item_reviews)
return games, review_items
def _normalize_single_game(
self,
raw: RawGameData,
) -> tuple[Optional[Game], list[ManualReviewItem]]:
"""Normalize a single raw game."""
review_items: list[ManualReviewItem] = []
# Resolve home team
home_result = self._team_resolver.resolve(
raw.home_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if home_result.review_item:
review_items.append(home_result.review_item)
if not home_result.canonical_id:
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
return None, review_items
# Resolve away team
away_result = self._team_resolver.resolve(
raw.away_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if away_result.review_item:
review_items.append(away_result.review_item)
if not away_result.canonical_id:
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
return None, review_items
# Resolve stadium
stadium_id = None
if raw.stadium_raw:
stadium_result = self._stadium_resolver.resolve(
raw.stadium_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if stadium_result.review_item:
review_items.append(stadium_result.review_item)
stadium_id = stadium_result.canonical_id
# Get abbreviations for game ID
home_abbrev = self._get_abbreviation(home_result.canonical_id)
away_abbrev = self._get_abbreviation(away_result.canonical_id)
# Generate canonical game ID
game_id = generate_game_id(
sport=self.sport,
season=self.season,
away_abbrev=away_abbrev,
home_abbrev=home_abbrev,
game_date=raw.game_date,
game_number=None,
)
game = Game(
id=game_id,
sport=self.sport,
season=self.season,
home_team_id=home_result.canonical_id,
away_team_id=away_result.canonical_id,
stadium_id=stadium_id or "",
game_date=raw.game_date,
game_number=None,
home_score=raw.home_score,
away_score=raw.away_score,
status=raw.status,
source_url=raw.source_url,
raw_home_team=raw.home_team_raw,
raw_away_team=raw.away_team_raw,
raw_stadium=raw.stadium_raw,
)
return game, review_items
def _get_abbreviation(self, team_id: str) -> str:
"""Extract abbreviation from team ID."""
parts = team_id.split("_")
return parts[-1] if parts else ""
def scrape_teams(self) -> list[Team]:
"""Get all MLS teams from hardcoded mappings."""
teams: list[Team] = []
seen: set[str] = set()
# MLS conference structure
conferences = {
"Eastern": ["ATL", "CLT", "CHI", "CIN", "CLB", "DC", "MIA", "MTL", "NE", "NYC", "RB", "ORL", "PHI", "TOR"],
"Western": ["AUS", "COL", "DAL", "HOU", "LAG", "LAFC", "MIN", "NSH", "POR", "SLC", "SD", "SJ", "SEA", "SKC", "STL", "VAN"],
}
# Build reverse lookup
team_conferences: dict[str, str] = {}
for conf, abbrevs in conferences.items():
for abbrev in abbrevs:
team_conferences[abbrev] = conf
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("mls", {}).items():
if team_id in seen:
continue
seen.add(team_id)
# Parse team name
team_name = full_name
# Get conference
conf = team_conferences.get(abbrev)
team = Team(
id=team_id,
sport="mls",
city=city,
name=team_name,
full_name=full_name,
abbreviation=abbrev,
conference=conf,
division=None, # MLS doesn't have divisions
stadium_id=stadium_id,
)
teams.append(team)
return teams
def scrape_stadiums(self) -> list[Stadium]:
"""Get all MLS stadiums from hardcoded mappings."""
stadiums: list[Stadium] = []
mls_stadiums = STADIUM_MAPPINGS.get("mls", {})
for stadium_id, info in mls_stadiums.items():
stadium = Stadium(
id=stadium_id,
sport="mls",
name=info.name,
city=info.city,
state=info.state,
country=info.country,
latitude=info.latitude,
longitude=info.longitude,
surface="grass",
roof_type="open",
)
stadiums.append(stadium)
return stadiums
def create_mls_scraper(season: int) -> MLSScraper:
"""Factory function to create an MLS scraper."""
return MLSScraper(season=season)

View File

@@ -0,0 +1,661 @@
"""NBA scraper implementation with multi-source fallback."""
from datetime import datetime, date, timezone
from typing import Optional
from bs4 import BeautifulSoup
import re
from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
TeamResolver,
TEAM_MAPPINGS,
get_team_resolver,
)
from ..normalizers.stadium_resolver import (
StadiumResolver,
STADIUM_MAPPINGS,
get_stadium_resolver,
)
from ..normalizers.timezone import parse_datetime
from ..utils.logging import get_logger, log_game, log_warning
# Month name to number mapping
MONTH_MAP = {
"january": 1, "february": 2, "march": 3, "april": 4,
"may": 5, "june": 6, "july": 7, "august": 8,
"september": 9, "october": 10, "november": 11, "december": 12,
}
# Basketball Reference month URLs
BR_MONTHS = [
"october", "november", "december",
"january", "february", "march", "april", "may", "june",
]
class NBAScraper(BaseScraper):
"""NBA schedule scraper with multi-source fallback.
Sources (in priority order):
1. Basketball-Reference - Most reliable, complete historical data
2. ESPN API - Good for current/future seasons
3. CBS Sports - Backup option
"""
def __init__(self, season: int, **kwargs):
"""Initialize NBA scraper.
Args:
season: Season start year (e.g., 2025 for 2025-26)
"""
super().__init__("nba", season, **kwargs)
self._team_resolver = get_team_resolver("nba")
self._stadium_resolver = get_stadium_resolver("nba")
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
# CBS scraper not yet implemented - TODO for future
return ["basketball_reference", "espn"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
if source == "basketball_reference":
month = kwargs.get("month", "october")
year = kwargs.get("year", self.season + 1)
return f"https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html"
elif source == "espn":
date_str = kwargs.get("date", "")
return f"https://site.api.espn.com/apis/site/v2/sports/basketball/nba/scoreboard?dates={date_str}"
elif source == "cbs":
return "https://www.cbssports.com/nba/schedule/"
raise ValueError(f"Unknown source: {source}")
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
"""Scrape games from a specific source."""
if source == "basketball_reference":
return self._scrape_basketball_reference()
elif source == "espn":
return self._scrape_espn()
elif source == "cbs":
return self._scrape_cbs()
else:
raise ValueError(f"Unknown source: {source}")
def _scrape_basketball_reference(self) -> list[RawGameData]:
"""Scrape games from Basketball-Reference.
BR organizes games by month with separate pages.
Format: https://www.basketball-reference.com/leagues/NBA_YYYY_games-month.html
where YYYY is the ending year of the season.
Bails early if first few months have no data (season doesn't exist).
"""
all_games: list[RawGameData] = []
end_year = self.season + 1
consecutive_empty_months = 0
for month in BR_MONTHS:
url = self._get_source_url("basketball_reference", month=month, year=end_year)
try:
html = self.session.get_html(url)
games = self._parse_basketball_reference(html, url)
if games:
all_games.extend(games)
consecutive_empty_months = 0
self._logger.debug(f"Found {len(games)} games in {month}")
else:
consecutive_empty_months += 1
except Exception as e:
# Some months may not exist (e.g., no games in August)
self._logger.debug(f"No data for {month}: {e}")
consecutive_empty_months += 1
# If first 3 months (Oct, Nov, Dec) all have no data, season doesn't exist
if consecutive_empty_months >= 3 and not all_games:
self._logger.info(f"No games found in first {consecutive_empty_months} months, season likely doesn't exist")
break
return all_games
def _parse_basketball_reference(
self,
html: str,
source_url: str,
) -> list[RawGameData]:
"""Parse Basketball-Reference schedule HTML.
Table structure:
- th[data-stat="date_game"]: Date (e.g., "Tue, Oct 22, 2024")
- td[data-stat="visitor_team_name"]: Away team
- td[data-stat="home_team_name"]: Home team
- td[data-stat="visitor_pts"]: Away score
- td[data-stat="home_pts"]: Home score
- td[data-stat="arena_name"]: Arena/stadium name
"""
soup = BeautifulSoup(html, "lxml")
games: list[RawGameData] = []
# Find the schedule table
table = soup.find("table", id="schedule")
if not table:
return games
tbody = table.find("tbody")
if not tbody:
return games
for row in tbody.find_all("tr"):
# Skip header rows
if row.get("class") and "thead" in row.get("class", []):
continue
try:
game = self._parse_br_row(row, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse row: {e}")
continue
return games
def _parse_br_row(
self,
row,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single Basketball-Reference table row."""
# Get date
date_cell = row.find("th", {"data-stat": "date_game"})
if not date_cell:
return None
date_text = date_cell.get_text(strip=True)
if not date_text:
return None
# Parse date (format: "Tue, Oct 22, 2024")
try:
game_date = datetime.strptime(date_text, "%a, %b %d, %Y")
except ValueError:
# Try alternative format
try:
game_date = datetime.strptime(date_text, "%B %d, %Y")
except ValueError:
self._logger.debug(f"Could not parse date: {date_text}")
return None
# Get teams
away_cell = row.find("td", {"data-stat": "visitor_team_name"})
home_cell = row.find("td", {"data-stat": "home_team_name"})
if not away_cell or not home_cell:
return None
away_team = away_cell.get_text(strip=True)
home_team = home_cell.get_text(strip=True)
if not away_team or not home_team:
return None
# Get scores (may be empty for future games)
away_score_cell = row.find("td", {"data-stat": "visitor_pts"})
home_score_cell = row.find("td", {"data-stat": "home_pts"})
away_score = None
home_score = None
if away_score_cell and away_score_cell.get_text(strip=True):
try:
away_score = int(away_score_cell.get_text(strip=True))
except ValueError:
pass
if home_score_cell and home_score_cell.get_text(strip=True):
try:
home_score = int(home_score_cell.get_text(strip=True))
except ValueError:
pass
# Get arena
arena_cell = row.find("td", {"data-stat": "arena_name"})
arena = arena_cell.get_text(strip=True) if arena_cell else None
# Determine status
status = "final" if home_score is not None else "scheduled"
# Check for postponed/cancelled
notes_cell = row.find("td", {"data-stat": "game_remarks"})
if notes_cell:
notes = notes_cell.get_text(strip=True).lower()
if "postponed" in notes:
status = "postponed"
elif "cancelled" in notes or "canceled" in notes:
status = "cancelled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=arena,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API.
ESPN API returns games for a specific date range.
We iterate through each day of the season.
Bails out early if no games found after checking first month.
"""
all_games: list[RawGameData] = []
consecutive_empty_days = 0
max_empty_days = 45 # Bail after ~1.5 months of no games
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
days_in_month = (next_month - date(year, month, 1)).days
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
if games:
all_games.extend(games)
consecutive_empty_days = 0
else:
consecutive_empty_days += 1
# Bail early if no games found for a long stretch
if consecutive_empty_days >= max_empty_days:
self._logger.info(f"No games found for {max_empty_days} consecutive days, stopping ESPN scrape")
return all_games
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
consecutive_empty_days += 1
if consecutive_empty_days >= max_empty_days:
self._logger.info(f"Too many consecutive failures, stopping ESPN scrape")
return all_games
continue
return all_games
def _parse_espn_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse ESPN API response."""
games: list[RawGameData] = []
events = data.get("events", [])
for event in events:
try:
game = self._parse_espn_event(event, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse ESPN event: {e}")
continue
return games
def _parse_espn_event(
self,
event: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single ESPN event."""
# Get date
date_str = event.get("date", "")
if not date_str:
return None
try:
# ESPN uses ISO format
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get competitions (usually just one)
competitions = event.get("competitions", [])
if not competitions:
return None
competition = competitions[0]
# Get teams
competitors = competition.get("competitors", [])
if len(competitors) != 2:
return None
home_team = None
away_team = None
home_score = None
away_score = None
for competitor in competitors:
team_info = competitor.get("team", {})
team_name = team_info.get("displayName", "")
is_home = competitor.get("homeAway") == "home"
score = competitor.get("score")
if score:
try:
score = int(score)
except (ValueError, TypeError):
score = None
if is_home:
home_team = team_name
home_score = score
else:
away_team = team_name
away_score = score
if not home_team or not away_team:
return None
# Get venue
venue = competition.get("venue", {})
arena = venue.get("fullName")
# Get status
status_info = competition.get("status", {})
status_type = status_info.get("type", {})
status_name = status_type.get("name", "").lower()
if status_name == "status_final":
status = "final"
elif status_name == "status_postponed":
status = "postponed"
elif status_name == "status_canceled":
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=arena,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_cbs(self) -> list[RawGameData]:
"""Scrape games from CBS Sports.
CBS Sports is a backup source with less structured data.
"""
# CBS Sports scraping would go here
# For now, return empty to fall back to other sources
raise NotImplementedError("CBS scraper not implemented")
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw games to Game objects with canonical IDs."""
games: list[Game] = []
review_items: list[ManualReviewItem] = []
# Track games by date for doubleheader detection
games_by_date: dict[str, list[RawGameData]] = {}
for raw in raw_games:
date_key = raw.game_date.strftime("%Y%m%d")
matchup_key = f"{date_key}_{raw.away_team_raw}_{raw.home_team_raw}"
if matchup_key not in games_by_date:
games_by_date[matchup_key] = []
games_by_date[matchup_key].append(raw)
# Process games with doubleheader detection
for matchup_key, matchup_games in games_by_date.items():
is_doubleheader = len(matchup_games) > 1
for i, raw in enumerate(matchup_games):
game_number = (i + 1) if is_doubleheader else None
game, item_reviews = self._normalize_single_game(raw, game_number)
if game:
games.append(game)
log_game(
self.sport,
game.id,
game.home_team_id,
game.away_team_id,
game.game_date.strftime("%Y-%m-%d"),
game.status,
)
review_items.extend(item_reviews)
return games, review_items
def _normalize_single_game(
self,
raw: RawGameData,
game_number: Optional[int],
) -> tuple[Optional[Game], list[ManualReviewItem]]:
"""Normalize a single raw game."""
review_items: list[ManualReviewItem] = []
# Resolve home team
home_result = self._team_resolver.resolve(
raw.home_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if home_result.review_item:
review_items.append(home_result.review_item)
if not home_result.canonical_id:
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
return None, review_items
# Resolve away team
away_result = self._team_resolver.resolve(
raw.away_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if away_result.review_item:
review_items.append(away_result.review_item)
if not away_result.canonical_id:
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
return None, review_items
# Resolve stadium (optional - use home team's stadium if not found)
stadium_id = None
if raw.stadium_raw:
stadium_result = self._stadium_resolver.resolve(
raw.stadium_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if stadium_result.review_item:
review_items.append(stadium_result.review_item)
stadium_id = stadium_result.canonical_id
# If no stadium found, use home team's default stadium
if not stadium_id:
# Look up home team's stadium from mappings
home_abbrev = home_result.canonical_id.split("_")[-1].upper()
team_info = self._team_resolver.get_team_info(home_abbrev)
if team_info:
# Try to find stadium by team's home arena
for sid, sinfo in STADIUM_MAPPINGS.get("nba", {}).items():
# Match by city
if sinfo.city.lower() in team_info[2].lower():
stadium_id = sid
break
# Get abbreviations for game ID
home_abbrev = self._get_abbreviation(home_result.canonical_id)
away_abbrev = self._get_abbreviation(away_result.canonical_id)
# Generate canonical game ID
game_id = generate_game_id(
sport=self.sport,
season=self.season,
away_abbrev=away_abbrev,
home_abbrev=home_abbrev,
game_date=raw.game_date,
game_number=game_number,
)
game = Game(
id=game_id,
sport=self.sport,
season=self.season,
home_team_id=home_result.canonical_id,
away_team_id=away_result.canonical_id,
stadium_id=stadium_id or "",
game_date=raw.game_date,
game_number=game_number,
home_score=raw.home_score,
away_score=raw.away_score,
status=raw.status,
source_url=raw.source_url,
raw_home_team=raw.home_team_raw,
raw_away_team=raw.away_team_raw,
raw_stadium=raw.stadium_raw,
)
return game, review_items
def _get_abbreviation(self, team_id: str) -> str:
"""Extract abbreviation from team ID."""
# team_nba_okc -> okc
parts = team_id.split("_")
return parts[-1] if parts else ""
def scrape_teams(self) -> list[Team]:
"""Get all NBA teams from hardcoded mappings."""
teams: list[Team] = []
seen: set[str] = set()
# NBA conference/division structure
divisions = {
"Atlantic": ("Eastern", ["BOS", "BKN", "NYK", "PHI", "TOR"]),
"Central": ("Eastern", ["CHI", "CLE", "DET", "IND", "MIL"]),
"Southeast": ("Eastern", ["ATL", "CHA", "MIA", "ORL", "WAS"]),
"Northwest": ("Western", ["DEN", "MIN", "OKC", "POR", "UTA"]),
"Pacific": ("Western", ["GSW", "LAC", "LAL", "PHX", "SAC"]),
"Southwest": ("Western", ["DAL", "HOU", "MEM", "NOP", "SAS"]),
}
# Build reverse lookup
team_divisions: dict[str, tuple[str, str]] = {}
for div, (conf, abbrevs) in divisions.items():
for abbrev in abbrevs:
team_divisions[abbrev] = (conf, div)
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nba", {}).items():
if team_id in seen:
continue
seen.add(team_id)
# Parse full name into city and name parts
parts = full_name.split()
if len(parts) >= 2:
# Handle special cases like "Oklahoma City Thunder"
if city == "Oklahoma City":
team_name = "Thunder"
elif city == "Golden State":
team_name = "Warriors"
elif city == "San Antonio":
team_name = "Spurs"
elif city == "New York":
team_name = parts[-1] # Knicks
elif city == "New Orleans":
team_name = "Pelicans"
elif city == "Los Angeles":
team_name = parts[-1] # Lakers or Clippers
else:
team_name = parts[-1]
else:
team_name = full_name
# Get conference and division
conf, div = team_divisions.get(abbrev, (None, None))
team = Team(
id=team_id,
sport="nba",
city=city,
name=team_name,
full_name=full_name,
abbreviation=abbrev,
conference=conf,
division=div,
stadium_id=stadium_id,
)
teams.append(team)
return teams
def scrape_stadiums(self) -> list[Stadium]:
"""Get all NBA stadiums from hardcoded mappings."""
stadiums: list[Stadium] = []
for stadium_id, info in STADIUM_MAPPINGS.get("nba", {}).items():
stadium = Stadium(
id=stadium_id,
sport="nba",
name=info.name,
city=info.city,
state=info.state,
country=info.country,
latitude=info.latitude,
longitude=info.longitude,
surface="hardwood",
roof_type="dome",
)
stadiums.append(stadium)
return stadiums
def create_nba_scraper(season: int) -> NBAScraper:
"""Factory function to create an NBA scraper."""
return NBAScraper(season=season)

View File

@@ -0,0 +1,579 @@
"""NFL scraper implementation with multi-source fallback."""
from datetime import datetime, date
from typing import Optional
from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
TeamResolver,
TEAM_MAPPINGS,
get_team_resolver,
)
from ..normalizers.stadium_resolver import (
StadiumResolver,
STADIUM_MAPPINGS,
get_stadium_resolver,
)
from ..utils.logging import get_logger, log_game, log_warning
# International game locations to filter out
INTERNATIONAL_LOCATIONS = {"London", "Mexico City", "Frankfurt", "Munich", "São Paulo"}
class NFLScraper(BaseScraper):
"""NFL schedule scraper with multi-source fallback.
Sources (in priority order):
1. ESPN API - Most reliable for NFL
2. Pro-Football-Reference - Complete historical data
3. CBS Sports - Backup option
"""
def __init__(self, season: int, **kwargs):
"""Initialize NFL scraper.
Args:
season: Season year (e.g., 2025 for 2025 season)
"""
super().__init__("nfl", season, **kwargs)
self._team_resolver = get_team_resolver("nfl")
self._stadium_resolver = get_stadium_resolver("nfl")
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
# CBS scraper not yet implemented - TODO for future
return ["espn", "pro_football_reference"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
if source == "espn":
week = kwargs.get("week", 1)
season_type = kwargs.get("season_type", 2) # 1=preseason, 2=regular, 3=postseason
return f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?seasontype={season_type}&week={week}"
elif source == "pro_football_reference":
return f"https://www.pro-football-reference.com/years/{self.season}/games.htm"
elif source == "cbs":
return "https://www.cbssports.com/nfl/schedule/"
raise ValueError(f"Unknown source: {source}")
def _get_season_months(self) -> list[tuple[int, int]]:
"""Get the months to scrape for NFL season.
NFL season runs September through February.
"""
months = []
# Regular season months
for month in range(9, 13): # Sept-Dec
months.append((self.season, month))
# Playoff months
for month in range(1, 3): # Jan-Feb
months.append((self.season + 1, month))
return months
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
"""Scrape games from a specific source."""
if source == "espn":
return self._scrape_espn()
elif source == "pro_football_reference":
return self._scrape_pro_football_reference()
elif source == "cbs":
return self._scrape_cbs()
else:
raise ValueError(f"Unknown source: {source}")
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API.
ESPN NFL API uses week numbers.
"""
all_games: list[RawGameData] = []
# Scrape preseason (4 weeks)
for week in range(1, 5):
try:
url = self._get_source_url("espn", week=week, season_type=1)
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN preseason week {week} error: {e}")
continue
# Scrape regular season (18 weeks)
for week in range(1, 19):
try:
url = self._get_source_url("espn", week=week, season_type=2)
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
self._logger.debug(f"Found {len(games)} games in week {week}")
except Exception as e:
self._logger.debug(f"ESPN regular season week {week} error: {e}")
continue
# Scrape postseason (4 rounds)
for week in range(1, 5):
try:
url = self._get_source_url("espn", week=week, season_type=3)
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN postseason week {week} error: {e}")
continue
return all_games
def _parse_espn_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse ESPN API response."""
games: list[RawGameData] = []
events = data.get("events", [])
for event in events:
try:
game = self._parse_espn_event(event, source_url)
if game:
# Filter international games
if game.stadium_raw and any(loc in game.stadium_raw for loc in INTERNATIONAL_LOCATIONS):
self._logger.debug(f"Skipping international game: {game.stadium_raw}")
continue
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse ESPN event: {e}")
continue
return games
def _parse_espn_event(
self,
event: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single ESPN event."""
# Get date
date_str = event.get("date", "")
if not date_str:
return None
try:
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get competitions
competitions = event.get("competitions", [])
if not competitions:
return None
competition = competitions[0]
# Check for neutral site (international games)
if competition.get("neutralSite"):
venue = competition.get("venue", {})
venue_city = venue.get("address", {}).get("city", "")
if venue_city in INTERNATIONAL_LOCATIONS:
return None
# Get teams
competitors = competition.get("competitors", [])
if len(competitors) != 2:
return None
home_team = None
away_team = None
home_score = None
away_score = None
for competitor in competitors:
team_info = competitor.get("team", {})
team_name = team_info.get("displayName", "")
is_home = competitor.get("homeAway") == "home"
score = competitor.get("score")
if score:
try:
score = int(score)
except (ValueError, TypeError):
score = None
if is_home:
home_team = team_name
home_score = score
else:
away_team = team_name
away_score = score
if not home_team or not away_team:
return None
# Get venue
venue = competition.get("venue", {})
stadium = venue.get("fullName")
# Get status
status_info = competition.get("status", {})
status_type = status_info.get("type", {})
status_name = status_type.get("name", "").lower()
if status_name == "status_final":
status = "final"
elif status_name == "status_postponed":
status = "postponed"
elif status_name == "status_canceled":
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_pro_football_reference(self) -> list[RawGameData]:
"""Scrape games from Pro-Football-Reference.
PFR has a single schedule page per season.
"""
url = self._get_source_url("pro_football_reference")
try:
html = self.session.get_html(url)
games = self._parse_pfr(html, url)
return games
except Exception as e:
self._logger.error(f"Failed to scrape Pro-Football-Reference: {e}")
raise
def _parse_pfr(
self,
html: str,
source_url: str,
) -> list[RawGameData]:
"""Parse Pro-Football-Reference schedule HTML."""
soup = BeautifulSoup(html, "lxml")
games: list[RawGameData] = []
# Find the schedule table
table = soup.find("table", id="games")
if not table:
return games
tbody = table.find("tbody")
if not tbody:
return games
for row in tbody.find_all("tr"):
# Skip header rows
if row.get("class") and "thead" in row.get("class", []):
continue
try:
game = self._parse_pfr_row(row, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse PFR row: {e}")
continue
return games
def _parse_pfr_row(
self,
row,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single Pro-Football-Reference table row."""
# Get date
date_cell = row.find("td", {"data-stat": "game_date"})
if not date_cell:
return None
date_text = date_cell.get_text(strip=True)
if not date_text:
return None
# Parse date
try:
# PFR uses YYYY-MM-DD format
game_date = datetime.strptime(date_text, "%Y-%m-%d")
except ValueError:
return None
# Get teams
winner_cell = row.find("td", {"data-stat": "winner"})
loser_cell = row.find("td", {"data-stat": "loser"})
if not winner_cell or not loser_cell:
return None
winner = winner_cell.get_text(strip=True)
loser = loser_cell.get_text(strip=True)
if not winner or not loser:
return None
# Determine home/away based on @ symbol
game_location = row.find("td", {"data-stat": "game_location"})
at_home = game_location and "@" in game_location.get_text()
if at_home:
home_team = loser
away_team = winner
else:
home_team = winner
away_team = loser
# Get scores
pts_win_cell = row.find("td", {"data-stat": "pts_win"})
pts_lose_cell = row.find("td", {"data-stat": "pts_lose"})
home_score = None
away_score = None
if pts_win_cell and pts_lose_cell:
try:
winner_pts = int(pts_win_cell.get_text(strip=True))
loser_pts = int(pts_lose_cell.get_text(strip=True))
if at_home:
home_score = loser_pts
away_score = winner_pts
else:
home_score = winner_pts
away_score = loser_pts
except ValueError:
pass
# Determine status
status = "final" if home_score is not None else "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=None, # PFR doesn't always have stadium
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_cbs(self) -> list[RawGameData]:
"""Scrape games from CBS Sports."""
raise NotImplementedError("CBS scraper not implemented")
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw games to Game objects with canonical IDs."""
games: list[Game] = []
review_items: list[ManualReviewItem] = []
for raw in raw_games:
game, item_reviews = self._normalize_single_game(raw)
if game:
games.append(game)
log_game(
self.sport,
game.id,
game.home_team_id,
game.away_team_id,
game.game_date.strftime("%Y-%m-%d"),
game.status,
)
review_items.extend(item_reviews)
return games, review_items
def _normalize_single_game(
self,
raw: RawGameData,
) -> tuple[Optional[Game], list[ManualReviewItem]]:
"""Normalize a single raw game."""
review_items: list[ManualReviewItem] = []
# Resolve home team
home_result = self._team_resolver.resolve(
raw.home_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if home_result.review_item:
review_items.append(home_result.review_item)
if not home_result.canonical_id:
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
return None, review_items
# Resolve away team
away_result = self._team_resolver.resolve(
raw.away_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if away_result.review_item:
review_items.append(away_result.review_item)
if not away_result.canonical_id:
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
return None, review_items
# Resolve stadium
stadium_id = None
if raw.stadium_raw:
stadium_result = self._stadium_resolver.resolve(
raw.stadium_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if stadium_result.review_item:
review_items.append(stadium_result.review_item)
stadium_id = stadium_result.canonical_id
# Get abbreviations for game ID
home_abbrev = self._get_abbreviation(home_result.canonical_id)
away_abbrev = self._get_abbreviation(away_result.canonical_id)
# Generate canonical game ID
game_id = generate_game_id(
sport=self.sport,
season=self.season,
away_abbrev=away_abbrev,
home_abbrev=home_abbrev,
game_date=raw.game_date,
game_number=None, # NFL doesn't have doubleheaders
)
game = Game(
id=game_id,
sport=self.sport,
season=self.season,
home_team_id=home_result.canonical_id,
away_team_id=away_result.canonical_id,
stadium_id=stadium_id or "",
game_date=raw.game_date,
game_number=None,
home_score=raw.home_score,
away_score=raw.away_score,
status=raw.status,
source_url=raw.source_url,
raw_home_team=raw.home_team_raw,
raw_away_team=raw.away_team_raw,
raw_stadium=raw.stadium_raw,
)
return game, review_items
def _get_abbreviation(self, team_id: str) -> str:
"""Extract abbreviation from team ID."""
parts = team_id.split("_")
return parts[-1] if parts else ""
def scrape_teams(self) -> list[Team]:
"""Get all NFL teams from hardcoded mappings."""
teams: list[Team] = []
seen: set[str] = set()
# NFL conference/division structure
divisions = {
"AFC East": ("AFC", ["BUF", "MIA", "NE", "NYJ"]),
"AFC North": ("AFC", ["BAL", "CIN", "CLE", "PIT"]),
"AFC South": ("AFC", ["HOU", "IND", "JAX", "TEN"]),
"AFC West": ("AFC", ["DEN", "KC", "LV", "LAC"]),
"NFC East": ("NFC", ["DAL", "NYG", "PHI", "WAS"]),
"NFC North": ("NFC", ["CHI", "DET", "GB", "MIN"]),
"NFC South": ("NFC", ["ATL", "CAR", "NO", "TB"]),
"NFC West": ("NFC", ["ARI", "LAR", "SF", "SEA"]),
}
# Build reverse lookup
team_divisions: dict[str, tuple[str, str]] = {}
for div, (conf, abbrevs) in divisions.items():
for abbrev in abbrevs:
team_divisions[abbrev] = (conf, div)
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nfl", {}).items():
if team_id in seen:
continue
seen.add(team_id)
# Parse team name
parts = full_name.split()
team_name = parts[-1] if parts else full_name
# Get conference and division
conf, div = team_divisions.get(abbrev, (None, None))
team = Team(
id=team_id,
sport="nfl",
city=city,
name=team_name,
full_name=full_name,
abbreviation=abbrev,
conference=conf,
division=div,
stadium_id=stadium_id,
)
teams.append(team)
return teams
def scrape_stadiums(self) -> list[Stadium]:
"""Get all NFL stadiums from hardcoded mappings."""
stadiums: list[Stadium] = []
nfl_stadiums = STADIUM_MAPPINGS.get("nfl", {})
for stadium_id, info in nfl_stadiums.items():
stadium = Stadium(
id=stadium_id,
sport="nfl",
name=info.name,
city=info.city,
state=info.state,
country=info.country,
latitude=info.latitude,
longitude=info.longitude,
surface="turf", # Many NFL stadiums
roof_type="open", # Most outdoor
)
stadiums.append(stadium)
return stadiums
def create_nfl_scraper(season: int) -> NFLScraper:
"""Factory function to create an NFL scraper."""
return NFLScraper(season=season)

View File

@@ -0,0 +1,657 @@
"""NHL scraper implementation with multi-source fallback."""
from datetime import datetime, date
from typing import Optional
from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
TeamResolver,
TEAM_MAPPINGS,
get_team_resolver,
)
from ..normalizers.stadium_resolver import (
StadiumResolver,
STADIUM_MAPPINGS,
get_stadium_resolver,
)
from ..utils.logging import get_logger, log_game, log_warning
# International game locations to filter out
INTERNATIONAL_LOCATIONS = {"Prague", "Stockholm", "Helsinki", "Tampere", "Gothenburg"}
# Hockey Reference month URLs
HR_MONTHS = [
"october", "november", "december",
"january", "february", "march", "april", "may", "june",
]
class NHLScraper(BaseScraper):
"""NHL schedule scraper with multi-source fallback.
Sources (in priority order):
1. Hockey-Reference - Most reliable for NHL
2. NHL API - Official NHL data
3. ESPN API - Backup option
"""
def __init__(self, season: int, **kwargs):
"""Initialize NHL scraper.
Args:
season: Season start year (e.g., 2025 for 2025-26)
"""
super().__init__("nhl", season, **kwargs)
self._team_resolver = get_team_resolver("nhl")
self._stadium_resolver = get_stadium_resolver("nhl")
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
return ["hockey_reference", "nhl_api", "espn"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
if source == "hockey_reference":
month = kwargs.get("month", "october")
year = kwargs.get("year", self.season + 1)
return f"https://www.hockey-reference.com/leagues/NHL_{year}_games.html"
elif source == "nhl_api":
start_date = kwargs.get("start_date", "")
end_date = kwargs.get("end_date", "")
return f"https://api-web.nhle.com/v1/schedule/{start_date}"
elif source == "espn":
date_str = kwargs.get("date", "")
return f"https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard?dates={date_str}"
raise ValueError(f"Unknown source: {source}")
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
"""Scrape games from a specific source."""
if source == "hockey_reference":
return self._scrape_hockey_reference()
elif source == "nhl_api":
return self._scrape_nhl_api()
elif source == "espn":
return self._scrape_espn()
else:
raise ValueError(f"Unknown source: {source}")
def _scrape_hockey_reference(self) -> list[RawGameData]:
"""Scrape games from Hockey-Reference.
HR has a single schedule page per season.
"""
end_year = self.season + 1
url = self._get_source_url("hockey_reference", year=end_year)
try:
html = self.session.get_html(url)
games = self._parse_hockey_reference(html, url)
return games
except Exception as e:
self._logger.error(f"Failed to scrape Hockey-Reference: {e}")
raise
def _parse_hockey_reference(
self,
html: str,
source_url: str,
) -> list[RawGameData]:
"""Parse Hockey-Reference schedule HTML."""
soup = BeautifulSoup(html, "lxml")
games: list[RawGameData] = []
# Find the schedule table
table = soup.find("table", id="games")
if not table:
return games
tbody = table.find("tbody")
if not tbody:
return games
for row in tbody.find_all("tr"):
# Skip header rows
if row.get("class") and "thead" in row.get("class", []):
continue
try:
game = self._parse_hr_row(row, source_url)
if game:
# Filter international games
if game.stadium_raw and any(loc in game.stadium_raw for loc in INTERNATIONAL_LOCATIONS):
continue
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse HR row: {e}")
continue
return games
def _parse_hr_row(
self,
row,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single Hockey-Reference table row."""
# Get date
date_cell = row.find("th", {"data-stat": "date_game"})
if not date_cell:
return None
date_text = date_cell.get_text(strip=True)
if not date_text:
return None
# Parse date (format: "2025-10-15")
try:
game_date = datetime.strptime(date_text, "%Y-%m-%d")
except ValueError:
return None
# Get teams
visitor_cell = row.find("td", {"data-stat": "visitor_team_name"})
home_cell = row.find("td", {"data-stat": "home_team_name"})
if not visitor_cell or not home_cell:
return None
away_team = visitor_cell.get_text(strip=True)
home_team = home_cell.get_text(strip=True)
if not away_team or not home_team:
return None
# Get scores
visitor_goals_cell = row.find("td", {"data-stat": "visitor_goals"})
home_goals_cell = row.find("td", {"data-stat": "home_goals"})
away_score = None
home_score = None
if visitor_goals_cell and visitor_goals_cell.get_text(strip=True):
try:
away_score = int(visitor_goals_cell.get_text(strip=True))
except ValueError:
pass
if home_goals_cell and home_goals_cell.get_text(strip=True):
try:
home_score = int(home_goals_cell.get_text(strip=True))
except ValueError:
pass
# Determine status
status = "final" if home_score is not None else "scheduled"
# Check for OT/SO
overtimes_cell = row.find("td", {"data-stat": "overtimes"})
if overtimes_cell:
ot_text = overtimes_cell.get_text(strip=True)
if ot_text:
status = "final" # OT games are still final
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=None, # HR doesn't have stadium
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_nhl_api(self) -> list[RawGameData]:
"""Scrape games from NHL API."""
all_games: list[RawGameData] = []
for year, month in self._get_season_months():
start_date = date(year, month, 1)
url = self._get_source_url("nhl_api", start_date=start_date.strftime("%Y-%m-%d"))
try:
data = self.session.get_json(url)
games = self._parse_nhl_api_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"NHL API error for {year}-{month}: {e}")
continue
return all_games
def _parse_nhl_api_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse NHL API response."""
games: list[RawGameData] = []
game_weeks = data.get("gameWeek", [])
for week in game_weeks:
for game_day in week.get("games", []):
try:
game = self._parse_nhl_api_game(game_day, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse NHL API game: {e}")
continue
return games
def _parse_nhl_api_game(
self,
game: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single NHL API game."""
# Get date
start_time = game.get("startTimeUTC", "")
if not start_time:
return None
try:
game_date = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
except ValueError:
return None
# Get teams
away_team_data = game.get("awayTeam", {})
home_team_data = game.get("homeTeam", {})
away_team = away_team_data.get("placeName", {}).get("default", "")
home_team = home_team_data.get("placeName", {}).get("default", "")
if not away_team or not home_team:
# Try full name
away_team = away_team_data.get("name", {}).get("default", "")
home_team = home_team_data.get("name", {}).get("default", "")
if not away_team or not home_team:
return None
# Get scores
away_score = away_team_data.get("score")
home_score = home_team_data.get("score")
# Get venue
venue = game.get("venue", {})
stadium = venue.get("default")
# Get status
game_state = game.get("gameState", "").lower()
if game_state in ["final", "off"]:
status = "final"
elif game_state == "postponed":
status = "postponed"
elif game_state in ["cancelled", "canceled"]:
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API."""
all_games: list[RawGameData] = []
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
days_in_month = (next_month - date(year, month, 1)).days
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
continue
return all_games
def _parse_espn_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse ESPN API response."""
games: list[RawGameData] = []
events = data.get("events", [])
for event in events:
try:
game = self._parse_espn_event(event, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse ESPN event: {e}")
continue
return games
def _parse_espn_event(
self,
event: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single ESPN event."""
# Get date
date_str = event.get("date", "")
if not date_str:
return None
try:
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get competitions
competitions = event.get("competitions", [])
if not competitions:
return None
competition = competitions[0]
# Check for neutral site (international games like Global Series)
if competition.get("neutralSite"):
venue = competition.get("venue", {})
venue_city = venue.get("address", {}).get("city", "")
if venue_city in INTERNATIONAL_LOCATIONS:
return None
# Get teams
competitors = competition.get("competitors", [])
if len(competitors) != 2:
return None
home_team = None
away_team = None
home_score = None
away_score = None
for competitor in competitors:
team_info = competitor.get("team", {})
team_name = team_info.get("displayName", "")
is_home = competitor.get("homeAway") == "home"
score = competitor.get("score")
if score:
try:
score = int(score)
except (ValueError, TypeError):
score = None
if is_home:
home_team = team_name
home_score = score
else:
away_team = team_name
away_score = score
if not home_team or not away_team:
return None
# Get venue
venue = competition.get("venue", {})
stadium = venue.get("fullName")
# Get status
status_info = competition.get("status", {})
status_type = status_info.get("type", {})
status_name = status_type.get("name", "").lower()
if status_name == "status_final":
status = "final"
elif status_name == "status_postponed":
status = "postponed"
elif status_name == "status_canceled":
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw games to Game objects with canonical IDs."""
games: list[Game] = []
review_items: list[ManualReviewItem] = []
for raw in raw_games:
game, item_reviews = self._normalize_single_game(raw)
if game:
games.append(game)
log_game(
self.sport,
game.id,
game.home_team_id,
game.away_team_id,
game.game_date.strftime("%Y-%m-%d"),
game.status,
)
review_items.extend(item_reviews)
return games, review_items
def _normalize_single_game(
self,
raw: RawGameData,
) -> tuple[Optional[Game], list[ManualReviewItem]]:
"""Normalize a single raw game."""
review_items: list[ManualReviewItem] = []
# Resolve home team
home_result = self._team_resolver.resolve(
raw.home_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if home_result.review_item:
review_items.append(home_result.review_item)
if not home_result.canonical_id:
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
return None, review_items
# Resolve away team
away_result = self._team_resolver.resolve(
raw.away_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if away_result.review_item:
review_items.append(away_result.review_item)
if not away_result.canonical_id:
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
return None, review_items
# Resolve stadium
stadium_id = None
if raw.stadium_raw:
stadium_result = self._stadium_resolver.resolve(
raw.stadium_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if stadium_result.review_item:
review_items.append(stadium_result.review_item)
stadium_id = stadium_result.canonical_id
# Fallback: Use home team's default stadium if no venue provided
# This is common for Hockey-Reference which doesn't have venue data
if not stadium_id:
home_team_data = TEAM_MAPPINGS.get("nhl", {})
home_abbrev = self._get_abbreviation(home_result.canonical_id)
for abbrev, (team_id, _, _, default_stadium) in home_team_data.items():
if team_id == home_result.canonical_id:
stadium_id = default_stadium
break
# Get abbreviations for game ID
home_abbrev = self._get_abbreviation(home_result.canonical_id)
away_abbrev = self._get_abbreviation(away_result.canonical_id)
# Generate canonical game ID
game_id = generate_game_id(
sport=self.sport,
season=self.season,
away_abbrev=away_abbrev,
home_abbrev=home_abbrev,
game_date=raw.game_date,
game_number=None, # NHL doesn't have doubleheaders
)
game = Game(
id=game_id,
sport=self.sport,
season=self.season,
home_team_id=home_result.canonical_id,
away_team_id=away_result.canonical_id,
stadium_id=stadium_id or "",
game_date=raw.game_date,
game_number=None,
home_score=raw.home_score,
away_score=raw.away_score,
status=raw.status,
source_url=raw.source_url,
raw_home_team=raw.home_team_raw,
raw_away_team=raw.away_team_raw,
raw_stadium=raw.stadium_raw,
)
return game, review_items
def _get_abbreviation(self, team_id: str) -> str:
"""Extract abbreviation from team ID."""
parts = team_id.split("_")
return parts[-1] if parts else ""
def scrape_teams(self) -> list[Team]:
"""Get all NHL teams from hardcoded mappings."""
teams: list[Team] = []
seen: set[str] = set()
# NHL conference/division structure
divisions = {
"Atlantic": ("Eastern", ["BOS", "BUF", "DET", "FLA", "MTL", "OTT", "TB", "TOR"]),
"Metropolitan": ("Eastern", ["CAR", "CBJ", "NJ", "NYI", "NYR", "PHI", "PIT", "WAS"]),
"Central": ("Western", ["ARI", "CHI", "COL", "DAL", "MIN", "NSH", "STL", "WPG"]),
"Pacific": ("Western", ["ANA", "CGY", "EDM", "LA", "SJ", "SEA", "VAN", "VGK"]),
}
# Build reverse lookup
team_divisions: dict[str, tuple[str, str]] = {}
for div, (conf, abbrevs) in divisions.items():
for abbrev in abbrevs:
team_divisions[abbrev] = (conf, div)
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nhl", {}).items():
if team_id in seen:
continue
seen.add(team_id)
# Parse team name
parts = full_name.split()
team_name = parts[-1] if parts else full_name
# Handle multi-word names
if team_name in ["Wings", "Jackets", "Knights", "Leafs"]:
team_name = " ".join(parts[-2:])
# Get conference and division
conf, div = team_divisions.get(abbrev, (None, None))
team = Team(
id=team_id,
sport="nhl",
city=city,
name=team_name,
full_name=full_name,
abbreviation=abbrev,
conference=conf,
division=div,
stadium_id=stadium_id,
)
teams.append(team)
return teams
def scrape_stadiums(self) -> list[Stadium]:
"""Get all NHL stadiums from hardcoded mappings."""
stadiums: list[Stadium] = []
nhl_stadiums = STADIUM_MAPPINGS.get("nhl", {})
for stadium_id, info in nhl_stadiums.items():
stadium = Stadium(
id=stadium_id,
sport="nhl",
name=info.name,
city=info.city,
state=info.state,
country=info.country,
latitude=info.latitude,
longitude=info.longitude,
surface="ice",
roof_type="dome",
)
stadiums.append(stadium)
return stadiums
def create_nhl_scraper(season: int) -> NHLScraper:
"""Factory function to create an NHL scraper."""
return NHLScraper(season=season)

View File

@@ -0,0 +1,374 @@
"""NWSL scraper implementation with multi-source fallback."""
from datetime import datetime, date, timedelta
from typing import Optional
from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
TeamResolver,
TEAM_MAPPINGS,
get_team_resolver,
)
from ..normalizers.stadium_resolver import (
StadiumResolver,
STADIUM_MAPPINGS,
get_stadium_resolver,
)
from ..utils.logging import get_logger, log_game, log_warning
class NWSLScraper(BaseScraper):
"""NWSL schedule scraper with multi-source fallback.
Sources (in priority order):
1. ESPN API - Most reliable for NWSL
2. NWSL official (via ESPN) - Backup option
"""
def __init__(self, season: int, **kwargs):
"""Initialize NWSL scraper.
Args:
season: Season year (e.g., 2026 for 2026 season)
"""
super().__init__("nwsl", season, **kwargs)
self._team_resolver = get_team_resolver("nwsl")
self._stadium_resolver = get_stadium_resolver("nwsl")
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
return ["espn"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
if source == "espn":
date_str = kwargs.get("date", "")
return f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.nwsl/scoreboard?dates={date_str}"
raise ValueError(f"Unknown source: {source}")
def _get_season_months(self) -> list[tuple[int, int]]:
"""Get the months to scrape for NWSL season.
NWSL season runs March through November.
"""
months = []
# NWSL regular season + playoffs
for month in range(3, 12): # March-Nov
months.append((self.season, month))
return months
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
"""Scrape games from a specific source."""
if source == "espn":
return self._scrape_espn()
else:
raise ValueError(f"Unknown source: {source}")
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (March-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.nwsl/scoreboard?limit=1000&dates={date_range}"
self._logger.info(f"Fetching NWSL schedule: {date_range}")
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse ESPN API response."""
games: list[RawGameData] = []
events = data.get("events", [])
for event in events:
try:
game = self._parse_espn_event(event, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse ESPN event: {e}")
continue
return games
def _parse_espn_event(
self,
event: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single ESPN event."""
# Get date
date_str = event.get("date", "")
if not date_str:
return None
try:
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get competitions
competitions = event.get("competitions", [])
if not competitions:
return None
competition = competitions[0]
# Get teams
competitors = competition.get("competitors", [])
if len(competitors) != 2:
return None
home_team = None
away_team = None
home_score = None
away_score = None
for competitor in competitors:
team_info = competitor.get("team", {})
team_name = team_info.get("displayName", "")
is_home = competitor.get("homeAway") == "home"
score = competitor.get("score")
if score:
try:
score = int(score)
except (ValueError, TypeError):
score = None
if is_home:
home_team = team_name
home_score = score
else:
away_team = team_name
away_score = score
if not home_team or not away_team:
return None
# Get venue
venue = competition.get("venue", {})
stadium = venue.get("fullName")
# Get status
status_info = competition.get("status", {})
status_type = status_info.get("type", {})
status_name = status_type.get("name", "").lower()
if status_name == "status_final":
status = "final"
elif status_name == "status_postponed":
status = "postponed"
elif status_name == "status_canceled":
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw games to Game objects with canonical IDs."""
games: list[Game] = []
review_items: list[ManualReviewItem] = []
for raw in raw_games:
game, item_reviews = self._normalize_single_game(raw)
if game:
games.append(game)
log_game(
self.sport,
game.id,
game.home_team_id,
game.away_team_id,
game.game_date.strftime("%Y-%m-%d"),
game.status,
)
review_items.extend(item_reviews)
return games, review_items
def _normalize_single_game(
self,
raw: RawGameData,
) -> tuple[Optional[Game], list[ManualReviewItem]]:
"""Normalize a single raw game."""
review_items: list[ManualReviewItem] = []
# Resolve home team
home_result = self._team_resolver.resolve(
raw.home_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if home_result.review_item:
review_items.append(home_result.review_item)
if not home_result.canonical_id:
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
return None, review_items
# Resolve away team
away_result = self._team_resolver.resolve(
raw.away_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if away_result.review_item:
review_items.append(away_result.review_item)
if not away_result.canonical_id:
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
return None, review_items
# Resolve stadium
stadium_id = None
if raw.stadium_raw:
stadium_result = self._stadium_resolver.resolve(
raw.stadium_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if stadium_result.review_item:
review_items.append(stadium_result.review_item)
stadium_id = stadium_result.canonical_id
# Get abbreviations for game ID
home_abbrev = self._get_abbreviation(home_result.canonical_id)
away_abbrev = self._get_abbreviation(away_result.canonical_id)
# Generate canonical game ID
game_id = generate_game_id(
sport=self.sport,
season=self.season,
away_abbrev=away_abbrev,
home_abbrev=home_abbrev,
game_date=raw.game_date,
game_number=None,
)
game = Game(
id=game_id,
sport=self.sport,
season=self.season,
home_team_id=home_result.canonical_id,
away_team_id=away_result.canonical_id,
stadium_id=stadium_id or "",
game_date=raw.game_date,
game_number=None,
home_score=raw.home_score,
away_score=raw.away_score,
status=raw.status,
source_url=raw.source_url,
raw_home_team=raw.home_team_raw,
raw_away_team=raw.away_team_raw,
raw_stadium=raw.stadium_raw,
)
return game, review_items
def _get_abbreviation(self, team_id: str) -> str:
"""Extract abbreviation from team ID."""
parts = team_id.split("_")
return parts[-1] if parts else ""
def scrape_teams(self) -> list[Team]:
"""Get all NWSL teams from hardcoded mappings."""
teams: list[Team] = []
seen: set[str] = set()
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nwsl", {}).items():
if team_id in seen:
continue
seen.add(team_id)
# Parse team name
team_name = full_name
team = Team(
id=team_id,
sport="nwsl",
city=city,
name=team_name,
full_name=full_name,
abbreviation=abbrev,
conference=None, # NWSL uses single table
division=None,
stadium_id=stadium_id,
)
teams.append(team)
return teams
def scrape_stadiums(self) -> list[Stadium]:
"""Get all NWSL stadiums from hardcoded mappings."""
stadiums: list[Stadium] = []
nwsl_stadiums = STADIUM_MAPPINGS.get("nwsl", {})
for stadium_id, info in nwsl_stadiums.items():
stadium = Stadium(
id=stadium_id,
sport="nwsl",
name=info.name,
city=info.city,
state=info.state,
country=info.country,
latitude=info.latitude,
longitude=info.longitude,
surface="grass",
roof_type="open",
)
stadiums.append(stadium)
return stadiums
def create_nwsl_scraper(season: int) -> NWSLScraper:
"""Factory function to create an NWSL scraper."""
return NWSLScraper(season=season)

View File

@@ -0,0 +1,375 @@
"""WNBA scraper implementation with multi-source fallback."""
from datetime import datetime, date, timedelta
from typing import Optional
from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
TeamResolver,
TEAM_MAPPINGS,
get_team_resolver,
)
from ..normalizers.stadium_resolver import (
StadiumResolver,
STADIUM_MAPPINGS,
get_stadium_resolver,
)
from ..utils.logging import get_logger, log_game, log_warning
class WNBAScraper(BaseScraper):
"""WNBA schedule scraper with multi-source fallback.
Sources (in priority order):
1. ESPN API - Most reliable for WNBA
2. WNBA official (via ESPN) - Backup option
"""
def __init__(self, season: int, **kwargs):
"""Initialize WNBA scraper.
Args:
season: Season year (e.g., 2026 for 2026 season)
"""
super().__init__("wnba", season, **kwargs)
self._team_resolver = get_team_resolver("wnba")
self._stadium_resolver = get_stadium_resolver("wnba")
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
return ["espn"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
if source == "espn":
date_str = kwargs.get("date", "")
return f"https://site.api.espn.com/apis/site/v2/sports/basketball/wnba/scoreboard?dates={date_str}"
raise ValueError(f"Unknown source: {source}")
def _get_season_months(self) -> list[tuple[int, int]]:
"""Get the months to scrape for WNBA season.
WNBA season runs May through September/October.
"""
months = []
# WNBA regular season + playoffs
for month in range(5, 11): # May-Oct
months.append((self.season, month))
return months
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
"""Scrape games from a specific source."""
if source == "espn":
return self._scrape_espn()
else:
raise ValueError(f"Unknown source: {source}")
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (May-October)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
url = f"https://site.api.espn.com/apis/site/v2/sports/basketball/wnba/scoreboard?limit=1000&dates={date_range}"
self._logger.info(f"Fetching WNBA schedule: {date_range}")
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse ESPN API response."""
games: list[RawGameData] = []
events = data.get("events", [])
for event in events:
try:
game = self._parse_espn_event(event, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse ESPN event: {e}")
continue
return games
def _parse_espn_event(
self,
event: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single ESPN event."""
# Get date
date_str = event.get("date", "")
if not date_str:
return None
try:
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get competitions
competitions = event.get("competitions", [])
if not competitions:
return None
competition = competitions[0]
# Get teams
competitors = competition.get("competitors", [])
if len(competitors) != 2:
return None
home_team = None
away_team = None
home_score = None
away_score = None
for competitor in competitors:
team_info = competitor.get("team", {})
team_name = team_info.get("displayName", "")
is_home = competitor.get("homeAway") == "home"
score = competitor.get("score")
if score:
try:
score = int(score)
except (ValueError, TypeError):
score = None
if is_home:
home_team = team_name
home_score = score
else:
away_team = team_name
away_score = score
if not home_team or not away_team:
return None
# Get venue
venue = competition.get("venue", {})
stadium = venue.get("fullName")
# Get status
status_info = competition.get("status", {})
status_type = status_info.get("type", {})
status_name = status_type.get("name", "").lower()
if status_name == "status_final":
status = "final"
elif status_name == "status_postponed":
status = "postponed"
elif status_name == "status_canceled":
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw games to Game objects with canonical IDs."""
games: list[Game] = []
review_items: list[ManualReviewItem] = []
for raw in raw_games:
game, item_reviews = self._normalize_single_game(raw)
if game:
games.append(game)
log_game(
self.sport,
game.id,
game.home_team_id,
game.away_team_id,
game.game_date.strftime("%Y-%m-%d"),
game.status,
)
review_items.extend(item_reviews)
return games, review_items
def _normalize_single_game(
self,
raw: RawGameData,
) -> tuple[Optional[Game], list[ManualReviewItem]]:
"""Normalize a single raw game."""
review_items: list[ManualReviewItem] = []
# Resolve home team
home_result = self._team_resolver.resolve(
raw.home_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if home_result.review_item:
review_items.append(home_result.review_item)
if not home_result.canonical_id:
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
return None, review_items
# Resolve away team
away_result = self._team_resolver.resolve(
raw.away_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if away_result.review_item:
review_items.append(away_result.review_item)
if not away_result.canonical_id:
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
return None, review_items
# Resolve stadium
stadium_id = None
if raw.stadium_raw:
stadium_result = self._stadium_resolver.resolve(
raw.stadium_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if stadium_result.review_item:
review_items.append(stadium_result.review_item)
stadium_id = stadium_result.canonical_id
# Get abbreviations for game ID
home_abbrev = self._get_abbreviation(home_result.canonical_id)
away_abbrev = self._get_abbreviation(away_result.canonical_id)
# Generate canonical game ID
game_id = generate_game_id(
sport=self.sport,
season=self.season,
away_abbrev=away_abbrev,
home_abbrev=home_abbrev,
game_date=raw.game_date,
game_number=None,
)
game = Game(
id=game_id,
sport=self.sport,
season=self.season,
home_team_id=home_result.canonical_id,
away_team_id=away_result.canonical_id,
stadium_id=stadium_id or "",
game_date=raw.game_date,
game_number=None,
home_score=raw.home_score,
away_score=raw.away_score,
status=raw.status,
source_url=raw.source_url,
raw_home_team=raw.home_team_raw,
raw_away_team=raw.away_team_raw,
raw_stadium=raw.stadium_raw,
)
return game, review_items
def _get_abbreviation(self, team_id: str) -> str:
"""Extract abbreviation from team ID."""
parts = team_id.split("_")
return parts[-1] if parts else ""
def scrape_teams(self) -> list[Team]:
"""Get all WNBA teams from hardcoded mappings."""
teams: list[Team] = []
seen: set[str] = set()
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("wnba", {}).items():
if team_id in seen:
continue
seen.add(team_id)
# Parse team name
parts = full_name.split()
team_name = parts[-1] if parts else full_name
team = Team(
id=team_id,
sport="wnba",
city=city,
name=team_name,
full_name=full_name,
abbreviation=abbrev,
conference=None, # WNBA uses single table now
division=None,
stadium_id=stadium_id,
)
teams.append(team)
return teams
def scrape_stadiums(self) -> list[Stadium]:
"""Get all WNBA stadiums from hardcoded mappings."""
stadiums: list[Stadium] = []
wnba_stadiums = STADIUM_MAPPINGS.get("wnba", {})
for stadium_id, info in wnba_stadiums.items():
stadium = Stadium(
id=stadium_id,
sport="wnba",
name=info.name,
city=info.city,
state=info.state,
country=info.country,
latitude=info.latitude,
longitude=info.longitude,
surface="hardwood",
roof_type="dome",
)
stadiums.append(stadium)
return stadiums
def create_wnba_scraper(season: int) -> WNBAScraper:
"""Factory function to create a WNBA scraper."""
return WNBAScraper(season=season)