feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
46
sportstime_parser/scrapers/__init__.py
Normal file
46
sportstime_parser/scrapers/__init__.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""Scrapers for fetching sports data from various sources."""
|
||||
|
||||
from .base import (
|
||||
BaseScraper,
|
||||
RawGameData,
|
||||
ScrapeResult,
|
||||
ScraperError,
|
||||
PartialDataError,
|
||||
)
|
||||
from .nba import NBAScraper, create_nba_scraper
|
||||
from .mlb import MLBScraper, create_mlb_scraper
|
||||
from .nfl import NFLScraper, create_nfl_scraper
|
||||
from .nhl import NHLScraper, create_nhl_scraper
|
||||
from .mls import MLSScraper, create_mls_scraper
|
||||
from .wnba import WNBAScraper, create_wnba_scraper
|
||||
from .nwsl import NWSLScraper, create_nwsl_scraper
|
||||
|
||||
__all__ = [
|
||||
# Base
|
||||
"BaseScraper",
|
||||
"RawGameData",
|
||||
"ScrapeResult",
|
||||
"ScraperError",
|
||||
"PartialDataError",
|
||||
# NBA
|
||||
"NBAScraper",
|
||||
"create_nba_scraper",
|
||||
# MLB
|
||||
"MLBScraper",
|
||||
"create_mlb_scraper",
|
||||
# NFL
|
||||
"NFLScraper",
|
||||
"create_nfl_scraper",
|
||||
# NHL
|
||||
"NHLScraper",
|
||||
"create_nhl_scraper",
|
||||
# MLS
|
||||
"MLSScraper",
|
||||
"create_mls_scraper",
|
||||
# WNBA
|
||||
"WNBAScraper",
|
||||
"create_wnba_scraper",
|
||||
# NWSL
|
||||
"NWSLScraper",
|
||||
"create_nwsl_scraper",
|
||||
]
|
||||
335
sportstime_parser/scrapers/base.py
Normal file
335
sportstime_parser/scrapers/base.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""Base scraper class for all sport scrapers."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date, datetime
|
||||
from typing import Optional
|
||||
|
||||
from ..config import EXPECTED_GAME_COUNTS
|
||||
from ..models.game import Game
|
||||
from ..models.team import Team
|
||||
from ..models.stadium import Stadium
|
||||
from ..models.aliases import ManualReviewItem
|
||||
from ..utils.http import RateLimitedSession, get_session
|
||||
from ..utils.logging import get_logger, log_error, log_warning
|
||||
from ..utils.progress import ScrapeProgress
|
||||
|
||||
|
||||
@dataclass
|
||||
class RawGameData:
|
||||
"""Raw game data before normalization.
|
||||
|
||||
This intermediate format holds data as scraped from sources,
|
||||
before team/stadium resolution and canonical ID generation.
|
||||
"""
|
||||
|
||||
game_date: datetime
|
||||
home_team_raw: str
|
||||
away_team_raw: str
|
||||
stadium_raw: Optional[str] = None
|
||||
home_score: Optional[int] = None
|
||||
away_score: Optional[int] = None
|
||||
status: str = "scheduled"
|
||||
source_url: Optional[str] = None
|
||||
game_number: Optional[int] = None # For doubleheaders
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeResult:
|
||||
"""Result of a scraping operation.
|
||||
|
||||
Attributes:
|
||||
games: List of normalized Game objects
|
||||
teams: List of Team objects
|
||||
stadiums: List of Stadium objects
|
||||
review_items: Items requiring manual review
|
||||
source: Name of the source used
|
||||
success: Whether scraping succeeded
|
||||
error_message: Error message if failed
|
||||
"""
|
||||
|
||||
games: list[Game] = field(default_factory=list)
|
||||
teams: list[Team] = field(default_factory=list)
|
||||
stadiums: list[Stadium] = field(default_factory=list)
|
||||
review_items: list[ManualReviewItem] = field(default_factory=list)
|
||||
source: str = ""
|
||||
success: bool = True
|
||||
error_message: Optional[str] = None
|
||||
|
||||
@property
|
||||
def game_count(self) -> int:
|
||||
return len(self.games)
|
||||
|
||||
@property
|
||||
def team_count(self) -> int:
|
||||
return len(self.teams)
|
||||
|
||||
@property
|
||||
def stadium_count(self) -> int:
|
||||
return len(self.stadiums)
|
||||
|
||||
@property
|
||||
def review_count(self) -> int:
|
||||
return len(self.review_items)
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
"""Abstract base class for sport scrapers.
|
||||
|
||||
Subclasses must implement:
|
||||
- scrape_games(): Fetch and normalize game schedule
|
||||
- scrape_teams(): Fetch team information
|
||||
- scrape_stadiums(): Fetch stadium information
|
||||
- _get_sources(): Return list of source names in priority order
|
||||
|
||||
Features:
|
||||
- Multi-source fallback (try sources in order)
|
||||
- Built-in rate limiting
|
||||
- Error handling with partial data discard
|
||||
- Progress tracking
|
||||
- Source URL tracking for manual review
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sport: str,
|
||||
season: int,
|
||||
session: Optional[RateLimitedSession] = None,
|
||||
):
|
||||
"""Initialize the scraper.
|
||||
|
||||
Args:
|
||||
sport: Sport code (e.g., 'nba', 'mlb')
|
||||
season: Season start year (e.g., 2025 for 2025-26)
|
||||
session: Optional HTTP session (default: global session)
|
||||
"""
|
||||
self.sport = sport.lower()
|
||||
self.season = season
|
||||
self.session = session or get_session()
|
||||
self._logger = get_logger()
|
||||
self._progress: Optional[ScrapeProgress] = None
|
||||
|
||||
@property
|
||||
def expected_game_count(self) -> int:
|
||||
"""Get expected number of games for this sport."""
|
||||
return EXPECTED_GAME_COUNTS.get(self.sport, 0)
|
||||
|
||||
@abstractmethod
|
||||
def _get_sources(self) -> list[str]:
|
||||
"""Return list of source names in priority order.
|
||||
|
||||
Returns:
|
||||
List of source identifiers (e.g., ['basketball_reference', 'espn', 'cbs'])
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _scrape_games_from_source(
|
||||
self,
|
||||
source: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Scrape games from a specific source.
|
||||
|
||||
Args:
|
||||
source: Source identifier
|
||||
|
||||
Returns:
|
||||
List of raw game data
|
||||
|
||||
Raises:
|
||||
Exception: If scraping fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _normalize_games(
|
||||
self,
|
||||
raw_games: list[RawGameData],
|
||||
) -> tuple[list[Game], list[ManualReviewItem]]:
|
||||
"""Normalize raw game data to Game objects.
|
||||
|
||||
Args:
|
||||
raw_games: Raw scraped data
|
||||
|
||||
Returns:
|
||||
Tuple of (normalized games, review items)
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def scrape_teams(self) -> list[Team]:
|
||||
"""Fetch team information.
|
||||
|
||||
Returns:
|
||||
List of Team objects
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def scrape_stadiums(self) -> list[Stadium]:
|
||||
"""Fetch stadium information.
|
||||
|
||||
Returns:
|
||||
List of Stadium objects
|
||||
"""
|
||||
pass
|
||||
|
||||
def scrape_games(self) -> ScrapeResult:
|
||||
"""Scrape games with multi-source fallback.
|
||||
|
||||
Tries each source in priority order. On failure, discards
|
||||
partial data and tries the next source.
|
||||
|
||||
Returns:
|
||||
ScrapeResult with games, review items, and status
|
||||
"""
|
||||
sources = self._get_sources()
|
||||
last_error: Optional[str] = None
|
||||
sources_tried = 0
|
||||
# Allow 3 sources to be tried. This enables NHL to fall back to NHL API
|
||||
# for venue data since Hockey Reference doesn't provide it.
|
||||
max_sources_to_try = 3
|
||||
|
||||
for source in sources:
|
||||
self._logger.info(f"Trying source: {source}")
|
||||
sources_tried += 1
|
||||
|
||||
try:
|
||||
# Scrape raw data
|
||||
raw_games = self._scrape_games_from_source(source)
|
||||
|
||||
if not raw_games:
|
||||
log_warning(f"No games found from {source}")
|
||||
# If multiple sources return nothing, the schedule likely doesn't exist
|
||||
if sources_tried >= max_sources_to_try:
|
||||
return ScrapeResult(
|
||||
success=False,
|
||||
error_message=f"No schedule data available (tried {sources_tried} sources)",
|
||||
)
|
||||
continue
|
||||
|
||||
self._logger.info(f"Found {len(raw_games)} raw games from {source}")
|
||||
|
||||
# Normalize data
|
||||
games, review_items = self._normalize_games(raw_games)
|
||||
|
||||
self._logger.info(
|
||||
f"Normalized {len(games)} games, {len(review_items)} need review"
|
||||
)
|
||||
|
||||
return ScrapeResult(
|
||||
games=games,
|
||||
review_items=review_items,
|
||||
source=source,
|
||||
success=True,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
|
||||
# If we've tried enough sources, bail out
|
||||
if sources_tried >= max_sources_to_try:
|
||||
break
|
||||
continue
|
||||
|
||||
# All sources failed
|
||||
return ScrapeResult(
|
||||
success=False,
|
||||
error_message=f"All sources failed. Last error: {last_error}",
|
||||
)
|
||||
|
||||
def scrape_all(self) -> ScrapeResult:
|
||||
"""Scrape games, teams, and stadiums.
|
||||
|
||||
Returns:
|
||||
Complete ScrapeResult with all data
|
||||
"""
|
||||
self._progress = ScrapeProgress(self.sport, self.season)
|
||||
self._progress.start()
|
||||
|
||||
try:
|
||||
# Scrape games
|
||||
result = self.scrape_games()
|
||||
|
||||
if not result.success:
|
||||
self._progress.log_error(result.error_message or "Unknown error")
|
||||
self._progress.finish()
|
||||
return result
|
||||
|
||||
# Scrape teams
|
||||
teams = self.scrape_teams()
|
||||
result.teams = teams
|
||||
|
||||
# Scrape stadiums
|
||||
stadiums = self.scrape_stadiums()
|
||||
result.stadiums = stadiums
|
||||
|
||||
# Update progress
|
||||
self._progress.games_count = result.game_count
|
||||
self._progress.teams_count = result.team_count
|
||||
self._progress.stadiums_count = result.stadium_count
|
||||
self._progress.errors_count = result.review_count
|
||||
|
||||
self._progress.finish()
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
log_error(f"Scraping failed: {e}", exc_info=True)
|
||||
self._progress.finish()
|
||||
|
||||
return ScrapeResult(
|
||||
success=False,
|
||||
error_message=str(e),
|
||||
)
|
||||
|
||||
def _get_season_months(self) -> list[tuple[int, int]]:
|
||||
"""Get the months to scrape for this sport's season.
|
||||
|
||||
Returns:
|
||||
List of (year, month) tuples
|
||||
"""
|
||||
# Default implementation for sports with fall-spring seasons
|
||||
# (NBA, NHL, etc.)
|
||||
months = []
|
||||
|
||||
# Fall months of season start year
|
||||
for month in range(10, 13): # Oct-Dec
|
||||
months.append((self.season, month))
|
||||
|
||||
# Winter-spring months of following year
|
||||
for month in range(1, 7): # Jan-Jun
|
||||
months.append((self.season + 1, month))
|
||||
|
||||
return months
|
||||
|
||||
def _get_source_url(self, source: str, **kwargs) -> str:
|
||||
"""Build a source URL with parameters.
|
||||
|
||||
Subclasses should override this to build URLs for their sources.
|
||||
|
||||
Args:
|
||||
source: Source identifier
|
||||
**kwargs: URL parameters
|
||||
|
||||
Returns:
|
||||
Complete URL string
|
||||
"""
|
||||
raise NotImplementedError(f"URL builder not implemented for {source}")
|
||||
|
||||
|
||||
class ScraperError(Exception):
|
||||
"""Exception raised when scraping fails."""
|
||||
|
||||
def __init__(self, source: str, message: str):
|
||||
self.source = source
|
||||
self.message = message
|
||||
super().__init__(f"[{source}] {message}")
|
||||
|
||||
|
||||
class PartialDataError(ScraperError):
|
||||
"""Exception raised when only partial data was retrieved."""
|
||||
|
||||
def __init__(self, source: str, message: str, partial_count: int):
|
||||
self.partial_count = partial_count
|
||||
super().__init__(source, f"{message} (got {partial_count} items)")
|
||||
685
sportstime_parser/scrapers/mlb.py
Normal file
685
sportstime_parser/scrapers/mlb.py
Normal file
@@ -0,0 +1,685 @@
|
||||
"""MLB scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
from ..models.game import Game
|
||||
from ..models.team import Team
|
||||
from ..models.stadium import Stadium
|
||||
from ..models.aliases import ManualReviewItem
|
||||
from ..normalizers.canonical_id import generate_game_id
|
||||
from ..normalizers.team_resolver import (
|
||||
TeamResolver,
|
||||
TEAM_MAPPINGS,
|
||||
get_team_resolver,
|
||||
)
|
||||
from ..normalizers.stadium_resolver import (
|
||||
StadiumResolver,
|
||||
STADIUM_MAPPINGS,
|
||||
get_stadium_resolver,
|
||||
)
|
||||
from ..normalizers.timezone import parse_datetime
|
||||
from ..utils.logging import get_logger, log_game, log_warning
|
||||
|
||||
|
||||
class MLBScraper(BaseScraper):
|
||||
"""MLB schedule scraper with multi-source fallback.
|
||||
|
||||
Sources (in priority order):
|
||||
1. Baseball-Reference - Most reliable, complete historical data
|
||||
2. MLB Stats API - Official MLB data
|
||||
3. ESPN API - Backup option
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
"""Initialize MLB scraper.
|
||||
|
||||
Args:
|
||||
season: Season year (e.g., 2026 for 2026 season)
|
||||
"""
|
||||
super().__init__("mlb", season, **kwargs)
|
||||
self._team_resolver = get_team_resolver("mlb")
|
||||
self._stadium_resolver = get_stadium_resolver("mlb")
|
||||
|
||||
def _get_sources(self) -> list[str]:
|
||||
"""Return source list in priority order."""
|
||||
# MLB API is best - returns full schedule in one request
|
||||
# ESPN caps at ~25 results for baseball
|
||||
# Baseball-Reference requires HTML parsing
|
||||
return ["mlb_api", "espn", "baseball_reference"]
|
||||
|
||||
def _get_source_url(self, source: str, **kwargs) -> str:
|
||||
"""Build URL for a source."""
|
||||
if source == "baseball_reference":
|
||||
month = kwargs.get("month", "april")
|
||||
# Baseball-Reference uses season year in URL
|
||||
return f"https://www.baseball-reference.com/leagues/majors/{self.season}-schedule.shtml"
|
||||
|
||||
elif source == "mlb_api":
|
||||
start_date = kwargs.get("start_date", "")
|
||||
end_date = kwargs.get("end_date", "")
|
||||
return f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date}&endDate={end_date}"
|
||||
|
||||
elif source == "espn":
|
||||
date_str = kwargs.get("date", "")
|
||||
return f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?dates={date_str}"
|
||||
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _get_season_months(self) -> list[tuple[int, int]]:
|
||||
"""Get the months to scrape for MLB season.
|
||||
|
||||
MLB season runs March/April through October/November.
|
||||
"""
|
||||
months = []
|
||||
|
||||
# Spring training / early season
|
||||
for month in range(3, 12): # March-November
|
||||
months.append((self.season, month))
|
||||
|
||||
return months
|
||||
|
||||
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
|
||||
"""Scrape games from a specific source."""
|
||||
if source == "baseball_reference":
|
||||
return self._scrape_baseball_reference()
|
||||
elif source == "mlb_api":
|
||||
return self._scrape_mlb_api()
|
||||
elif source == "espn":
|
||||
return self._scrape_espn()
|
||||
else:
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_baseball_reference(self) -> list[RawGameData]:
|
||||
"""Scrape games from Baseball-Reference.
|
||||
|
||||
BR has a single schedule page per season.
|
||||
Format: https://www.baseball-reference.com/leagues/majors/YYYY-schedule.shtml
|
||||
"""
|
||||
url = self._get_source_url("baseball_reference")
|
||||
|
||||
try:
|
||||
html = self.session.get_html(url)
|
||||
games = self._parse_baseball_reference(html, url)
|
||||
return games
|
||||
|
||||
except Exception as e:
|
||||
self._logger.error(f"Failed to scrape Baseball-Reference: {e}")
|
||||
raise
|
||||
|
||||
def _parse_baseball_reference(
|
||||
self,
|
||||
html: str,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse Baseball-Reference schedule HTML.
|
||||
|
||||
Structure: Games are organized by date in div elements.
|
||||
Each game row has: date, away team, away score, home team, home score, venue.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
games: list[RawGameData] = []
|
||||
|
||||
# Find all game divs - they use class "game" or similar
|
||||
# Baseball-Reference uses <p class="game"> for each game
|
||||
game_paragraphs = soup.find_all("p", class_="game")
|
||||
|
||||
current_date = None
|
||||
|
||||
for elem in soup.find_all(["h3", "p"]):
|
||||
# H3 contains date headers
|
||||
if elem.name == "h3":
|
||||
date_text = elem.get_text(strip=True)
|
||||
try:
|
||||
# Format: "Thursday, April 1, 2026"
|
||||
current_date = datetime.strptime(date_text, "%A, %B %d, %Y")
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
elif elem.name == "p" and "game" in elem.get("class", []):
|
||||
if current_date is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
game = self._parse_br_game(elem, current_date, source_url)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse game: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_br_game(
|
||||
self,
|
||||
elem,
|
||||
game_date: datetime,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single Baseball-Reference game element."""
|
||||
text = elem.get_text(" ", strip=True)
|
||||
|
||||
# Parse game text - formats vary:
|
||||
# "Team A (5) @ Team B (3)" or "Team A @ Team B"
|
||||
# Also handles doubleheader notation
|
||||
|
||||
# Find all links - usually team names
|
||||
links = elem.find_all("a")
|
||||
if len(links) < 2:
|
||||
return None
|
||||
|
||||
# First link is away team, second is home team
|
||||
away_team = links[0].get_text(strip=True)
|
||||
home_team = links[1].get_text(strip=True)
|
||||
|
||||
# Try to extract scores from text
|
||||
away_score = None
|
||||
home_score = None
|
||||
|
||||
# Look for score pattern "(N)"
|
||||
import re
|
||||
score_pattern = r"\((\d+)\)"
|
||||
scores = re.findall(score_pattern, text)
|
||||
|
||||
if len(scores) >= 2:
|
||||
try:
|
||||
away_score = int(scores[0])
|
||||
home_score = int(scores[1])
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
# Determine status
|
||||
status = "final" if home_score is not None else "scheduled"
|
||||
|
||||
# Check for postponed/cancelled
|
||||
text_lower = text.lower()
|
||||
if "postponed" in text_lower:
|
||||
status = "postponed"
|
||||
elif "cancelled" in text_lower or "canceled" in text_lower:
|
||||
status = "cancelled"
|
||||
|
||||
# Extract venue if present (usually after @ symbol)
|
||||
stadium = None
|
||||
if len(links) > 2:
|
||||
# Third link might be stadium
|
||||
stadium = links[2].get_text(strip=True)
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=stadium,
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _scrape_mlb_api(self) -> list[RawGameData]:
|
||||
"""Scrape games from MLB Stats API using full season query."""
|
||||
# Build date range for entire season (March-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
start_date = date(start_year, start_month, 1)
|
||||
|
||||
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date.strftime('%Y-%m-%d')}&endDate={end_date.strftime('%Y-%m-%d')}"
|
||||
self._logger.info(f"Fetching MLB schedule: {start_date} to {end_date}")
|
||||
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_mlb_api_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"MLB API error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_mlb_api_response(
|
||||
self,
|
||||
data: dict,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse MLB Stats API response."""
|
||||
games: list[RawGameData] = []
|
||||
|
||||
dates = data.get("dates", [])
|
||||
|
||||
for date_entry in dates:
|
||||
for game in date_entry.get("games", []):
|
||||
try:
|
||||
raw_game = self._parse_mlb_api_game(game, source_url)
|
||||
if raw_game:
|
||||
games.append(raw_game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse MLB API game: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_mlb_api_game(
|
||||
self,
|
||||
game: dict,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single MLB API game."""
|
||||
# Get game date/time
|
||||
game_date_str = game.get("gameDate", "")
|
||||
if not game_date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
game_date = datetime.fromisoformat(game_date_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get teams
|
||||
teams = game.get("teams", {})
|
||||
away_data = teams.get("away", {})
|
||||
home_data = teams.get("home", {})
|
||||
|
||||
away_team_info = away_data.get("team", {})
|
||||
home_team_info = home_data.get("team", {})
|
||||
|
||||
away_team = away_team_info.get("name", "")
|
||||
home_team = home_team_info.get("name", "")
|
||||
|
||||
if not away_team or not home_team:
|
||||
return None
|
||||
|
||||
# Get scores
|
||||
away_score = away_data.get("score")
|
||||
home_score = home_data.get("score")
|
||||
|
||||
# Get venue
|
||||
venue = game.get("venue", {})
|
||||
stadium = venue.get("name")
|
||||
|
||||
# Get status
|
||||
status_data = game.get("status", {})
|
||||
abstract_game_state = status_data.get("abstractGameState", "").lower()
|
||||
detailed_state = status_data.get("detailedState", "").lower()
|
||||
|
||||
if abstract_game_state == "final":
|
||||
status = "final"
|
||||
elif "postponed" in detailed_state:
|
||||
status = "postponed"
|
||||
elif "cancelled" in detailed_state or "canceled" in detailed_state:
|
||||
status = "cancelled"
|
||||
else:
|
||||
status = "scheduled"
|
||||
|
||||
# Check for doubleheader
|
||||
game_number = game.get("gameNumber")
|
||||
if game.get("doubleHeader") == "Y":
|
||||
game_number = game.get("gameNumber", 1)
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=stadium,
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
game_number=game_number if game.get("doubleHeader") == "Y" else None,
|
||||
)
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (March-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?limit=3000&dates={date_range}"
|
||||
self._logger.info(f"Fetching MLB schedule: {date_range}")
|
||||
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
data: dict,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse ESPN API response."""
|
||||
games: list[RawGameData] = []
|
||||
|
||||
events = data.get("events", [])
|
||||
|
||||
for event in events:
|
||||
try:
|
||||
game = self._parse_espn_event(event, source_url)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse ESPN event: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_espn_event(
|
||||
self,
|
||||
event: dict,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single ESPN event."""
|
||||
# Get date
|
||||
date_str = event.get("date", "")
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get competitions
|
||||
competitions = event.get("competitions", [])
|
||||
if not competitions:
|
||||
return None
|
||||
|
||||
competition = competitions[0]
|
||||
|
||||
# Get teams
|
||||
competitors = competition.get("competitors", [])
|
||||
if len(competitors) != 2:
|
||||
return None
|
||||
|
||||
home_team = None
|
||||
away_team = None
|
||||
home_score = None
|
||||
away_score = None
|
||||
|
||||
for competitor in competitors:
|
||||
team_info = competitor.get("team", {})
|
||||
team_name = team_info.get("displayName", "")
|
||||
is_home = competitor.get("homeAway") == "home"
|
||||
score = competitor.get("score")
|
||||
|
||||
if score:
|
||||
try:
|
||||
score = int(score)
|
||||
except (ValueError, TypeError):
|
||||
score = None
|
||||
|
||||
if is_home:
|
||||
home_team = team_name
|
||||
home_score = score
|
||||
else:
|
||||
away_team = team_name
|
||||
away_score = score
|
||||
|
||||
if not home_team or not away_team:
|
||||
return None
|
||||
|
||||
# Get venue
|
||||
venue = competition.get("venue", {})
|
||||
stadium = venue.get("fullName")
|
||||
|
||||
# Get status
|
||||
status_info = competition.get("status", {})
|
||||
status_type = status_info.get("type", {})
|
||||
status_name = status_type.get("name", "").lower()
|
||||
|
||||
if status_name == "status_final":
|
||||
status = "final"
|
||||
elif status_name == "status_postponed":
|
||||
status = "postponed"
|
||||
elif status_name == "status_canceled":
|
||||
status = "cancelled"
|
||||
else:
|
||||
status = "scheduled"
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=stadium,
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _normalize_games(
|
||||
self,
|
||||
raw_games: list[RawGameData],
|
||||
) -> tuple[list[Game], list[ManualReviewItem]]:
|
||||
"""Normalize raw games to Game objects with canonical IDs."""
|
||||
games: list[Game] = []
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
# Track games by date/matchup for doubleheader detection
|
||||
games_by_matchup: dict[str, list[RawGameData]] = {}
|
||||
|
||||
for raw in raw_games:
|
||||
date_key = raw.game_date.strftime("%Y%m%d")
|
||||
matchup_key = f"{date_key}_{raw.away_team_raw}_{raw.home_team_raw}"
|
||||
|
||||
if matchup_key not in games_by_matchup:
|
||||
games_by_matchup[matchup_key] = []
|
||||
games_by_matchup[matchup_key].append(raw)
|
||||
|
||||
# Process games with doubleheader detection
|
||||
for matchup_key, matchup_games in games_by_matchup.items():
|
||||
is_doubleheader = len(matchup_games) > 1
|
||||
|
||||
# Sort by time if doubleheader
|
||||
if is_doubleheader:
|
||||
matchup_games.sort(key=lambda g: g.game_date)
|
||||
|
||||
for i, raw in enumerate(matchup_games):
|
||||
# Use provided game_number or calculate from order
|
||||
game_number = raw.game_number or ((i + 1) if is_doubleheader else None)
|
||||
|
||||
game, item_reviews = self._normalize_single_game(raw, game_number)
|
||||
|
||||
if game:
|
||||
games.append(game)
|
||||
log_game(
|
||||
self.sport,
|
||||
game.id,
|
||||
game.home_team_id,
|
||||
game.away_team_id,
|
||||
game.game_date.strftime("%Y-%m-%d"),
|
||||
game.status,
|
||||
)
|
||||
|
||||
review_items.extend(item_reviews)
|
||||
|
||||
return games, review_items
|
||||
|
||||
def _normalize_single_game(
|
||||
self,
|
||||
raw: RawGameData,
|
||||
game_number: Optional[int],
|
||||
) -> tuple[Optional[Game], list[ManualReviewItem]]:
|
||||
"""Normalize a single raw game."""
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
# Resolve home team
|
||||
home_result = self._team_resolver.resolve(
|
||||
raw.home_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if home_result.review_item:
|
||||
review_items.append(home_result.review_item)
|
||||
|
||||
if not home_result.canonical_id:
|
||||
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve away team
|
||||
away_result = self._team_resolver.resolve(
|
||||
raw.away_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if away_result.review_item:
|
||||
review_items.append(away_result.review_item)
|
||||
|
||||
if not away_result.canonical_id:
|
||||
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve stadium
|
||||
stadium_id = None
|
||||
|
||||
if raw.stadium_raw:
|
||||
stadium_result = self._stadium_resolver.resolve(
|
||||
raw.stadium_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if stadium_result.review_item:
|
||||
review_items.append(stadium_result.review_item)
|
||||
|
||||
stadium_id = stadium_result.canonical_id
|
||||
|
||||
# Get abbreviations for game ID
|
||||
home_abbrev = self._get_abbreviation(home_result.canonical_id)
|
||||
away_abbrev = self._get_abbreviation(away_result.canonical_id)
|
||||
|
||||
# Generate canonical game ID
|
||||
game_id = generate_game_id(
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
away_abbrev=away_abbrev,
|
||||
home_abbrev=home_abbrev,
|
||||
game_date=raw.game_date,
|
||||
game_number=game_number,
|
||||
)
|
||||
|
||||
game = Game(
|
||||
id=game_id,
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
home_team_id=home_result.canonical_id,
|
||||
away_team_id=away_result.canonical_id,
|
||||
stadium_id=stadium_id or "",
|
||||
game_date=raw.game_date,
|
||||
game_number=game_number,
|
||||
home_score=raw.home_score,
|
||||
away_score=raw.away_score,
|
||||
status=raw.status,
|
||||
source_url=raw.source_url,
|
||||
raw_home_team=raw.home_team_raw,
|
||||
raw_away_team=raw.away_team_raw,
|
||||
raw_stadium=raw.stadium_raw,
|
||||
)
|
||||
|
||||
return game, review_items
|
||||
|
||||
def _get_abbreviation(self, team_id: str) -> str:
|
||||
"""Extract abbreviation from team ID."""
|
||||
# team_mlb_nyy -> nyy
|
||||
parts = team_id.split("_")
|
||||
return parts[-1] if parts else ""
|
||||
|
||||
def scrape_teams(self) -> list[Team]:
|
||||
"""Get all MLB teams from hardcoded mappings."""
|
||||
teams: list[Team] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
# MLB league/division structure
|
||||
divisions = {
|
||||
"AL East": ("American", ["BAL", "BOS", "NYY", "TB", "TOR"]),
|
||||
"AL Central": ("American", ["CHW", "CLE", "DET", "KC", "MIN"]),
|
||||
"AL West": ("American", ["HOU", "LAA", "OAK", "SEA", "TEX"]),
|
||||
"NL East": ("National", ["ATL", "MIA", "NYM", "PHI", "WSN"]),
|
||||
"NL Central": ("National", ["CHC", "CIN", "MIL", "PIT", "STL"]),
|
||||
"NL West": ("National", ["ARI", "COL", "LAD", "SD", "SF"]),
|
||||
}
|
||||
|
||||
# Build reverse lookup
|
||||
team_divisions: dict[str, tuple[str, str]] = {}
|
||||
for div, (league, abbrevs) in divisions.items():
|
||||
for abbrev in abbrevs:
|
||||
team_divisions[abbrev] = (league, div)
|
||||
|
||||
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("mlb", {}).items():
|
||||
if team_id in seen:
|
||||
continue
|
||||
seen.add(team_id)
|
||||
|
||||
# Parse team name from full name
|
||||
parts = full_name.split()
|
||||
if len(parts) >= 2:
|
||||
team_name = parts[-1]
|
||||
# Handle multi-word team names
|
||||
if team_name in ["Sox", "Jays"]:
|
||||
team_name = " ".join(parts[-2:])
|
||||
else:
|
||||
team_name = full_name
|
||||
|
||||
# Get league and division
|
||||
league, div = team_divisions.get(abbrev, (None, None))
|
||||
|
||||
team = Team(
|
||||
id=team_id,
|
||||
sport="mlb",
|
||||
city=city,
|
||||
name=team_name,
|
||||
full_name=full_name,
|
||||
abbreviation=abbrev,
|
||||
conference=league, # MLB uses "league" but we map to conference field
|
||||
division=div,
|
||||
stadium_id=stadium_id,
|
||||
)
|
||||
teams.append(team)
|
||||
|
||||
return teams
|
||||
|
||||
def scrape_stadiums(self) -> list[Stadium]:
|
||||
"""Get all MLB stadiums from hardcoded mappings."""
|
||||
stadiums: list[Stadium] = []
|
||||
|
||||
mlb_stadiums = STADIUM_MAPPINGS.get("mlb", {})
|
||||
for stadium_id, info in mlb_stadiums.items():
|
||||
stadium = Stadium(
|
||||
id=stadium_id,
|
||||
sport="mlb",
|
||||
name=info.name,
|
||||
city=info.city,
|
||||
state=info.state,
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
surface="grass", # Most MLB stadiums
|
||||
roof_type="open", # Most MLB stadiums
|
||||
)
|
||||
stadiums.append(stadium)
|
||||
|
||||
return stadiums
|
||||
|
||||
|
||||
def create_mlb_scraper(season: int) -> MLBScraper:
|
||||
"""Factory function to create an MLB scraper."""
|
||||
return MLBScraper(season=season)
|
||||
400
sportstime_parser/scrapers/mls.py
Normal file
400
sportstime_parser/scrapers/mls.py
Normal file
@@ -0,0 +1,400 @@
|
||||
"""MLS scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
from ..models.game import Game
|
||||
from ..models.team import Team
|
||||
from ..models.stadium import Stadium
|
||||
from ..models.aliases import ManualReviewItem
|
||||
from ..normalizers.canonical_id import generate_game_id
|
||||
from ..normalizers.team_resolver import (
|
||||
TeamResolver,
|
||||
TEAM_MAPPINGS,
|
||||
get_team_resolver,
|
||||
)
|
||||
from ..normalizers.stadium_resolver import (
|
||||
StadiumResolver,
|
||||
STADIUM_MAPPINGS,
|
||||
get_stadium_resolver,
|
||||
)
|
||||
from ..utils.logging import get_logger, log_game, log_warning
|
||||
|
||||
|
||||
class MLSScraper(BaseScraper):
|
||||
"""MLS schedule scraper with multi-source fallback.
|
||||
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for MLS
|
||||
2. FBref - Backup option
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
"""Initialize MLS scraper.
|
||||
|
||||
Args:
|
||||
season: Season year (e.g., 2026 for 2026 season)
|
||||
"""
|
||||
super().__init__("mls", season, **kwargs)
|
||||
self._team_resolver = get_team_resolver("mls")
|
||||
self._stadium_resolver = get_stadium_resolver("mls")
|
||||
|
||||
def _get_sources(self) -> list[str]:
|
||||
"""Return source list in priority order."""
|
||||
# FBref scraper not yet implemented - TODO for future
|
||||
return ["espn"]
|
||||
|
||||
def _get_source_url(self, source: str, **kwargs) -> str:
|
||||
"""Build URL for a source."""
|
||||
if source == "espn":
|
||||
date_str = kwargs.get("date", "")
|
||||
return f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.1/scoreboard?dates={date_str}"
|
||||
|
||||
elif source == "fbref":
|
||||
return f"https://fbref.com/en/comps/22/{self.season}/schedule/{self.season}-Major-League-Soccer-Scores-and-Fixtures"
|
||||
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _get_season_months(self) -> list[tuple[int, int]]:
|
||||
"""Get the months to scrape for MLS season.
|
||||
|
||||
MLS season runs February/March through October/November.
|
||||
"""
|
||||
months = []
|
||||
|
||||
# MLS runs within a calendar year
|
||||
for month in range(2, 12): # Feb-Nov
|
||||
months.append((self.season, month))
|
||||
|
||||
return months
|
||||
|
||||
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
|
||||
"""Scrape games from a specific source."""
|
||||
if source == "espn":
|
||||
return self._scrape_espn()
|
||||
elif source == "fbref":
|
||||
return self._scrape_fbref()
|
||||
else:
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (Feb-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.1/scoreboard?limit=1000&dates={date_range}"
|
||||
self._logger.info(f"Fetching MLS schedule: {date_range}")
|
||||
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
data: dict,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse ESPN API response."""
|
||||
games: list[RawGameData] = []
|
||||
|
||||
events = data.get("events", [])
|
||||
|
||||
for event in events:
|
||||
try:
|
||||
game = self._parse_espn_event(event, source_url)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse ESPN event: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_espn_event(
|
||||
self,
|
||||
event: dict,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single ESPN event."""
|
||||
# Get date
|
||||
date_str = event.get("date", "")
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get competitions
|
||||
competitions = event.get("competitions", [])
|
||||
if not competitions:
|
||||
return None
|
||||
|
||||
competition = competitions[0]
|
||||
|
||||
# Get teams
|
||||
competitors = competition.get("competitors", [])
|
||||
if len(competitors) != 2:
|
||||
return None
|
||||
|
||||
home_team = None
|
||||
away_team = None
|
||||
home_score = None
|
||||
away_score = None
|
||||
|
||||
for competitor in competitors:
|
||||
team_info = competitor.get("team", {})
|
||||
team_name = team_info.get("displayName", "")
|
||||
is_home = competitor.get("homeAway") == "home"
|
||||
score = competitor.get("score")
|
||||
|
||||
if score:
|
||||
try:
|
||||
score = int(score)
|
||||
except (ValueError, TypeError):
|
||||
score = None
|
||||
|
||||
if is_home:
|
||||
home_team = team_name
|
||||
home_score = score
|
||||
else:
|
||||
away_team = team_name
|
||||
away_score = score
|
||||
|
||||
if not home_team or not away_team:
|
||||
return None
|
||||
|
||||
# Get venue
|
||||
venue = competition.get("venue", {})
|
||||
stadium = venue.get("fullName")
|
||||
|
||||
# Get status
|
||||
status_info = competition.get("status", {})
|
||||
status_type = status_info.get("type", {})
|
||||
status_name = status_type.get("name", "").lower()
|
||||
|
||||
if status_name == "status_final":
|
||||
status = "final"
|
||||
elif status_name == "status_postponed":
|
||||
status = "postponed"
|
||||
elif status_name == "status_canceled":
|
||||
status = "cancelled"
|
||||
else:
|
||||
status = "scheduled"
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=stadium,
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _scrape_fbref(self) -> list[RawGameData]:
|
||||
"""Scrape games from FBref."""
|
||||
# FBref scraping would go here
|
||||
raise NotImplementedError("FBref scraper not implemented")
|
||||
|
||||
def _normalize_games(
|
||||
self,
|
||||
raw_games: list[RawGameData],
|
||||
) -> tuple[list[Game], list[ManualReviewItem]]:
|
||||
"""Normalize raw games to Game objects with canonical IDs."""
|
||||
games: list[Game] = []
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
for raw in raw_games:
|
||||
game, item_reviews = self._normalize_single_game(raw)
|
||||
|
||||
if game:
|
||||
games.append(game)
|
||||
log_game(
|
||||
self.sport,
|
||||
game.id,
|
||||
game.home_team_id,
|
||||
game.away_team_id,
|
||||
game.game_date.strftime("%Y-%m-%d"),
|
||||
game.status,
|
||||
)
|
||||
|
||||
review_items.extend(item_reviews)
|
||||
|
||||
return games, review_items
|
||||
|
||||
def _normalize_single_game(
|
||||
self,
|
||||
raw: RawGameData,
|
||||
) -> tuple[Optional[Game], list[ManualReviewItem]]:
|
||||
"""Normalize a single raw game."""
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
# Resolve home team
|
||||
home_result = self._team_resolver.resolve(
|
||||
raw.home_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if home_result.review_item:
|
||||
review_items.append(home_result.review_item)
|
||||
|
||||
if not home_result.canonical_id:
|
||||
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve away team
|
||||
away_result = self._team_resolver.resolve(
|
||||
raw.away_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if away_result.review_item:
|
||||
review_items.append(away_result.review_item)
|
||||
|
||||
if not away_result.canonical_id:
|
||||
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve stadium
|
||||
stadium_id = None
|
||||
|
||||
if raw.stadium_raw:
|
||||
stadium_result = self._stadium_resolver.resolve(
|
||||
raw.stadium_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if stadium_result.review_item:
|
||||
review_items.append(stadium_result.review_item)
|
||||
|
||||
stadium_id = stadium_result.canonical_id
|
||||
|
||||
# Get abbreviations for game ID
|
||||
home_abbrev = self._get_abbreviation(home_result.canonical_id)
|
||||
away_abbrev = self._get_abbreviation(away_result.canonical_id)
|
||||
|
||||
# Generate canonical game ID
|
||||
game_id = generate_game_id(
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
away_abbrev=away_abbrev,
|
||||
home_abbrev=home_abbrev,
|
||||
game_date=raw.game_date,
|
||||
game_number=None,
|
||||
)
|
||||
|
||||
game = Game(
|
||||
id=game_id,
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
home_team_id=home_result.canonical_id,
|
||||
away_team_id=away_result.canonical_id,
|
||||
stadium_id=stadium_id or "",
|
||||
game_date=raw.game_date,
|
||||
game_number=None,
|
||||
home_score=raw.home_score,
|
||||
away_score=raw.away_score,
|
||||
status=raw.status,
|
||||
source_url=raw.source_url,
|
||||
raw_home_team=raw.home_team_raw,
|
||||
raw_away_team=raw.away_team_raw,
|
||||
raw_stadium=raw.stadium_raw,
|
||||
)
|
||||
|
||||
return game, review_items
|
||||
|
||||
def _get_abbreviation(self, team_id: str) -> str:
|
||||
"""Extract abbreviation from team ID."""
|
||||
parts = team_id.split("_")
|
||||
return parts[-1] if parts else ""
|
||||
|
||||
def scrape_teams(self) -> list[Team]:
|
||||
"""Get all MLS teams from hardcoded mappings."""
|
||||
teams: list[Team] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
# MLS conference structure
|
||||
conferences = {
|
||||
"Eastern": ["ATL", "CLT", "CHI", "CIN", "CLB", "DC", "MIA", "MTL", "NE", "NYC", "RB", "ORL", "PHI", "TOR"],
|
||||
"Western": ["AUS", "COL", "DAL", "HOU", "LAG", "LAFC", "MIN", "NSH", "POR", "SLC", "SD", "SJ", "SEA", "SKC", "STL", "VAN"],
|
||||
}
|
||||
|
||||
# Build reverse lookup
|
||||
team_conferences: dict[str, str] = {}
|
||||
for conf, abbrevs in conferences.items():
|
||||
for abbrev in abbrevs:
|
||||
team_conferences[abbrev] = conf
|
||||
|
||||
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("mls", {}).items():
|
||||
if team_id in seen:
|
||||
continue
|
||||
seen.add(team_id)
|
||||
|
||||
# Parse team name
|
||||
team_name = full_name
|
||||
|
||||
# Get conference
|
||||
conf = team_conferences.get(abbrev)
|
||||
|
||||
team = Team(
|
||||
id=team_id,
|
||||
sport="mls",
|
||||
city=city,
|
||||
name=team_name,
|
||||
full_name=full_name,
|
||||
abbreviation=abbrev,
|
||||
conference=conf,
|
||||
division=None, # MLS doesn't have divisions
|
||||
stadium_id=stadium_id,
|
||||
)
|
||||
teams.append(team)
|
||||
|
||||
return teams
|
||||
|
||||
def scrape_stadiums(self) -> list[Stadium]:
|
||||
"""Get all MLS stadiums from hardcoded mappings."""
|
||||
stadiums: list[Stadium] = []
|
||||
|
||||
mls_stadiums = STADIUM_MAPPINGS.get("mls", {})
|
||||
for stadium_id, info in mls_stadiums.items():
|
||||
stadium = Stadium(
|
||||
id=stadium_id,
|
||||
sport="mls",
|
||||
name=info.name,
|
||||
city=info.city,
|
||||
state=info.state,
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
surface="grass",
|
||||
roof_type="open",
|
||||
)
|
||||
stadiums.append(stadium)
|
||||
|
||||
return stadiums
|
||||
|
||||
|
||||
def create_mls_scraper(season: int) -> MLSScraper:
|
||||
"""Factory function to create an MLS scraper."""
|
||||
return MLSScraper(season=season)
|
||||
661
sportstime_parser/scrapers/nba.py
Normal file
661
sportstime_parser/scrapers/nba.py
Normal file
@@ -0,0 +1,661 @@
|
||||
"""NBA scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date, timezone
|
||||
from typing import Optional
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
from ..models.game import Game
|
||||
from ..models.team import Team
|
||||
from ..models.stadium import Stadium
|
||||
from ..models.aliases import ManualReviewItem
|
||||
from ..normalizers.canonical_id import generate_game_id
|
||||
from ..normalizers.team_resolver import (
|
||||
TeamResolver,
|
||||
TEAM_MAPPINGS,
|
||||
get_team_resolver,
|
||||
)
|
||||
from ..normalizers.stadium_resolver import (
|
||||
StadiumResolver,
|
||||
STADIUM_MAPPINGS,
|
||||
get_stadium_resolver,
|
||||
)
|
||||
from ..normalizers.timezone import parse_datetime
|
||||
from ..utils.logging import get_logger, log_game, log_warning
|
||||
|
||||
|
||||
# Month name to number mapping
|
||||
MONTH_MAP = {
|
||||
"january": 1, "february": 2, "march": 3, "april": 4,
|
||||
"may": 5, "june": 6, "july": 7, "august": 8,
|
||||
"september": 9, "october": 10, "november": 11, "december": 12,
|
||||
}
|
||||
|
||||
# Basketball Reference month URLs
|
||||
BR_MONTHS = [
|
||||
"october", "november", "december",
|
||||
"january", "february", "march", "april", "may", "june",
|
||||
]
|
||||
|
||||
|
||||
class NBAScraper(BaseScraper):
|
||||
"""NBA schedule scraper with multi-source fallback.
|
||||
|
||||
Sources (in priority order):
|
||||
1. Basketball-Reference - Most reliable, complete historical data
|
||||
2. ESPN API - Good for current/future seasons
|
||||
3. CBS Sports - Backup option
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
"""Initialize NBA scraper.
|
||||
|
||||
Args:
|
||||
season: Season start year (e.g., 2025 for 2025-26)
|
||||
"""
|
||||
super().__init__("nba", season, **kwargs)
|
||||
self._team_resolver = get_team_resolver("nba")
|
||||
self._stadium_resolver = get_stadium_resolver("nba")
|
||||
|
||||
def _get_sources(self) -> list[str]:
|
||||
"""Return source list in priority order."""
|
||||
# CBS scraper not yet implemented - TODO for future
|
||||
return ["basketball_reference", "espn"]
|
||||
|
||||
def _get_source_url(self, source: str, **kwargs) -> str:
|
||||
"""Build URL for a source."""
|
||||
if source == "basketball_reference":
|
||||
month = kwargs.get("month", "october")
|
||||
year = kwargs.get("year", self.season + 1)
|
||||
return f"https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html"
|
||||
|
||||
elif source == "espn":
|
||||
date_str = kwargs.get("date", "")
|
||||
return f"https://site.api.espn.com/apis/site/v2/sports/basketball/nba/scoreboard?dates={date_str}"
|
||||
|
||||
elif source == "cbs":
|
||||
return "https://www.cbssports.com/nba/schedule/"
|
||||
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
|
||||
"""Scrape games from a specific source."""
|
||||
if source == "basketball_reference":
|
||||
return self._scrape_basketball_reference()
|
||||
elif source == "espn":
|
||||
return self._scrape_espn()
|
||||
elif source == "cbs":
|
||||
return self._scrape_cbs()
|
||||
else:
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_basketball_reference(self) -> list[RawGameData]:
|
||||
"""Scrape games from Basketball-Reference.
|
||||
|
||||
BR organizes games by month with separate pages.
|
||||
Format: https://www.basketball-reference.com/leagues/NBA_YYYY_games-month.html
|
||||
where YYYY is the ending year of the season.
|
||||
Bails early if first few months have no data (season doesn't exist).
|
||||
"""
|
||||
all_games: list[RawGameData] = []
|
||||
end_year = self.season + 1
|
||||
consecutive_empty_months = 0
|
||||
|
||||
for month in BR_MONTHS:
|
||||
url = self._get_source_url("basketball_reference", month=month, year=end_year)
|
||||
|
||||
try:
|
||||
html = self.session.get_html(url)
|
||||
games = self._parse_basketball_reference(html, url)
|
||||
|
||||
if games:
|
||||
all_games.extend(games)
|
||||
consecutive_empty_months = 0
|
||||
self._logger.debug(f"Found {len(games)} games in {month}")
|
||||
else:
|
||||
consecutive_empty_months += 1
|
||||
|
||||
except Exception as e:
|
||||
# Some months may not exist (e.g., no games in August)
|
||||
self._logger.debug(f"No data for {month}: {e}")
|
||||
consecutive_empty_months += 1
|
||||
|
||||
# If first 3 months (Oct, Nov, Dec) all have no data, season doesn't exist
|
||||
if consecutive_empty_months >= 3 and not all_games:
|
||||
self._logger.info(f"No games found in first {consecutive_empty_months} months, season likely doesn't exist")
|
||||
break
|
||||
|
||||
return all_games
|
||||
|
||||
def _parse_basketball_reference(
|
||||
self,
|
||||
html: str,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse Basketball-Reference schedule HTML.
|
||||
|
||||
Table structure:
|
||||
- th[data-stat="date_game"]: Date (e.g., "Tue, Oct 22, 2024")
|
||||
- td[data-stat="visitor_team_name"]: Away team
|
||||
- td[data-stat="home_team_name"]: Home team
|
||||
- td[data-stat="visitor_pts"]: Away score
|
||||
- td[data-stat="home_pts"]: Home score
|
||||
- td[data-stat="arena_name"]: Arena/stadium name
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
games: list[RawGameData] = []
|
||||
|
||||
# Find the schedule table
|
||||
table = soup.find("table", id="schedule")
|
||||
if not table:
|
||||
return games
|
||||
|
||||
tbody = table.find("tbody")
|
||||
if not tbody:
|
||||
return games
|
||||
|
||||
for row in tbody.find_all("tr"):
|
||||
# Skip header rows
|
||||
if row.get("class") and "thead" in row.get("class", []):
|
||||
continue
|
||||
|
||||
try:
|
||||
game = self._parse_br_row(row, source_url)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse row: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_br_row(
|
||||
self,
|
||||
row,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single Basketball-Reference table row."""
|
||||
# Get date
|
||||
date_cell = row.find("th", {"data-stat": "date_game"})
|
||||
if not date_cell:
|
||||
return None
|
||||
|
||||
date_text = date_cell.get_text(strip=True)
|
||||
if not date_text:
|
||||
return None
|
||||
|
||||
# Parse date (format: "Tue, Oct 22, 2024")
|
||||
try:
|
||||
game_date = datetime.strptime(date_text, "%a, %b %d, %Y")
|
||||
except ValueError:
|
||||
# Try alternative format
|
||||
try:
|
||||
game_date = datetime.strptime(date_text, "%B %d, %Y")
|
||||
except ValueError:
|
||||
self._logger.debug(f"Could not parse date: {date_text}")
|
||||
return None
|
||||
|
||||
# Get teams
|
||||
away_cell = row.find("td", {"data-stat": "visitor_team_name"})
|
||||
home_cell = row.find("td", {"data-stat": "home_team_name"})
|
||||
|
||||
if not away_cell or not home_cell:
|
||||
return None
|
||||
|
||||
away_team = away_cell.get_text(strip=True)
|
||||
home_team = home_cell.get_text(strip=True)
|
||||
|
||||
if not away_team or not home_team:
|
||||
return None
|
||||
|
||||
# Get scores (may be empty for future games)
|
||||
away_score_cell = row.find("td", {"data-stat": "visitor_pts"})
|
||||
home_score_cell = row.find("td", {"data-stat": "home_pts"})
|
||||
|
||||
away_score = None
|
||||
home_score = None
|
||||
|
||||
if away_score_cell and away_score_cell.get_text(strip=True):
|
||||
try:
|
||||
away_score = int(away_score_cell.get_text(strip=True))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if home_score_cell and home_score_cell.get_text(strip=True):
|
||||
try:
|
||||
home_score = int(home_score_cell.get_text(strip=True))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Get arena
|
||||
arena_cell = row.find("td", {"data-stat": "arena_name"})
|
||||
arena = arena_cell.get_text(strip=True) if arena_cell else None
|
||||
|
||||
# Determine status
|
||||
status = "final" if home_score is not None else "scheduled"
|
||||
|
||||
# Check for postponed/cancelled
|
||||
notes_cell = row.find("td", {"data-stat": "game_remarks"})
|
||||
if notes_cell:
|
||||
notes = notes_cell.get_text(strip=True).lower()
|
||||
if "postponed" in notes:
|
||||
status = "postponed"
|
||||
elif "cancelled" in notes or "canceled" in notes:
|
||||
status = "cancelled"
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=arena,
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API.
|
||||
|
||||
ESPN API returns games for a specific date range.
|
||||
We iterate through each day of the season.
|
||||
Bails out early if no games found after checking first month.
|
||||
"""
|
||||
all_games: list[RawGameData] = []
|
||||
consecutive_empty_days = 0
|
||||
max_empty_days = 45 # Bail after ~1.5 months of no games
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
if month == 12:
|
||||
next_month = date(year + 1, 1, 1)
|
||||
else:
|
||||
next_month = date(year, month + 1, 1)
|
||||
|
||||
days_in_month = (next_month - date(year, month, 1)).days
|
||||
|
||||
for day in range(1, days_in_month + 1):
|
||||
try:
|
||||
game_date = date(year, month, day)
|
||||
date_str = game_date.strftime("%Y%m%d")
|
||||
url = self._get_source_url("espn", date=date_str)
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
|
||||
if games:
|
||||
all_games.extend(games)
|
||||
consecutive_empty_days = 0
|
||||
else:
|
||||
consecutive_empty_days += 1
|
||||
|
||||
# Bail early if no games found for a long stretch
|
||||
if consecutive_empty_days >= max_empty_days:
|
||||
self._logger.info(f"No games found for {max_empty_days} consecutive days, stopping ESPN scrape")
|
||||
return all_games
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
consecutive_empty_days += 1
|
||||
|
||||
if consecutive_empty_days >= max_empty_days:
|
||||
self._logger.info(f"Too many consecutive failures, stopping ESPN scrape")
|
||||
return all_games
|
||||
continue
|
||||
|
||||
return all_games
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
data: dict,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse ESPN API response."""
|
||||
games: list[RawGameData] = []
|
||||
|
||||
events = data.get("events", [])
|
||||
|
||||
for event in events:
|
||||
try:
|
||||
game = self._parse_espn_event(event, source_url)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse ESPN event: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_espn_event(
|
||||
self,
|
||||
event: dict,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single ESPN event."""
|
||||
# Get date
|
||||
date_str = event.get("date", "")
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
# ESPN uses ISO format
|
||||
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get competitions (usually just one)
|
||||
competitions = event.get("competitions", [])
|
||||
if not competitions:
|
||||
return None
|
||||
|
||||
competition = competitions[0]
|
||||
|
||||
# Get teams
|
||||
competitors = competition.get("competitors", [])
|
||||
if len(competitors) != 2:
|
||||
return None
|
||||
|
||||
home_team = None
|
||||
away_team = None
|
||||
home_score = None
|
||||
away_score = None
|
||||
|
||||
for competitor in competitors:
|
||||
team_info = competitor.get("team", {})
|
||||
team_name = team_info.get("displayName", "")
|
||||
is_home = competitor.get("homeAway") == "home"
|
||||
score = competitor.get("score")
|
||||
|
||||
if score:
|
||||
try:
|
||||
score = int(score)
|
||||
except (ValueError, TypeError):
|
||||
score = None
|
||||
|
||||
if is_home:
|
||||
home_team = team_name
|
||||
home_score = score
|
||||
else:
|
||||
away_team = team_name
|
||||
away_score = score
|
||||
|
||||
if not home_team or not away_team:
|
||||
return None
|
||||
|
||||
# Get venue
|
||||
venue = competition.get("venue", {})
|
||||
arena = venue.get("fullName")
|
||||
|
||||
# Get status
|
||||
status_info = competition.get("status", {})
|
||||
status_type = status_info.get("type", {})
|
||||
status_name = status_type.get("name", "").lower()
|
||||
|
||||
if status_name == "status_final":
|
||||
status = "final"
|
||||
elif status_name == "status_postponed":
|
||||
status = "postponed"
|
||||
elif status_name == "status_canceled":
|
||||
status = "cancelled"
|
||||
else:
|
||||
status = "scheduled"
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=arena,
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _scrape_cbs(self) -> list[RawGameData]:
|
||||
"""Scrape games from CBS Sports.
|
||||
|
||||
CBS Sports is a backup source with less structured data.
|
||||
"""
|
||||
# CBS Sports scraping would go here
|
||||
# For now, return empty to fall back to other sources
|
||||
raise NotImplementedError("CBS scraper not implemented")
|
||||
|
||||
def _normalize_games(
|
||||
self,
|
||||
raw_games: list[RawGameData],
|
||||
) -> tuple[list[Game], list[ManualReviewItem]]:
|
||||
"""Normalize raw games to Game objects with canonical IDs."""
|
||||
games: list[Game] = []
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
# Track games by date for doubleheader detection
|
||||
games_by_date: dict[str, list[RawGameData]] = {}
|
||||
|
||||
for raw in raw_games:
|
||||
date_key = raw.game_date.strftime("%Y%m%d")
|
||||
matchup_key = f"{date_key}_{raw.away_team_raw}_{raw.home_team_raw}"
|
||||
|
||||
if matchup_key not in games_by_date:
|
||||
games_by_date[matchup_key] = []
|
||||
games_by_date[matchup_key].append(raw)
|
||||
|
||||
# Process games with doubleheader detection
|
||||
for matchup_key, matchup_games in games_by_date.items():
|
||||
is_doubleheader = len(matchup_games) > 1
|
||||
|
||||
for i, raw in enumerate(matchup_games):
|
||||
game_number = (i + 1) if is_doubleheader else None
|
||||
|
||||
game, item_reviews = self._normalize_single_game(raw, game_number)
|
||||
|
||||
if game:
|
||||
games.append(game)
|
||||
log_game(
|
||||
self.sport,
|
||||
game.id,
|
||||
game.home_team_id,
|
||||
game.away_team_id,
|
||||
game.game_date.strftime("%Y-%m-%d"),
|
||||
game.status,
|
||||
)
|
||||
|
||||
review_items.extend(item_reviews)
|
||||
|
||||
return games, review_items
|
||||
|
||||
def _normalize_single_game(
|
||||
self,
|
||||
raw: RawGameData,
|
||||
game_number: Optional[int],
|
||||
) -> tuple[Optional[Game], list[ManualReviewItem]]:
|
||||
"""Normalize a single raw game."""
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
# Resolve home team
|
||||
home_result = self._team_resolver.resolve(
|
||||
raw.home_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if home_result.review_item:
|
||||
review_items.append(home_result.review_item)
|
||||
|
||||
if not home_result.canonical_id:
|
||||
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve away team
|
||||
away_result = self._team_resolver.resolve(
|
||||
raw.away_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if away_result.review_item:
|
||||
review_items.append(away_result.review_item)
|
||||
|
||||
if not away_result.canonical_id:
|
||||
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve stadium (optional - use home team's stadium if not found)
|
||||
stadium_id = None
|
||||
|
||||
if raw.stadium_raw:
|
||||
stadium_result = self._stadium_resolver.resolve(
|
||||
raw.stadium_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if stadium_result.review_item:
|
||||
review_items.append(stadium_result.review_item)
|
||||
|
||||
stadium_id = stadium_result.canonical_id
|
||||
|
||||
# If no stadium found, use home team's default stadium
|
||||
if not stadium_id:
|
||||
# Look up home team's stadium from mappings
|
||||
home_abbrev = home_result.canonical_id.split("_")[-1].upper()
|
||||
team_info = self._team_resolver.get_team_info(home_abbrev)
|
||||
|
||||
if team_info:
|
||||
# Try to find stadium by team's home arena
|
||||
for sid, sinfo in STADIUM_MAPPINGS.get("nba", {}).items():
|
||||
# Match by city
|
||||
if sinfo.city.lower() in team_info[2].lower():
|
||||
stadium_id = sid
|
||||
break
|
||||
|
||||
# Get abbreviations for game ID
|
||||
home_abbrev = self._get_abbreviation(home_result.canonical_id)
|
||||
away_abbrev = self._get_abbreviation(away_result.canonical_id)
|
||||
|
||||
# Generate canonical game ID
|
||||
game_id = generate_game_id(
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
away_abbrev=away_abbrev,
|
||||
home_abbrev=home_abbrev,
|
||||
game_date=raw.game_date,
|
||||
game_number=game_number,
|
||||
)
|
||||
|
||||
game = Game(
|
||||
id=game_id,
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
home_team_id=home_result.canonical_id,
|
||||
away_team_id=away_result.canonical_id,
|
||||
stadium_id=stadium_id or "",
|
||||
game_date=raw.game_date,
|
||||
game_number=game_number,
|
||||
home_score=raw.home_score,
|
||||
away_score=raw.away_score,
|
||||
status=raw.status,
|
||||
source_url=raw.source_url,
|
||||
raw_home_team=raw.home_team_raw,
|
||||
raw_away_team=raw.away_team_raw,
|
||||
raw_stadium=raw.stadium_raw,
|
||||
)
|
||||
|
||||
return game, review_items
|
||||
|
||||
def _get_abbreviation(self, team_id: str) -> str:
|
||||
"""Extract abbreviation from team ID."""
|
||||
# team_nba_okc -> okc
|
||||
parts = team_id.split("_")
|
||||
return parts[-1] if parts else ""
|
||||
|
||||
def scrape_teams(self) -> list[Team]:
|
||||
"""Get all NBA teams from hardcoded mappings."""
|
||||
teams: list[Team] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
# NBA conference/division structure
|
||||
divisions = {
|
||||
"Atlantic": ("Eastern", ["BOS", "BKN", "NYK", "PHI", "TOR"]),
|
||||
"Central": ("Eastern", ["CHI", "CLE", "DET", "IND", "MIL"]),
|
||||
"Southeast": ("Eastern", ["ATL", "CHA", "MIA", "ORL", "WAS"]),
|
||||
"Northwest": ("Western", ["DEN", "MIN", "OKC", "POR", "UTA"]),
|
||||
"Pacific": ("Western", ["GSW", "LAC", "LAL", "PHX", "SAC"]),
|
||||
"Southwest": ("Western", ["DAL", "HOU", "MEM", "NOP", "SAS"]),
|
||||
}
|
||||
|
||||
# Build reverse lookup
|
||||
team_divisions: dict[str, tuple[str, str]] = {}
|
||||
for div, (conf, abbrevs) in divisions.items():
|
||||
for abbrev in abbrevs:
|
||||
team_divisions[abbrev] = (conf, div)
|
||||
|
||||
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nba", {}).items():
|
||||
if team_id in seen:
|
||||
continue
|
||||
seen.add(team_id)
|
||||
|
||||
# Parse full name into city and name parts
|
||||
parts = full_name.split()
|
||||
if len(parts) >= 2:
|
||||
# Handle special cases like "Oklahoma City Thunder"
|
||||
if city == "Oklahoma City":
|
||||
team_name = "Thunder"
|
||||
elif city == "Golden State":
|
||||
team_name = "Warriors"
|
||||
elif city == "San Antonio":
|
||||
team_name = "Spurs"
|
||||
elif city == "New York":
|
||||
team_name = parts[-1] # Knicks
|
||||
elif city == "New Orleans":
|
||||
team_name = "Pelicans"
|
||||
elif city == "Los Angeles":
|
||||
team_name = parts[-1] # Lakers or Clippers
|
||||
else:
|
||||
team_name = parts[-1]
|
||||
else:
|
||||
team_name = full_name
|
||||
|
||||
# Get conference and division
|
||||
conf, div = team_divisions.get(abbrev, (None, None))
|
||||
|
||||
team = Team(
|
||||
id=team_id,
|
||||
sport="nba",
|
||||
city=city,
|
||||
name=team_name,
|
||||
full_name=full_name,
|
||||
abbreviation=abbrev,
|
||||
conference=conf,
|
||||
division=div,
|
||||
stadium_id=stadium_id,
|
||||
)
|
||||
teams.append(team)
|
||||
|
||||
return teams
|
||||
|
||||
def scrape_stadiums(self) -> list[Stadium]:
|
||||
"""Get all NBA stadiums from hardcoded mappings."""
|
||||
stadiums: list[Stadium] = []
|
||||
|
||||
for stadium_id, info in STADIUM_MAPPINGS.get("nba", {}).items():
|
||||
stadium = Stadium(
|
||||
id=stadium_id,
|
||||
sport="nba",
|
||||
name=info.name,
|
||||
city=info.city,
|
||||
state=info.state,
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
surface="hardwood",
|
||||
roof_type="dome",
|
||||
)
|
||||
stadiums.append(stadium)
|
||||
|
||||
return stadiums
|
||||
|
||||
|
||||
def create_nba_scraper(season: int) -> NBAScraper:
|
||||
"""Factory function to create an NBA scraper."""
|
||||
return NBAScraper(season=season)
|
||||
579
sportstime_parser/scrapers/nfl.py
Normal file
579
sportstime_parser/scrapers/nfl.py
Normal file
@@ -0,0 +1,579 @@
|
||||
"""NFL scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date
|
||||
from typing import Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
from ..models.game import Game
|
||||
from ..models.team import Team
|
||||
from ..models.stadium import Stadium
|
||||
from ..models.aliases import ManualReviewItem
|
||||
from ..normalizers.canonical_id import generate_game_id
|
||||
from ..normalizers.team_resolver import (
|
||||
TeamResolver,
|
||||
TEAM_MAPPINGS,
|
||||
get_team_resolver,
|
||||
)
|
||||
from ..normalizers.stadium_resolver import (
|
||||
StadiumResolver,
|
||||
STADIUM_MAPPINGS,
|
||||
get_stadium_resolver,
|
||||
)
|
||||
from ..utils.logging import get_logger, log_game, log_warning
|
||||
|
||||
|
||||
# International game locations to filter out
|
||||
INTERNATIONAL_LOCATIONS = {"London", "Mexico City", "Frankfurt", "Munich", "São Paulo"}
|
||||
|
||||
|
||||
class NFLScraper(BaseScraper):
|
||||
"""NFL schedule scraper with multi-source fallback.
|
||||
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for NFL
|
||||
2. Pro-Football-Reference - Complete historical data
|
||||
3. CBS Sports - Backup option
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
"""Initialize NFL scraper.
|
||||
|
||||
Args:
|
||||
season: Season year (e.g., 2025 for 2025 season)
|
||||
"""
|
||||
super().__init__("nfl", season, **kwargs)
|
||||
self._team_resolver = get_team_resolver("nfl")
|
||||
self._stadium_resolver = get_stadium_resolver("nfl")
|
||||
|
||||
def _get_sources(self) -> list[str]:
|
||||
"""Return source list in priority order."""
|
||||
# CBS scraper not yet implemented - TODO for future
|
||||
return ["espn", "pro_football_reference"]
|
||||
|
||||
def _get_source_url(self, source: str, **kwargs) -> str:
|
||||
"""Build URL for a source."""
|
||||
if source == "espn":
|
||||
week = kwargs.get("week", 1)
|
||||
season_type = kwargs.get("season_type", 2) # 1=preseason, 2=regular, 3=postseason
|
||||
return f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?seasontype={season_type}&week={week}"
|
||||
|
||||
elif source == "pro_football_reference":
|
||||
return f"https://www.pro-football-reference.com/years/{self.season}/games.htm"
|
||||
|
||||
elif source == "cbs":
|
||||
return "https://www.cbssports.com/nfl/schedule/"
|
||||
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _get_season_months(self) -> list[tuple[int, int]]:
|
||||
"""Get the months to scrape for NFL season.
|
||||
|
||||
NFL season runs September through February.
|
||||
"""
|
||||
months = []
|
||||
|
||||
# Regular season months
|
||||
for month in range(9, 13): # Sept-Dec
|
||||
months.append((self.season, month))
|
||||
|
||||
# Playoff months
|
||||
for month in range(1, 3): # Jan-Feb
|
||||
months.append((self.season + 1, month))
|
||||
|
||||
return months
|
||||
|
||||
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
|
||||
"""Scrape games from a specific source."""
|
||||
if source == "espn":
|
||||
return self._scrape_espn()
|
||||
elif source == "pro_football_reference":
|
||||
return self._scrape_pro_football_reference()
|
||||
elif source == "cbs":
|
||||
return self._scrape_cbs()
|
||||
else:
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API.
|
||||
|
||||
ESPN NFL API uses week numbers.
|
||||
"""
|
||||
all_games: list[RawGameData] = []
|
||||
|
||||
# Scrape preseason (4 weeks)
|
||||
for week in range(1, 5):
|
||||
try:
|
||||
url = self._get_source_url("espn", week=week, season_type=1)
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN preseason week {week} error: {e}")
|
||||
continue
|
||||
|
||||
# Scrape regular season (18 weeks)
|
||||
for week in range(1, 19):
|
||||
try:
|
||||
url = self._get_source_url("espn", week=week, season_type=2)
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
self._logger.debug(f"Found {len(games)} games in week {week}")
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN regular season week {week} error: {e}")
|
||||
continue
|
||||
|
||||
# Scrape postseason (4 rounds)
|
||||
for week in range(1, 5):
|
||||
try:
|
||||
url = self._get_source_url("espn", week=week, season_type=3)
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN postseason week {week} error: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
data: dict,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse ESPN API response."""
|
||||
games: list[RawGameData] = []
|
||||
|
||||
events = data.get("events", [])
|
||||
|
||||
for event in events:
|
||||
try:
|
||||
game = self._parse_espn_event(event, source_url)
|
||||
if game:
|
||||
# Filter international games
|
||||
if game.stadium_raw and any(loc in game.stadium_raw for loc in INTERNATIONAL_LOCATIONS):
|
||||
self._logger.debug(f"Skipping international game: {game.stadium_raw}")
|
||||
continue
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse ESPN event: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_espn_event(
|
||||
self,
|
||||
event: dict,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single ESPN event."""
|
||||
# Get date
|
||||
date_str = event.get("date", "")
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get competitions
|
||||
competitions = event.get("competitions", [])
|
||||
if not competitions:
|
||||
return None
|
||||
|
||||
competition = competitions[0]
|
||||
|
||||
# Check for neutral site (international games)
|
||||
if competition.get("neutralSite"):
|
||||
venue = competition.get("venue", {})
|
||||
venue_city = venue.get("address", {}).get("city", "")
|
||||
if venue_city in INTERNATIONAL_LOCATIONS:
|
||||
return None
|
||||
|
||||
# Get teams
|
||||
competitors = competition.get("competitors", [])
|
||||
if len(competitors) != 2:
|
||||
return None
|
||||
|
||||
home_team = None
|
||||
away_team = None
|
||||
home_score = None
|
||||
away_score = None
|
||||
|
||||
for competitor in competitors:
|
||||
team_info = competitor.get("team", {})
|
||||
team_name = team_info.get("displayName", "")
|
||||
is_home = competitor.get("homeAway") == "home"
|
||||
score = competitor.get("score")
|
||||
|
||||
if score:
|
||||
try:
|
||||
score = int(score)
|
||||
except (ValueError, TypeError):
|
||||
score = None
|
||||
|
||||
if is_home:
|
||||
home_team = team_name
|
||||
home_score = score
|
||||
else:
|
||||
away_team = team_name
|
||||
away_score = score
|
||||
|
||||
if not home_team or not away_team:
|
||||
return None
|
||||
|
||||
# Get venue
|
||||
venue = competition.get("venue", {})
|
||||
stadium = venue.get("fullName")
|
||||
|
||||
# Get status
|
||||
status_info = competition.get("status", {})
|
||||
status_type = status_info.get("type", {})
|
||||
status_name = status_type.get("name", "").lower()
|
||||
|
||||
if status_name == "status_final":
|
||||
status = "final"
|
||||
elif status_name == "status_postponed":
|
||||
status = "postponed"
|
||||
elif status_name == "status_canceled":
|
||||
status = "cancelled"
|
||||
else:
|
||||
status = "scheduled"
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=stadium,
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _scrape_pro_football_reference(self) -> list[RawGameData]:
|
||||
"""Scrape games from Pro-Football-Reference.
|
||||
|
||||
PFR has a single schedule page per season.
|
||||
"""
|
||||
url = self._get_source_url("pro_football_reference")
|
||||
|
||||
try:
|
||||
html = self.session.get_html(url)
|
||||
games = self._parse_pfr(html, url)
|
||||
return games
|
||||
except Exception as e:
|
||||
self._logger.error(f"Failed to scrape Pro-Football-Reference: {e}")
|
||||
raise
|
||||
|
||||
def _parse_pfr(
|
||||
self,
|
||||
html: str,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse Pro-Football-Reference schedule HTML."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
games: list[RawGameData] = []
|
||||
|
||||
# Find the schedule table
|
||||
table = soup.find("table", id="games")
|
||||
if not table:
|
||||
return games
|
||||
|
||||
tbody = table.find("tbody")
|
||||
if not tbody:
|
||||
return games
|
||||
|
||||
for row in tbody.find_all("tr"):
|
||||
# Skip header rows
|
||||
if row.get("class") and "thead" in row.get("class", []):
|
||||
continue
|
||||
|
||||
try:
|
||||
game = self._parse_pfr_row(row, source_url)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse PFR row: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_pfr_row(
|
||||
self,
|
||||
row,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single Pro-Football-Reference table row."""
|
||||
# Get date
|
||||
date_cell = row.find("td", {"data-stat": "game_date"})
|
||||
if not date_cell:
|
||||
return None
|
||||
|
||||
date_text = date_cell.get_text(strip=True)
|
||||
if not date_text:
|
||||
return None
|
||||
|
||||
# Parse date
|
||||
try:
|
||||
# PFR uses YYYY-MM-DD format
|
||||
game_date = datetime.strptime(date_text, "%Y-%m-%d")
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get teams
|
||||
winner_cell = row.find("td", {"data-stat": "winner"})
|
||||
loser_cell = row.find("td", {"data-stat": "loser"})
|
||||
|
||||
if not winner_cell or not loser_cell:
|
||||
return None
|
||||
|
||||
winner = winner_cell.get_text(strip=True)
|
||||
loser = loser_cell.get_text(strip=True)
|
||||
|
||||
if not winner or not loser:
|
||||
return None
|
||||
|
||||
# Determine home/away based on @ symbol
|
||||
game_location = row.find("td", {"data-stat": "game_location"})
|
||||
at_home = game_location and "@" in game_location.get_text()
|
||||
|
||||
if at_home:
|
||||
home_team = loser
|
||||
away_team = winner
|
||||
else:
|
||||
home_team = winner
|
||||
away_team = loser
|
||||
|
||||
# Get scores
|
||||
pts_win_cell = row.find("td", {"data-stat": "pts_win"})
|
||||
pts_lose_cell = row.find("td", {"data-stat": "pts_lose"})
|
||||
|
||||
home_score = None
|
||||
away_score = None
|
||||
|
||||
if pts_win_cell and pts_lose_cell:
|
||||
try:
|
||||
winner_pts = int(pts_win_cell.get_text(strip=True))
|
||||
loser_pts = int(pts_lose_cell.get_text(strip=True))
|
||||
|
||||
if at_home:
|
||||
home_score = loser_pts
|
||||
away_score = winner_pts
|
||||
else:
|
||||
home_score = winner_pts
|
||||
away_score = loser_pts
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Determine status
|
||||
status = "final" if home_score is not None else "scheduled"
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=None, # PFR doesn't always have stadium
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _scrape_cbs(self) -> list[RawGameData]:
|
||||
"""Scrape games from CBS Sports."""
|
||||
raise NotImplementedError("CBS scraper not implemented")
|
||||
|
||||
def _normalize_games(
|
||||
self,
|
||||
raw_games: list[RawGameData],
|
||||
) -> tuple[list[Game], list[ManualReviewItem]]:
|
||||
"""Normalize raw games to Game objects with canonical IDs."""
|
||||
games: list[Game] = []
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
for raw in raw_games:
|
||||
game, item_reviews = self._normalize_single_game(raw)
|
||||
|
||||
if game:
|
||||
games.append(game)
|
||||
log_game(
|
||||
self.sport,
|
||||
game.id,
|
||||
game.home_team_id,
|
||||
game.away_team_id,
|
||||
game.game_date.strftime("%Y-%m-%d"),
|
||||
game.status,
|
||||
)
|
||||
|
||||
review_items.extend(item_reviews)
|
||||
|
||||
return games, review_items
|
||||
|
||||
def _normalize_single_game(
|
||||
self,
|
||||
raw: RawGameData,
|
||||
) -> tuple[Optional[Game], list[ManualReviewItem]]:
|
||||
"""Normalize a single raw game."""
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
# Resolve home team
|
||||
home_result = self._team_resolver.resolve(
|
||||
raw.home_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if home_result.review_item:
|
||||
review_items.append(home_result.review_item)
|
||||
|
||||
if not home_result.canonical_id:
|
||||
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve away team
|
||||
away_result = self._team_resolver.resolve(
|
||||
raw.away_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if away_result.review_item:
|
||||
review_items.append(away_result.review_item)
|
||||
|
||||
if not away_result.canonical_id:
|
||||
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve stadium
|
||||
stadium_id = None
|
||||
|
||||
if raw.stadium_raw:
|
||||
stadium_result = self._stadium_resolver.resolve(
|
||||
raw.stadium_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if stadium_result.review_item:
|
||||
review_items.append(stadium_result.review_item)
|
||||
|
||||
stadium_id = stadium_result.canonical_id
|
||||
|
||||
# Get abbreviations for game ID
|
||||
home_abbrev = self._get_abbreviation(home_result.canonical_id)
|
||||
away_abbrev = self._get_abbreviation(away_result.canonical_id)
|
||||
|
||||
# Generate canonical game ID
|
||||
game_id = generate_game_id(
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
away_abbrev=away_abbrev,
|
||||
home_abbrev=home_abbrev,
|
||||
game_date=raw.game_date,
|
||||
game_number=None, # NFL doesn't have doubleheaders
|
||||
)
|
||||
|
||||
game = Game(
|
||||
id=game_id,
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
home_team_id=home_result.canonical_id,
|
||||
away_team_id=away_result.canonical_id,
|
||||
stadium_id=stadium_id or "",
|
||||
game_date=raw.game_date,
|
||||
game_number=None,
|
||||
home_score=raw.home_score,
|
||||
away_score=raw.away_score,
|
||||
status=raw.status,
|
||||
source_url=raw.source_url,
|
||||
raw_home_team=raw.home_team_raw,
|
||||
raw_away_team=raw.away_team_raw,
|
||||
raw_stadium=raw.stadium_raw,
|
||||
)
|
||||
|
||||
return game, review_items
|
||||
|
||||
def _get_abbreviation(self, team_id: str) -> str:
|
||||
"""Extract abbreviation from team ID."""
|
||||
parts = team_id.split("_")
|
||||
return parts[-1] if parts else ""
|
||||
|
||||
def scrape_teams(self) -> list[Team]:
|
||||
"""Get all NFL teams from hardcoded mappings."""
|
||||
teams: list[Team] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
# NFL conference/division structure
|
||||
divisions = {
|
||||
"AFC East": ("AFC", ["BUF", "MIA", "NE", "NYJ"]),
|
||||
"AFC North": ("AFC", ["BAL", "CIN", "CLE", "PIT"]),
|
||||
"AFC South": ("AFC", ["HOU", "IND", "JAX", "TEN"]),
|
||||
"AFC West": ("AFC", ["DEN", "KC", "LV", "LAC"]),
|
||||
"NFC East": ("NFC", ["DAL", "NYG", "PHI", "WAS"]),
|
||||
"NFC North": ("NFC", ["CHI", "DET", "GB", "MIN"]),
|
||||
"NFC South": ("NFC", ["ATL", "CAR", "NO", "TB"]),
|
||||
"NFC West": ("NFC", ["ARI", "LAR", "SF", "SEA"]),
|
||||
}
|
||||
|
||||
# Build reverse lookup
|
||||
team_divisions: dict[str, tuple[str, str]] = {}
|
||||
for div, (conf, abbrevs) in divisions.items():
|
||||
for abbrev in abbrevs:
|
||||
team_divisions[abbrev] = (conf, div)
|
||||
|
||||
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nfl", {}).items():
|
||||
if team_id in seen:
|
||||
continue
|
||||
seen.add(team_id)
|
||||
|
||||
# Parse team name
|
||||
parts = full_name.split()
|
||||
team_name = parts[-1] if parts else full_name
|
||||
|
||||
# Get conference and division
|
||||
conf, div = team_divisions.get(abbrev, (None, None))
|
||||
|
||||
team = Team(
|
||||
id=team_id,
|
||||
sport="nfl",
|
||||
city=city,
|
||||
name=team_name,
|
||||
full_name=full_name,
|
||||
abbreviation=abbrev,
|
||||
conference=conf,
|
||||
division=div,
|
||||
stadium_id=stadium_id,
|
||||
)
|
||||
teams.append(team)
|
||||
|
||||
return teams
|
||||
|
||||
def scrape_stadiums(self) -> list[Stadium]:
|
||||
"""Get all NFL stadiums from hardcoded mappings."""
|
||||
stadiums: list[Stadium] = []
|
||||
|
||||
nfl_stadiums = STADIUM_MAPPINGS.get("nfl", {})
|
||||
for stadium_id, info in nfl_stadiums.items():
|
||||
stadium = Stadium(
|
||||
id=stadium_id,
|
||||
sport="nfl",
|
||||
name=info.name,
|
||||
city=info.city,
|
||||
state=info.state,
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
surface="turf", # Many NFL stadiums
|
||||
roof_type="open", # Most outdoor
|
||||
)
|
||||
stadiums.append(stadium)
|
||||
|
||||
return stadiums
|
||||
|
||||
|
||||
def create_nfl_scraper(season: int) -> NFLScraper:
|
||||
"""Factory function to create an NFL scraper."""
|
||||
return NFLScraper(season=season)
|
||||
657
sportstime_parser/scrapers/nhl.py
Normal file
657
sportstime_parser/scrapers/nhl.py
Normal file
@@ -0,0 +1,657 @@
|
||||
"""NHL scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date
|
||||
from typing import Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
from ..models.game import Game
|
||||
from ..models.team import Team
|
||||
from ..models.stadium import Stadium
|
||||
from ..models.aliases import ManualReviewItem
|
||||
from ..normalizers.canonical_id import generate_game_id
|
||||
from ..normalizers.team_resolver import (
|
||||
TeamResolver,
|
||||
TEAM_MAPPINGS,
|
||||
get_team_resolver,
|
||||
)
|
||||
from ..normalizers.stadium_resolver import (
|
||||
StadiumResolver,
|
||||
STADIUM_MAPPINGS,
|
||||
get_stadium_resolver,
|
||||
)
|
||||
from ..utils.logging import get_logger, log_game, log_warning
|
||||
|
||||
|
||||
# International game locations to filter out
|
||||
INTERNATIONAL_LOCATIONS = {"Prague", "Stockholm", "Helsinki", "Tampere", "Gothenburg"}
|
||||
|
||||
# Hockey Reference month URLs
|
||||
HR_MONTHS = [
|
||||
"october", "november", "december",
|
||||
"january", "february", "march", "april", "may", "june",
|
||||
]
|
||||
|
||||
|
||||
class NHLScraper(BaseScraper):
|
||||
"""NHL schedule scraper with multi-source fallback.
|
||||
|
||||
Sources (in priority order):
|
||||
1. Hockey-Reference - Most reliable for NHL
|
||||
2. NHL API - Official NHL data
|
||||
3. ESPN API - Backup option
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
"""Initialize NHL scraper.
|
||||
|
||||
Args:
|
||||
season: Season start year (e.g., 2025 for 2025-26)
|
||||
"""
|
||||
super().__init__("nhl", season, **kwargs)
|
||||
self._team_resolver = get_team_resolver("nhl")
|
||||
self._stadium_resolver = get_stadium_resolver("nhl")
|
||||
|
||||
def _get_sources(self) -> list[str]:
|
||||
"""Return source list in priority order."""
|
||||
return ["hockey_reference", "nhl_api", "espn"]
|
||||
|
||||
def _get_source_url(self, source: str, **kwargs) -> str:
|
||||
"""Build URL for a source."""
|
||||
if source == "hockey_reference":
|
||||
month = kwargs.get("month", "october")
|
||||
year = kwargs.get("year", self.season + 1)
|
||||
return f"https://www.hockey-reference.com/leagues/NHL_{year}_games.html"
|
||||
|
||||
elif source == "nhl_api":
|
||||
start_date = kwargs.get("start_date", "")
|
||||
end_date = kwargs.get("end_date", "")
|
||||
return f"https://api-web.nhle.com/v1/schedule/{start_date}"
|
||||
|
||||
elif source == "espn":
|
||||
date_str = kwargs.get("date", "")
|
||||
return f"https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard?dates={date_str}"
|
||||
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
|
||||
"""Scrape games from a specific source."""
|
||||
if source == "hockey_reference":
|
||||
return self._scrape_hockey_reference()
|
||||
elif source == "nhl_api":
|
||||
return self._scrape_nhl_api()
|
||||
elif source == "espn":
|
||||
return self._scrape_espn()
|
||||
else:
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_hockey_reference(self) -> list[RawGameData]:
|
||||
"""Scrape games from Hockey-Reference.
|
||||
|
||||
HR has a single schedule page per season.
|
||||
"""
|
||||
end_year = self.season + 1
|
||||
url = self._get_source_url("hockey_reference", year=end_year)
|
||||
|
||||
try:
|
||||
html = self.session.get_html(url)
|
||||
games = self._parse_hockey_reference(html, url)
|
||||
return games
|
||||
except Exception as e:
|
||||
self._logger.error(f"Failed to scrape Hockey-Reference: {e}")
|
||||
raise
|
||||
|
||||
def _parse_hockey_reference(
|
||||
self,
|
||||
html: str,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse Hockey-Reference schedule HTML."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
games: list[RawGameData] = []
|
||||
|
||||
# Find the schedule table
|
||||
table = soup.find("table", id="games")
|
||||
if not table:
|
||||
return games
|
||||
|
||||
tbody = table.find("tbody")
|
||||
if not tbody:
|
||||
return games
|
||||
|
||||
for row in tbody.find_all("tr"):
|
||||
# Skip header rows
|
||||
if row.get("class") and "thead" in row.get("class", []):
|
||||
continue
|
||||
|
||||
try:
|
||||
game = self._parse_hr_row(row, source_url)
|
||||
if game:
|
||||
# Filter international games
|
||||
if game.stadium_raw and any(loc in game.stadium_raw for loc in INTERNATIONAL_LOCATIONS):
|
||||
continue
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse HR row: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_hr_row(
|
||||
self,
|
||||
row,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single Hockey-Reference table row."""
|
||||
# Get date
|
||||
date_cell = row.find("th", {"data-stat": "date_game"})
|
||||
if not date_cell:
|
||||
return None
|
||||
|
||||
date_text = date_cell.get_text(strip=True)
|
||||
if not date_text:
|
||||
return None
|
||||
|
||||
# Parse date (format: "2025-10-15")
|
||||
try:
|
||||
game_date = datetime.strptime(date_text, "%Y-%m-%d")
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get teams
|
||||
visitor_cell = row.find("td", {"data-stat": "visitor_team_name"})
|
||||
home_cell = row.find("td", {"data-stat": "home_team_name"})
|
||||
|
||||
if not visitor_cell or not home_cell:
|
||||
return None
|
||||
|
||||
away_team = visitor_cell.get_text(strip=True)
|
||||
home_team = home_cell.get_text(strip=True)
|
||||
|
||||
if not away_team or not home_team:
|
||||
return None
|
||||
|
||||
# Get scores
|
||||
visitor_goals_cell = row.find("td", {"data-stat": "visitor_goals"})
|
||||
home_goals_cell = row.find("td", {"data-stat": "home_goals"})
|
||||
|
||||
away_score = None
|
||||
home_score = None
|
||||
|
||||
if visitor_goals_cell and visitor_goals_cell.get_text(strip=True):
|
||||
try:
|
||||
away_score = int(visitor_goals_cell.get_text(strip=True))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if home_goals_cell and home_goals_cell.get_text(strip=True):
|
||||
try:
|
||||
home_score = int(home_goals_cell.get_text(strip=True))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Determine status
|
||||
status = "final" if home_score is not None else "scheduled"
|
||||
|
||||
# Check for OT/SO
|
||||
overtimes_cell = row.find("td", {"data-stat": "overtimes"})
|
||||
if overtimes_cell:
|
||||
ot_text = overtimes_cell.get_text(strip=True)
|
||||
if ot_text:
|
||||
status = "final" # OT games are still final
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=None, # HR doesn't have stadium
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _scrape_nhl_api(self) -> list[RawGameData]:
|
||||
"""Scrape games from NHL API."""
|
||||
all_games: list[RawGameData] = []
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
start_date = date(year, month, 1)
|
||||
|
||||
url = self._get_source_url("nhl_api", start_date=start_date.strftime("%Y-%m-%d"))
|
||||
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_nhl_api_response(data, url)
|
||||
all_games.extend(games)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"NHL API error for {year}-{month}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
|
||||
def _parse_nhl_api_response(
|
||||
self,
|
||||
data: dict,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse NHL API response."""
|
||||
games: list[RawGameData] = []
|
||||
|
||||
game_weeks = data.get("gameWeek", [])
|
||||
|
||||
for week in game_weeks:
|
||||
for game_day in week.get("games", []):
|
||||
try:
|
||||
game = self._parse_nhl_api_game(game_day, source_url)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse NHL API game: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_nhl_api_game(
|
||||
self,
|
||||
game: dict,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single NHL API game."""
|
||||
# Get date
|
||||
start_time = game.get("startTimeUTC", "")
|
||||
if not start_time:
|
||||
return None
|
||||
|
||||
try:
|
||||
game_date = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get teams
|
||||
away_team_data = game.get("awayTeam", {})
|
||||
home_team_data = game.get("homeTeam", {})
|
||||
|
||||
away_team = away_team_data.get("placeName", {}).get("default", "")
|
||||
home_team = home_team_data.get("placeName", {}).get("default", "")
|
||||
|
||||
if not away_team or not home_team:
|
||||
# Try full name
|
||||
away_team = away_team_data.get("name", {}).get("default", "")
|
||||
home_team = home_team_data.get("name", {}).get("default", "")
|
||||
|
||||
if not away_team or not home_team:
|
||||
return None
|
||||
|
||||
# Get scores
|
||||
away_score = away_team_data.get("score")
|
||||
home_score = home_team_data.get("score")
|
||||
|
||||
# Get venue
|
||||
venue = game.get("venue", {})
|
||||
stadium = venue.get("default")
|
||||
|
||||
# Get status
|
||||
game_state = game.get("gameState", "").lower()
|
||||
|
||||
if game_state in ["final", "off"]:
|
||||
status = "final"
|
||||
elif game_state == "postponed":
|
||||
status = "postponed"
|
||||
elif game_state in ["cancelled", "canceled"]:
|
||||
status = "cancelled"
|
||||
else:
|
||||
status = "scheduled"
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=stadium,
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API."""
|
||||
all_games: list[RawGameData] = []
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
if month == 12:
|
||||
next_month = date(year + 1, 1, 1)
|
||||
else:
|
||||
next_month = date(year, month + 1, 1)
|
||||
|
||||
days_in_month = (next_month - date(year, month, 1)).days
|
||||
|
||||
for day in range(1, days_in_month + 1):
|
||||
try:
|
||||
game_date = date(year, month, day)
|
||||
date_str = game_date.strftime("%Y%m%d")
|
||||
url = self._get_source_url("espn", date=date_str)
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
data: dict,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse ESPN API response."""
|
||||
games: list[RawGameData] = []
|
||||
|
||||
events = data.get("events", [])
|
||||
|
||||
for event in events:
|
||||
try:
|
||||
game = self._parse_espn_event(event, source_url)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse ESPN event: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_espn_event(
|
||||
self,
|
||||
event: dict,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single ESPN event."""
|
||||
# Get date
|
||||
date_str = event.get("date", "")
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get competitions
|
||||
competitions = event.get("competitions", [])
|
||||
if not competitions:
|
||||
return None
|
||||
|
||||
competition = competitions[0]
|
||||
|
||||
# Check for neutral site (international games like Global Series)
|
||||
if competition.get("neutralSite"):
|
||||
venue = competition.get("venue", {})
|
||||
venue_city = venue.get("address", {}).get("city", "")
|
||||
if venue_city in INTERNATIONAL_LOCATIONS:
|
||||
return None
|
||||
|
||||
# Get teams
|
||||
competitors = competition.get("competitors", [])
|
||||
if len(competitors) != 2:
|
||||
return None
|
||||
|
||||
home_team = None
|
||||
away_team = None
|
||||
home_score = None
|
||||
away_score = None
|
||||
|
||||
for competitor in competitors:
|
||||
team_info = competitor.get("team", {})
|
||||
team_name = team_info.get("displayName", "")
|
||||
is_home = competitor.get("homeAway") == "home"
|
||||
score = competitor.get("score")
|
||||
|
||||
if score:
|
||||
try:
|
||||
score = int(score)
|
||||
except (ValueError, TypeError):
|
||||
score = None
|
||||
|
||||
if is_home:
|
||||
home_team = team_name
|
||||
home_score = score
|
||||
else:
|
||||
away_team = team_name
|
||||
away_score = score
|
||||
|
||||
if not home_team or not away_team:
|
||||
return None
|
||||
|
||||
# Get venue
|
||||
venue = competition.get("venue", {})
|
||||
stadium = venue.get("fullName")
|
||||
|
||||
# Get status
|
||||
status_info = competition.get("status", {})
|
||||
status_type = status_info.get("type", {})
|
||||
status_name = status_type.get("name", "").lower()
|
||||
|
||||
if status_name == "status_final":
|
||||
status = "final"
|
||||
elif status_name == "status_postponed":
|
||||
status = "postponed"
|
||||
elif status_name == "status_canceled":
|
||||
status = "cancelled"
|
||||
else:
|
||||
status = "scheduled"
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=stadium,
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _normalize_games(
|
||||
self,
|
||||
raw_games: list[RawGameData],
|
||||
) -> tuple[list[Game], list[ManualReviewItem]]:
|
||||
"""Normalize raw games to Game objects with canonical IDs."""
|
||||
games: list[Game] = []
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
for raw in raw_games:
|
||||
game, item_reviews = self._normalize_single_game(raw)
|
||||
|
||||
if game:
|
||||
games.append(game)
|
||||
log_game(
|
||||
self.sport,
|
||||
game.id,
|
||||
game.home_team_id,
|
||||
game.away_team_id,
|
||||
game.game_date.strftime("%Y-%m-%d"),
|
||||
game.status,
|
||||
)
|
||||
|
||||
review_items.extend(item_reviews)
|
||||
|
||||
return games, review_items
|
||||
|
||||
def _normalize_single_game(
|
||||
self,
|
||||
raw: RawGameData,
|
||||
) -> tuple[Optional[Game], list[ManualReviewItem]]:
|
||||
"""Normalize a single raw game."""
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
# Resolve home team
|
||||
home_result = self._team_resolver.resolve(
|
||||
raw.home_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if home_result.review_item:
|
||||
review_items.append(home_result.review_item)
|
||||
|
||||
if not home_result.canonical_id:
|
||||
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve away team
|
||||
away_result = self._team_resolver.resolve(
|
||||
raw.away_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if away_result.review_item:
|
||||
review_items.append(away_result.review_item)
|
||||
|
||||
if not away_result.canonical_id:
|
||||
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve stadium
|
||||
stadium_id = None
|
||||
|
||||
if raw.stadium_raw:
|
||||
stadium_result = self._stadium_resolver.resolve(
|
||||
raw.stadium_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if stadium_result.review_item:
|
||||
review_items.append(stadium_result.review_item)
|
||||
|
||||
stadium_id = stadium_result.canonical_id
|
||||
|
||||
# Fallback: Use home team's default stadium if no venue provided
|
||||
# This is common for Hockey-Reference which doesn't have venue data
|
||||
if not stadium_id:
|
||||
home_team_data = TEAM_MAPPINGS.get("nhl", {})
|
||||
home_abbrev = self._get_abbreviation(home_result.canonical_id)
|
||||
for abbrev, (team_id, _, _, default_stadium) in home_team_data.items():
|
||||
if team_id == home_result.canonical_id:
|
||||
stadium_id = default_stadium
|
||||
break
|
||||
|
||||
# Get abbreviations for game ID
|
||||
home_abbrev = self._get_abbreviation(home_result.canonical_id)
|
||||
away_abbrev = self._get_abbreviation(away_result.canonical_id)
|
||||
|
||||
# Generate canonical game ID
|
||||
game_id = generate_game_id(
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
away_abbrev=away_abbrev,
|
||||
home_abbrev=home_abbrev,
|
||||
game_date=raw.game_date,
|
||||
game_number=None, # NHL doesn't have doubleheaders
|
||||
)
|
||||
|
||||
game = Game(
|
||||
id=game_id,
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
home_team_id=home_result.canonical_id,
|
||||
away_team_id=away_result.canonical_id,
|
||||
stadium_id=stadium_id or "",
|
||||
game_date=raw.game_date,
|
||||
game_number=None,
|
||||
home_score=raw.home_score,
|
||||
away_score=raw.away_score,
|
||||
status=raw.status,
|
||||
source_url=raw.source_url,
|
||||
raw_home_team=raw.home_team_raw,
|
||||
raw_away_team=raw.away_team_raw,
|
||||
raw_stadium=raw.stadium_raw,
|
||||
)
|
||||
|
||||
return game, review_items
|
||||
|
||||
def _get_abbreviation(self, team_id: str) -> str:
|
||||
"""Extract abbreviation from team ID."""
|
||||
parts = team_id.split("_")
|
||||
return parts[-1] if parts else ""
|
||||
|
||||
def scrape_teams(self) -> list[Team]:
|
||||
"""Get all NHL teams from hardcoded mappings."""
|
||||
teams: list[Team] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
# NHL conference/division structure
|
||||
divisions = {
|
||||
"Atlantic": ("Eastern", ["BOS", "BUF", "DET", "FLA", "MTL", "OTT", "TB", "TOR"]),
|
||||
"Metropolitan": ("Eastern", ["CAR", "CBJ", "NJ", "NYI", "NYR", "PHI", "PIT", "WAS"]),
|
||||
"Central": ("Western", ["ARI", "CHI", "COL", "DAL", "MIN", "NSH", "STL", "WPG"]),
|
||||
"Pacific": ("Western", ["ANA", "CGY", "EDM", "LA", "SJ", "SEA", "VAN", "VGK"]),
|
||||
}
|
||||
|
||||
# Build reverse lookup
|
||||
team_divisions: dict[str, tuple[str, str]] = {}
|
||||
for div, (conf, abbrevs) in divisions.items():
|
||||
for abbrev in abbrevs:
|
||||
team_divisions[abbrev] = (conf, div)
|
||||
|
||||
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nhl", {}).items():
|
||||
if team_id in seen:
|
||||
continue
|
||||
seen.add(team_id)
|
||||
|
||||
# Parse team name
|
||||
parts = full_name.split()
|
||||
team_name = parts[-1] if parts else full_name
|
||||
# Handle multi-word names
|
||||
if team_name in ["Wings", "Jackets", "Knights", "Leafs"]:
|
||||
team_name = " ".join(parts[-2:])
|
||||
|
||||
# Get conference and division
|
||||
conf, div = team_divisions.get(abbrev, (None, None))
|
||||
|
||||
team = Team(
|
||||
id=team_id,
|
||||
sport="nhl",
|
||||
city=city,
|
||||
name=team_name,
|
||||
full_name=full_name,
|
||||
abbreviation=abbrev,
|
||||
conference=conf,
|
||||
division=div,
|
||||
stadium_id=stadium_id,
|
||||
)
|
||||
teams.append(team)
|
||||
|
||||
return teams
|
||||
|
||||
def scrape_stadiums(self) -> list[Stadium]:
|
||||
"""Get all NHL stadiums from hardcoded mappings."""
|
||||
stadiums: list[Stadium] = []
|
||||
|
||||
nhl_stadiums = STADIUM_MAPPINGS.get("nhl", {})
|
||||
for stadium_id, info in nhl_stadiums.items():
|
||||
stadium = Stadium(
|
||||
id=stadium_id,
|
||||
sport="nhl",
|
||||
name=info.name,
|
||||
city=info.city,
|
||||
state=info.state,
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
surface="ice",
|
||||
roof_type="dome",
|
||||
)
|
||||
stadiums.append(stadium)
|
||||
|
||||
return stadiums
|
||||
|
||||
|
||||
def create_nhl_scraper(season: int) -> NHLScraper:
|
||||
"""Factory function to create an NHL scraper."""
|
||||
return NHLScraper(season=season)
|
||||
374
sportstime_parser/scrapers/nwsl.py
Normal file
374
sportstime_parser/scrapers/nwsl.py
Normal file
@@ -0,0 +1,374 @@
|
||||
"""NWSL scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
from ..models.game import Game
|
||||
from ..models.team import Team
|
||||
from ..models.stadium import Stadium
|
||||
from ..models.aliases import ManualReviewItem
|
||||
from ..normalizers.canonical_id import generate_game_id
|
||||
from ..normalizers.team_resolver import (
|
||||
TeamResolver,
|
||||
TEAM_MAPPINGS,
|
||||
get_team_resolver,
|
||||
)
|
||||
from ..normalizers.stadium_resolver import (
|
||||
StadiumResolver,
|
||||
STADIUM_MAPPINGS,
|
||||
get_stadium_resolver,
|
||||
)
|
||||
from ..utils.logging import get_logger, log_game, log_warning
|
||||
|
||||
|
||||
class NWSLScraper(BaseScraper):
|
||||
"""NWSL schedule scraper with multi-source fallback.
|
||||
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for NWSL
|
||||
2. NWSL official (via ESPN) - Backup option
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
"""Initialize NWSL scraper.
|
||||
|
||||
Args:
|
||||
season: Season year (e.g., 2026 for 2026 season)
|
||||
"""
|
||||
super().__init__("nwsl", season, **kwargs)
|
||||
self._team_resolver = get_team_resolver("nwsl")
|
||||
self._stadium_resolver = get_stadium_resolver("nwsl")
|
||||
|
||||
def _get_sources(self) -> list[str]:
|
||||
"""Return source list in priority order."""
|
||||
return ["espn"]
|
||||
|
||||
def _get_source_url(self, source: str, **kwargs) -> str:
|
||||
"""Build URL for a source."""
|
||||
if source == "espn":
|
||||
date_str = kwargs.get("date", "")
|
||||
return f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.nwsl/scoreboard?dates={date_str}"
|
||||
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _get_season_months(self) -> list[tuple[int, int]]:
|
||||
"""Get the months to scrape for NWSL season.
|
||||
|
||||
NWSL season runs March through November.
|
||||
"""
|
||||
months = []
|
||||
|
||||
# NWSL regular season + playoffs
|
||||
for month in range(3, 12): # March-Nov
|
||||
months.append((self.season, month))
|
||||
|
||||
return months
|
||||
|
||||
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
|
||||
"""Scrape games from a specific source."""
|
||||
if source == "espn":
|
||||
return self._scrape_espn()
|
||||
else:
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (March-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.nwsl/scoreboard?limit=1000&dates={date_range}"
|
||||
self._logger.info(f"Fetching NWSL schedule: {date_range}")
|
||||
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
data: dict,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse ESPN API response."""
|
||||
games: list[RawGameData] = []
|
||||
|
||||
events = data.get("events", [])
|
||||
|
||||
for event in events:
|
||||
try:
|
||||
game = self._parse_espn_event(event, source_url)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse ESPN event: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_espn_event(
|
||||
self,
|
||||
event: dict,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single ESPN event."""
|
||||
# Get date
|
||||
date_str = event.get("date", "")
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get competitions
|
||||
competitions = event.get("competitions", [])
|
||||
if not competitions:
|
||||
return None
|
||||
|
||||
competition = competitions[0]
|
||||
|
||||
# Get teams
|
||||
competitors = competition.get("competitors", [])
|
||||
if len(competitors) != 2:
|
||||
return None
|
||||
|
||||
home_team = None
|
||||
away_team = None
|
||||
home_score = None
|
||||
away_score = None
|
||||
|
||||
for competitor in competitors:
|
||||
team_info = competitor.get("team", {})
|
||||
team_name = team_info.get("displayName", "")
|
||||
is_home = competitor.get("homeAway") == "home"
|
||||
score = competitor.get("score")
|
||||
|
||||
if score:
|
||||
try:
|
||||
score = int(score)
|
||||
except (ValueError, TypeError):
|
||||
score = None
|
||||
|
||||
if is_home:
|
||||
home_team = team_name
|
||||
home_score = score
|
||||
else:
|
||||
away_team = team_name
|
||||
away_score = score
|
||||
|
||||
if not home_team or not away_team:
|
||||
return None
|
||||
|
||||
# Get venue
|
||||
venue = competition.get("venue", {})
|
||||
stadium = venue.get("fullName")
|
||||
|
||||
# Get status
|
||||
status_info = competition.get("status", {})
|
||||
status_type = status_info.get("type", {})
|
||||
status_name = status_type.get("name", "").lower()
|
||||
|
||||
if status_name == "status_final":
|
||||
status = "final"
|
||||
elif status_name == "status_postponed":
|
||||
status = "postponed"
|
||||
elif status_name == "status_canceled":
|
||||
status = "cancelled"
|
||||
else:
|
||||
status = "scheduled"
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=stadium,
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _normalize_games(
|
||||
self,
|
||||
raw_games: list[RawGameData],
|
||||
) -> tuple[list[Game], list[ManualReviewItem]]:
|
||||
"""Normalize raw games to Game objects with canonical IDs."""
|
||||
games: list[Game] = []
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
for raw in raw_games:
|
||||
game, item_reviews = self._normalize_single_game(raw)
|
||||
|
||||
if game:
|
||||
games.append(game)
|
||||
log_game(
|
||||
self.sport,
|
||||
game.id,
|
||||
game.home_team_id,
|
||||
game.away_team_id,
|
||||
game.game_date.strftime("%Y-%m-%d"),
|
||||
game.status,
|
||||
)
|
||||
|
||||
review_items.extend(item_reviews)
|
||||
|
||||
return games, review_items
|
||||
|
||||
def _normalize_single_game(
|
||||
self,
|
||||
raw: RawGameData,
|
||||
) -> tuple[Optional[Game], list[ManualReviewItem]]:
|
||||
"""Normalize a single raw game."""
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
# Resolve home team
|
||||
home_result = self._team_resolver.resolve(
|
||||
raw.home_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if home_result.review_item:
|
||||
review_items.append(home_result.review_item)
|
||||
|
||||
if not home_result.canonical_id:
|
||||
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve away team
|
||||
away_result = self._team_resolver.resolve(
|
||||
raw.away_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if away_result.review_item:
|
||||
review_items.append(away_result.review_item)
|
||||
|
||||
if not away_result.canonical_id:
|
||||
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve stadium
|
||||
stadium_id = None
|
||||
|
||||
if raw.stadium_raw:
|
||||
stadium_result = self._stadium_resolver.resolve(
|
||||
raw.stadium_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if stadium_result.review_item:
|
||||
review_items.append(stadium_result.review_item)
|
||||
|
||||
stadium_id = stadium_result.canonical_id
|
||||
|
||||
# Get abbreviations for game ID
|
||||
home_abbrev = self._get_abbreviation(home_result.canonical_id)
|
||||
away_abbrev = self._get_abbreviation(away_result.canonical_id)
|
||||
|
||||
# Generate canonical game ID
|
||||
game_id = generate_game_id(
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
away_abbrev=away_abbrev,
|
||||
home_abbrev=home_abbrev,
|
||||
game_date=raw.game_date,
|
||||
game_number=None,
|
||||
)
|
||||
|
||||
game = Game(
|
||||
id=game_id,
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
home_team_id=home_result.canonical_id,
|
||||
away_team_id=away_result.canonical_id,
|
||||
stadium_id=stadium_id or "",
|
||||
game_date=raw.game_date,
|
||||
game_number=None,
|
||||
home_score=raw.home_score,
|
||||
away_score=raw.away_score,
|
||||
status=raw.status,
|
||||
source_url=raw.source_url,
|
||||
raw_home_team=raw.home_team_raw,
|
||||
raw_away_team=raw.away_team_raw,
|
||||
raw_stadium=raw.stadium_raw,
|
||||
)
|
||||
|
||||
return game, review_items
|
||||
|
||||
def _get_abbreviation(self, team_id: str) -> str:
|
||||
"""Extract abbreviation from team ID."""
|
||||
parts = team_id.split("_")
|
||||
return parts[-1] if parts else ""
|
||||
|
||||
def scrape_teams(self) -> list[Team]:
|
||||
"""Get all NWSL teams from hardcoded mappings."""
|
||||
teams: list[Team] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nwsl", {}).items():
|
||||
if team_id in seen:
|
||||
continue
|
||||
seen.add(team_id)
|
||||
|
||||
# Parse team name
|
||||
team_name = full_name
|
||||
|
||||
team = Team(
|
||||
id=team_id,
|
||||
sport="nwsl",
|
||||
city=city,
|
||||
name=team_name,
|
||||
full_name=full_name,
|
||||
abbreviation=abbrev,
|
||||
conference=None, # NWSL uses single table
|
||||
division=None,
|
||||
stadium_id=stadium_id,
|
||||
)
|
||||
teams.append(team)
|
||||
|
||||
return teams
|
||||
|
||||
def scrape_stadiums(self) -> list[Stadium]:
|
||||
"""Get all NWSL stadiums from hardcoded mappings."""
|
||||
stadiums: list[Stadium] = []
|
||||
|
||||
nwsl_stadiums = STADIUM_MAPPINGS.get("nwsl", {})
|
||||
for stadium_id, info in nwsl_stadiums.items():
|
||||
stadium = Stadium(
|
||||
id=stadium_id,
|
||||
sport="nwsl",
|
||||
name=info.name,
|
||||
city=info.city,
|
||||
state=info.state,
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
surface="grass",
|
||||
roof_type="open",
|
||||
)
|
||||
stadiums.append(stadium)
|
||||
|
||||
return stadiums
|
||||
|
||||
|
||||
def create_nwsl_scraper(season: int) -> NWSLScraper:
|
||||
"""Factory function to create an NWSL scraper."""
|
||||
return NWSLScraper(season=season)
|
||||
375
sportstime_parser/scrapers/wnba.py
Normal file
375
sportstime_parser/scrapers/wnba.py
Normal file
@@ -0,0 +1,375 @@
|
||||
"""WNBA scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
from ..models.game import Game
|
||||
from ..models.team import Team
|
||||
from ..models.stadium import Stadium
|
||||
from ..models.aliases import ManualReviewItem
|
||||
from ..normalizers.canonical_id import generate_game_id
|
||||
from ..normalizers.team_resolver import (
|
||||
TeamResolver,
|
||||
TEAM_MAPPINGS,
|
||||
get_team_resolver,
|
||||
)
|
||||
from ..normalizers.stadium_resolver import (
|
||||
StadiumResolver,
|
||||
STADIUM_MAPPINGS,
|
||||
get_stadium_resolver,
|
||||
)
|
||||
from ..utils.logging import get_logger, log_game, log_warning
|
||||
|
||||
|
||||
class WNBAScraper(BaseScraper):
|
||||
"""WNBA schedule scraper with multi-source fallback.
|
||||
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for WNBA
|
||||
2. WNBA official (via ESPN) - Backup option
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
"""Initialize WNBA scraper.
|
||||
|
||||
Args:
|
||||
season: Season year (e.g., 2026 for 2026 season)
|
||||
"""
|
||||
super().__init__("wnba", season, **kwargs)
|
||||
self._team_resolver = get_team_resolver("wnba")
|
||||
self._stadium_resolver = get_stadium_resolver("wnba")
|
||||
|
||||
def _get_sources(self) -> list[str]:
|
||||
"""Return source list in priority order."""
|
||||
return ["espn"]
|
||||
|
||||
def _get_source_url(self, source: str, **kwargs) -> str:
|
||||
"""Build URL for a source."""
|
||||
if source == "espn":
|
||||
date_str = kwargs.get("date", "")
|
||||
return f"https://site.api.espn.com/apis/site/v2/sports/basketball/wnba/scoreboard?dates={date_str}"
|
||||
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _get_season_months(self) -> list[tuple[int, int]]:
|
||||
"""Get the months to scrape for WNBA season.
|
||||
|
||||
WNBA season runs May through September/October.
|
||||
"""
|
||||
months = []
|
||||
|
||||
# WNBA regular season + playoffs
|
||||
for month in range(5, 11): # May-Oct
|
||||
months.append((self.season, month))
|
||||
|
||||
return months
|
||||
|
||||
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
|
||||
"""Scrape games from a specific source."""
|
||||
if source == "espn":
|
||||
return self._scrape_espn()
|
||||
else:
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (May-October)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/basketball/wnba/scoreboard?limit=1000&dates={date_range}"
|
||||
self._logger.info(f"Fetching WNBA schedule: {date_range}")
|
||||
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
data: dict,
|
||||
source_url: str,
|
||||
) -> list[RawGameData]:
|
||||
"""Parse ESPN API response."""
|
||||
games: list[RawGameData] = []
|
||||
|
||||
events = data.get("events", [])
|
||||
|
||||
for event in events:
|
||||
try:
|
||||
game = self._parse_espn_event(event, source_url)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
self._logger.debug(f"Failed to parse ESPN event: {e}")
|
||||
continue
|
||||
|
||||
return games
|
||||
|
||||
def _parse_espn_event(
|
||||
self,
|
||||
event: dict,
|
||||
source_url: str,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single ESPN event."""
|
||||
# Get date
|
||||
date_str = event.get("date", "")
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get competitions
|
||||
competitions = event.get("competitions", [])
|
||||
if not competitions:
|
||||
return None
|
||||
|
||||
competition = competitions[0]
|
||||
|
||||
# Get teams
|
||||
competitors = competition.get("competitors", [])
|
||||
if len(competitors) != 2:
|
||||
return None
|
||||
|
||||
home_team = None
|
||||
away_team = None
|
||||
home_score = None
|
||||
away_score = None
|
||||
|
||||
for competitor in competitors:
|
||||
team_info = competitor.get("team", {})
|
||||
team_name = team_info.get("displayName", "")
|
||||
is_home = competitor.get("homeAway") == "home"
|
||||
score = competitor.get("score")
|
||||
|
||||
if score:
|
||||
try:
|
||||
score = int(score)
|
||||
except (ValueError, TypeError):
|
||||
score = None
|
||||
|
||||
if is_home:
|
||||
home_team = team_name
|
||||
home_score = score
|
||||
else:
|
||||
away_team = team_name
|
||||
away_score = score
|
||||
|
||||
if not home_team or not away_team:
|
||||
return None
|
||||
|
||||
# Get venue
|
||||
venue = competition.get("venue", {})
|
||||
stadium = venue.get("fullName")
|
||||
|
||||
# Get status
|
||||
status_info = competition.get("status", {})
|
||||
status_type = status_info.get("type", {})
|
||||
status_name = status_type.get("name", "").lower()
|
||||
|
||||
if status_name == "status_final":
|
||||
status = "final"
|
||||
elif status_name == "status_postponed":
|
||||
status = "postponed"
|
||||
elif status_name == "status_canceled":
|
||||
status = "cancelled"
|
||||
else:
|
||||
status = "scheduled"
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=stadium,
|
||||
home_score=home_score,
|
||||
away_score=away_score,
|
||||
status=status,
|
||||
source_url=source_url,
|
||||
)
|
||||
|
||||
def _normalize_games(
|
||||
self,
|
||||
raw_games: list[RawGameData],
|
||||
) -> tuple[list[Game], list[ManualReviewItem]]:
|
||||
"""Normalize raw games to Game objects with canonical IDs."""
|
||||
games: list[Game] = []
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
for raw in raw_games:
|
||||
game, item_reviews = self._normalize_single_game(raw)
|
||||
|
||||
if game:
|
||||
games.append(game)
|
||||
log_game(
|
||||
self.sport,
|
||||
game.id,
|
||||
game.home_team_id,
|
||||
game.away_team_id,
|
||||
game.game_date.strftime("%Y-%m-%d"),
|
||||
game.status,
|
||||
)
|
||||
|
||||
review_items.extend(item_reviews)
|
||||
|
||||
return games, review_items
|
||||
|
||||
def _normalize_single_game(
|
||||
self,
|
||||
raw: RawGameData,
|
||||
) -> tuple[Optional[Game], list[ManualReviewItem]]:
|
||||
"""Normalize a single raw game."""
|
||||
review_items: list[ManualReviewItem] = []
|
||||
|
||||
# Resolve home team
|
||||
home_result = self._team_resolver.resolve(
|
||||
raw.home_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if home_result.review_item:
|
||||
review_items.append(home_result.review_item)
|
||||
|
||||
if not home_result.canonical_id:
|
||||
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve away team
|
||||
away_result = self._team_resolver.resolve(
|
||||
raw.away_team_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if away_result.review_item:
|
||||
review_items.append(away_result.review_item)
|
||||
|
||||
if not away_result.canonical_id:
|
||||
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
|
||||
return None, review_items
|
||||
|
||||
# Resolve stadium
|
||||
stadium_id = None
|
||||
|
||||
if raw.stadium_raw:
|
||||
stadium_result = self._stadium_resolver.resolve(
|
||||
raw.stadium_raw,
|
||||
check_date=raw.game_date.date(),
|
||||
source_url=raw.source_url,
|
||||
)
|
||||
|
||||
if stadium_result.review_item:
|
||||
review_items.append(stadium_result.review_item)
|
||||
|
||||
stadium_id = stadium_result.canonical_id
|
||||
|
||||
# Get abbreviations for game ID
|
||||
home_abbrev = self._get_abbreviation(home_result.canonical_id)
|
||||
away_abbrev = self._get_abbreviation(away_result.canonical_id)
|
||||
|
||||
# Generate canonical game ID
|
||||
game_id = generate_game_id(
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
away_abbrev=away_abbrev,
|
||||
home_abbrev=home_abbrev,
|
||||
game_date=raw.game_date,
|
||||
game_number=None,
|
||||
)
|
||||
|
||||
game = Game(
|
||||
id=game_id,
|
||||
sport=self.sport,
|
||||
season=self.season,
|
||||
home_team_id=home_result.canonical_id,
|
||||
away_team_id=away_result.canonical_id,
|
||||
stadium_id=stadium_id or "",
|
||||
game_date=raw.game_date,
|
||||
game_number=None,
|
||||
home_score=raw.home_score,
|
||||
away_score=raw.away_score,
|
||||
status=raw.status,
|
||||
source_url=raw.source_url,
|
||||
raw_home_team=raw.home_team_raw,
|
||||
raw_away_team=raw.away_team_raw,
|
||||
raw_stadium=raw.stadium_raw,
|
||||
)
|
||||
|
||||
return game, review_items
|
||||
|
||||
def _get_abbreviation(self, team_id: str) -> str:
|
||||
"""Extract abbreviation from team ID."""
|
||||
parts = team_id.split("_")
|
||||
return parts[-1] if parts else ""
|
||||
|
||||
def scrape_teams(self) -> list[Team]:
|
||||
"""Get all WNBA teams from hardcoded mappings."""
|
||||
teams: list[Team] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("wnba", {}).items():
|
||||
if team_id in seen:
|
||||
continue
|
||||
seen.add(team_id)
|
||||
|
||||
# Parse team name
|
||||
parts = full_name.split()
|
||||
team_name = parts[-1] if parts else full_name
|
||||
|
||||
team = Team(
|
||||
id=team_id,
|
||||
sport="wnba",
|
||||
city=city,
|
||||
name=team_name,
|
||||
full_name=full_name,
|
||||
abbreviation=abbrev,
|
||||
conference=None, # WNBA uses single table now
|
||||
division=None,
|
||||
stadium_id=stadium_id,
|
||||
)
|
||||
teams.append(team)
|
||||
|
||||
return teams
|
||||
|
||||
def scrape_stadiums(self) -> list[Stadium]:
|
||||
"""Get all WNBA stadiums from hardcoded mappings."""
|
||||
stadiums: list[Stadium] = []
|
||||
|
||||
wnba_stadiums = STADIUM_MAPPINGS.get("wnba", {})
|
||||
for stadium_id, info in wnba_stadiums.items():
|
||||
stadium = Stadium(
|
||||
id=stadium_id,
|
||||
sport="wnba",
|
||||
name=info.name,
|
||||
city=info.city,
|
||||
state=info.state,
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
surface="hardwood",
|
||||
roof_type="dome",
|
||||
)
|
||||
stadiums.append(stadium)
|
||||
|
||||
return stadiums
|
||||
|
||||
|
||||
def create_wnba_scraper(season: int) -> WNBAScraper:
|
||||
"""Factory function to create a WNBA scraper."""
|
||||
return WNBAScraper(season=season)
|
||||
Reference in New Issue
Block a user