"""NFL scraper implementation with multi-source fallback.""" from datetime import datetime, date from typing import Optional from bs4 import BeautifulSoup from .base import BaseScraper, RawGameData, ScrapeResult from ..models.game import Game from ..models.team import Team from ..models.stadium import Stadium from ..models.aliases import ManualReviewItem from ..normalizers.canonical_id import generate_game_id from ..normalizers.team_resolver import ( TeamResolver, TEAM_MAPPINGS, get_team_resolver, ) from ..normalizers.stadium_resolver import ( StadiumResolver, STADIUM_MAPPINGS, get_stadium_resolver, ) from ..utils.logging import get_logger, log_game, log_warning # International game locations to filter out INTERNATIONAL_LOCATIONS = {"London", "Mexico City", "Frankfurt", "Munich", "São Paulo"} class NFLScraper(BaseScraper): """NFL schedule scraper with multi-source fallback. Sources (in priority order): 1. ESPN API - Most reliable for NFL 2. Pro-Football-Reference - Complete historical data 3. CBS Sports - Backup option """ def __init__(self, season: int, **kwargs): """Initialize NFL scraper. Args: season: Season year (e.g., 2025 for 2025 season) """ super().__init__("nfl", season, **kwargs) self._team_resolver = get_team_resolver("nfl") self._stadium_resolver = get_stadium_resolver("nfl") def _get_sources(self) -> list[str]: """Return source list in priority order.""" return ["espn", "pro_football_reference", "cbs"] def _get_source_url(self, source: str, **kwargs) -> str: """Build URL for a source.""" if source == "espn": week = kwargs.get("week", 1) season_type = kwargs.get("season_type", 2) # 1=preseason, 2=regular, 3=postseason return f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?seasontype={season_type}&week={week}" elif source == "pro_football_reference": return f"https://www.pro-football-reference.com/years/{self.season}/games.htm" elif source == "cbs": return "https://www.cbssports.com/nfl/schedule/" raise ValueError(f"Unknown source: {source}") def _get_season_months(self) -> list[tuple[int, int]]: """Get the months to scrape for NFL season. NFL season runs September through February. """ months = [] # Regular season months for month in range(9, 13): # Sept-Dec months.append((self.season, month)) # Playoff months for month in range(1, 3): # Jan-Feb months.append((self.season + 1, month)) return months def _scrape_games_from_source(self, source: str) -> list[RawGameData]: """Scrape games from a specific source.""" if source == "espn": return self._scrape_espn() elif source == "pro_football_reference": return self._scrape_pro_football_reference() elif source == "cbs": return self._scrape_cbs() else: raise ValueError(f"Unknown source: {source}") def _scrape_espn(self) -> list[RawGameData]: """Scrape games from ESPN API. ESPN NFL API uses week numbers. """ all_games: list[RawGameData] = [] # Scrape preseason (4 weeks) for week in range(1, 5): try: url = self._get_source_url("espn", week=week, season_type=1) data = self.session.get_json(url) games = self._parse_espn_response(data, url) all_games.extend(games) except Exception as e: self._logger.debug(f"ESPN preseason week {week} error: {e}") continue # Scrape regular season (18 weeks) for week in range(1, 19): try: url = self._get_source_url("espn", week=week, season_type=2) data = self.session.get_json(url) games = self._parse_espn_response(data, url) all_games.extend(games) self._logger.debug(f"Found {len(games)} games in week {week}") except Exception as e: self._logger.debug(f"ESPN regular season week {week} error: {e}") continue # Scrape postseason (4 rounds) for week in range(1, 5): try: url = self._get_source_url("espn", week=week, season_type=3) data = self.session.get_json(url) games = self._parse_espn_response(data, url) all_games.extend(games) except Exception as e: self._logger.debug(f"ESPN postseason week {week} error: {e}") continue return all_games def _parse_espn_response( self, data: dict, source_url: str, ) -> list[RawGameData]: """Parse ESPN API response.""" games: list[RawGameData] = [] events = data.get("events", []) for event in events: try: game = self._parse_espn_event(event, source_url) if game: # Filter international games if game.stadium_raw and any(loc in game.stadium_raw for loc in INTERNATIONAL_LOCATIONS): self._logger.debug(f"Skipping international game: {game.stadium_raw}") continue games.append(game) except Exception as e: self._logger.debug(f"Failed to parse ESPN event: {e}") continue return games def _parse_espn_event( self, event: dict, source_url: str, ) -> Optional[RawGameData]: """Parse a single ESPN event.""" # Get date date_str = event.get("date", "") if not date_str: return None try: game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00")) except ValueError: return None # Get competitions competitions = event.get("competitions", []) if not competitions: return None competition = competitions[0] # Check for neutral site (international games) if competition.get("neutralSite"): venue = competition.get("venue", {}) venue_city = venue.get("address", {}).get("city", "") if venue_city in INTERNATIONAL_LOCATIONS: return None # Get teams competitors = competition.get("competitors", []) if len(competitors) != 2: return None home_team = None away_team = None home_score = None away_score = None for competitor in competitors: team_info = competitor.get("team", {}) team_name = team_info.get("displayName", "") is_home = competitor.get("homeAway") == "home" score = competitor.get("score") if score: try: score = int(score) except (ValueError, TypeError): score = None if is_home: home_team = team_name home_score = score else: away_team = team_name away_score = score if not home_team or not away_team: return None # Get venue venue = competition.get("venue", {}) stadium = venue.get("fullName") # Get status status_info = competition.get("status", {}) status_type = status_info.get("type", {}) status_name = status_type.get("name", "").lower() if status_name == "status_final": status = "final" elif status_name == "status_postponed": status = "postponed" elif status_name == "status_canceled": status = "cancelled" else: status = "scheduled" return RawGameData( game_date=game_date, home_team_raw=home_team, away_team_raw=away_team, stadium_raw=stadium, home_score=home_score, away_score=away_score, status=status, source_url=source_url, ) def _scrape_pro_football_reference(self) -> list[RawGameData]: """Scrape games from Pro-Football-Reference. PFR has a single schedule page per season. """ url = self._get_source_url("pro_football_reference") try: html = self.session.get_html(url) games = self._parse_pfr(html, url) return games except Exception as e: self._logger.error(f"Failed to scrape Pro-Football-Reference: {e}") raise def _parse_pfr( self, html: str, source_url: str, ) -> list[RawGameData]: """Parse Pro-Football-Reference schedule HTML.""" soup = BeautifulSoup(html, "lxml") games: list[RawGameData] = [] # Find the schedule table table = soup.find("table", id="games") if not table: return games tbody = table.find("tbody") if not tbody: return games for row in tbody.find_all("tr"): # Skip header rows if row.get("class") and "thead" in row.get("class", []): continue try: game = self._parse_pfr_row(row, source_url) if game: games.append(game) except Exception as e: self._logger.debug(f"Failed to parse PFR row: {e}") continue return games def _parse_pfr_row( self, row, source_url: str, ) -> Optional[RawGameData]: """Parse a single Pro-Football-Reference table row.""" # Get date date_cell = row.find("td", {"data-stat": "game_date"}) if not date_cell: return None date_text = date_cell.get_text(strip=True) if not date_text: return None # Parse date try: # PFR uses YYYY-MM-DD format game_date = datetime.strptime(date_text, "%Y-%m-%d") except ValueError: return None # Get teams winner_cell = row.find("td", {"data-stat": "winner"}) loser_cell = row.find("td", {"data-stat": "loser"}) if not winner_cell or not loser_cell: return None winner = winner_cell.get_text(strip=True) loser = loser_cell.get_text(strip=True) if not winner or not loser: return None # Determine home/away based on @ symbol game_location = row.find("td", {"data-stat": "game_location"}) at_home = game_location and "@" in game_location.get_text() if at_home: home_team = loser away_team = winner else: home_team = winner away_team = loser # Get scores pts_win_cell = row.find("td", {"data-stat": "pts_win"}) pts_lose_cell = row.find("td", {"data-stat": "pts_lose"}) home_score = None away_score = None if pts_win_cell and pts_lose_cell: try: winner_pts = int(pts_win_cell.get_text(strip=True)) loser_pts = int(pts_lose_cell.get_text(strip=True)) if at_home: home_score = loser_pts away_score = winner_pts else: home_score = winner_pts away_score = loser_pts except ValueError: pass # Determine status status = "final" if home_score is not None else "scheduled" return RawGameData( game_date=game_date, home_team_raw=home_team, away_team_raw=away_team, stadium_raw=None, # PFR doesn't always have stadium home_score=home_score, away_score=away_score, status=status, source_url=source_url, ) def _scrape_cbs(self) -> list[RawGameData]: """Scrape games from CBS Sports.""" raise NotImplementedError("CBS scraper not implemented") def _normalize_games( self, raw_games: list[RawGameData], ) -> tuple[list[Game], list[ManualReviewItem]]: """Normalize raw games to Game objects with canonical IDs.""" games: list[Game] = [] review_items: list[ManualReviewItem] = [] for raw in raw_games: game, item_reviews = self._normalize_single_game(raw) if game: games.append(game) log_game( self.sport, game.id, game.home_team_id, game.away_team_id, game.game_date.strftime("%Y-%m-%d"), game.status, ) review_items.extend(item_reviews) return games, review_items def _normalize_single_game( self, raw: RawGameData, ) -> tuple[Optional[Game], list[ManualReviewItem]]: """Normalize a single raw game.""" review_items: list[ManualReviewItem] = [] # Resolve home team home_result = self._team_resolver.resolve( raw.home_team_raw, check_date=raw.game_date.date(), source_url=raw.source_url, ) if home_result.review_item: review_items.append(home_result.review_item) if not home_result.canonical_id: log_warning(f"Could not resolve home team: {raw.home_team_raw}") return None, review_items # Resolve away team away_result = self._team_resolver.resolve( raw.away_team_raw, check_date=raw.game_date.date(), source_url=raw.source_url, ) if away_result.review_item: review_items.append(away_result.review_item) if not away_result.canonical_id: log_warning(f"Could not resolve away team: {raw.away_team_raw}") return None, review_items # Resolve stadium stadium_id = None if raw.stadium_raw: stadium_result = self._stadium_resolver.resolve( raw.stadium_raw, check_date=raw.game_date.date(), source_url=raw.source_url, ) if stadium_result.review_item: review_items.append(stadium_result.review_item) stadium_id = stadium_result.canonical_id # Get abbreviations for game ID home_abbrev = self._get_abbreviation(home_result.canonical_id) away_abbrev = self._get_abbreviation(away_result.canonical_id) # Generate canonical game ID game_id = generate_game_id( sport=self.sport, season=self.season, away_abbrev=away_abbrev, home_abbrev=home_abbrev, game_date=raw.game_date, game_number=None, # NFL doesn't have doubleheaders ) game = Game( id=game_id, sport=self.sport, season=self.season, home_team_id=home_result.canonical_id, away_team_id=away_result.canonical_id, stadium_id=stadium_id or "", game_date=raw.game_date, game_number=None, home_score=raw.home_score, away_score=raw.away_score, status=raw.status, source_url=raw.source_url, raw_home_team=raw.home_team_raw, raw_away_team=raw.away_team_raw, raw_stadium=raw.stadium_raw, ) return game, review_items def _get_abbreviation(self, team_id: str) -> str: """Extract abbreviation from team ID.""" parts = team_id.split("_") return parts[-1] if parts else "" def scrape_teams(self) -> list[Team]: """Get all NFL teams from hardcoded mappings.""" teams: list[Team] = [] seen: set[str] = set() # NFL conference/division structure divisions = { "AFC East": ("AFC", ["BUF", "MIA", "NE", "NYJ"]), "AFC North": ("AFC", ["BAL", "CIN", "CLE", "PIT"]), "AFC South": ("AFC", ["HOU", "IND", "JAX", "TEN"]), "AFC West": ("AFC", ["DEN", "KC", "LV", "LAC"]), "NFC East": ("NFC", ["DAL", "NYG", "PHI", "WAS"]), "NFC North": ("NFC", ["CHI", "DET", "GB", "MIN"]), "NFC South": ("NFC", ["ATL", "CAR", "NO", "TB"]), "NFC West": ("NFC", ["ARI", "LAR", "SF", "SEA"]), } # Build reverse lookup team_divisions: dict[str, tuple[str, str]] = {} for div, (conf, abbrevs) in divisions.items(): for abbrev in abbrevs: team_divisions[abbrev] = (conf, div) for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nfl", {}).items(): if team_id in seen: continue seen.add(team_id) # Parse team name parts = full_name.split() team_name = parts[-1] if parts else full_name # Get conference and division conf, div = team_divisions.get(abbrev, (None, None)) team = Team( id=team_id, sport="nfl", city=city, name=team_name, full_name=full_name, abbreviation=abbrev, conference=conf, division=div, stadium_id=stadium_id, ) teams.append(team) return teams def scrape_stadiums(self) -> list[Stadium]: """Get all NFL stadiums from hardcoded mappings.""" stadiums: list[Stadium] = [] nfl_stadiums = STADIUM_MAPPINGS.get("nfl", {}) for stadium_id, info in nfl_stadiums.items(): stadium = Stadium( id=stadium_id, sport="nfl", name=info.name, city=info.city, state=info.state, country=info.country, latitude=info.latitude, longitude=info.longitude, surface="turf", # Many NFL stadiums roof_type="open", # Most outdoor ) stadiums.append(stadium) return stadiums def create_nfl_scraper(season: int) -> NFLScraper: """Factory function to create an NFL scraper.""" return NFLScraper(season=season)