Files
Sportstime/Scripts/sportstime_parser/scrapers/nfl.py
Trey t 4d097883a6 fix(data): add timezone handling for Sports-Reference scrapers and new stadiums
- Add ET timezone (America/New_York) to all Sports-Reference scrapers:
  - NBA: Basketball-Reference times parsed as ET
  - NFL: Pro-Football-Reference times parsed as ET
  - NHL: Hockey-Reference times parsed as ET
  - MLB: Baseball-Reference times parsed as ET
- Document source timezones in scraper docstrings
- Add 11 new stadiums to STADIUM_MAPPINGS:
  - NFL: 5 international venues (Corinthians Arena, Croke Park,
    Olympic Stadium Berlin, Santiago Bernabéu, Tom Benson Hall of Fame)
  - MLS: 4 alternate venues (Miami Freedom Park, Citi Field,
    LA Memorial Coliseum, M&T Bank Stadium)
  - NWSL: 2 alternate venues (Northwestern Medicine Field, ONE Spokane)
- Add 15 stadium aliases for MLS/NWSL team-based lookups
- Fix CanonicalSyncService to sync timezone identifier to SwiftData
- Update debug logging to use stadium timezone for display

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-21 16:04:45 -06:00

606 lines
20 KiB
Python

"""NFL scraper implementation with multi-source fallback."""
from datetime import datetime, date
from typing import Optional
from zoneinfo import ZoneInfo
from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult
from ..models.game import Game
from ..models.team import Team
from ..models.stadium import Stadium
from ..models.aliases import ManualReviewItem
from ..normalizers.canonical_id import generate_game_id
from ..normalizers.team_resolver import (
TeamResolver,
TEAM_MAPPINGS,
get_team_resolver,
)
from ..normalizers.stadium_resolver import (
StadiumResolver,
STADIUM_MAPPINGS,
get_stadium_resolver,
)
from ..utils.logging import get_logger, log_game, log_warning
# International game locations to filter out
INTERNATIONAL_LOCATIONS = {"London", "Mexico City", "Frankfurt", "Munich", "São Paulo"}
class NFLScraper(BaseScraper):
"""NFL schedule scraper with multi-source fallback.
Sources (in priority order):
1. ESPN API - Most reliable for NFL
2. Pro-Football-Reference - Complete historical data
3. CBS Sports - Backup option (not implemented)
Source Timezones:
- espn: UTC - ISO 8601 format with "Z" suffix
- pro_football_reference: Eastern Time (ET) - times displayed as "8:20PM"
- cbs: Not implemented
"""
def __init__(self, season: int, **kwargs):
"""Initialize NFL scraper.
Args:
season: Season year (e.g., 2025 for 2025 season)
"""
super().__init__("nfl", season, **kwargs)
self._team_resolver = get_team_resolver("nfl")
self._stadium_resolver = get_stadium_resolver("nfl")
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
# CBS scraper not yet implemented - TODO for future
return ["espn", "pro_football_reference"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
if source == "espn":
week = kwargs.get("week", 1)
season_type = kwargs.get("season_type", 2) # 1=preseason, 2=regular, 3=postseason
return f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?seasontype={season_type}&week={week}"
elif source == "pro_football_reference":
return f"https://www.pro-football-reference.com/years/{self.season}/games.htm"
elif source == "cbs":
return "https://www.cbssports.com/nfl/schedule/"
raise ValueError(f"Unknown source: {source}")
def _get_season_months(self) -> list[tuple[int, int]]:
"""Get the months to scrape for NFL season.
NFL season runs September through February.
"""
months = []
# Regular season months
for month in range(9, 13): # Sept-Dec
months.append((self.season, month))
# Playoff months
for month in range(1, 3): # Jan-Feb
months.append((self.season + 1, month))
return months
def _scrape_games_from_source(self, source: str) -> list[RawGameData]:
"""Scrape games from a specific source."""
if source == "espn":
return self._scrape_espn()
elif source == "pro_football_reference":
return self._scrape_pro_football_reference()
elif source == "cbs":
return self._scrape_cbs()
else:
raise ValueError(f"Unknown source: {source}")
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API.
ESPN NFL API uses week numbers.
"""
all_games: list[RawGameData] = []
# Scrape preseason (4 weeks)
for week in range(1, 5):
try:
url = self._get_source_url("espn", week=week, season_type=1)
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN preseason week {week} error: {e}")
continue
# Scrape regular season (18 weeks)
for week in range(1, 19):
try:
url = self._get_source_url("espn", week=week, season_type=2)
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
self._logger.debug(f"Found {len(games)} games in week {week}")
except Exception as e:
self._logger.debug(f"ESPN regular season week {week} error: {e}")
continue
# Scrape postseason (4 rounds)
for week in range(1, 5):
try:
url = self._get_source_url("espn", week=week, season_type=3)
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN postseason week {week} error: {e}")
continue
return all_games
def _parse_espn_response(
self,
data: dict,
source_url: str,
) -> list[RawGameData]:
"""Parse ESPN API response."""
games: list[RawGameData] = []
events = data.get("events", [])
for event in events:
try:
game = self._parse_espn_event(event, source_url)
if game:
# Filter international games
if game.stadium_raw and any(loc in game.stadium_raw for loc in INTERNATIONAL_LOCATIONS):
self._logger.debug(f"Skipping international game: {game.stadium_raw}")
continue
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse ESPN event: {e}")
continue
return games
def _parse_espn_event(
self,
event: dict,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single ESPN event."""
# Get date
date_str = event.get("date", "")
if not date_str:
return None
try:
game_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return None
# Get competitions
competitions = event.get("competitions", [])
if not competitions:
return None
competition = competitions[0]
# Check for neutral site (international games)
if competition.get("neutralSite"):
venue = competition.get("venue", {})
venue_city = venue.get("address", {}).get("city", "")
if venue_city in INTERNATIONAL_LOCATIONS:
return None
# Get teams
competitors = competition.get("competitors", [])
if len(competitors) != 2:
return None
home_team = None
away_team = None
home_score = None
away_score = None
for competitor in competitors:
team_info = competitor.get("team", {})
team_name = team_info.get("displayName", "")
is_home = competitor.get("homeAway") == "home"
score = competitor.get("score")
if score:
try:
score = int(score)
except (ValueError, TypeError):
score = None
if is_home:
home_team = team_name
home_score = score
else:
away_team = team_name
away_score = score
if not home_team or not away_team:
return None
# Get venue
venue = competition.get("venue", {})
stadium = venue.get("fullName")
# Get status
status_info = competition.get("status", {})
status_type = status_info.get("type", {})
status_name = status_type.get("name", "").lower()
if status_name == "status_final":
status = "final"
elif status_name == "status_postponed":
status = "postponed"
elif status_name == "status_canceled":
status = "cancelled"
else:
status = "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_pro_football_reference(self) -> list[RawGameData]:
"""Scrape games from Pro-Football-Reference.
PFR has a single schedule page per season.
"""
url = self._get_source_url("pro_football_reference")
try:
html = self.session.get_html(url)
games = self._parse_pfr(html, url)
return games
except Exception as e:
self._logger.error(f"Failed to scrape Pro-Football-Reference: {e}")
raise
def _parse_pfr(
self,
html: str,
source_url: str,
) -> list[RawGameData]:
"""Parse Pro-Football-Reference schedule HTML."""
soup = BeautifulSoup(html, "lxml")
games: list[RawGameData] = []
# Find the schedule table
table = soup.find("table", id="games")
if not table:
return games
tbody = table.find("tbody")
if not tbody:
return games
for row in tbody.find_all("tr"):
# Skip header rows
if row.get("class") and "thead" in row.get("class", []):
continue
try:
game = self._parse_pfr_row(row, source_url)
if game:
games.append(game)
except Exception as e:
self._logger.debug(f"Failed to parse PFR row: {e}")
continue
return games
def _parse_pfr_row(
self,
row,
source_url: str,
) -> Optional[RawGameData]:
"""Parse a single Pro-Football-Reference table row."""
# Get date
date_cell = row.find("td", {"data-stat": "game_date"})
if not date_cell:
return None
date_text = date_cell.get_text(strip=True)
if not date_text:
return None
# Parse date
try:
# PFR uses YYYY-MM-DD format
game_date = datetime.strptime(date_text, "%Y-%m-%d")
except ValueError:
return None
# Get game start time (format: "8:20PM" or "1:00PM") - times are in ET
time_cell = row.find("td", {"data-stat": "gametime"})
if time_cell:
time_text = time_cell.get_text(strip=True)
if time_text:
try:
# Parse time like "8:20PM" or "1:00PM"
# Normalize: "8:20PM" -> "8:20 PM"
time_normalized = time_text.replace("PM", " PM").replace("AM", " AM")
game_time = datetime.strptime(time_normalized, "%I:%M %p")
# Combine date and time with ET timezone (Pro-Football-Reference uses ET)
game_date = game_date.replace(
hour=game_time.hour,
minute=game_time.minute,
tzinfo=ZoneInfo("America/New_York"),
)
except ValueError:
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
# Get teams
winner_cell = row.find("td", {"data-stat": "winner"})
loser_cell = row.find("td", {"data-stat": "loser"})
if not winner_cell or not loser_cell:
return None
winner = winner_cell.get_text(strip=True)
loser = loser_cell.get_text(strip=True)
if not winner or not loser:
return None
# Determine home/away based on @ symbol
game_location = row.find("td", {"data-stat": "game_location"})
at_home = game_location and "@" in game_location.get_text()
if at_home:
home_team = loser
away_team = winner
else:
home_team = winner
away_team = loser
# Get scores
pts_win_cell = row.find("td", {"data-stat": "pts_win"})
pts_lose_cell = row.find("td", {"data-stat": "pts_lose"})
home_score = None
away_score = None
if pts_win_cell and pts_lose_cell:
try:
winner_pts = int(pts_win_cell.get_text(strip=True))
loser_pts = int(pts_lose_cell.get_text(strip=True))
if at_home:
home_score = loser_pts
away_score = winner_pts
else:
home_score = winner_pts
away_score = loser_pts
except ValueError:
pass
# Determine status
status = "final" if home_score is not None else "scheduled"
return RawGameData(
game_date=game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=None, # PFR doesn't always have stadium
home_score=home_score,
away_score=away_score,
status=status,
source_url=source_url,
)
def _scrape_cbs(self) -> list[RawGameData]:
"""Scrape games from CBS Sports."""
raise NotImplementedError("CBS scraper not implemented")
def _normalize_games(
self,
raw_games: list[RawGameData],
) -> tuple[list[Game], list[ManualReviewItem]]:
"""Normalize raw games to Game objects with canonical IDs."""
games: list[Game] = []
review_items: list[ManualReviewItem] = []
for raw in raw_games:
game, item_reviews = self._normalize_single_game(raw)
if game:
games.append(game)
log_game(
self.sport,
game.id,
game.home_team_id,
game.away_team_id,
game.game_date.strftime("%Y-%m-%d"),
game.status,
)
review_items.extend(item_reviews)
return games, review_items
def _normalize_single_game(
self,
raw: RawGameData,
) -> tuple[Optional[Game], list[ManualReviewItem]]:
"""Normalize a single raw game."""
review_items: list[ManualReviewItem] = []
# Resolve home team
home_result = self._team_resolver.resolve(
raw.home_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if home_result.review_item:
review_items.append(home_result.review_item)
if not home_result.canonical_id:
log_warning(f"Could not resolve home team: {raw.home_team_raw}")
return None, review_items
# Resolve away team
away_result = self._team_resolver.resolve(
raw.away_team_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if away_result.review_item:
review_items.append(away_result.review_item)
if not away_result.canonical_id:
log_warning(f"Could not resolve away team: {raw.away_team_raw}")
return None, review_items
# Resolve stadium
stadium_id = None
if raw.stadium_raw:
stadium_result = self._stadium_resolver.resolve(
raw.stadium_raw,
check_date=raw.game_date.date(),
source_url=raw.source_url,
)
if stadium_result.review_item:
review_items.append(stadium_result.review_item)
stadium_id = stadium_result.canonical_id
# Get abbreviations for game ID
home_abbrev = self._get_abbreviation(home_result.canonical_id)
away_abbrev = self._get_abbreviation(away_result.canonical_id)
# Generate canonical game ID
game_id = generate_game_id(
sport=self.sport,
season=self.season,
away_abbrev=away_abbrev,
home_abbrev=home_abbrev,
game_date=raw.game_date,
game_number=None, # NFL doesn't have doubleheaders
)
game = Game(
id=game_id,
sport=self.sport,
season=self.season,
home_team_id=home_result.canonical_id,
away_team_id=away_result.canonical_id,
stadium_id=stadium_id or "",
game_date=raw.game_date,
game_number=None,
home_score=raw.home_score,
away_score=raw.away_score,
status=raw.status,
source_url=raw.source_url,
raw_home_team=raw.home_team_raw,
raw_away_team=raw.away_team_raw,
raw_stadium=raw.stadium_raw,
)
return game, review_items
def _get_abbreviation(self, team_id: str) -> str:
"""Extract abbreviation from team ID."""
parts = team_id.split("_")
return parts[-1] if parts else ""
def scrape_teams(self) -> list[Team]:
"""Get all NFL teams from hardcoded mappings."""
teams: list[Team] = []
seen: set[str] = set()
# NFL conference/division structure
divisions = {
"AFC East": ("AFC", ["BUF", "MIA", "NE", "NYJ"]),
"AFC North": ("AFC", ["BAL", "CIN", "CLE", "PIT"]),
"AFC South": ("AFC", ["HOU", "IND", "JAX", "TEN"]),
"AFC West": ("AFC", ["DEN", "KC", "LV", "LAC"]),
"NFC East": ("NFC", ["DAL", "NYG", "PHI", "WAS"]),
"NFC North": ("NFC", ["CHI", "DET", "GB", "MIN"]),
"NFC South": ("NFC", ["ATL", "CAR", "NO", "TB"]),
"NFC West": ("NFC", ["ARI", "LAR", "SF", "SEA"]),
}
# Build reverse lookup
team_divisions: dict[str, tuple[str, str]] = {}
for div, (conf, abbrevs) in divisions.items():
for abbrev in abbrevs:
team_divisions[abbrev] = (conf, div)
for abbrev, (team_id, full_name, city, stadium_id) in TEAM_MAPPINGS.get("nfl", {}).items():
if team_id in seen:
continue
seen.add(team_id)
# Parse team name
parts = full_name.split()
team_name = parts[-1] if parts else full_name
# Get conference and division
conf, div = team_divisions.get(abbrev, (None, None))
team = Team(
id=team_id,
sport="nfl",
city=city,
name=team_name,
full_name=full_name,
abbreviation=abbrev,
conference=conf,
division=div,
stadium_id=stadium_id,
)
teams.append(team)
return teams
def scrape_stadiums(self) -> list[Stadium]:
"""Get all NFL stadiums from hardcoded mappings."""
stadiums: list[Stadium] = []
nfl_stadiums = STADIUM_MAPPINGS.get("nfl", {})
for stadium_id, info in nfl_stadiums.items():
stadium = Stadium(
id=stadium_id,
sport="nfl",
name=info.name,
city=info.city,
state=info.state,
country=info.country,
latitude=info.latitude,
longitude=info.longitude,
timezone=info.timezone,
surface="turf", # Many NFL stadiums
roof_type="open", # Most outdoor
)
stadiums.append(stadium)
return stadiums
def create_nfl_scraper(season: int) -> NFLScraper:
"""Factory function to create an NFL scraper."""
return NFLScraper(season=season)