fix(data): add timezone handling for Sports-Reference scrapers and new stadiums

- Add ET timezone (America/New_York) to all Sports-Reference scrapers:
  - NBA: Basketball-Reference times parsed as ET
  - NFL: Pro-Football-Reference times parsed as ET
  - NHL: Hockey-Reference times parsed as ET
  - MLB: Baseball-Reference times parsed as ET
- Document source timezones in scraper docstrings
- Add 11 new stadiums to STADIUM_MAPPINGS:
  - NFL: 5 international venues (Corinthians Arena, Croke Park,
    Olympic Stadium Berlin, Santiago Bernabéu, Tom Benson Hall of Fame)
  - MLS: 4 alternate venues (Miami Freedom Park, Citi Field,
    LA Memorial Coliseum, M&T Bank Stadium)
  - NWSL: 2 alternate venues (Northwestern Medicine Field, ONE Spokane)
- Add 15 stadium aliases for MLS/NWSL team-based lookups
- Fix CanonicalSyncService to sync timezone identifier to SwiftData
- Update debug logging to use stadium timezone for display

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-21 16:04:45 -06:00
parent b339a53db3
commit 4d097883a6
11 changed files with 226 additions and 12 deletions

View File

@@ -2,6 +2,7 @@
from datetime import datetime, date, timedelta
from typing import Optional
from zoneinfo import ZoneInfo
from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult
@@ -28,9 +29,14 @@ class MLBScraper(BaseScraper):
"""MLB schedule scraper with multi-source fallback.
Sources (in priority order):
1. Baseball-Reference - Most reliable, complete historical data
2. MLB Stats API - Official MLB data
3. ESPN API - Backup option
1. MLB Stats API - Official MLB data (primary)
2. ESPN API - Backup option
3. Baseball-Reference - Complete historical data
Source Timezones:
- mlb_api: UTC - ISO 8601 format with "Z" suffix (gameDate field)
- espn: UTC - ISO 8601 format with "Z" suffix
- baseball_reference: Eastern Time (ET) - times displayed as "7:05 PM ET"
"""
def __init__(self, season: int, **kwargs):
@@ -143,7 +149,22 @@ class MLBScraper(BaseScraper):
continue
try:
game = self._parse_br_game(elem, current_date, source_url)
# Extract game time from the element if present
# Baseball-Reference may have time in a span or in the text
game_time_for_row = None
time_elem = elem.find("span", class_="game_time")
if time_elem:
time_text = time_elem.get_text(strip=True)
if time_text:
try:
# Parse time like "7:05 PM ET" or "1:10 PM"
# Remove timezone suffix if present
time_clean = time_text.replace(" ET", "").replace(" PT", "").replace(" CT", "").replace(" MT", "").strip()
game_time_for_row = datetime.strptime(time_clean, "%I:%M %p")
except ValueError:
pass
game = self._parse_br_game(elem, current_date, source_url, game_time_for_row)
if game:
games.append(game)
except Exception as e:
@@ -157,6 +178,7 @@ class MLBScraper(BaseScraper):
elem,
game_date: datetime,
source_url: str,
game_time: Optional[datetime] = None,
) -> Optional[RawGameData]:
"""Parse a single Baseball-Reference game element."""
text = elem.get_text(" ", strip=True)
@@ -206,8 +228,17 @@ class MLBScraper(BaseScraper):
# Third link might be stadium
stadium = links[2].get_text(strip=True)
# Combine date and time if time was provided, with ET timezone (Baseball-Reference uses ET)
final_game_date = game_date
if game_time:
final_game_date = game_date.replace(
hour=game_time.hour,
minute=game_time.minute,
tzinfo=ZoneInfo("America/New_York"),
)
return RawGameData(
game_date=game_date,
game_date=final_game_date,
home_team_raw=home_team,
away_team_raw=away_team,
stadium_raw=stadium,

View File

@@ -27,7 +27,11 @@ class MLSScraper(BaseScraper):
Sources (in priority order):
1. ESPN API - Most reliable for MLS
2. FBref - Backup option
2. FBref - Backup option (not implemented)
Source Timezones:
- espn: UTC - ISO 8601 format with "Z" suffix
- fbref: Not implemented
"""
def __init__(self, season: int, **kwargs):

View File

@@ -2,6 +2,7 @@
from datetime import datetime, date, timezone
from typing import Optional
from zoneinfo import ZoneInfo
from bs4 import BeautifulSoup
import re
@@ -45,7 +46,12 @@ class NBAScraper(BaseScraper):
Sources (in priority order):
1. Basketball-Reference - Most reliable, complete historical data
2. ESPN API - Good for current/future seasons
3. CBS Sports - Backup option
3. CBS Sports - Backup option (not implemented)
Source Timezones:
- basketball_reference: Eastern Time (ET) - times displayed as "7:30p"
- espn: UTC - ISO 8601 format with "Z" suffix
- cbs: Not implemented
"""
def __init__(self, season: int, **kwargs):
@@ -196,6 +202,25 @@ class NBAScraper(BaseScraper):
self._logger.debug(f"Could not parse date: {date_text}")
return None
# Get game start time (format: "7:30p" or "10:00p") - times are in ET
time_cell = row.find("td", {"data-stat": "game_start_time"})
if time_cell:
time_text = time_cell.get_text(strip=True)
if time_text:
try:
# Parse time like "7:30p" or "10:00p"
# Normalize: "7:30p" -> "7:30 PM", "10:00p" -> "10:00 PM"
time_normalized = time_text.replace("p", " PM").replace("a", " AM")
game_time = datetime.strptime(time_normalized, "%I:%M %p")
# Combine date and time with ET timezone (Basketball-Reference uses ET)
game_date = game_date.replace(
hour=game_time.hour,
minute=game_time.minute,
tzinfo=ZoneInfo("America/New_York"),
)
except ValueError:
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
# Get teams
away_cell = row.find("td", {"data-stat": "visitor_team_name"})
home_cell = row.find("td", {"data-stat": "home_team_name"})

View File

@@ -2,6 +2,7 @@
from datetime import datetime, date
from typing import Optional
from zoneinfo import ZoneInfo
from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult
@@ -33,7 +34,12 @@ class NFLScraper(BaseScraper):
Sources (in priority order):
1. ESPN API - Most reliable for NFL
2. Pro-Football-Reference - Complete historical data
3. CBS Sports - Backup option
3. CBS Sports - Backup option (not implemented)
Source Timezones:
- espn: UTC - ISO 8601 format with "Z" suffix
- pro_football_reference: Eastern Time (ET) - times displayed as "8:20PM"
- cbs: Not implemented
"""
def __init__(self, season: int, **kwargs):
@@ -323,6 +329,25 @@ class NFLScraper(BaseScraper):
except ValueError:
return None
# Get game start time (format: "8:20PM" or "1:00PM") - times are in ET
time_cell = row.find("td", {"data-stat": "gametime"})
if time_cell:
time_text = time_cell.get_text(strip=True)
if time_text:
try:
# Parse time like "8:20PM" or "1:00PM"
# Normalize: "8:20PM" -> "8:20 PM"
time_normalized = time_text.replace("PM", " PM").replace("AM", " AM")
game_time = datetime.strptime(time_normalized, "%I:%M %p")
# Combine date and time with ET timezone (Pro-Football-Reference uses ET)
game_date = game_date.replace(
hour=game_time.hour,
minute=game_time.minute,
tzinfo=ZoneInfo("America/New_York"),
)
except ValueError:
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
# Get teams
winner_cell = row.find("td", {"data-stat": "winner"})
loser_cell = row.find("td", {"data-stat": "loser"})

View File

@@ -2,6 +2,7 @@
from datetime import datetime, date
from typing import Optional
from zoneinfo import ZoneInfo
from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult
@@ -40,6 +41,11 @@ class NHLScraper(BaseScraper):
1. Hockey-Reference - Most reliable for NHL
2. NHL API - Official NHL data
3. ESPN API - Backup option
Source Timezones:
- hockey_reference: Eastern Time (ET) - times displayed as "7:00p"
- nhl_api: UTC - ISO 8601 format with "Z" suffix (startTimeUTC field)
- espn: UTC - ISO 8601 format with "Z" suffix
"""
def __init__(self, season: int, **kwargs):
@@ -158,6 +164,25 @@ class NHLScraper(BaseScraper):
except ValueError:
return None
# Get game start time (format: "7:00p" or "10:30p") - times are in ET
time_cell = row.find("td", {"data-stat": "time_game"})
if time_cell:
time_text = time_cell.get_text(strip=True)
if time_text:
try:
# Parse time like "7:00p" or "10:30p"
# Normalize: "7:00p" -> "7:00 PM", "10:30p" -> "10:30 PM"
time_normalized = time_text.replace("p", " PM").replace("a", " AM")
game_time = datetime.strptime(time_normalized, "%I:%M %p")
# Combine date and time with ET timezone (Hockey-Reference uses ET)
game_date = game_date.replace(
hour=game_time.hour,
minute=game_time.minute,
tzinfo=ZoneInfo("America/New_York"),
)
except ValueError:
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
# Get teams
visitor_cell = row.find("td", {"data-stat": "visitor_team_name"})
home_cell = row.find("td", {"data-stat": "home_team_name"})

View File

@@ -27,7 +27,9 @@ class NWSLScraper(BaseScraper):
Sources (in priority order):
1. ESPN API - Most reliable for NWSL
2. NWSL official (via ESPN) - Backup option
Source Timezones:
- espn: UTC - ISO 8601 format with "Z" suffix
"""
def __init__(self, season: int, **kwargs):

View File

@@ -27,7 +27,9 @@ class WNBAScraper(BaseScraper):
Sources (in priority order):
1. ESPN API - Most reliable for WNBA
2. WNBA official (via ESPN) - Backup option
Source Timezones:
- espn: UTC - ISO 8601 format with "Z" suffix
"""
def __init__(self, season: int, **kwargs):