fix(data): add timezone handling for Sports-Reference scrapers and new stadiums

- Add ET timezone (America/New_York) to all Sports-Reference scrapers:
  - NBA: Basketball-Reference times parsed as ET
  - NFL: Pro-Football-Reference times parsed as ET
  - NHL: Hockey-Reference times parsed as ET
  - MLB: Baseball-Reference times parsed as ET
- Document source timezones in scraper docstrings
- Add 11 new stadiums to STADIUM_MAPPINGS:
  - NFL: 5 international venues (Corinthians Arena, Croke Park,
    Olympic Stadium Berlin, Santiago Bernabéu, Tom Benson Hall of Fame)
  - MLS: 4 alternate venues (Miami Freedom Park, Citi Field,
    LA Memorial Coliseum, M&T Bank Stadium)
  - NWSL: 2 alternate venues (Northwestern Medicine Field, ONE Spokane)
- Add 15 stadium aliases for MLS/NWSL team-based lookups
- Fix CanonicalSyncService to sync timezone identifier to SwiftData
- Update debug logging to use stadium timezone for display

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-21 16:04:45 -06:00
parent b339a53db3
commit 4d097883a6
11 changed files with 226 additions and 12 deletions

View File

@@ -175,6 +175,12 @@ STADIUM_MAPPINGS: dict[str, dict[str, StadiumInfo]] = {
"stadium_nfl_raymond_james_stadium": StadiumInfo("stadium_nfl_raymond_james_stadium", "Raymond James Stadium", "Tampa", "FL", "USA", "nfl", 27.9759, -82.5033), "stadium_nfl_raymond_james_stadium": StadiumInfo("stadium_nfl_raymond_james_stadium", "Raymond James Stadium", "Tampa", "FL", "USA", "nfl", 27.9759, -82.5033),
"stadium_nfl_nissan_stadium": StadiumInfo("stadium_nfl_nissan_stadium", "Nissan Stadium", "Nashville", "TN", "USA", "nfl", 36.1665, -86.7713, "America/Chicago"), "stadium_nfl_nissan_stadium": StadiumInfo("stadium_nfl_nissan_stadium", "Nissan Stadium", "Nashville", "TN", "USA", "nfl", 36.1665, -86.7713, "America/Chicago"),
"stadium_nfl_northwest_stadium": StadiumInfo("stadium_nfl_northwest_stadium", "Northwest Stadium", "Landover", "MD", "USA", "nfl", 38.9076, -76.8645), "stadium_nfl_northwest_stadium": StadiumInfo("stadium_nfl_northwest_stadium", "Northwest Stadium", "Landover", "MD", "USA", "nfl", 38.9076, -76.8645),
# Special and international venues
"stadium_nfl_tom_benson_hall_of_fame_stadium": StadiumInfo("stadium_nfl_tom_benson_hall_of_fame_stadium", "Tom Benson Hall of Fame Stadium", "Canton", "OH", "USA", "nfl", 40.8209, -81.3985),
"stadium_nfl_corinthians_arena": StadiumInfo("stadium_nfl_corinthians_arena", "Corinthians Arena", "São Paulo", "SP", "Brazil", "nfl", -23.5453, -46.4742, "America/Sao_Paulo"),
"stadium_nfl_croke_park": StadiumInfo("stadium_nfl_croke_park", "Croke Park", "Dublin", "", "Ireland", "nfl", 53.3609, -6.2514, "Europe/Dublin"),
"stadium_nfl_olympic_stadium_berlin": StadiumInfo("stadium_nfl_olympic_stadium_berlin", "Olympic Stadium Berlin", "Berlin", "", "Germany", "nfl", 52.5147, 13.2395, "Europe/Berlin"),
"stadium_nfl_santiago_bernabeu": StadiumInfo("stadium_nfl_santiago_bernabeu", "Santiago Bernabéu", "Madrid", "", "Spain", "nfl", 40.4531, -3.6883, "Europe/Madrid"),
}, },
"nhl": { "nhl": {
"stadium_nhl_honda_center": StadiumInfo("stadium_nhl_honda_center", "Honda Center", "Anaheim", "CA", "USA", "nhl", 33.8078, -117.8765, "America/Los_Angeles"), "stadium_nhl_honda_center": StadiumInfo("stadium_nhl_honda_center", "Honda Center", "Anaheim", "CA", "USA", "nhl", 33.8078, -117.8765, "America/Los_Angeles"),
@@ -241,6 +247,11 @@ STADIUM_MAPPINGS: dict[str, dict[str, StadiumInfo]] = {
"stadium_mls_citypark": StadiumInfo("stadium_mls_citypark", "CITYPARK", "St. Louis", "MO", "USA", "mls", 38.6316, -90.2106, "America/Chicago"), "stadium_mls_citypark": StadiumInfo("stadium_mls_citypark", "CITYPARK", "St. Louis", "MO", "USA", "mls", 38.6316, -90.2106, "America/Chicago"),
"stadium_mls_bmo_field": StadiumInfo("stadium_mls_bmo_field", "BMO Field", "Toronto", "ON", "Canada", "mls", 43.6332, -79.4186, "America/Toronto"), "stadium_mls_bmo_field": StadiumInfo("stadium_mls_bmo_field", "BMO Field", "Toronto", "ON", "Canada", "mls", 43.6332, -79.4186, "America/Toronto"),
"stadium_mls_bc_place": StadiumInfo("stadium_mls_bc_place", "BC Place", "Vancouver", "BC", "Canada", "mls", 49.2768, -123.1118, "America/Vancouver"), "stadium_mls_bc_place": StadiumInfo("stadium_mls_bc_place", "BC Place", "Vancouver", "BC", "Canada", "mls", 49.2768, -123.1118, "America/Vancouver"),
# Alternate and special venues
"stadium_mls_miami_freedom_park": StadiumInfo("stadium_mls_miami_freedom_park", "Miami Freedom Park", "Miami", "FL", "USA", "mls", 25.789, -80.237),
"stadium_mls_citi_field": StadiumInfo("stadium_mls_citi_field", "Citi Field", "New York", "NY", "USA", "mls", 40.7571, -73.8458),
"stadium_mls_los_angeles_memorial_coliseum": StadiumInfo("stadium_mls_los_angeles_memorial_coliseum", "Los Angeles Memorial Coliseum", "Los Angeles", "CA", "USA", "mls", 34.0141, -118.2879, "America/Los_Angeles"),
"stadium_mls_mandt_bank_stadium": StadiumInfo("stadium_mls_mandt_bank_stadium", "M&T Bank Stadium", "Baltimore", "MD", "USA", "mls", 39.2780, -76.6227),
}, },
"wnba": { "wnba": {
"stadium_wnba_gateway_center_arena": StadiumInfo("stadium_wnba_gateway_center_arena", "Gateway Center Arena", "College Park", "GA", "USA", "wnba", 33.6510, -84.4474), "stadium_wnba_gateway_center_arena": StadiumInfo("stadium_wnba_gateway_center_arena", "Gateway Center Arena", "College Park", "GA", "USA", "wnba", 33.6510, -84.4474),
@@ -286,6 +297,9 @@ STADIUM_MAPPINGS: dict[str, dict[str, StadiumInfo]] = {
# Shared NFL/MLB venues # Shared NFL/MLB venues
"stadium_nwsl_soldier_field": StadiumInfo("stadium_nwsl_soldier_field", "Soldier Field", "Chicago", "IL", "USA", "nwsl", 41.8623, -87.6167, "America/Chicago"), "stadium_nwsl_soldier_field": StadiumInfo("stadium_nwsl_soldier_field", "Soldier Field", "Chicago", "IL", "USA", "nwsl", 41.8623, -87.6167, "America/Chicago"),
"stadium_nwsl_oracle_park": StadiumInfo("stadium_nwsl_oracle_park", "Oracle Park", "San Francisco", "CA", "USA", "nwsl", 37.7786, -122.3893, "America/Los_Angeles"), "stadium_nwsl_oracle_park": StadiumInfo("stadium_nwsl_oracle_park", "Oracle Park", "San Francisco", "CA", "USA", "nwsl", 37.7786, -122.3893, "America/Los_Angeles"),
# Additional alternate venues
"stadium_nwsl_northwestern_medicine_field": StadiumInfo("stadium_nwsl_northwestern_medicine_field", "Northwestern Medicine Field at Martin Stadium", "Evanston", "IL", "USA", "nwsl", 42.0598, -87.6743, "America/Chicago"),
"stadium_nwsl_one_spokane_stadium": StadiumInfo("stadium_nwsl_one_spokane_stadium", "ONE Spokane Stadium", "Spokane", "WA", "USA", "nwsl", 47.6588, -117.4101, "America/Los_Angeles"),
}, },
} }

View File

@@ -2,6 +2,7 @@
from datetime import datetime, date, timedelta from datetime import datetime, date, timedelta
from typing import Optional from typing import Optional
from zoneinfo import ZoneInfo
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult from .base import BaseScraper, RawGameData, ScrapeResult
@@ -28,9 +29,14 @@ class MLBScraper(BaseScraper):
"""MLB schedule scraper with multi-source fallback. """MLB schedule scraper with multi-source fallback.
Sources (in priority order): Sources (in priority order):
1. Baseball-Reference - Most reliable, complete historical data 1. MLB Stats API - Official MLB data (primary)
2. MLB Stats API - Official MLB data 2. ESPN API - Backup option
3. ESPN API - Backup option 3. Baseball-Reference - Complete historical data
Source Timezones:
- mlb_api: UTC - ISO 8601 format with "Z" suffix (gameDate field)
- espn: UTC - ISO 8601 format with "Z" suffix
- baseball_reference: Eastern Time (ET) - times displayed as "7:05 PM ET"
""" """
def __init__(self, season: int, **kwargs): def __init__(self, season: int, **kwargs):
@@ -143,7 +149,22 @@ class MLBScraper(BaseScraper):
continue continue
try: try:
game = self._parse_br_game(elem, current_date, source_url) # Extract game time from the element if present
# Baseball-Reference may have time in a span or in the text
game_time_for_row = None
time_elem = elem.find("span", class_="game_time")
if time_elem:
time_text = time_elem.get_text(strip=True)
if time_text:
try:
# Parse time like "7:05 PM ET" or "1:10 PM"
# Remove timezone suffix if present
time_clean = time_text.replace(" ET", "").replace(" PT", "").replace(" CT", "").replace(" MT", "").strip()
game_time_for_row = datetime.strptime(time_clean, "%I:%M %p")
except ValueError:
pass
game = self._parse_br_game(elem, current_date, source_url, game_time_for_row)
if game: if game:
games.append(game) games.append(game)
except Exception as e: except Exception as e:
@@ -157,6 +178,7 @@ class MLBScraper(BaseScraper):
elem, elem,
game_date: datetime, game_date: datetime,
source_url: str, source_url: str,
game_time: Optional[datetime] = None,
) -> Optional[RawGameData]: ) -> Optional[RawGameData]:
"""Parse a single Baseball-Reference game element.""" """Parse a single Baseball-Reference game element."""
text = elem.get_text(" ", strip=True) text = elem.get_text(" ", strip=True)
@@ -206,8 +228,17 @@ class MLBScraper(BaseScraper):
# Third link might be stadium # Third link might be stadium
stadium = links[2].get_text(strip=True) stadium = links[2].get_text(strip=True)
# Combine date and time if time was provided, with ET timezone (Baseball-Reference uses ET)
final_game_date = game_date
if game_time:
final_game_date = game_date.replace(
hour=game_time.hour,
minute=game_time.minute,
tzinfo=ZoneInfo("America/New_York"),
)
return RawGameData( return RawGameData(
game_date=game_date, game_date=final_game_date,
home_team_raw=home_team, home_team_raw=home_team,
away_team_raw=away_team, away_team_raw=away_team,
stadium_raw=stadium, stadium_raw=stadium,

View File

@@ -27,7 +27,11 @@ class MLSScraper(BaseScraper):
Sources (in priority order): Sources (in priority order):
1. ESPN API - Most reliable for MLS 1. ESPN API - Most reliable for MLS
2. FBref - Backup option 2. FBref - Backup option (not implemented)
Source Timezones:
- espn: UTC - ISO 8601 format with "Z" suffix
- fbref: Not implemented
""" """
def __init__(self, season: int, **kwargs): def __init__(self, season: int, **kwargs):

View File

@@ -2,6 +2,7 @@
from datetime import datetime, date, timezone from datetime import datetime, date, timezone
from typing import Optional from typing import Optional
from zoneinfo import ZoneInfo
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import re import re
@@ -45,7 +46,12 @@ class NBAScraper(BaseScraper):
Sources (in priority order): Sources (in priority order):
1. Basketball-Reference - Most reliable, complete historical data 1. Basketball-Reference - Most reliable, complete historical data
2. ESPN API - Good for current/future seasons 2. ESPN API - Good for current/future seasons
3. CBS Sports - Backup option 3. CBS Sports - Backup option (not implemented)
Source Timezones:
- basketball_reference: Eastern Time (ET) - times displayed as "7:30p"
- espn: UTC - ISO 8601 format with "Z" suffix
- cbs: Not implemented
""" """
def __init__(self, season: int, **kwargs): def __init__(self, season: int, **kwargs):
@@ -196,6 +202,25 @@ class NBAScraper(BaseScraper):
self._logger.debug(f"Could not parse date: {date_text}") self._logger.debug(f"Could not parse date: {date_text}")
return None return None
# Get game start time (format: "7:30p" or "10:00p") - times are in ET
time_cell = row.find("td", {"data-stat": "game_start_time"})
if time_cell:
time_text = time_cell.get_text(strip=True)
if time_text:
try:
# Parse time like "7:30p" or "10:00p"
# Normalize: "7:30p" -> "7:30 PM", "10:00p" -> "10:00 PM"
time_normalized = time_text.replace("p", " PM").replace("a", " AM")
game_time = datetime.strptime(time_normalized, "%I:%M %p")
# Combine date and time with ET timezone (Basketball-Reference uses ET)
game_date = game_date.replace(
hour=game_time.hour,
minute=game_time.minute,
tzinfo=ZoneInfo("America/New_York"),
)
except ValueError:
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
# Get teams # Get teams
away_cell = row.find("td", {"data-stat": "visitor_team_name"}) away_cell = row.find("td", {"data-stat": "visitor_team_name"})
home_cell = row.find("td", {"data-stat": "home_team_name"}) home_cell = row.find("td", {"data-stat": "home_team_name"})

View File

@@ -2,6 +2,7 @@
from datetime import datetime, date from datetime import datetime, date
from typing import Optional from typing import Optional
from zoneinfo import ZoneInfo
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult from .base import BaseScraper, RawGameData, ScrapeResult
@@ -33,7 +34,12 @@ class NFLScraper(BaseScraper):
Sources (in priority order): Sources (in priority order):
1. ESPN API - Most reliable for NFL 1. ESPN API - Most reliable for NFL
2. Pro-Football-Reference - Complete historical data 2. Pro-Football-Reference - Complete historical data
3. CBS Sports - Backup option 3. CBS Sports - Backup option (not implemented)
Source Timezones:
- espn: UTC - ISO 8601 format with "Z" suffix
- pro_football_reference: Eastern Time (ET) - times displayed as "8:20PM"
- cbs: Not implemented
""" """
def __init__(self, season: int, **kwargs): def __init__(self, season: int, **kwargs):
@@ -323,6 +329,25 @@ class NFLScraper(BaseScraper):
except ValueError: except ValueError:
return None return None
# Get game start time (format: "8:20PM" or "1:00PM") - times are in ET
time_cell = row.find("td", {"data-stat": "gametime"})
if time_cell:
time_text = time_cell.get_text(strip=True)
if time_text:
try:
# Parse time like "8:20PM" or "1:00PM"
# Normalize: "8:20PM" -> "8:20 PM"
time_normalized = time_text.replace("PM", " PM").replace("AM", " AM")
game_time = datetime.strptime(time_normalized, "%I:%M %p")
# Combine date and time with ET timezone (Pro-Football-Reference uses ET)
game_date = game_date.replace(
hour=game_time.hour,
minute=game_time.minute,
tzinfo=ZoneInfo("America/New_York"),
)
except ValueError:
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
# Get teams # Get teams
winner_cell = row.find("td", {"data-stat": "winner"}) winner_cell = row.find("td", {"data-stat": "winner"})
loser_cell = row.find("td", {"data-stat": "loser"}) loser_cell = row.find("td", {"data-stat": "loser"})

View File

@@ -2,6 +2,7 @@
from datetime import datetime, date from datetime import datetime, date
from typing import Optional from typing import Optional
from zoneinfo import ZoneInfo
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .base import BaseScraper, RawGameData, ScrapeResult from .base import BaseScraper, RawGameData, ScrapeResult
@@ -40,6 +41,11 @@ class NHLScraper(BaseScraper):
1. Hockey-Reference - Most reliable for NHL 1. Hockey-Reference - Most reliable for NHL
2. NHL API - Official NHL data 2. NHL API - Official NHL data
3. ESPN API - Backup option 3. ESPN API - Backup option
Source Timezones:
- hockey_reference: Eastern Time (ET) - times displayed as "7:00p"
- nhl_api: UTC - ISO 8601 format with "Z" suffix (startTimeUTC field)
- espn: UTC - ISO 8601 format with "Z" suffix
""" """
def __init__(self, season: int, **kwargs): def __init__(self, season: int, **kwargs):
@@ -158,6 +164,25 @@ class NHLScraper(BaseScraper):
except ValueError: except ValueError:
return None return None
# Get game start time (format: "7:00p" or "10:30p") - times are in ET
time_cell = row.find("td", {"data-stat": "time_game"})
if time_cell:
time_text = time_cell.get_text(strip=True)
if time_text:
try:
# Parse time like "7:00p" or "10:30p"
# Normalize: "7:00p" -> "7:00 PM", "10:30p" -> "10:30 PM"
time_normalized = time_text.replace("p", " PM").replace("a", " AM")
game_time = datetime.strptime(time_normalized, "%I:%M %p")
# Combine date and time with ET timezone (Hockey-Reference uses ET)
game_date = game_date.replace(
hour=game_time.hour,
minute=game_time.minute,
tzinfo=ZoneInfo("America/New_York"),
)
except ValueError:
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
# Get teams # Get teams
visitor_cell = row.find("td", {"data-stat": "visitor_team_name"}) visitor_cell = row.find("td", {"data-stat": "visitor_team_name"})
home_cell = row.find("td", {"data-stat": "home_team_name"}) home_cell = row.find("td", {"data-stat": "home_team_name"})

View File

@@ -27,7 +27,9 @@ class NWSLScraper(BaseScraper):
Sources (in priority order): Sources (in priority order):
1. ESPN API - Most reliable for NWSL 1. ESPN API - Most reliable for NWSL
2. NWSL official (via ESPN) - Backup option
Source Timezones:
- espn: UTC - ISO 8601 format with "Z" suffix
""" """
def __init__(self, season: int, **kwargs): def __init__(self, season: int, **kwargs):

View File

@@ -27,7 +27,9 @@ class WNBAScraper(BaseScraper):
Sources (in priority order): Sources (in priority order):
1. ESPN API - Most reliable for WNBA 1. ESPN API - Most reliable for WNBA
2. WNBA official (via ESPN) - Backup option
Source Timezones:
- espn: UTC - ISO 8601 format with "Z" suffix
""" """
def __init__(self, season: int, **kwargs): def __init__(self, season: int, **kwargs):

View File

@@ -2032,5 +2032,89 @@
"stadium_canonical_id": "stadium_wnba_purcell_pavilion", "stadium_canonical_id": "stadium_wnba_purcell_pavilion",
"valid_from": null, "valid_from": null,
"valid_until": null "valid_until": null
},
{
"alias_name": "inter miami",
"stadium_canonical_id": "stadium_mls_chase_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "inter miami cf",
"stadium_canonical_id": "stadium_mls_chase_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "miami",
"stadium_canonical_id": "stadium_mls_chase_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "mia",
"stadium_canonical_id": "stadium_mls_chase_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "nycfc",
"stadium_canonical_id": "stadium_mls_yankee_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "nyc",
"stadium_canonical_id": "stadium_mls_yankee_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "new york city fc",
"stadium_canonical_id": "stadium_mls_yankee_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "lafc",
"stadium_canonical_id": "stadium_mls_bmo_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "los angeles fc",
"stadium_canonical_id": "stadium_mls_bmo_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "dc united",
"stadium_canonical_id": "stadium_mls_audi_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "d.c. united",
"stadium_canonical_id": "stadium_mls_audi_field",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "chicago red stars",
"stadium_canonical_id": "stadium_nwsl_seatgeek_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "chi",
"stadium_canonical_id": "stadium_nwsl_seatgeek_stadium",
"valid_from": null,
"valid_until": null
},
{
"alias_name": "chicago",
"stadium_canonical_id": "stadium_nwsl_seatgeek_stadium",
"valid_from": null,
"valid_until": null
} }
] ]

View File

@@ -582,6 +582,7 @@ actor CanonicalSyncService {
existing.yearOpened = remote.yearOpened existing.yearOpened = remote.yearOpened
existing.imageURL = remote.imageURL?.absoluteString existing.imageURL = remote.imageURL?.absoluteString
existing.sport = remote.sport.rawValue existing.sport = remote.sport.rawValue
existing.timezoneIdentifier = remote.timeZoneIdentifier
existing.source = .cloudKit existing.source = .cloudKit
existing.lastModified = Date() existing.lastModified = Date()
@@ -607,7 +608,8 @@ actor CanonicalSyncService {
capacity: remote.capacity, capacity: remote.capacity,
yearOpened: remote.yearOpened, yearOpened: remote.yearOpened,
imageURL: remote.imageURL?.absoluteString, imageURL: remote.imageURL?.absoluteString,
sport: remote.sport.rawValue sport: remote.sport.rawValue,
timezoneIdentifier: remote.timeZoneIdentifier
) )
context.insert(canonical) context.insert(canonical)
return .applied return .applied

View File

@@ -120,7 +120,7 @@ final class ScheduleViewModel {
let nbaGames = games.filter { $0.game.sport == .nba } let nbaGames = games.filter { $0.game.sport == .nba }
print("🏀 [DEBUG] All NBA games in schedule (\(nbaGames.count) total):") print("🏀 [DEBUG] All NBA games in schedule (\(nbaGames.count) total):")
for game in nbaGames.sorted(by: { $0.game.dateTime < $1.game.dateTime }) { for game in nbaGames.sorted(by: { $0.game.dateTime < $1.game.dateTime }) {
let dateStr = game.game.dateTime.formatted(date: .abbreviated, time: .shortened) let dateStr = game.game.dateTime.gameDateTimeString(in: game.stadium.timeZone)
print("🏀 \(dateStr): \(game.awayTeam.name) @ \(game.homeTeam.name) (\(game.game.id))") print("🏀 \(dateStr): \(game.awayTeam.name) @ \(game.homeTeam.name) (\(game.game.id))")
} }