fix(data): add timezone handling for Sports-Reference scrapers and new stadiums
- Add ET timezone (America/New_York) to all Sports-Reference scrapers:
- NBA: Basketball-Reference times parsed as ET
- NFL: Pro-Football-Reference times parsed as ET
- NHL: Hockey-Reference times parsed as ET
- MLB: Baseball-Reference times parsed as ET
- Document source timezones in scraper docstrings
- Add 11 new stadiums to STADIUM_MAPPINGS:
- NFL: 5 international venues (Corinthians Arena, Croke Park,
Olympic Stadium Berlin, Santiago Bernabéu, Tom Benson Hall of Fame)
- MLS: 4 alternate venues (Miami Freedom Park, Citi Field,
LA Memorial Coliseum, M&T Bank Stadium)
- NWSL: 2 alternate venues (Northwestern Medicine Field, ONE Spokane)
- Add 15 stadium aliases for MLS/NWSL team-based lookups
- Fix CanonicalSyncService to sync timezone identifier to SwiftData
- Update debug logging to use stadium timezone for display
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -175,6 +175,12 @@ STADIUM_MAPPINGS: dict[str, dict[str, StadiumInfo]] = {
|
||||
"stadium_nfl_raymond_james_stadium": StadiumInfo("stadium_nfl_raymond_james_stadium", "Raymond James Stadium", "Tampa", "FL", "USA", "nfl", 27.9759, -82.5033),
|
||||
"stadium_nfl_nissan_stadium": StadiumInfo("stadium_nfl_nissan_stadium", "Nissan Stadium", "Nashville", "TN", "USA", "nfl", 36.1665, -86.7713, "America/Chicago"),
|
||||
"stadium_nfl_northwest_stadium": StadiumInfo("stadium_nfl_northwest_stadium", "Northwest Stadium", "Landover", "MD", "USA", "nfl", 38.9076, -76.8645),
|
||||
# Special and international venues
|
||||
"stadium_nfl_tom_benson_hall_of_fame_stadium": StadiumInfo("stadium_nfl_tom_benson_hall_of_fame_stadium", "Tom Benson Hall of Fame Stadium", "Canton", "OH", "USA", "nfl", 40.8209, -81.3985),
|
||||
"stadium_nfl_corinthians_arena": StadiumInfo("stadium_nfl_corinthians_arena", "Corinthians Arena", "São Paulo", "SP", "Brazil", "nfl", -23.5453, -46.4742, "America/Sao_Paulo"),
|
||||
"stadium_nfl_croke_park": StadiumInfo("stadium_nfl_croke_park", "Croke Park", "Dublin", "", "Ireland", "nfl", 53.3609, -6.2514, "Europe/Dublin"),
|
||||
"stadium_nfl_olympic_stadium_berlin": StadiumInfo("stadium_nfl_olympic_stadium_berlin", "Olympic Stadium Berlin", "Berlin", "", "Germany", "nfl", 52.5147, 13.2395, "Europe/Berlin"),
|
||||
"stadium_nfl_santiago_bernabeu": StadiumInfo("stadium_nfl_santiago_bernabeu", "Santiago Bernabéu", "Madrid", "", "Spain", "nfl", 40.4531, -3.6883, "Europe/Madrid"),
|
||||
},
|
||||
"nhl": {
|
||||
"stadium_nhl_honda_center": StadiumInfo("stadium_nhl_honda_center", "Honda Center", "Anaheim", "CA", "USA", "nhl", 33.8078, -117.8765, "America/Los_Angeles"),
|
||||
@@ -241,6 +247,11 @@ STADIUM_MAPPINGS: dict[str, dict[str, StadiumInfo]] = {
|
||||
"stadium_mls_citypark": StadiumInfo("stadium_mls_citypark", "CITYPARK", "St. Louis", "MO", "USA", "mls", 38.6316, -90.2106, "America/Chicago"),
|
||||
"stadium_mls_bmo_field": StadiumInfo("stadium_mls_bmo_field", "BMO Field", "Toronto", "ON", "Canada", "mls", 43.6332, -79.4186, "America/Toronto"),
|
||||
"stadium_mls_bc_place": StadiumInfo("stadium_mls_bc_place", "BC Place", "Vancouver", "BC", "Canada", "mls", 49.2768, -123.1118, "America/Vancouver"),
|
||||
# Alternate and special venues
|
||||
"stadium_mls_miami_freedom_park": StadiumInfo("stadium_mls_miami_freedom_park", "Miami Freedom Park", "Miami", "FL", "USA", "mls", 25.789, -80.237),
|
||||
"stadium_mls_citi_field": StadiumInfo("stadium_mls_citi_field", "Citi Field", "New York", "NY", "USA", "mls", 40.7571, -73.8458),
|
||||
"stadium_mls_los_angeles_memorial_coliseum": StadiumInfo("stadium_mls_los_angeles_memorial_coliseum", "Los Angeles Memorial Coliseum", "Los Angeles", "CA", "USA", "mls", 34.0141, -118.2879, "America/Los_Angeles"),
|
||||
"stadium_mls_mandt_bank_stadium": StadiumInfo("stadium_mls_mandt_bank_stadium", "M&T Bank Stadium", "Baltimore", "MD", "USA", "mls", 39.2780, -76.6227),
|
||||
},
|
||||
"wnba": {
|
||||
"stadium_wnba_gateway_center_arena": StadiumInfo("stadium_wnba_gateway_center_arena", "Gateway Center Arena", "College Park", "GA", "USA", "wnba", 33.6510, -84.4474),
|
||||
@@ -286,6 +297,9 @@ STADIUM_MAPPINGS: dict[str, dict[str, StadiumInfo]] = {
|
||||
# Shared NFL/MLB venues
|
||||
"stadium_nwsl_soldier_field": StadiumInfo("stadium_nwsl_soldier_field", "Soldier Field", "Chicago", "IL", "USA", "nwsl", 41.8623, -87.6167, "America/Chicago"),
|
||||
"stadium_nwsl_oracle_park": StadiumInfo("stadium_nwsl_oracle_park", "Oracle Park", "San Francisco", "CA", "USA", "nwsl", 37.7786, -122.3893, "America/Los_Angeles"),
|
||||
# Additional alternate venues
|
||||
"stadium_nwsl_northwestern_medicine_field": StadiumInfo("stadium_nwsl_northwestern_medicine_field", "Northwestern Medicine Field at Martin Stadium", "Evanston", "IL", "USA", "nwsl", 42.0598, -87.6743, "America/Chicago"),
|
||||
"stadium_nwsl_one_spokane_stadium": StadiumInfo("stadium_nwsl_one_spokane_stadium", "ONE Spokane Stadium", "Spokane", "WA", "USA", "nwsl", 47.6588, -117.4101, "America/Los_Angeles"),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -28,9 +29,14 @@ class MLBScraper(BaseScraper):
|
||||
"""MLB schedule scraper with multi-source fallback.
|
||||
|
||||
Sources (in priority order):
|
||||
1. Baseball-Reference - Most reliable, complete historical data
|
||||
2. MLB Stats API - Official MLB data
|
||||
3. ESPN API - Backup option
|
||||
1. MLB Stats API - Official MLB data (primary)
|
||||
2. ESPN API - Backup option
|
||||
3. Baseball-Reference - Complete historical data
|
||||
|
||||
Source Timezones:
|
||||
- mlb_api: UTC - ISO 8601 format with "Z" suffix (gameDate field)
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
- baseball_reference: Eastern Time (ET) - times displayed as "7:05 PM ET"
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
@@ -143,7 +149,22 @@ class MLBScraper(BaseScraper):
|
||||
continue
|
||||
|
||||
try:
|
||||
game = self._parse_br_game(elem, current_date, source_url)
|
||||
# Extract game time from the element if present
|
||||
# Baseball-Reference may have time in a span or in the text
|
||||
game_time_for_row = None
|
||||
time_elem = elem.find("span", class_="game_time")
|
||||
if time_elem:
|
||||
time_text = time_elem.get_text(strip=True)
|
||||
if time_text:
|
||||
try:
|
||||
# Parse time like "7:05 PM ET" or "1:10 PM"
|
||||
# Remove timezone suffix if present
|
||||
time_clean = time_text.replace(" ET", "").replace(" PT", "").replace(" CT", "").replace(" MT", "").strip()
|
||||
game_time_for_row = datetime.strptime(time_clean, "%I:%M %p")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
game = self._parse_br_game(elem, current_date, source_url, game_time_for_row)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
@@ -157,6 +178,7 @@ class MLBScraper(BaseScraper):
|
||||
elem,
|
||||
game_date: datetime,
|
||||
source_url: str,
|
||||
game_time: Optional[datetime] = None,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single Baseball-Reference game element."""
|
||||
text = elem.get_text(" ", strip=True)
|
||||
@@ -206,8 +228,17 @@ class MLBScraper(BaseScraper):
|
||||
# Third link might be stadium
|
||||
stadium = links[2].get_text(strip=True)
|
||||
|
||||
# Combine date and time if time was provided, with ET timezone (Baseball-Reference uses ET)
|
||||
final_game_date = game_date
|
||||
if game_time:
|
||||
final_game_date = game_date.replace(
|
||||
hour=game_time.hour,
|
||||
minute=game_time.minute,
|
||||
tzinfo=ZoneInfo("America/New_York"),
|
||||
)
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
game_date=final_game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=stadium,
|
||||
|
||||
@@ -27,7 +27,11 @@ class MLSScraper(BaseScraper):
|
||||
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for MLS
|
||||
2. FBref - Backup option
|
||||
2. FBref - Backup option (not implemented)
|
||||
|
||||
Source Timezones:
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
- fbref: Not implemented
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from datetime import datetime, date, timezone
|
||||
from typing import Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
@@ -45,7 +46,12 @@ class NBAScraper(BaseScraper):
|
||||
Sources (in priority order):
|
||||
1. Basketball-Reference - Most reliable, complete historical data
|
||||
2. ESPN API - Good for current/future seasons
|
||||
3. CBS Sports - Backup option
|
||||
3. CBS Sports - Backup option (not implemented)
|
||||
|
||||
Source Timezones:
|
||||
- basketball_reference: Eastern Time (ET) - times displayed as "7:30p"
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
- cbs: Not implemented
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
@@ -196,6 +202,25 @@ class NBAScraper(BaseScraper):
|
||||
self._logger.debug(f"Could not parse date: {date_text}")
|
||||
return None
|
||||
|
||||
# Get game start time (format: "7:30p" or "10:00p") - times are in ET
|
||||
time_cell = row.find("td", {"data-stat": "game_start_time"})
|
||||
if time_cell:
|
||||
time_text = time_cell.get_text(strip=True)
|
||||
if time_text:
|
||||
try:
|
||||
# Parse time like "7:30p" or "10:00p"
|
||||
# Normalize: "7:30p" -> "7:30 PM", "10:00p" -> "10:00 PM"
|
||||
time_normalized = time_text.replace("p", " PM").replace("a", " AM")
|
||||
game_time = datetime.strptime(time_normalized, "%I:%M %p")
|
||||
# Combine date and time with ET timezone (Basketball-Reference uses ET)
|
||||
game_date = game_date.replace(
|
||||
hour=game_time.hour,
|
||||
minute=game_time.minute,
|
||||
tzinfo=ZoneInfo("America/New_York"),
|
||||
)
|
||||
except ValueError:
|
||||
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
|
||||
|
||||
# Get teams
|
||||
away_cell = row.find("td", {"data-stat": "visitor_team_name"})
|
||||
home_cell = row.find("td", {"data-stat": "home_team_name"})
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from datetime import datetime, date
|
||||
from typing import Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -33,7 +34,12 @@ class NFLScraper(BaseScraper):
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for NFL
|
||||
2. Pro-Football-Reference - Complete historical data
|
||||
3. CBS Sports - Backup option
|
||||
3. CBS Sports - Backup option (not implemented)
|
||||
|
||||
Source Timezones:
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
- pro_football_reference: Eastern Time (ET) - times displayed as "8:20PM"
|
||||
- cbs: Not implemented
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
@@ -323,6 +329,25 @@ class NFLScraper(BaseScraper):
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get game start time (format: "8:20PM" or "1:00PM") - times are in ET
|
||||
time_cell = row.find("td", {"data-stat": "gametime"})
|
||||
if time_cell:
|
||||
time_text = time_cell.get_text(strip=True)
|
||||
if time_text:
|
||||
try:
|
||||
# Parse time like "8:20PM" or "1:00PM"
|
||||
# Normalize: "8:20PM" -> "8:20 PM"
|
||||
time_normalized = time_text.replace("PM", " PM").replace("AM", " AM")
|
||||
game_time = datetime.strptime(time_normalized, "%I:%M %p")
|
||||
# Combine date and time with ET timezone (Pro-Football-Reference uses ET)
|
||||
game_date = game_date.replace(
|
||||
hour=game_time.hour,
|
||||
minute=game_time.minute,
|
||||
tzinfo=ZoneInfo("America/New_York"),
|
||||
)
|
||||
except ValueError:
|
||||
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
|
||||
|
||||
# Get teams
|
||||
winner_cell = row.find("td", {"data-stat": "winner"})
|
||||
loser_cell = row.find("td", {"data-stat": "loser"})
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from datetime import datetime, date
|
||||
from typing import Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -40,6 +41,11 @@ class NHLScraper(BaseScraper):
|
||||
1. Hockey-Reference - Most reliable for NHL
|
||||
2. NHL API - Official NHL data
|
||||
3. ESPN API - Backup option
|
||||
|
||||
Source Timezones:
|
||||
- hockey_reference: Eastern Time (ET) - times displayed as "7:00p"
|
||||
- nhl_api: UTC - ISO 8601 format with "Z" suffix (startTimeUTC field)
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
@@ -158,6 +164,25 @@ class NHLScraper(BaseScraper):
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get game start time (format: "7:00p" or "10:30p") - times are in ET
|
||||
time_cell = row.find("td", {"data-stat": "time_game"})
|
||||
if time_cell:
|
||||
time_text = time_cell.get_text(strip=True)
|
||||
if time_text:
|
||||
try:
|
||||
# Parse time like "7:00p" or "10:30p"
|
||||
# Normalize: "7:00p" -> "7:00 PM", "10:30p" -> "10:30 PM"
|
||||
time_normalized = time_text.replace("p", " PM").replace("a", " AM")
|
||||
game_time = datetime.strptime(time_normalized, "%I:%M %p")
|
||||
# Combine date and time with ET timezone (Hockey-Reference uses ET)
|
||||
game_date = game_date.replace(
|
||||
hour=game_time.hour,
|
||||
minute=game_time.minute,
|
||||
tzinfo=ZoneInfo("America/New_York"),
|
||||
)
|
||||
except ValueError:
|
||||
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
|
||||
|
||||
# Get teams
|
||||
visitor_cell = row.find("td", {"data-stat": "visitor_team_name"})
|
||||
home_cell = row.find("td", {"data-stat": "home_team_name"})
|
||||
|
||||
@@ -27,7 +27,9 @@ class NWSLScraper(BaseScraper):
|
||||
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for NWSL
|
||||
2. NWSL official (via ESPN) - Backup option
|
||||
|
||||
Source Timezones:
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
|
||||
@@ -27,7 +27,9 @@ class WNBAScraper(BaseScraper):
|
||||
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for WNBA
|
||||
2. WNBA official (via ESPN) - Backup option
|
||||
|
||||
Source Timezones:
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
|
||||
@@ -2032,5 +2032,89 @@
|
||||
"stadium_canonical_id": "stadium_wnba_purcell_pavilion",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "inter miami",
|
||||
"stadium_canonical_id": "stadium_mls_chase_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "inter miami cf",
|
||||
"stadium_canonical_id": "stadium_mls_chase_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "miami",
|
||||
"stadium_canonical_id": "stadium_mls_chase_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "mia",
|
||||
"stadium_canonical_id": "stadium_mls_chase_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "nycfc",
|
||||
"stadium_canonical_id": "stadium_mls_yankee_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "nyc",
|
||||
"stadium_canonical_id": "stadium_mls_yankee_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "new york city fc",
|
||||
"stadium_canonical_id": "stadium_mls_yankee_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "lafc",
|
||||
"stadium_canonical_id": "stadium_mls_bmo_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "los angeles fc",
|
||||
"stadium_canonical_id": "stadium_mls_bmo_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "dc united",
|
||||
"stadium_canonical_id": "stadium_mls_audi_field",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "d.c. united",
|
||||
"stadium_canonical_id": "stadium_mls_audi_field",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "chicago red stars",
|
||||
"stadium_canonical_id": "stadium_nwsl_seatgeek_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "chi",
|
||||
"stadium_canonical_id": "stadium_nwsl_seatgeek_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
},
|
||||
{
|
||||
"alias_name": "chicago",
|
||||
"stadium_canonical_id": "stadium_nwsl_seatgeek_stadium",
|
||||
"valid_from": null,
|
||||
"valid_until": null
|
||||
}
|
||||
]
|
||||
Reference in New Issue
Block a user