Files
SportstimeAPI/sportstime_parser/normalizers/timezone.py
Trey t 52d445bca4 feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00

345 lines
9.6 KiB
Python

"""Timezone conversion utilities for normalizing game times to UTC."""
import re
from dataclasses import dataclass
from datetime import datetime, date, time
from typing import Optional
from zoneinfo import ZoneInfo
from dateutil import parser as dateutil_parser
from dateutil.tz import gettz, tzutc
from ..models.aliases import ReviewReason, ManualReviewItem
# Common timezone abbreviations to IANA timezones
TIMEZONE_ABBREV_MAP: dict[str, str] = {
# US timezones
"ET": "America/New_York",
"EST": "America/New_York",
"EDT": "America/New_York",
"CT": "America/Chicago",
"CST": "America/Chicago",
"CDT": "America/Chicago",
"MT": "America/Denver",
"MST": "America/Denver",
"MDT": "America/Denver",
"PT": "America/Los_Angeles",
"PST": "America/Los_Angeles",
"PDT": "America/Los_Angeles",
"AT": "America/Anchorage",
"AKST": "America/Anchorage",
"AKDT": "America/Anchorage",
"HT": "Pacific/Honolulu",
"HST": "Pacific/Honolulu",
# Canada
"AST": "America/Halifax",
"ADT": "America/Halifax",
"NST": "America/St_Johns",
"NDT": "America/St_Johns",
# Mexico
"CDST": "America/Mexico_City",
# UTC
"UTC": "UTC",
"GMT": "UTC",
"Z": "UTC",
}
# State/region to timezone mapping for inferring timezone from location
STATE_TIMEZONE_MAP: dict[str, str] = {
# Eastern
"CT": "America/New_York",
"DE": "America/New_York",
"FL": "America/New_York", # Most of Florida
"GA": "America/New_York",
"MA": "America/New_York",
"MD": "America/New_York",
"ME": "America/New_York",
"MI": "America/Detroit",
"NC": "America/New_York",
"NH": "America/New_York",
"NJ": "America/New_York",
"NY": "America/New_York",
"OH": "America/New_York",
"PA": "America/New_York",
"RI": "America/New_York",
"SC": "America/New_York",
"VA": "America/New_York",
"VT": "America/New_York",
"WV": "America/New_York",
"DC": "America/New_York",
# Central
"AL": "America/Chicago",
"AR": "America/Chicago",
"IA": "America/Chicago",
"IL": "America/Chicago",
"IN": "America/Indiana/Indianapolis",
"KS": "America/Chicago",
"KY": "America/Kentucky/Louisville",
"LA": "America/Chicago",
"MN": "America/Chicago",
"MO": "America/Chicago",
"MS": "America/Chicago",
"ND": "America/Chicago",
"NE": "America/Chicago",
"OK": "America/Chicago",
"SD": "America/Chicago",
"TN": "America/Chicago",
"TX": "America/Chicago",
"WI": "America/Chicago",
# Mountain
"AZ": "America/Phoenix", # No DST
"CO": "America/Denver",
"ID": "America/Boise",
"MT": "America/Denver",
"NM": "America/Denver",
"UT": "America/Denver",
"WY": "America/Denver",
# Pacific
"CA": "America/Los_Angeles",
"NV": "America/Los_Angeles",
"OR": "America/Los_Angeles",
"WA": "America/Los_Angeles",
# Alaska/Hawaii
"AK": "America/Anchorage",
"HI": "Pacific/Honolulu",
# Canada provinces
"ON": "America/Toronto",
"QC": "America/Montreal",
"BC": "America/Vancouver",
"AB": "America/Edmonton",
"MB": "America/Winnipeg",
"SK": "America/Regina",
"NS": "America/Halifax",
"NB": "America/Moncton",
"NL": "America/St_Johns",
"PE": "America/Halifax",
}
@dataclass
class TimezoneResult:
"""Result of timezone conversion.
Attributes:
datetime_utc: The datetime converted to UTC
source_timezone: The timezone that was detected/used
confidence: Confidence in the timezone detection ('high', 'medium', 'low')
warning: Warning message if timezone was uncertain
"""
datetime_utc: datetime
source_timezone: str
confidence: str
warning: Optional[str] = None
def detect_timezone_from_string(time_str: str) -> Optional[str]:
"""Detect timezone from a time string containing a timezone abbreviation.
Args:
time_str: Time string that may contain timezone info (e.g., '7:00 PM ET')
Returns:
IANA timezone string if detected, None otherwise
"""
# Look for timezone abbreviation at end of string
for abbrev, tz in TIMEZONE_ABBREV_MAP.items():
pattern = rf"\b{abbrev}\b"
if re.search(pattern, time_str, re.IGNORECASE):
return tz
return None
def detect_timezone_from_location(
state: Optional[str] = None,
city: Optional[str] = None,
) -> Optional[str]:
"""Detect timezone from location information.
Args:
state: State/province code (e.g., 'NY', 'ON')
city: City name (optional, for special cases)
Returns:
IANA timezone string if detected, None otherwise
"""
if state and state.upper() in STATE_TIMEZONE_MAP:
return STATE_TIMEZONE_MAP[state.upper()]
return None
def parse_datetime(
date_str: str,
time_str: Optional[str] = None,
timezone_hint: Optional[str] = None,
location_state: Optional[str] = None,
) -> TimezoneResult:
"""Parse a date/time string and convert to UTC.
Attempts to detect timezone from:
1. Explicit timezone in the string
2. Provided timezone hint
3. Location-based inference
4. Default to Eastern Time with warning
Args:
date_str: Date string (e.g., '2025-10-21', 'October 21, 2025')
time_str: Optional time string (e.g., '7:00 PM ET', '19:00')
timezone_hint: Optional IANA timezone to use if not detected
location_state: Optional state code for timezone inference
Returns:
TimezoneResult with UTC datetime and metadata
"""
# Parse the date
try:
if time_str:
# Combine date and time
full_str = f"{date_str} {time_str}"
else:
full_str = date_str
parsed = dateutil_parser.parse(full_str, fuzzy=True)
except (ValueError, OverflowError) as e:
# If parsing fails, return a placeholder with low confidence
return TimezoneResult(
datetime_utc=datetime.now(tz=ZoneInfo("UTC")),
source_timezone="unknown",
confidence="low",
warning=f"Failed to parse datetime: {e}",
)
# Determine timezone
detected_tz = None
confidence = "high"
warning = None
# Check if datetime already has timezone
if parsed.tzinfo is not None:
detected_tz = str(parsed.tzinfo)
else:
# Try to detect from time string
if time_str:
detected_tz = detect_timezone_from_string(time_str)
# Try timezone hint
if not detected_tz and timezone_hint:
detected_tz = timezone_hint
confidence = "medium"
# Try location inference
if not detected_tz and location_state:
detected_tz = detect_timezone_from_location(state=location_state)
confidence = "medium"
# Default to Eastern Time
if not detected_tz:
detected_tz = "America/New_York"
confidence = "low"
warning = "Timezone not detected, defaulting to Eastern Time"
# Apply timezone and convert to UTC
try:
tz = ZoneInfo(detected_tz)
except KeyError:
# Invalid timezone, try to resolve abbreviation
if detected_tz in TIMEZONE_ABBREV_MAP:
tz = ZoneInfo(TIMEZONE_ABBREV_MAP[detected_tz])
detected_tz = TIMEZONE_ABBREV_MAP[detected_tz]
else:
tz = ZoneInfo("America/New_York")
confidence = "low"
warning = f"Unknown timezone '{detected_tz}', defaulting to Eastern Time"
detected_tz = "America/New_York"
# Apply timezone if not already set
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=tz)
# Convert to UTC
utc_dt = parsed.astimezone(ZoneInfo("UTC"))
return TimezoneResult(
datetime_utc=utc_dt,
source_timezone=detected_tz,
confidence=confidence,
warning=warning,
)
def convert_to_utc(
dt: datetime,
source_timezone: str,
) -> datetime:
"""Convert a datetime from a known timezone to UTC.
Args:
dt: Datetime to convert (timezone-naive or timezone-aware)
source_timezone: IANA timezone of the datetime
Returns:
Datetime in UTC
"""
tz = ZoneInfo(source_timezone)
if dt.tzinfo is None:
# Localize naive datetime
dt = dt.replace(tzinfo=tz)
return dt.astimezone(ZoneInfo("UTC"))
def create_timezone_warning(
raw_value: str,
sport: str,
game_date: Optional[date] = None,
source_url: Optional[str] = None,
) -> ManualReviewItem:
"""Create a manual review item for an undetermined timezone.
Args:
raw_value: The original time string that couldn't be resolved
sport: Sport code
game_date: Date of the game
source_url: URL of the source page
Returns:
ManualReviewItem for timezone review
"""
return ManualReviewItem(
id=f"tz_{sport}_{raw_value[:20].replace(' ', '_')}",
reason=ReviewReason.TIMEZONE_UNKNOWN,
sport=sport,
raw_value=raw_value,
context={"issue": "Could not determine timezone for game time"},
source_url=source_url,
game_date=game_date,
)
def get_stadium_timezone(
stadium_state: str,
stadium_timezone: Optional[str] = None,
) -> str:
"""Get the timezone for a stadium based on its location.
Args:
stadium_state: State/province code
stadium_timezone: Explicit timezone override from stadium data
Returns:
IANA timezone string
"""
if stadium_timezone:
return stadium_timezone
tz = detect_timezone_from_location(state=stadium_state)
if tz:
return tz
# Default to Eastern
return "America/New_York"