Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
345 lines
9.6 KiB
Python
345 lines
9.6 KiB
Python
"""Timezone conversion utilities for normalizing game times to UTC."""
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, date, time
|
|
from typing import Optional
|
|
from zoneinfo import ZoneInfo
|
|
|
|
from dateutil import parser as dateutil_parser
|
|
from dateutil.tz import gettz, tzutc
|
|
|
|
from ..models.aliases import ReviewReason, ManualReviewItem
|
|
|
|
|
|
# Common timezone abbreviations to IANA timezones
|
|
TIMEZONE_ABBREV_MAP: dict[str, str] = {
|
|
# US timezones
|
|
"ET": "America/New_York",
|
|
"EST": "America/New_York",
|
|
"EDT": "America/New_York",
|
|
"CT": "America/Chicago",
|
|
"CST": "America/Chicago",
|
|
"CDT": "America/Chicago",
|
|
"MT": "America/Denver",
|
|
"MST": "America/Denver",
|
|
"MDT": "America/Denver",
|
|
"PT": "America/Los_Angeles",
|
|
"PST": "America/Los_Angeles",
|
|
"PDT": "America/Los_Angeles",
|
|
"AT": "America/Anchorage",
|
|
"AKST": "America/Anchorage",
|
|
"AKDT": "America/Anchorage",
|
|
"HT": "Pacific/Honolulu",
|
|
"HST": "Pacific/Honolulu",
|
|
# Canada
|
|
"AST": "America/Halifax",
|
|
"ADT": "America/Halifax",
|
|
"NST": "America/St_Johns",
|
|
"NDT": "America/St_Johns",
|
|
# Mexico
|
|
"CDST": "America/Mexico_City",
|
|
# UTC
|
|
"UTC": "UTC",
|
|
"GMT": "UTC",
|
|
"Z": "UTC",
|
|
}
|
|
|
|
# State/region to timezone mapping for inferring timezone from location
|
|
STATE_TIMEZONE_MAP: dict[str, str] = {
|
|
# Eastern
|
|
"CT": "America/New_York",
|
|
"DE": "America/New_York",
|
|
"FL": "America/New_York", # Most of Florida
|
|
"GA": "America/New_York",
|
|
"MA": "America/New_York",
|
|
"MD": "America/New_York",
|
|
"ME": "America/New_York",
|
|
"MI": "America/Detroit",
|
|
"NC": "America/New_York",
|
|
"NH": "America/New_York",
|
|
"NJ": "America/New_York",
|
|
"NY": "America/New_York",
|
|
"OH": "America/New_York",
|
|
"PA": "America/New_York",
|
|
"RI": "America/New_York",
|
|
"SC": "America/New_York",
|
|
"VA": "America/New_York",
|
|
"VT": "America/New_York",
|
|
"WV": "America/New_York",
|
|
"DC": "America/New_York",
|
|
# Central
|
|
"AL": "America/Chicago",
|
|
"AR": "America/Chicago",
|
|
"IA": "America/Chicago",
|
|
"IL": "America/Chicago",
|
|
"IN": "America/Indiana/Indianapolis",
|
|
"KS": "America/Chicago",
|
|
"KY": "America/Kentucky/Louisville",
|
|
"LA": "America/Chicago",
|
|
"MN": "America/Chicago",
|
|
"MO": "America/Chicago",
|
|
"MS": "America/Chicago",
|
|
"ND": "America/Chicago",
|
|
"NE": "America/Chicago",
|
|
"OK": "America/Chicago",
|
|
"SD": "America/Chicago",
|
|
"TN": "America/Chicago",
|
|
"TX": "America/Chicago",
|
|
"WI": "America/Chicago",
|
|
# Mountain
|
|
"AZ": "America/Phoenix", # No DST
|
|
"CO": "America/Denver",
|
|
"ID": "America/Boise",
|
|
"MT": "America/Denver",
|
|
"NM": "America/Denver",
|
|
"UT": "America/Denver",
|
|
"WY": "America/Denver",
|
|
# Pacific
|
|
"CA": "America/Los_Angeles",
|
|
"NV": "America/Los_Angeles",
|
|
"OR": "America/Los_Angeles",
|
|
"WA": "America/Los_Angeles",
|
|
# Alaska/Hawaii
|
|
"AK": "America/Anchorage",
|
|
"HI": "Pacific/Honolulu",
|
|
# Canada provinces
|
|
"ON": "America/Toronto",
|
|
"QC": "America/Montreal",
|
|
"BC": "America/Vancouver",
|
|
"AB": "America/Edmonton",
|
|
"MB": "America/Winnipeg",
|
|
"SK": "America/Regina",
|
|
"NS": "America/Halifax",
|
|
"NB": "America/Moncton",
|
|
"NL": "America/St_Johns",
|
|
"PE": "America/Halifax",
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class TimezoneResult:
|
|
"""Result of timezone conversion.
|
|
|
|
Attributes:
|
|
datetime_utc: The datetime converted to UTC
|
|
source_timezone: The timezone that was detected/used
|
|
confidence: Confidence in the timezone detection ('high', 'medium', 'low')
|
|
warning: Warning message if timezone was uncertain
|
|
"""
|
|
|
|
datetime_utc: datetime
|
|
source_timezone: str
|
|
confidence: str
|
|
warning: Optional[str] = None
|
|
|
|
|
|
def detect_timezone_from_string(time_str: str) -> Optional[str]:
|
|
"""Detect timezone from a time string containing a timezone abbreviation.
|
|
|
|
Args:
|
|
time_str: Time string that may contain timezone info (e.g., '7:00 PM ET')
|
|
|
|
Returns:
|
|
IANA timezone string if detected, None otherwise
|
|
"""
|
|
# Look for timezone abbreviation at end of string
|
|
for abbrev, tz in TIMEZONE_ABBREV_MAP.items():
|
|
pattern = rf"\b{abbrev}\b"
|
|
if re.search(pattern, time_str, re.IGNORECASE):
|
|
return tz
|
|
|
|
return None
|
|
|
|
|
|
def detect_timezone_from_location(
|
|
state: Optional[str] = None,
|
|
city: Optional[str] = None,
|
|
) -> Optional[str]:
|
|
"""Detect timezone from location information.
|
|
|
|
Args:
|
|
state: State/province code (e.g., 'NY', 'ON')
|
|
city: City name (optional, for special cases)
|
|
|
|
Returns:
|
|
IANA timezone string if detected, None otherwise
|
|
"""
|
|
if state and state.upper() in STATE_TIMEZONE_MAP:
|
|
return STATE_TIMEZONE_MAP[state.upper()]
|
|
|
|
return None
|
|
|
|
|
|
def parse_datetime(
|
|
date_str: str,
|
|
time_str: Optional[str] = None,
|
|
timezone_hint: Optional[str] = None,
|
|
location_state: Optional[str] = None,
|
|
) -> TimezoneResult:
|
|
"""Parse a date/time string and convert to UTC.
|
|
|
|
Attempts to detect timezone from:
|
|
1. Explicit timezone in the string
|
|
2. Provided timezone hint
|
|
3. Location-based inference
|
|
4. Default to Eastern Time with warning
|
|
|
|
Args:
|
|
date_str: Date string (e.g., '2025-10-21', 'October 21, 2025')
|
|
time_str: Optional time string (e.g., '7:00 PM ET', '19:00')
|
|
timezone_hint: Optional IANA timezone to use if not detected
|
|
location_state: Optional state code for timezone inference
|
|
|
|
Returns:
|
|
TimezoneResult with UTC datetime and metadata
|
|
"""
|
|
# Parse the date
|
|
try:
|
|
if time_str:
|
|
# Combine date and time
|
|
full_str = f"{date_str} {time_str}"
|
|
else:
|
|
full_str = date_str
|
|
|
|
parsed = dateutil_parser.parse(full_str, fuzzy=True)
|
|
except (ValueError, OverflowError) as e:
|
|
# If parsing fails, return a placeholder with low confidence
|
|
return TimezoneResult(
|
|
datetime_utc=datetime.now(tz=ZoneInfo("UTC")),
|
|
source_timezone="unknown",
|
|
confidence="low",
|
|
warning=f"Failed to parse datetime: {e}",
|
|
)
|
|
|
|
# Determine timezone
|
|
detected_tz = None
|
|
confidence = "high"
|
|
warning = None
|
|
|
|
# Check if datetime already has timezone
|
|
if parsed.tzinfo is not None:
|
|
detected_tz = str(parsed.tzinfo)
|
|
else:
|
|
# Try to detect from time string
|
|
if time_str:
|
|
detected_tz = detect_timezone_from_string(time_str)
|
|
|
|
# Try timezone hint
|
|
if not detected_tz and timezone_hint:
|
|
detected_tz = timezone_hint
|
|
confidence = "medium"
|
|
|
|
# Try location inference
|
|
if not detected_tz and location_state:
|
|
detected_tz = detect_timezone_from_location(state=location_state)
|
|
confidence = "medium"
|
|
|
|
# Default to Eastern Time
|
|
if not detected_tz:
|
|
detected_tz = "America/New_York"
|
|
confidence = "low"
|
|
warning = "Timezone not detected, defaulting to Eastern Time"
|
|
|
|
# Apply timezone and convert to UTC
|
|
try:
|
|
tz = ZoneInfo(detected_tz)
|
|
except KeyError:
|
|
# Invalid timezone, try to resolve abbreviation
|
|
if detected_tz in TIMEZONE_ABBREV_MAP:
|
|
tz = ZoneInfo(TIMEZONE_ABBREV_MAP[detected_tz])
|
|
detected_tz = TIMEZONE_ABBREV_MAP[detected_tz]
|
|
else:
|
|
tz = ZoneInfo("America/New_York")
|
|
confidence = "low"
|
|
warning = f"Unknown timezone '{detected_tz}', defaulting to Eastern Time"
|
|
detected_tz = "America/New_York"
|
|
|
|
# Apply timezone if not already set
|
|
if parsed.tzinfo is None:
|
|
parsed = parsed.replace(tzinfo=tz)
|
|
|
|
# Convert to UTC
|
|
utc_dt = parsed.astimezone(ZoneInfo("UTC"))
|
|
|
|
return TimezoneResult(
|
|
datetime_utc=utc_dt,
|
|
source_timezone=detected_tz,
|
|
confidence=confidence,
|
|
warning=warning,
|
|
)
|
|
|
|
|
|
def convert_to_utc(
|
|
dt: datetime,
|
|
source_timezone: str,
|
|
) -> datetime:
|
|
"""Convert a datetime from a known timezone to UTC.
|
|
|
|
Args:
|
|
dt: Datetime to convert (timezone-naive or timezone-aware)
|
|
source_timezone: IANA timezone of the datetime
|
|
|
|
Returns:
|
|
Datetime in UTC
|
|
"""
|
|
tz = ZoneInfo(source_timezone)
|
|
|
|
if dt.tzinfo is None:
|
|
# Localize naive datetime
|
|
dt = dt.replace(tzinfo=tz)
|
|
|
|
return dt.astimezone(ZoneInfo("UTC"))
|
|
|
|
|
|
def create_timezone_warning(
|
|
raw_value: str,
|
|
sport: str,
|
|
game_date: Optional[date] = None,
|
|
source_url: Optional[str] = None,
|
|
) -> ManualReviewItem:
|
|
"""Create a manual review item for an undetermined timezone.
|
|
|
|
Args:
|
|
raw_value: The original time string that couldn't be resolved
|
|
sport: Sport code
|
|
game_date: Date of the game
|
|
source_url: URL of the source page
|
|
|
|
Returns:
|
|
ManualReviewItem for timezone review
|
|
"""
|
|
return ManualReviewItem(
|
|
id=f"tz_{sport}_{raw_value[:20].replace(' ', '_')}",
|
|
reason=ReviewReason.TIMEZONE_UNKNOWN,
|
|
sport=sport,
|
|
raw_value=raw_value,
|
|
context={"issue": "Could not determine timezone for game time"},
|
|
source_url=source_url,
|
|
game_date=game_date,
|
|
)
|
|
|
|
|
|
def get_stadium_timezone(
|
|
stadium_state: str,
|
|
stadium_timezone: Optional[str] = None,
|
|
) -> str:
|
|
"""Get the timezone for a stadium based on its location.
|
|
|
|
Args:
|
|
stadium_state: State/province code
|
|
stadium_timezone: Explicit timezone override from stadium data
|
|
|
|
Returns:
|
|
IANA timezone string
|
|
"""
|
|
if stadium_timezone:
|
|
return stadium_timezone
|
|
|
|
tz = detect_timezone_from_location(state=stadium_state)
|
|
if tz:
|
|
return tz
|
|
|
|
# Default to Eastern
|
|
return "America/New_York"
|