feat: add Django web app, CloudKit sync, dashboard, and game_datetime_utc export
Adds the full Django application layer on top of sportstime_parser: - core: Sport, Team, Stadium, Game models with aliases and league structure - scraper: orchestration engine, adapter, job management, Celery tasks - cloudkit: CloudKit sync client, sync state tracking, sync jobs - dashboard: staff dashboard for monitoring scrapers, sync, review queue - notifications: email reports for scrape/sync results - Docker setup for deployment (Dockerfile, docker-compose, entrypoint) Game exports now use game_datetime_utc (ISO 8601 UTC) instead of venue-local date+time strings, matching the canonical format used by the iOS app. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -28,9 +29,14 @@ class MLBScraper(BaseScraper):
|
||||
"""MLB schedule scraper with multi-source fallback.
|
||||
|
||||
Sources (in priority order):
|
||||
1. Baseball-Reference - Most reliable, complete historical data
|
||||
2. MLB Stats API - Official MLB data
|
||||
3. ESPN API - Backup option
|
||||
1. MLB Stats API - Official MLB data (primary)
|
||||
2. ESPN API - Backup option
|
||||
3. Baseball-Reference - Complete historical data
|
||||
|
||||
Source Timezones:
|
||||
- mlb_api: UTC - ISO 8601 format with "Z" suffix (gameDate field)
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
- baseball_reference: Eastern Time (ET) - times displayed as "7:05 PM ET"
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
@@ -143,7 +149,22 @@ class MLBScraper(BaseScraper):
|
||||
continue
|
||||
|
||||
try:
|
||||
game = self._parse_br_game(elem, current_date, source_url)
|
||||
# Extract game time from the element if present
|
||||
# Baseball-Reference may have time in a span or in the text
|
||||
game_time_for_row = None
|
||||
time_elem = elem.find("span", class_="game_time")
|
||||
if time_elem:
|
||||
time_text = time_elem.get_text(strip=True)
|
||||
if time_text:
|
||||
try:
|
||||
# Parse time like "7:05 PM ET" or "1:10 PM"
|
||||
# Remove timezone suffix if present
|
||||
time_clean = time_text.replace(" ET", "").replace(" PT", "").replace(" CT", "").replace(" MT", "").strip()
|
||||
game_time_for_row = datetime.strptime(time_clean, "%I:%M %p")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
game = self._parse_br_game(elem, current_date, source_url, game_time_for_row)
|
||||
if game:
|
||||
games.append(game)
|
||||
except Exception as e:
|
||||
@@ -157,6 +178,7 @@ class MLBScraper(BaseScraper):
|
||||
elem,
|
||||
game_date: datetime,
|
||||
source_url: str,
|
||||
game_time: Optional[datetime] = None,
|
||||
) -> Optional[RawGameData]:
|
||||
"""Parse a single Baseball-Reference game element."""
|
||||
text = elem.get_text(" ", strip=True)
|
||||
@@ -206,8 +228,17 @@ class MLBScraper(BaseScraper):
|
||||
# Third link might be stadium
|
||||
stadium = links[2].get_text(strip=True)
|
||||
|
||||
# Combine date and time if time was provided, with ET timezone (Baseball-Reference uses ET)
|
||||
final_game_date = game_date
|
||||
if game_time:
|
||||
final_game_date = game_date.replace(
|
||||
hour=game_time.hour,
|
||||
minute=game_time.minute,
|
||||
tzinfo=ZoneInfo("America/New_York"),
|
||||
)
|
||||
|
||||
return RawGameData(
|
||||
game_date=game_date,
|
||||
game_date=final_game_date,
|
||||
home_team_raw=home_team,
|
||||
away_team_raw=away_team,
|
||||
stadium_raw=stadium,
|
||||
@@ -672,6 +703,7 @@ class MLBScraper(BaseScraper):
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
timezone=info.timezone,
|
||||
surface="grass", # Most MLB stadiums
|
||||
roof_type="open", # Most MLB stadiums
|
||||
)
|
||||
|
||||
@@ -27,7 +27,11 @@ class MLSScraper(BaseScraper):
|
||||
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for MLS
|
||||
2. FBref - Backup option
|
||||
2. FBref - Backup option (not implemented)
|
||||
|
||||
Source Timezones:
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
- fbref: Not implemented
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
@@ -387,6 +391,7 @@ class MLSScraper(BaseScraper):
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
timezone=info.timezone,
|
||||
surface="grass",
|
||||
roof_type="open",
|
||||
)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from datetime import datetime, date, timezone
|
||||
from typing import Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
@@ -45,7 +46,12 @@ class NBAScraper(BaseScraper):
|
||||
Sources (in priority order):
|
||||
1. Basketball-Reference - Most reliable, complete historical data
|
||||
2. ESPN API - Good for current/future seasons
|
||||
3. CBS Sports - Backup option
|
||||
3. CBS Sports - Backup option (not implemented)
|
||||
|
||||
Source Timezones:
|
||||
- basketball_reference: Eastern Time (ET) - times displayed as "7:30p"
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
- cbs: Not implemented
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
@@ -196,6 +202,25 @@ class NBAScraper(BaseScraper):
|
||||
self._logger.debug(f"Could not parse date: {date_text}")
|
||||
return None
|
||||
|
||||
# Get game start time (format: "7:30p" or "10:00p") - times are in ET
|
||||
time_cell = row.find("td", {"data-stat": "game_start_time"})
|
||||
if time_cell:
|
||||
time_text = time_cell.get_text(strip=True)
|
||||
if time_text:
|
||||
try:
|
||||
# Parse time like "7:30p" or "10:00p"
|
||||
# Normalize: "7:30p" -> "7:30 PM", "10:00p" -> "10:00 PM"
|
||||
time_normalized = time_text.replace("p", " PM").replace("a", " AM")
|
||||
game_time = datetime.strptime(time_normalized, "%I:%M %p")
|
||||
# Combine date and time with ET timezone (Basketball-Reference uses ET)
|
||||
game_date = game_date.replace(
|
||||
hour=game_time.hour,
|
||||
minute=game_time.minute,
|
||||
tzinfo=ZoneInfo("America/New_York"),
|
||||
)
|
||||
except ValueError:
|
||||
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
|
||||
|
||||
# Get teams
|
||||
away_cell = row.find("td", {"data-stat": "visitor_team_name"})
|
||||
home_cell = row.find("td", {"data-stat": "home_team_name"})
|
||||
@@ -648,6 +673,7 @@ class NBAScraper(BaseScraper):
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
timezone=info.timezone,
|
||||
surface="hardwood",
|
||||
roof_type="dome",
|
||||
)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from datetime import datetime, date
|
||||
from typing import Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -33,7 +34,12 @@ class NFLScraper(BaseScraper):
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for NFL
|
||||
2. Pro-Football-Reference - Complete historical data
|
||||
3. CBS Sports - Backup option
|
||||
3. CBS Sports - Backup option (not implemented)
|
||||
|
||||
Source Timezones:
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
- pro_football_reference: Eastern Time (ET) - times displayed as "8:20PM"
|
||||
- cbs: Not implemented
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
@@ -56,7 +62,7 @@ class NFLScraper(BaseScraper):
|
||||
if source == "espn":
|
||||
week = kwargs.get("week", 1)
|
||||
season_type = kwargs.get("season_type", 2) # 1=preseason, 2=regular, 3=postseason
|
||||
return f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?seasontype={season_type}&week={week}"
|
||||
return f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?season={self.season}&seasontype={season_type}&week={week}"
|
||||
|
||||
elif source == "pro_football_reference":
|
||||
return f"https://www.pro-football-reference.com/years/{self.season}/games.htm"
|
||||
@@ -323,6 +329,25 @@ class NFLScraper(BaseScraper):
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get game start time (format: "8:20PM" or "1:00PM") - times are in ET
|
||||
time_cell = row.find("td", {"data-stat": "gametime"})
|
||||
if time_cell:
|
||||
time_text = time_cell.get_text(strip=True)
|
||||
if time_text:
|
||||
try:
|
||||
# Parse time like "8:20PM" or "1:00PM"
|
||||
# Normalize: "8:20PM" -> "8:20 PM"
|
||||
time_normalized = time_text.replace("PM", " PM").replace("AM", " AM")
|
||||
game_time = datetime.strptime(time_normalized, "%I:%M %p")
|
||||
# Combine date and time with ET timezone (Pro-Football-Reference uses ET)
|
||||
game_date = game_date.replace(
|
||||
hour=game_time.hour,
|
||||
minute=game_time.minute,
|
||||
tzinfo=ZoneInfo("America/New_York"),
|
||||
)
|
||||
except ValueError:
|
||||
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
|
||||
|
||||
# Get teams
|
||||
winner_cell = row.find("td", {"data-stat": "winner"})
|
||||
loser_cell = row.find("td", {"data-stat": "loser"})
|
||||
@@ -566,6 +591,7 @@ class NFLScraper(BaseScraper):
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
timezone=info.timezone,
|
||||
surface="turf", # Many NFL stadiums
|
||||
roof_type="open", # Most outdoor
|
||||
)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from datetime import datetime, date
|
||||
from typing import Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -40,6 +41,11 @@ class NHLScraper(BaseScraper):
|
||||
1. Hockey-Reference - Most reliable for NHL
|
||||
2. NHL API - Official NHL data
|
||||
3. ESPN API - Backup option
|
||||
|
||||
Source Timezones:
|
||||
- hockey_reference: Eastern Time (ET) - times displayed as "7:00p"
|
||||
- nhl_api: UTC - ISO 8601 format with "Z" suffix (startTimeUTC field)
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
@@ -158,6 +164,25 @@ class NHLScraper(BaseScraper):
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Get game start time (format: "7:00p" or "10:30p") - times are in ET
|
||||
time_cell = row.find("td", {"data-stat": "time_game"})
|
||||
if time_cell:
|
||||
time_text = time_cell.get_text(strip=True)
|
||||
if time_text:
|
||||
try:
|
||||
# Parse time like "7:00p" or "10:30p"
|
||||
# Normalize: "7:00p" -> "7:00 PM", "10:30p" -> "10:30 PM"
|
||||
time_normalized = time_text.replace("p", " PM").replace("a", " AM")
|
||||
game_time = datetime.strptime(time_normalized, "%I:%M %p")
|
||||
# Combine date and time with ET timezone (Hockey-Reference uses ET)
|
||||
game_date = game_date.replace(
|
||||
hour=game_time.hour,
|
||||
minute=game_time.minute,
|
||||
tzinfo=ZoneInfo("America/New_York"),
|
||||
)
|
||||
except ValueError:
|
||||
self._logger.debug(f"Could not parse time: {time_text}, using midnight")
|
||||
|
||||
# Get teams
|
||||
visitor_cell = row.find("td", {"data-stat": "visitor_team_name"})
|
||||
home_cell = row.find("td", {"data-stat": "home_team_name"})
|
||||
@@ -644,6 +669,7 @@ class NHLScraper(BaseScraper):
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
timezone=info.timezone,
|
||||
surface="ice",
|
||||
roof_type="dome",
|
||||
)
|
||||
|
||||
@@ -27,7 +27,9 @@ class NWSLScraper(BaseScraper):
|
||||
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for NWSL
|
||||
2. NWSL official (via ESPN) - Backup option
|
||||
|
||||
Source Timezones:
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
@@ -361,6 +363,7 @@ class NWSLScraper(BaseScraper):
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
timezone=info.timezone,
|
||||
surface="grass",
|
||||
roof_type="open",
|
||||
)
|
||||
|
||||
@@ -27,7 +27,9 @@ class WNBAScraper(BaseScraper):
|
||||
|
||||
Sources (in priority order):
|
||||
1. ESPN API - Most reliable for WNBA
|
||||
2. WNBA official (via ESPN) - Backup option
|
||||
|
||||
Source Timezones:
|
||||
- espn: UTC - ISO 8601 format with "Z" suffix
|
||||
"""
|
||||
|
||||
def __init__(self, season: int, **kwargs):
|
||||
@@ -362,6 +364,7 @@ class WNBAScraper(BaseScraper):
|
||||
country=info.country,
|
||||
latitude=info.latitude,
|
||||
longitude=info.longitude,
|
||||
timezone=info.timezone,
|
||||
surface="hardwood",
|
||||
roof_type="dome",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user