feat: add Django web app, CloudKit sync, dashboard, and game_datetime_utc export

Adds the full Django application layer on top of sportstime_parser:
- core: Sport, Team, Stadium, Game models with aliases and league structure
- scraper: orchestration engine, adapter, job management, Celery tasks
- cloudkit: CloudKit sync client, sync state tracking, sync jobs
- dashboard: staff dashboard for monitoring scrapers, sync, review queue
- notifications: email reports for scrape/sync results
- Docker setup for deployment (Dockerfile, docker-compose, entrypoint)

Game exports now use game_datetime_utc (ISO 8601 UTC) instead of
venue-local date+time strings, matching the canonical format used
by the iOS app.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-02-19 14:04:27 -06:00
parent 4353d5943c
commit 63acf7accb
114 changed files with 13070 additions and 887 deletions

View File

@@ -0,0 +1 @@
# Scraper engine package

496
scraper/engine/adapter.py Normal file
View File

@@ -0,0 +1,496 @@
"""
Adapter to bridge existing sportstime_parser scrapers with Django models.
"""
import hashlib
from datetime import datetime
from typing import Callable, Optional
from django.db import transaction
from django.utils import timezone
class ScraperAdapter:
"""
Adapts the existing sportstime_parser scrapers to work with Django models.
"""
def __init__(
self,
sport_code: str,
season: int,
config,
log_func: Optional[Callable] = None,
):
self.sport_code = sport_code
self.season = season
self.config = config
self.log = log_func or (lambda level, msg, **kw: None)
def run(self) -> dict:
"""
Run the scraper and return results.
"""
from core.models import Sport, Team, Stadium, Game
from scraper.models import ManualReviewItem
from cloudkit.models import CloudKitSyncState
result = {
'games_found': 0,
'games_new': 0,
'games_updated': 0,
'games_unchanged': 0,
'games_errors': 0,
'teams_found': 0,
'stadiums_found': 0,
'review_items': 0,
}
# Get sport
try:
sport = Sport.objects.get(code=self.sport_code)
except Sport.DoesNotExist:
raise ValueError(f"Sport {self.sport_code} not found in database")
self.log('info', f'Starting scraper for {sport.short_name} {self.season}', source='adapter')
# Import and create the appropriate scraper
scraper = self._create_scraper()
# Run the scrape
self.log('info', 'Scraping games...', source='adapter')
raw_result = scraper.scrape_all()
# Process stadiums first (teams reference stadiums via home_stadium FK)
self.log('info', f'Processing {len(raw_result.stadiums)} stadiums...', source='adapter')
result['stadiums_found'] = len(raw_result.stadiums)
self._process_stadiums(sport, raw_result.stadiums)
# Process teams
self.log('info', f'Processing {len(raw_result.teams)} teams...', source='adapter')
result['teams_found'] = len(raw_result.teams)
self._process_teams(sport, raw_result.teams)
# Process games
self.log('info', f'Processing {len(raw_result.games)} games...', source='adapter')
game_result = self._process_games(sport, raw_result.games)
result.update(game_result)
# Process review items
if raw_result.review_items:
self.log('info', f'Creating {len(raw_result.review_items)} review items...', source='adapter')
result['review_items'] = self._process_review_items(sport, raw_result.review_items)
self.log('info', f'Scrape complete: {result}', source='adapter')
return result
def _create_scraper(self):
"""Create the appropriate scraper instance."""
# Import from existing sportstime_parser
from sportstime_parser.scrapers import (
create_nba_scraper,
create_mlb_scraper,
create_nfl_scraper,
create_nhl_scraper,
create_mls_scraper,
create_wnba_scraper,
create_nwsl_scraper,
)
scrapers = {
'nba': create_nba_scraper,
'mlb': create_mlb_scraper,
'nfl': create_nfl_scraper,
'nhl': create_nhl_scraper,
'mls': create_mls_scraper,
'wnba': create_wnba_scraper,
'nwsl': create_nwsl_scraper,
}
creator = scrapers.get(self.sport_code)
if not creator:
raise ValueError(f"No scraper for sport: {self.sport_code}")
# Create scraper (config overrides handled via session/resolver settings if needed)
return creator(season=self.season)
def _process_teams(self, sport, teams):
"""Process and upsert teams."""
from core.models import Team, Stadium, Division, Conference
from cloudkit.models import CloudKitSyncState
for team_data in teams:
team_id = team_data.id
# Find division if available
division = None
if team_data.division:
division = Division.objects.filter(
conference__sport=sport,
name__iexact=team_data.division
).first()
# Fallback to partial match
if not division:
division = Division.objects.filter(
conference__sport=sport,
name__icontains=team_data.division
).first()
# Resolve home stadium if available
home_stadium = None
stadium_id = getattr(team_data, 'stadium_id', None)
if stadium_id:
home_stadium = Stadium.objects.filter(id=stadium_id).first()
team, created = Team.objects.update_or_create(
id=team_id,
defaults={
'sport': sport,
'division': division,
'city': team_data.city,
'name': team_data.name,
'full_name': team_data.full_name,
'abbreviation': team_data.abbreviation,
'home_stadium': home_stadium,
'primary_color': getattr(team_data, 'primary_color', '') or '',
'secondary_color': getattr(team_data, 'secondary_color', '') or '',
'logo_url': getattr(team_data, 'logo_url', '') or '',
}
)
# Mark for sync
if created:
CloudKitSyncState.objects.get_or_create(
record_type='Team',
record_id=team_id,
defaults={'sync_status': 'pending'}
)
def _process_stadiums(self, sport, stadiums):
"""Process and upsert stadiums."""
from core.models import Stadium
from cloudkit.models import CloudKitSyncState
for stadium_data in stadiums:
stadium_id = stadium_data.id
stadium, created = Stadium.objects.update_or_create(
id=stadium_id,
defaults={
'sport': sport,
'name': stadium_data.name,
'city': stadium_data.city,
'state': getattr(stadium_data, 'state', '') or '',
'country': getattr(stadium_data, 'country', 'USA'),
'latitude': getattr(stadium_data, 'latitude', None),
'longitude': getattr(stadium_data, 'longitude', None),
'capacity': getattr(stadium_data, 'capacity', None),
'surface': getattr(stadium_data, 'surface', '') or '',
'roof_type': getattr(stadium_data, 'roof_type', '') or '',
'opened_year': getattr(stadium_data, 'opened_year', None),
'timezone': getattr(stadium_data, 'timezone', '') or '',
'image_url': getattr(stadium_data, 'image_url', '') or '',
}
)
if created:
CloudKitSyncState.objects.get_or_create(
record_type='Stadium',
record_id=stadium_id,
defaults={'sync_status': 'pending'}
)
def _resolve_team_via_db_alias(self, sport, raw_name, check_date=None):
"""Try to resolve a team name using database aliases.
Args:
sport: Sport model instance
raw_name: Raw team name from scraper
check_date: Date for alias validity check
Returns:
Team instance if found, None otherwise
"""
from core.models import Team, TeamAlias
from datetime import date
if not raw_name:
return None
check_date = check_date or date.today()
# Check TeamAlias model
aliases = TeamAlias.objects.filter(
alias__iexact=raw_name.strip(),
team__sport=sport,
).select_related('team')
for alias in aliases:
if alias.is_valid_for_date(check_date):
return alias.team
# Also try partial matching on team full_name and city
team = Team.objects.filter(
sport=sport,
full_name__iexact=raw_name.strip()
).first()
if team:
return team
team = Team.objects.filter(
sport=sport,
city__iexact=raw_name.strip()
).first()
if team:
return team
return None
def _resolve_stadium_via_db_alias(self, sport, raw_name, check_date=None):
"""Try to resolve a stadium name using database aliases.
Args:
sport: Sport model instance
raw_name: Raw stadium name from scraper
check_date: Date for alias validity check
Returns:
Stadium instance if found, None otherwise
"""
from core.models import Stadium, StadiumAlias
from datetime import date
if not raw_name:
return None
check_date = check_date or date.today()
# Check StadiumAlias model
aliases = StadiumAlias.objects.filter(
alias__iexact=raw_name.strip(),
stadium__sport=sport,
).select_related('stadium')
for alias in aliases:
if alias.is_valid_for_date(check_date):
return alias.stadium
# Also try direct matching on stadium name
stadium = Stadium.objects.filter(
sport=sport,
name__iexact=raw_name.strip()
).first()
if stadium:
return stadium
return None
def _process_games(self, sport, games):
"""Process and upsert games."""
from core.models import Game, Team, Stadium
from cloudkit.models import CloudKitSyncState
result = {
'games_found': len(games),
'games_new': 0,
'games_updated': 0,
'games_unchanged': 0,
'games_errors': 0,
}
for game_data in games:
try:
game_id = game_data.id
check_date = game_data.game_date.date() if hasattr(game_data.game_date, 'date') else game_data.game_date
# Get related objects - try by ID first, then by DB alias
home_team = None
away_team = None
try:
home_team = Team.objects.get(id=game_data.home_team_id)
except Team.DoesNotExist:
# Try resolving via database alias using raw name
raw_home = getattr(game_data, 'raw_home_team', None)
if raw_home:
home_team = self._resolve_team_via_db_alias(sport, raw_home, check_date)
if home_team:
self.log('info', f'Resolved home team via DB alias: {raw_home} -> {home_team.abbreviation}', source='adapter')
try:
away_team = Team.objects.get(id=game_data.away_team_id)
except Team.DoesNotExist:
# Try resolving via database alias using raw name
raw_away = getattr(game_data, 'raw_away_team', None)
if raw_away:
away_team = self._resolve_team_via_db_alias(sport, raw_away, check_date)
if away_team:
self.log('info', f'Resolved away team via DB alias: {raw_away} -> {away_team.abbreviation}', source='adapter')
if not home_team or not away_team:
missing = []
if not home_team:
missing.append(f'home={game_data.home_team_id}')
if not away_team:
missing.append(f'away={game_data.away_team_id}')
self.log('warning', f'Team not found for game {game_id}: {", ".join(missing)}', source='adapter')
result['games_errors'] += 1
continue
stadium = None
if game_data.stadium_id:
try:
stadium = Stadium.objects.get(id=game_data.stadium_id)
except Stadium.DoesNotExist:
# Try resolving via database alias using raw name
raw_stadium = getattr(game_data, 'raw_stadium', None)
if raw_stadium:
stadium = self._resolve_stadium_via_db_alias(sport, raw_stadium, check_date)
if stadium:
self.log('info', f'Resolved stadium via DB alias: {raw_stadium} -> {stadium.name}', source='adapter')
# Build game dict
game_defaults = {
'sport': sport,
'season': game_data.season,
'home_team': home_team,
'away_team': away_team,
'stadium': stadium,
'game_date': game_data.game_date,
'game_number': getattr(game_data, 'game_number', None),
'home_score': game_data.home_score,
'away_score': game_data.away_score,
'status': game_data.status,
'raw_home_team': getattr(game_data, 'raw_home_team', '') or '',
'raw_away_team': getattr(game_data, 'raw_away_team', '') or '',
'raw_stadium': getattr(game_data, 'raw_stadium', '') or '',
'source_url': getattr(game_data, 'source_url', '') or '',
}
# Check if game exists
try:
existing = Game.objects.get(id=game_id)
# Check if changed
changed = False
for key, value in game_defaults.items():
if getattr(existing, key if not hasattr(existing, f'{key}_id') else f'{key}_id') != (value.id if hasattr(value, 'id') else value):
changed = True
break
if changed:
for key, value in game_defaults.items():
setattr(existing, key, value)
existing.save()
result['games_updated'] += 1
# Mark for sync
CloudKitSyncState.objects.update_or_create(
record_type='Game',
record_id=game_id,
defaults={'sync_status': 'pending'}
)
else:
result['games_unchanged'] += 1
except Game.DoesNotExist:
# Create new game
Game.objects.create(id=game_id, **game_defaults)
result['games_new'] += 1
# Mark for sync
CloudKitSyncState.objects.get_or_create(
record_type='Game',
record_id=game_id,
defaults={'sync_status': 'pending'}
)
except Exception as e:
self.log('error', f'Error processing game: {e}', source='adapter')
result['games_errors'] += 1
return result
def _process_review_items(self, sport, review_items):
"""Create manual review items."""
from scraper.models import ManualReviewItem, ScrapeJob
from sportstime_parser.models.aliases import ReviewReason
# Get current job
job = ScrapeJob.objects.filter(
config=self.config,
status='running'
).order_by('-created_at').first()
count = 0
for item in review_items:
# Derive item_type from reason
item_type = self._get_item_type_from_reason(item.reason)
# Get suggested match info (parser uses suggested_matches list)
suggested_id = ''
confidence = 0.0
if item.suggested_matches:
best_match = item.suggested_matches[0]
suggested_id = best_match.canonical_id
confidence = best_match.confidence / 100.0 # Convert to 0-1 range
ManualReviewItem.objects.create(
job=job,
item_type=item_type,
sport=sport,
raw_value=item.raw_value,
suggested_id=suggested_id,
confidence=confidence,
reason=self._map_reason(item.reason),
source_url=item.source_url or '',
check_date=item.game_date,
context=item.context if item.context else None,
)
count += 1
return count
def _get_item_type_from_reason(self, reason) -> str:
"""Derive item type (team/stadium) from ReviewReason enum."""
from sportstime_parser.models.aliases import ReviewReason
# Map reason to item type
if isinstance(reason, ReviewReason):
reason_value = reason.value
else:
reason_value = str(reason).lower()
if 'team' in reason_value:
return 'team'
elif 'stadium' in reason_value:
return 'stadium'
else:
# Default to team for other reasons
return 'team'
def _map_reason(self, reason) -> str:
"""Map scraper ReviewReason to model choice."""
from sportstime_parser.models.aliases import ReviewReason
# Handle ReviewReason enum
if isinstance(reason, ReviewReason):
reason_value = reason.value
else:
reason_value = str(reason).lower()
reason_map = {
'unresolved_team': 'no_match',
'unresolved_stadium': 'no_match',
'low_confidence_match': 'low_confidence',
'missing_data': 'no_match',
'duplicate_game': 'ambiguous',
'timezone_unknown': 'no_match',
'geographic_filter': 'no_match',
# Legacy mappings
'no_match': 'no_match',
'no match found': 'no_match',
'low_confidence': 'low_confidence',
'fuzzy match below threshold': 'low_confidence',
'ambiguous': 'ambiguous',
'new_entity': 'new_entity',
}
return reason_map.get(reason_value.lower(), 'no_match')

View File

@@ -0,0 +1,144 @@
"""Database-aware alias loaders for team and stadium resolution.
These loaders check the Django TeamAlias and StadiumAlias models
in addition to the hardcoded mappings, allowing aliases to be
managed via the admin interface.
"""
from datetime import date
from typing import Optional
class DatabaseTeamAliasLoader:
"""Load team aliases from the Django database.
Checks the core.TeamAlias model for alias mappings,
supporting date-aware lookups for historical names.
"""
def resolve(
self,
value: str,
sport_code: str,
check_date: Optional[date] = None,
) -> Optional[str]:
"""Resolve an alias value to a canonical team ID.
Args:
value: Alias value to look up (case-insensitive)
sport_code: Sport code to filter by
check_date: Date to check validity (None = current date)
Returns:
Canonical team ID if found, None otherwise
"""
from core.models import TeamAlias
from django.db.models import Q
if check_date is None:
check_date = date.today()
value_lower = value.lower().strip()
# Query aliases matching the value and sport
aliases = TeamAlias.objects.filter(
alias__iexact=value_lower,
team__sport__code=sport_code,
).select_related('team')
for alias in aliases:
if alias.is_valid_for_date(check_date):
return alias.team.id
return None
def get_aliases_for_team(
self,
team_id: str,
check_date: Optional[date] = None,
) -> list:
"""Get all aliases for a team.
Args:
team_id: Team ID
check_date: Date to filter by (None = all aliases)
Returns:
List of TeamAlias objects
"""
from core.models import TeamAlias
aliases = TeamAlias.objects.filter(team_id=team_id)
if check_date:
result = []
for alias in aliases:
if alias.is_valid_for_date(check_date):
result.append(alias)
return result
return list(aliases)
class DatabaseStadiumAliasLoader:
"""Load stadium aliases from the Django database.
Checks the core.StadiumAlias model for alias mappings,
supporting date-aware lookups for naming rights changes.
"""
def resolve(
self,
name: str,
sport_code: str,
check_date: Optional[date] = None,
) -> Optional[str]:
"""Resolve a stadium name to a canonical stadium ID.
Args:
name: Stadium name to look up (case-insensitive)
sport_code: Sport code to filter by
check_date: Date to check validity (None = current date)
Returns:
Canonical stadium ID if found, None otherwise
"""
from core.models import StadiumAlias
if check_date is None:
check_date = date.today()
name_lower = name.lower().strip()
# Query aliases matching the name and sport
aliases = StadiumAlias.objects.filter(
alias__iexact=name_lower,
stadium__sport__code=sport_code,
).select_related('stadium')
for alias in aliases:
if alias.is_valid_for_date(check_date):
return alias.stadium.id
return None
# Global instances
_db_team_loader: Optional[DatabaseTeamAliasLoader] = None
_db_stadium_loader: Optional[DatabaseStadiumAliasLoader] = None
def get_db_team_alias_loader() -> DatabaseTeamAliasLoader:
"""Get the database team alias loader."""
global _db_team_loader
if _db_team_loader is None:
_db_team_loader = DatabaseTeamAliasLoader()
return _db_team_loader
def get_db_stadium_alias_loader() -> DatabaseStadiumAliasLoader:
"""Get the database stadium alias loader."""
global _db_stadium_loader
if _db_stadium_loader is None:
_db_stadium_loader = DatabaseStadiumAliasLoader()
return _db_stadium_loader