feat: add Django web app, CloudKit sync, dashboard, and game_datetime_utc export

Adds the full Django application layer on top of sportstime_parser:
- core: Sport, Team, Stadium, Game models with aliases and league structure
- scraper: orchestration engine, adapter, job management, Celery tasks
- cloudkit: CloudKit sync client, sync state tracking, sync jobs
- dashboard: staff dashboard for monitoring scrapers, sync, review queue
- notifications: email reports for scrape/sync results
- Docker setup for deployment (Dockerfile, docker-compose, entrypoint)

Game exports now use game_datetime_utc (ISO 8601 UTC) instead of
venue-local date+time strings, matching the canonical format used
by the iOS app.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-02-19 14:04:27 -06:00
parent 4353d5943c
commit 63acf7accb
114 changed files with 13070 additions and 887 deletions

496
scraper/engine/adapter.py Normal file
View File

@@ -0,0 +1,496 @@
"""
Adapter to bridge existing sportstime_parser scrapers with Django models.
"""
import hashlib
from datetime import datetime
from typing import Callable, Optional
from django.db import transaction
from django.utils import timezone
class ScraperAdapter:
"""
Adapts the existing sportstime_parser scrapers to work with Django models.
"""
def __init__(
self,
sport_code: str,
season: int,
config,
log_func: Optional[Callable] = None,
):
self.sport_code = sport_code
self.season = season
self.config = config
self.log = log_func or (lambda level, msg, **kw: None)
def run(self) -> dict:
"""
Run the scraper and return results.
"""
from core.models import Sport, Team, Stadium, Game
from scraper.models import ManualReviewItem
from cloudkit.models import CloudKitSyncState
result = {
'games_found': 0,
'games_new': 0,
'games_updated': 0,
'games_unchanged': 0,
'games_errors': 0,
'teams_found': 0,
'stadiums_found': 0,
'review_items': 0,
}
# Get sport
try:
sport = Sport.objects.get(code=self.sport_code)
except Sport.DoesNotExist:
raise ValueError(f"Sport {self.sport_code} not found in database")
self.log('info', f'Starting scraper for {sport.short_name} {self.season}', source='adapter')
# Import and create the appropriate scraper
scraper = self._create_scraper()
# Run the scrape
self.log('info', 'Scraping games...', source='adapter')
raw_result = scraper.scrape_all()
# Process stadiums first (teams reference stadiums via home_stadium FK)
self.log('info', f'Processing {len(raw_result.stadiums)} stadiums...', source='adapter')
result['stadiums_found'] = len(raw_result.stadiums)
self._process_stadiums(sport, raw_result.stadiums)
# Process teams
self.log('info', f'Processing {len(raw_result.teams)} teams...', source='adapter')
result['teams_found'] = len(raw_result.teams)
self._process_teams(sport, raw_result.teams)
# Process games
self.log('info', f'Processing {len(raw_result.games)} games...', source='adapter')
game_result = self._process_games(sport, raw_result.games)
result.update(game_result)
# Process review items
if raw_result.review_items:
self.log('info', f'Creating {len(raw_result.review_items)} review items...', source='adapter')
result['review_items'] = self._process_review_items(sport, raw_result.review_items)
self.log('info', f'Scrape complete: {result}', source='adapter')
return result
def _create_scraper(self):
"""Create the appropriate scraper instance."""
# Import from existing sportstime_parser
from sportstime_parser.scrapers import (
create_nba_scraper,
create_mlb_scraper,
create_nfl_scraper,
create_nhl_scraper,
create_mls_scraper,
create_wnba_scraper,
create_nwsl_scraper,
)
scrapers = {
'nba': create_nba_scraper,
'mlb': create_mlb_scraper,
'nfl': create_nfl_scraper,
'nhl': create_nhl_scraper,
'mls': create_mls_scraper,
'wnba': create_wnba_scraper,
'nwsl': create_nwsl_scraper,
}
creator = scrapers.get(self.sport_code)
if not creator:
raise ValueError(f"No scraper for sport: {self.sport_code}")
# Create scraper (config overrides handled via session/resolver settings if needed)
return creator(season=self.season)
def _process_teams(self, sport, teams):
"""Process and upsert teams."""
from core.models import Team, Stadium, Division, Conference
from cloudkit.models import CloudKitSyncState
for team_data in teams:
team_id = team_data.id
# Find division if available
division = None
if team_data.division:
division = Division.objects.filter(
conference__sport=sport,
name__iexact=team_data.division
).first()
# Fallback to partial match
if not division:
division = Division.objects.filter(
conference__sport=sport,
name__icontains=team_data.division
).first()
# Resolve home stadium if available
home_stadium = None
stadium_id = getattr(team_data, 'stadium_id', None)
if stadium_id:
home_stadium = Stadium.objects.filter(id=stadium_id).first()
team, created = Team.objects.update_or_create(
id=team_id,
defaults={
'sport': sport,
'division': division,
'city': team_data.city,
'name': team_data.name,
'full_name': team_data.full_name,
'abbreviation': team_data.abbreviation,
'home_stadium': home_stadium,
'primary_color': getattr(team_data, 'primary_color', '') or '',
'secondary_color': getattr(team_data, 'secondary_color', '') or '',
'logo_url': getattr(team_data, 'logo_url', '') or '',
}
)
# Mark for sync
if created:
CloudKitSyncState.objects.get_or_create(
record_type='Team',
record_id=team_id,
defaults={'sync_status': 'pending'}
)
def _process_stadiums(self, sport, stadiums):
"""Process and upsert stadiums."""
from core.models import Stadium
from cloudkit.models import CloudKitSyncState
for stadium_data in stadiums:
stadium_id = stadium_data.id
stadium, created = Stadium.objects.update_or_create(
id=stadium_id,
defaults={
'sport': sport,
'name': stadium_data.name,
'city': stadium_data.city,
'state': getattr(stadium_data, 'state', '') or '',
'country': getattr(stadium_data, 'country', 'USA'),
'latitude': getattr(stadium_data, 'latitude', None),
'longitude': getattr(stadium_data, 'longitude', None),
'capacity': getattr(stadium_data, 'capacity', None),
'surface': getattr(stadium_data, 'surface', '') or '',
'roof_type': getattr(stadium_data, 'roof_type', '') or '',
'opened_year': getattr(stadium_data, 'opened_year', None),
'timezone': getattr(stadium_data, 'timezone', '') or '',
'image_url': getattr(stadium_data, 'image_url', '') or '',
}
)
if created:
CloudKitSyncState.objects.get_or_create(
record_type='Stadium',
record_id=stadium_id,
defaults={'sync_status': 'pending'}
)
def _resolve_team_via_db_alias(self, sport, raw_name, check_date=None):
"""Try to resolve a team name using database aliases.
Args:
sport: Sport model instance
raw_name: Raw team name from scraper
check_date: Date for alias validity check
Returns:
Team instance if found, None otherwise
"""
from core.models import Team, TeamAlias
from datetime import date
if not raw_name:
return None
check_date = check_date or date.today()
# Check TeamAlias model
aliases = TeamAlias.objects.filter(
alias__iexact=raw_name.strip(),
team__sport=sport,
).select_related('team')
for alias in aliases:
if alias.is_valid_for_date(check_date):
return alias.team
# Also try partial matching on team full_name and city
team = Team.objects.filter(
sport=sport,
full_name__iexact=raw_name.strip()
).first()
if team:
return team
team = Team.objects.filter(
sport=sport,
city__iexact=raw_name.strip()
).first()
if team:
return team
return None
def _resolve_stadium_via_db_alias(self, sport, raw_name, check_date=None):
"""Try to resolve a stadium name using database aliases.
Args:
sport: Sport model instance
raw_name: Raw stadium name from scraper
check_date: Date for alias validity check
Returns:
Stadium instance if found, None otherwise
"""
from core.models import Stadium, StadiumAlias
from datetime import date
if not raw_name:
return None
check_date = check_date or date.today()
# Check StadiumAlias model
aliases = StadiumAlias.objects.filter(
alias__iexact=raw_name.strip(),
stadium__sport=sport,
).select_related('stadium')
for alias in aliases:
if alias.is_valid_for_date(check_date):
return alias.stadium
# Also try direct matching on stadium name
stadium = Stadium.objects.filter(
sport=sport,
name__iexact=raw_name.strip()
).first()
if stadium:
return stadium
return None
def _process_games(self, sport, games):
"""Process and upsert games."""
from core.models import Game, Team, Stadium
from cloudkit.models import CloudKitSyncState
result = {
'games_found': len(games),
'games_new': 0,
'games_updated': 0,
'games_unchanged': 0,
'games_errors': 0,
}
for game_data in games:
try:
game_id = game_data.id
check_date = game_data.game_date.date() if hasattr(game_data.game_date, 'date') else game_data.game_date
# Get related objects - try by ID first, then by DB alias
home_team = None
away_team = None
try:
home_team = Team.objects.get(id=game_data.home_team_id)
except Team.DoesNotExist:
# Try resolving via database alias using raw name
raw_home = getattr(game_data, 'raw_home_team', None)
if raw_home:
home_team = self._resolve_team_via_db_alias(sport, raw_home, check_date)
if home_team:
self.log('info', f'Resolved home team via DB alias: {raw_home} -> {home_team.abbreviation}', source='adapter')
try:
away_team = Team.objects.get(id=game_data.away_team_id)
except Team.DoesNotExist:
# Try resolving via database alias using raw name
raw_away = getattr(game_data, 'raw_away_team', None)
if raw_away:
away_team = self._resolve_team_via_db_alias(sport, raw_away, check_date)
if away_team:
self.log('info', f'Resolved away team via DB alias: {raw_away} -> {away_team.abbreviation}', source='adapter')
if not home_team or not away_team:
missing = []
if not home_team:
missing.append(f'home={game_data.home_team_id}')
if not away_team:
missing.append(f'away={game_data.away_team_id}')
self.log('warning', f'Team not found for game {game_id}: {", ".join(missing)}', source='adapter')
result['games_errors'] += 1
continue
stadium = None
if game_data.stadium_id:
try:
stadium = Stadium.objects.get(id=game_data.stadium_id)
except Stadium.DoesNotExist:
# Try resolving via database alias using raw name
raw_stadium = getattr(game_data, 'raw_stadium', None)
if raw_stadium:
stadium = self._resolve_stadium_via_db_alias(sport, raw_stadium, check_date)
if stadium:
self.log('info', f'Resolved stadium via DB alias: {raw_stadium} -> {stadium.name}', source='adapter')
# Build game dict
game_defaults = {
'sport': sport,
'season': game_data.season,
'home_team': home_team,
'away_team': away_team,
'stadium': stadium,
'game_date': game_data.game_date,
'game_number': getattr(game_data, 'game_number', None),
'home_score': game_data.home_score,
'away_score': game_data.away_score,
'status': game_data.status,
'raw_home_team': getattr(game_data, 'raw_home_team', '') or '',
'raw_away_team': getattr(game_data, 'raw_away_team', '') or '',
'raw_stadium': getattr(game_data, 'raw_stadium', '') or '',
'source_url': getattr(game_data, 'source_url', '') or '',
}
# Check if game exists
try:
existing = Game.objects.get(id=game_id)
# Check if changed
changed = False
for key, value in game_defaults.items():
if getattr(existing, key if not hasattr(existing, f'{key}_id') else f'{key}_id') != (value.id if hasattr(value, 'id') else value):
changed = True
break
if changed:
for key, value in game_defaults.items():
setattr(existing, key, value)
existing.save()
result['games_updated'] += 1
# Mark for sync
CloudKitSyncState.objects.update_or_create(
record_type='Game',
record_id=game_id,
defaults={'sync_status': 'pending'}
)
else:
result['games_unchanged'] += 1
except Game.DoesNotExist:
# Create new game
Game.objects.create(id=game_id, **game_defaults)
result['games_new'] += 1
# Mark for sync
CloudKitSyncState.objects.get_or_create(
record_type='Game',
record_id=game_id,
defaults={'sync_status': 'pending'}
)
except Exception as e:
self.log('error', f'Error processing game: {e}', source='adapter')
result['games_errors'] += 1
return result
def _process_review_items(self, sport, review_items):
"""Create manual review items."""
from scraper.models import ManualReviewItem, ScrapeJob
from sportstime_parser.models.aliases import ReviewReason
# Get current job
job = ScrapeJob.objects.filter(
config=self.config,
status='running'
).order_by('-created_at').first()
count = 0
for item in review_items:
# Derive item_type from reason
item_type = self._get_item_type_from_reason(item.reason)
# Get suggested match info (parser uses suggested_matches list)
suggested_id = ''
confidence = 0.0
if item.suggested_matches:
best_match = item.suggested_matches[0]
suggested_id = best_match.canonical_id
confidence = best_match.confidence / 100.0 # Convert to 0-1 range
ManualReviewItem.objects.create(
job=job,
item_type=item_type,
sport=sport,
raw_value=item.raw_value,
suggested_id=suggested_id,
confidence=confidence,
reason=self._map_reason(item.reason),
source_url=item.source_url or '',
check_date=item.game_date,
context=item.context if item.context else None,
)
count += 1
return count
def _get_item_type_from_reason(self, reason) -> str:
"""Derive item type (team/stadium) from ReviewReason enum."""
from sportstime_parser.models.aliases import ReviewReason
# Map reason to item type
if isinstance(reason, ReviewReason):
reason_value = reason.value
else:
reason_value = str(reason).lower()
if 'team' in reason_value:
return 'team'
elif 'stadium' in reason_value:
return 'stadium'
else:
# Default to team for other reasons
return 'team'
def _map_reason(self, reason) -> str:
"""Map scraper ReviewReason to model choice."""
from sportstime_parser.models.aliases import ReviewReason
# Handle ReviewReason enum
if isinstance(reason, ReviewReason):
reason_value = reason.value
else:
reason_value = str(reason).lower()
reason_map = {
'unresolved_team': 'no_match',
'unresolved_stadium': 'no_match',
'low_confidence_match': 'low_confidence',
'missing_data': 'no_match',
'duplicate_game': 'ambiguous',
'timezone_unknown': 'no_match',
'geographic_filter': 'no_match',
# Legacy mappings
'no_match': 'no_match',
'no match found': 'no_match',
'low_confidence': 'low_confidence',
'fuzzy match below threshold': 'low_confidence',
'ambiguous': 'ambiguous',
'new_entity': 'new_entity',
}
return reason_map.get(reason_value.lower(), 'no_match')