Adds the full Django application layer on top of sportstime_parser: - core: Sport, Team, Stadium, Game models with aliases and league structure - scraper: orchestration engine, adapter, job management, Celery tasks - cloudkit: CloudKit sync client, sync state tracking, sync jobs - dashboard: staff dashboard for monitoring scrapers, sync, review queue - notifications: email reports for scrape/sync results - Docker setup for deployment (Dockerfile, docker-compose, entrypoint) Game exports now use game_datetime_utc (ISO 8601 UTC) instead of venue-local date+time strings, matching the canonical format used by the iOS app. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
497 lines
18 KiB
Python
497 lines
18 KiB
Python
"""
|
|
Adapter to bridge existing sportstime_parser scrapers with Django models.
|
|
"""
|
|
import hashlib
|
|
from datetime import datetime
|
|
from typing import Callable, Optional
|
|
|
|
from django.db import transaction
|
|
from django.utils import timezone
|
|
|
|
|
|
class ScraperAdapter:
|
|
"""
|
|
Adapts the existing sportstime_parser scrapers to work with Django models.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
sport_code: str,
|
|
season: int,
|
|
config,
|
|
log_func: Optional[Callable] = None,
|
|
):
|
|
self.sport_code = sport_code
|
|
self.season = season
|
|
self.config = config
|
|
self.log = log_func or (lambda level, msg, **kw: None)
|
|
|
|
def run(self) -> dict:
|
|
"""
|
|
Run the scraper and return results.
|
|
"""
|
|
from core.models import Sport, Team, Stadium, Game
|
|
from scraper.models import ManualReviewItem
|
|
from cloudkit.models import CloudKitSyncState
|
|
|
|
result = {
|
|
'games_found': 0,
|
|
'games_new': 0,
|
|
'games_updated': 0,
|
|
'games_unchanged': 0,
|
|
'games_errors': 0,
|
|
'teams_found': 0,
|
|
'stadiums_found': 0,
|
|
'review_items': 0,
|
|
}
|
|
|
|
# Get sport
|
|
try:
|
|
sport = Sport.objects.get(code=self.sport_code)
|
|
except Sport.DoesNotExist:
|
|
raise ValueError(f"Sport {self.sport_code} not found in database")
|
|
|
|
self.log('info', f'Starting scraper for {sport.short_name} {self.season}', source='adapter')
|
|
|
|
# Import and create the appropriate scraper
|
|
scraper = self._create_scraper()
|
|
|
|
# Run the scrape
|
|
self.log('info', 'Scraping games...', source='adapter')
|
|
raw_result = scraper.scrape_all()
|
|
|
|
# Process stadiums first (teams reference stadiums via home_stadium FK)
|
|
self.log('info', f'Processing {len(raw_result.stadiums)} stadiums...', source='adapter')
|
|
result['stadiums_found'] = len(raw_result.stadiums)
|
|
self._process_stadiums(sport, raw_result.stadiums)
|
|
|
|
# Process teams
|
|
self.log('info', f'Processing {len(raw_result.teams)} teams...', source='adapter')
|
|
result['teams_found'] = len(raw_result.teams)
|
|
self._process_teams(sport, raw_result.teams)
|
|
|
|
# Process games
|
|
self.log('info', f'Processing {len(raw_result.games)} games...', source='adapter')
|
|
game_result = self._process_games(sport, raw_result.games)
|
|
result.update(game_result)
|
|
|
|
# Process review items
|
|
if raw_result.review_items:
|
|
self.log('info', f'Creating {len(raw_result.review_items)} review items...', source='adapter')
|
|
result['review_items'] = self._process_review_items(sport, raw_result.review_items)
|
|
|
|
self.log('info', f'Scrape complete: {result}', source='adapter')
|
|
return result
|
|
|
|
def _create_scraper(self):
|
|
"""Create the appropriate scraper instance."""
|
|
# Import from existing sportstime_parser
|
|
from sportstime_parser.scrapers import (
|
|
create_nba_scraper,
|
|
create_mlb_scraper,
|
|
create_nfl_scraper,
|
|
create_nhl_scraper,
|
|
create_mls_scraper,
|
|
create_wnba_scraper,
|
|
create_nwsl_scraper,
|
|
)
|
|
|
|
scrapers = {
|
|
'nba': create_nba_scraper,
|
|
'mlb': create_mlb_scraper,
|
|
'nfl': create_nfl_scraper,
|
|
'nhl': create_nhl_scraper,
|
|
'mls': create_mls_scraper,
|
|
'wnba': create_wnba_scraper,
|
|
'nwsl': create_nwsl_scraper,
|
|
}
|
|
|
|
creator = scrapers.get(self.sport_code)
|
|
if not creator:
|
|
raise ValueError(f"No scraper for sport: {self.sport_code}")
|
|
|
|
# Create scraper (config overrides handled via session/resolver settings if needed)
|
|
return creator(season=self.season)
|
|
|
|
def _process_teams(self, sport, teams):
|
|
"""Process and upsert teams."""
|
|
from core.models import Team, Stadium, Division, Conference
|
|
from cloudkit.models import CloudKitSyncState
|
|
|
|
for team_data in teams:
|
|
team_id = team_data.id
|
|
|
|
# Find division if available
|
|
division = None
|
|
if team_data.division:
|
|
division = Division.objects.filter(
|
|
conference__sport=sport,
|
|
name__iexact=team_data.division
|
|
).first()
|
|
# Fallback to partial match
|
|
if not division:
|
|
division = Division.objects.filter(
|
|
conference__sport=sport,
|
|
name__icontains=team_data.division
|
|
).first()
|
|
|
|
# Resolve home stadium if available
|
|
home_stadium = None
|
|
stadium_id = getattr(team_data, 'stadium_id', None)
|
|
if stadium_id:
|
|
home_stadium = Stadium.objects.filter(id=stadium_id).first()
|
|
|
|
team, created = Team.objects.update_or_create(
|
|
id=team_id,
|
|
defaults={
|
|
'sport': sport,
|
|
'division': division,
|
|
'city': team_data.city,
|
|
'name': team_data.name,
|
|
'full_name': team_data.full_name,
|
|
'abbreviation': team_data.abbreviation,
|
|
'home_stadium': home_stadium,
|
|
'primary_color': getattr(team_data, 'primary_color', '') or '',
|
|
'secondary_color': getattr(team_data, 'secondary_color', '') or '',
|
|
'logo_url': getattr(team_data, 'logo_url', '') or '',
|
|
}
|
|
)
|
|
|
|
# Mark for sync
|
|
if created:
|
|
CloudKitSyncState.objects.get_or_create(
|
|
record_type='Team',
|
|
record_id=team_id,
|
|
defaults={'sync_status': 'pending'}
|
|
)
|
|
|
|
def _process_stadiums(self, sport, stadiums):
|
|
"""Process and upsert stadiums."""
|
|
from core.models import Stadium
|
|
from cloudkit.models import CloudKitSyncState
|
|
|
|
for stadium_data in stadiums:
|
|
stadium_id = stadium_data.id
|
|
|
|
stadium, created = Stadium.objects.update_or_create(
|
|
id=stadium_id,
|
|
defaults={
|
|
'sport': sport,
|
|
'name': stadium_data.name,
|
|
'city': stadium_data.city,
|
|
'state': getattr(stadium_data, 'state', '') or '',
|
|
'country': getattr(stadium_data, 'country', 'USA'),
|
|
'latitude': getattr(stadium_data, 'latitude', None),
|
|
'longitude': getattr(stadium_data, 'longitude', None),
|
|
'capacity': getattr(stadium_data, 'capacity', None),
|
|
'surface': getattr(stadium_data, 'surface', '') or '',
|
|
'roof_type': getattr(stadium_data, 'roof_type', '') or '',
|
|
'opened_year': getattr(stadium_data, 'opened_year', None),
|
|
'timezone': getattr(stadium_data, 'timezone', '') or '',
|
|
'image_url': getattr(stadium_data, 'image_url', '') or '',
|
|
}
|
|
)
|
|
|
|
if created:
|
|
CloudKitSyncState.objects.get_or_create(
|
|
record_type='Stadium',
|
|
record_id=stadium_id,
|
|
defaults={'sync_status': 'pending'}
|
|
)
|
|
|
|
def _resolve_team_via_db_alias(self, sport, raw_name, check_date=None):
|
|
"""Try to resolve a team name using database aliases.
|
|
|
|
Args:
|
|
sport: Sport model instance
|
|
raw_name: Raw team name from scraper
|
|
check_date: Date for alias validity check
|
|
|
|
Returns:
|
|
Team instance if found, None otherwise
|
|
"""
|
|
from core.models import Team, TeamAlias
|
|
from datetime import date
|
|
|
|
if not raw_name:
|
|
return None
|
|
|
|
check_date = check_date or date.today()
|
|
|
|
# Check TeamAlias model
|
|
aliases = TeamAlias.objects.filter(
|
|
alias__iexact=raw_name.strip(),
|
|
team__sport=sport,
|
|
).select_related('team')
|
|
|
|
for alias in aliases:
|
|
if alias.is_valid_for_date(check_date):
|
|
return alias.team
|
|
|
|
# Also try partial matching on team full_name and city
|
|
team = Team.objects.filter(
|
|
sport=sport,
|
|
full_name__iexact=raw_name.strip()
|
|
).first()
|
|
if team:
|
|
return team
|
|
|
|
team = Team.objects.filter(
|
|
sport=sport,
|
|
city__iexact=raw_name.strip()
|
|
).first()
|
|
if team:
|
|
return team
|
|
|
|
return None
|
|
|
|
def _resolve_stadium_via_db_alias(self, sport, raw_name, check_date=None):
|
|
"""Try to resolve a stadium name using database aliases.
|
|
|
|
Args:
|
|
sport: Sport model instance
|
|
raw_name: Raw stadium name from scraper
|
|
check_date: Date for alias validity check
|
|
|
|
Returns:
|
|
Stadium instance if found, None otherwise
|
|
"""
|
|
from core.models import Stadium, StadiumAlias
|
|
from datetime import date
|
|
|
|
if not raw_name:
|
|
return None
|
|
|
|
check_date = check_date or date.today()
|
|
|
|
# Check StadiumAlias model
|
|
aliases = StadiumAlias.objects.filter(
|
|
alias__iexact=raw_name.strip(),
|
|
stadium__sport=sport,
|
|
).select_related('stadium')
|
|
|
|
for alias in aliases:
|
|
if alias.is_valid_for_date(check_date):
|
|
return alias.stadium
|
|
|
|
# Also try direct matching on stadium name
|
|
stadium = Stadium.objects.filter(
|
|
sport=sport,
|
|
name__iexact=raw_name.strip()
|
|
).first()
|
|
if stadium:
|
|
return stadium
|
|
|
|
return None
|
|
|
|
def _process_games(self, sport, games):
|
|
"""Process and upsert games."""
|
|
from core.models import Game, Team, Stadium
|
|
from cloudkit.models import CloudKitSyncState
|
|
|
|
result = {
|
|
'games_found': len(games),
|
|
'games_new': 0,
|
|
'games_updated': 0,
|
|
'games_unchanged': 0,
|
|
'games_errors': 0,
|
|
}
|
|
|
|
for game_data in games:
|
|
try:
|
|
game_id = game_data.id
|
|
check_date = game_data.game_date.date() if hasattr(game_data.game_date, 'date') else game_data.game_date
|
|
|
|
# Get related objects - try by ID first, then by DB alias
|
|
home_team = None
|
|
away_team = None
|
|
|
|
try:
|
|
home_team = Team.objects.get(id=game_data.home_team_id)
|
|
except Team.DoesNotExist:
|
|
# Try resolving via database alias using raw name
|
|
raw_home = getattr(game_data, 'raw_home_team', None)
|
|
if raw_home:
|
|
home_team = self._resolve_team_via_db_alias(sport, raw_home, check_date)
|
|
if home_team:
|
|
self.log('info', f'Resolved home team via DB alias: {raw_home} -> {home_team.abbreviation}', source='adapter')
|
|
|
|
try:
|
|
away_team = Team.objects.get(id=game_data.away_team_id)
|
|
except Team.DoesNotExist:
|
|
# Try resolving via database alias using raw name
|
|
raw_away = getattr(game_data, 'raw_away_team', None)
|
|
if raw_away:
|
|
away_team = self._resolve_team_via_db_alias(sport, raw_away, check_date)
|
|
if away_team:
|
|
self.log('info', f'Resolved away team via DB alias: {raw_away} -> {away_team.abbreviation}', source='adapter')
|
|
|
|
if not home_team or not away_team:
|
|
missing = []
|
|
if not home_team:
|
|
missing.append(f'home={game_data.home_team_id}')
|
|
if not away_team:
|
|
missing.append(f'away={game_data.away_team_id}')
|
|
self.log('warning', f'Team not found for game {game_id}: {", ".join(missing)}', source='adapter')
|
|
result['games_errors'] += 1
|
|
continue
|
|
|
|
stadium = None
|
|
if game_data.stadium_id:
|
|
try:
|
|
stadium = Stadium.objects.get(id=game_data.stadium_id)
|
|
except Stadium.DoesNotExist:
|
|
# Try resolving via database alias using raw name
|
|
raw_stadium = getattr(game_data, 'raw_stadium', None)
|
|
if raw_stadium:
|
|
stadium = self._resolve_stadium_via_db_alias(sport, raw_stadium, check_date)
|
|
if stadium:
|
|
self.log('info', f'Resolved stadium via DB alias: {raw_stadium} -> {stadium.name}', source='adapter')
|
|
|
|
# Build game dict
|
|
game_defaults = {
|
|
'sport': sport,
|
|
'season': game_data.season,
|
|
'home_team': home_team,
|
|
'away_team': away_team,
|
|
'stadium': stadium,
|
|
'game_date': game_data.game_date,
|
|
'game_number': getattr(game_data, 'game_number', None),
|
|
'home_score': game_data.home_score,
|
|
'away_score': game_data.away_score,
|
|
'status': game_data.status,
|
|
'raw_home_team': getattr(game_data, 'raw_home_team', '') or '',
|
|
'raw_away_team': getattr(game_data, 'raw_away_team', '') or '',
|
|
'raw_stadium': getattr(game_data, 'raw_stadium', '') or '',
|
|
'source_url': getattr(game_data, 'source_url', '') or '',
|
|
}
|
|
|
|
# Check if game exists
|
|
try:
|
|
existing = Game.objects.get(id=game_id)
|
|
# Check if changed
|
|
changed = False
|
|
for key, value in game_defaults.items():
|
|
if getattr(existing, key if not hasattr(existing, f'{key}_id') else f'{key}_id') != (value.id if hasattr(value, 'id') else value):
|
|
changed = True
|
|
break
|
|
|
|
if changed:
|
|
for key, value in game_defaults.items():
|
|
setattr(existing, key, value)
|
|
existing.save()
|
|
result['games_updated'] += 1
|
|
|
|
# Mark for sync
|
|
CloudKitSyncState.objects.update_or_create(
|
|
record_type='Game',
|
|
record_id=game_id,
|
|
defaults={'sync_status': 'pending'}
|
|
)
|
|
else:
|
|
result['games_unchanged'] += 1
|
|
|
|
except Game.DoesNotExist:
|
|
# Create new game
|
|
Game.objects.create(id=game_id, **game_defaults)
|
|
result['games_new'] += 1
|
|
|
|
# Mark for sync
|
|
CloudKitSyncState.objects.get_or_create(
|
|
record_type='Game',
|
|
record_id=game_id,
|
|
defaults={'sync_status': 'pending'}
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log('error', f'Error processing game: {e}', source='adapter')
|
|
result['games_errors'] += 1
|
|
|
|
return result
|
|
|
|
def _process_review_items(self, sport, review_items):
|
|
"""Create manual review items."""
|
|
from scraper.models import ManualReviewItem, ScrapeJob
|
|
from sportstime_parser.models.aliases import ReviewReason
|
|
|
|
# Get current job
|
|
job = ScrapeJob.objects.filter(
|
|
config=self.config,
|
|
status='running'
|
|
).order_by('-created_at').first()
|
|
|
|
count = 0
|
|
for item in review_items:
|
|
# Derive item_type from reason
|
|
item_type = self._get_item_type_from_reason(item.reason)
|
|
|
|
# Get suggested match info (parser uses suggested_matches list)
|
|
suggested_id = ''
|
|
confidence = 0.0
|
|
if item.suggested_matches:
|
|
best_match = item.suggested_matches[0]
|
|
suggested_id = best_match.canonical_id
|
|
confidence = best_match.confidence / 100.0 # Convert to 0-1 range
|
|
|
|
ManualReviewItem.objects.create(
|
|
job=job,
|
|
item_type=item_type,
|
|
sport=sport,
|
|
raw_value=item.raw_value,
|
|
suggested_id=suggested_id,
|
|
confidence=confidence,
|
|
reason=self._map_reason(item.reason),
|
|
source_url=item.source_url or '',
|
|
check_date=item.game_date,
|
|
context=item.context if item.context else None,
|
|
)
|
|
count += 1
|
|
|
|
return count
|
|
|
|
def _get_item_type_from_reason(self, reason) -> str:
|
|
"""Derive item type (team/stadium) from ReviewReason enum."""
|
|
from sportstime_parser.models.aliases import ReviewReason
|
|
|
|
# Map reason to item type
|
|
if isinstance(reason, ReviewReason):
|
|
reason_value = reason.value
|
|
else:
|
|
reason_value = str(reason).lower()
|
|
|
|
if 'team' in reason_value:
|
|
return 'team'
|
|
elif 'stadium' in reason_value:
|
|
return 'stadium'
|
|
else:
|
|
# Default to team for other reasons
|
|
return 'team'
|
|
|
|
def _map_reason(self, reason) -> str:
|
|
"""Map scraper ReviewReason to model choice."""
|
|
from sportstime_parser.models.aliases import ReviewReason
|
|
|
|
# Handle ReviewReason enum
|
|
if isinstance(reason, ReviewReason):
|
|
reason_value = reason.value
|
|
else:
|
|
reason_value = str(reason).lower()
|
|
|
|
reason_map = {
|
|
'unresolved_team': 'no_match',
|
|
'unresolved_stadium': 'no_match',
|
|
'low_confidence_match': 'low_confidence',
|
|
'missing_data': 'no_match',
|
|
'duplicate_game': 'ambiguous',
|
|
'timezone_unknown': 'no_match',
|
|
'geographic_filter': 'no_match',
|
|
# Legacy mappings
|
|
'no_match': 'no_match',
|
|
'no match found': 'no_match',
|
|
'low_confidence': 'low_confidence',
|
|
'fuzzy match below threshold': 'low_confidence',
|
|
'ambiguous': 'ambiguous',
|
|
'new_entity': 'new_entity',
|
|
}
|
|
return reason_map.get(reason_value.lower(), 'no_match')
|