""" Adapter to bridge existing sportstime_parser scrapers with Django models. """ import hashlib from datetime import datetime from typing import Callable, Optional from django.db import transaction from django.utils import timezone class ScraperAdapter: """ Adapts the existing sportstime_parser scrapers to work with Django models. """ def __init__( self, sport_code: str, season: int, config, log_func: Optional[Callable] = None, ): self.sport_code = sport_code self.season = season self.config = config self.log = log_func or (lambda level, msg, **kw: None) def run(self) -> dict: """ Run the scraper and return results. """ from core.models import Sport, Team, Stadium, Game from scraper.models import ManualReviewItem from cloudkit.models import CloudKitSyncState result = { 'games_found': 0, 'games_new': 0, 'games_updated': 0, 'games_unchanged': 0, 'games_errors': 0, 'teams_found': 0, 'stadiums_found': 0, 'review_items': 0, } # Get sport try: sport = Sport.objects.get(code=self.sport_code) except Sport.DoesNotExist: raise ValueError(f"Sport {self.sport_code} not found in database") self.log('info', f'Starting scraper for {sport.short_name} {self.season}', source='adapter') # Import and create the appropriate scraper scraper = self._create_scraper() # Run the scrape self.log('info', 'Scraping games...', source='adapter') raw_result = scraper.scrape_all() # Process stadiums first (teams reference stadiums via home_stadium FK) self.log('info', f'Processing {len(raw_result.stadiums)} stadiums...', source='adapter') result['stadiums_found'] = len(raw_result.stadiums) self._process_stadiums(sport, raw_result.stadiums) # Process teams self.log('info', f'Processing {len(raw_result.teams)} teams...', source='adapter') result['teams_found'] = len(raw_result.teams) self._process_teams(sport, raw_result.teams) # Process games self.log('info', f'Processing {len(raw_result.games)} games...', source='adapter') game_result = self._process_games(sport, raw_result.games) result.update(game_result) # Process review items if raw_result.review_items: self.log('info', f'Creating {len(raw_result.review_items)} review items...', source='adapter') result['review_items'] = self._process_review_items(sport, raw_result.review_items) self.log('info', f'Scrape complete: {result}', source='adapter') return result def _create_scraper(self): """Create the appropriate scraper instance.""" # Import from existing sportstime_parser from sportstime_parser.scrapers import ( create_nba_scraper, create_mlb_scraper, create_nfl_scraper, create_nhl_scraper, create_mls_scraper, create_wnba_scraper, create_nwsl_scraper, ) scrapers = { 'nba': create_nba_scraper, 'mlb': create_mlb_scraper, 'nfl': create_nfl_scraper, 'nhl': create_nhl_scraper, 'mls': create_mls_scraper, 'wnba': create_wnba_scraper, 'nwsl': create_nwsl_scraper, } creator = scrapers.get(self.sport_code) if not creator: raise ValueError(f"No scraper for sport: {self.sport_code}") # Create scraper (config overrides handled via session/resolver settings if needed) return creator(season=self.season) def _process_teams(self, sport, teams): """Process and upsert teams.""" from core.models import Team, Stadium, Division, Conference from cloudkit.models import CloudKitSyncState for team_data in teams: team_id = team_data.id # Find division if available division = None if team_data.division: division = Division.objects.filter( conference__sport=sport, name__iexact=team_data.division ).first() # Fallback to partial match if not division: division = Division.objects.filter( conference__sport=sport, name__icontains=team_data.division ).first() # Resolve home stadium if available home_stadium = None stadium_id = getattr(team_data, 'stadium_id', None) if stadium_id: home_stadium = Stadium.objects.filter(id=stadium_id).first() team, created = Team.objects.update_or_create( id=team_id, defaults={ 'sport': sport, 'division': division, 'city': team_data.city, 'name': team_data.name, 'full_name': team_data.full_name, 'abbreviation': team_data.abbreviation, 'home_stadium': home_stadium, 'primary_color': getattr(team_data, 'primary_color', '') or '', 'secondary_color': getattr(team_data, 'secondary_color', '') or '', 'logo_url': getattr(team_data, 'logo_url', '') or '', } ) # Mark for sync if created: CloudKitSyncState.objects.get_or_create( record_type='Team', record_id=team_id, defaults={'sync_status': 'pending'} ) def _process_stadiums(self, sport, stadiums): """Process and upsert stadiums.""" from core.models import Stadium from cloudkit.models import CloudKitSyncState for stadium_data in stadiums: stadium_id = stadium_data.id stadium, created = Stadium.objects.update_or_create( id=stadium_id, defaults={ 'sport': sport, 'name': stadium_data.name, 'city': stadium_data.city, 'state': getattr(stadium_data, 'state', '') or '', 'country': getattr(stadium_data, 'country', 'USA'), 'latitude': getattr(stadium_data, 'latitude', None), 'longitude': getattr(stadium_data, 'longitude', None), 'capacity': getattr(stadium_data, 'capacity', None), 'surface': getattr(stadium_data, 'surface', '') or '', 'roof_type': getattr(stadium_data, 'roof_type', '') or '', 'opened_year': getattr(stadium_data, 'opened_year', None), 'timezone': getattr(stadium_data, 'timezone', '') or '', 'image_url': getattr(stadium_data, 'image_url', '') or '', } ) if created: CloudKitSyncState.objects.get_or_create( record_type='Stadium', record_id=stadium_id, defaults={'sync_status': 'pending'} ) def _resolve_team_via_db_alias(self, sport, raw_name, check_date=None): """Try to resolve a team name using database aliases. Args: sport: Sport model instance raw_name: Raw team name from scraper check_date: Date for alias validity check Returns: Team instance if found, None otherwise """ from core.models import Team, TeamAlias from datetime import date if not raw_name: return None check_date = check_date or date.today() # Check TeamAlias model aliases = TeamAlias.objects.filter( alias__iexact=raw_name.strip(), team__sport=sport, ).select_related('team') for alias in aliases: if alias.is_valid_for_date(check_date): return alias.team # Also try partial matching on team full_name and city team = Team.objects.filter( sport=sport, full_name__iexact=raw_name.strip() ).first() if team: return team team = Team.objects.filter( sport=sport, city__iexact=raw_name.strip() ).first() if team: return team return None def _resolve_stadium_via_db_alias(self, sport, raw_name, check_date=None): """Try to resolve a stadium name using database aliases. Args: sport: Sport model instance raw_name: Raw stadium name from scraper check_date: Date for alias validity check Returns: Stadium instance if found, None otherwise """ from core.models import Stadium, StadiumAlias from datetime import date if not raw_name: return None check_date = check_date or date.today() # Check StadiumAlias model aliases = StadiumAlias.objects.filter( alias__iexact=raw_name.strip(), stadium__sport=sport, ).select_related('stadium') for alias in aliases: if alias.is_valid_for_date(check_date): return alias.stadium # Also try direct matching on stadium name stadium = Stadium.objects.filter( sport=sport, name__iexact=raw_name.strip() ).first() if stadium: return stadium return None def _process_games(self, sport, games): """Process and upsert games.""" from core.models import Game, Team, Stadium from cloudkit.models import CloudKitSyncState result = { 'games_found': len(games), 'games_new': 0, 'games_updated': 0, 'games_unchanged': 0, 'games_errors': 0, } for game_data in games: try: game_id = game_data.id check_date = game_data.game_date.date() if hasattr(game_data.game_date, 'date') else game_data.game_date # Get related objects - try by ID first, then by DB alias home_team = None away_team = None try: home_team = Team.objects.get(id=game_data.home_team_id) except Team.DoesNotExist: # Try resolving via database alias using raw name raw_home = getattr(game_data, 'raw_home_team', None) if raw_home: home_team = self._resolve_team_via_db_alias(sport, raw_home, check_date) if home_team: self.log('info', f'Resolved home team via DB alias: {raw_home} -> {home_team.abbreviation}', source='adapter') try: away_team = Team.objects.get(id=game_data.away_team_id) except Team.DoesNotExist: # Try resolving via database alias using raw name raw_away = getattr(game_data, 'raw_away_team', None) if raw_away: away_team = self._resolve_team_via_db_alias(sport, raw_away, check_date) if away_team: self.log('info', f'Resolved away team via DB alias: {raw_away} -> {away_team.abbreviation}', source='adapter') if not home_team or not away_team: missing = [] if not home_team: missing.append(f'home={game_data.home_team_id}') if not away_team: missing.append(f'away={game_data.away_team_id}') self.log('warning', f'Team not found for game {game_id}: {", ".join(missing)}', source='adapter') result['games_errors'] += 1 continue stadium = None if game_data.stadium_id: try: stadium = Stadium.objects.get(id=game_data.stadium_id) except Stadium.DoesNotExist: # Try resolving via database alias using raw name raw_stadium = getattr(game_data, 'raw_stadium', None) if raw_stadium: stadium = self._resolve_stadium_via_db_alias(sport, raw_stadium, check_date) if stadium: self.log('info', f'Resolved stadium via DB alias: {raw_stadium} -> {stadium.name}', source='adapter') # Build game dict game_defaults = { 'sport': sport, 'season': game_data.season, 'home_team': home_team, 'away_team': away_team, 'stadium': stadium, 'game_date': game_data.game_date, 'game_number': getattr(game_data, 'game_number', None), 'home_score': game_data.home_score, 'away_score': game_data.away_score, 'status': game_data.status, 'raw_home_team': getattr(game_data, 'raw_home_team', '') or '', 'raw_away_team': getattr(game_data, 'raw_away_team', '') or '', 'raw_stadium': getattr(game_data, 'raw_stadium', '') or '', 'source_url': getattr(game_data, 'source_url', '') or '', } # Check if game exists try: existing = Game.objects.get(id=game_id) # Check if changed changed = False for key, value in game_defaults.items(): if getattr(existing, key if not hasattr(existing, f'{key}_id') else f'{key}_id') != (value.id if hasattr(value, 'id') else value): changed = True break if changed: for key, value in game_defaults.items(): setattr(existing, key, value) existing.save() result['games_updated'] += 1 # Mark for sync CloudKitSyncState.objects.update_or_create( record_type='Game', record_id=game_id, defaults={'sync_status': 'pending'} ) else: result['games_unchanged'] += 1 except Game.DoesNotExist: # Create new game Game.objects.create(id=game_id, **game_defaults) result['games_new'] += 1 # Mark for sync CloudKitSyncState.objects.get_or_create( record_type='Game', record_id=game_id, defaults={'sync_status': 'pending'} ) except Exception as e: self.log('error', f'Error processing game: {e}', source='adapter') result['games_errors'] += 1 return result def _process_review_items(self, sport, review_items): """Create manual review items.""" from scraper.models import ManualReviewItem, ScrapeJob from sportstime_parser.models.aliases import ReviewReason # Get current job job = ScrapeJob.objects.filter( config=self.config, status='running' ).order_by('-created_at').first() count = 0 for item in review_items: # Derive item_type from reason item_type = self._get_item_type_from_reason(item.reason) # Get suggested match info (parser uses suggested_matches list) suggested_id = '' confidence = 0.0 if item.suggested_matches: best_match = item.suggested_matches[0] suggested_id = best_match.canonical_id confidence = best_match.confidence / 100.0 # Convert to 0-1 range ManualReviewItem.objects.create( job=job, item_type=item_type, sport=sport, raw_value=item.raw_value, suggested_id=suggested_id, confidence=confidence, reason=self._map_reason(item.reason), source_url=item.source_url or '', check_date=item.game_date, context=item.context if item.context else None, ) count += 1 return count def _get_item_type_from_reason(self, reason) -> str: """Derive item type (team/stadium) from ReviewReason enum.""" from sportstime_parser.models.aliases import ReviewReason # Map reason to item type if isinstance(reason, ReviewReason): reason_value = reason.value else: reason_value = str(reason).lower() if 'team' in reason_value: return 'team' elif 'stadium' in reason_value: return 'stadium' else: # Default to team for other reasons return 'team' def _map_reason(self, reason) -> str: """Map scraper ReviewReason to model choice.""" from sportstime_parser.models.aliases import ReviewReason # Handle ReviewReason enum if isinstance(reason, ReviewReason): reason_value = reason.value else: reason_value = str(reason).lower() reason_map = { 'unresolved_team': 'no_match', 'unresolved_stadium': 'no_match', 'low_confidence_match': 'low_confidence', 'missing_data': 'no_match', 'duplicate_game': 'ambiguous', 'timezone_unknown': 'no_match', 'geographic_filter': 'no_match', # Legacy mappings 'no_match': 'no_match', 'no match found': 'no_match', 'low_confidence': 'low_confidence', 'fuzzy match below threshold': 'low_confidence', 'ambiguous': 'ambiguous', 'new_entity': 'new_entity', } return reason_map.get(reason_value.lower(), 'no_match')