Initial commit: SportsTime trip planning app

- Three-scenario planning engine (A: date range, B: selected games, C: directional routes) - GeographicRouteExplorer with anchor game support for route exploration - Shared ItineraryBuilder for travel segment calculation - TravelEstimator for driving time/distance estimation - SwiftUI views for trip creation and detail display - CloudKit integration for schedule data - Python scraping scripts for sports schedules 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 00:46:40 -06:00
commit 9088b46563
84 changed files with 180371 additions and 0 deletions
--- a/Scripts/validate_data.py
+++ b/Scripts/validate_data.py
@@ -0,0 +1,590 @@
+#!/usr/bin/env python3
+"""
+Cross-Validation System for SportsTime App
+Compares scraped data from multiple sources and flags discrepancies.
+
+Usage:
+    python validate_data.py --data-dir ./data
+    python validate_data.py --scrape-and-validate --season 2025
+"""
+
+import argparse
+import json
+from datetime import datetime
+from pathlib import Path
+from dataclasses import dataclass, asdict, field
+from typing import Optional
+from collections import defaultdict
+
+# Import scrapers from main script
+from scrape_schedules import (
+    Game, Stadium,
+    scrape_nba_basketball_reference,
+    scrape_mlb_statsapi, scrape_mlb_baseball_reference,
+    scrape_nhl_hockey_reference,
+    NBA_TEAMS, MLB_TEAMS, NHL_TEAMS,
+    assign_stable_ids,
+)
+
+
+# =============================================================================
+# VALIDATION DATA CLASSES
+# =============================================================================
+
+@dataclass
+class Discrepancy:
+    """Represents a discrepancy between sources."""
+    game_key: str
+    field: str  # 'date', 'time', 'venue', 'teams', 'missing'
+    source1: str
+    source2: str
+    value1: str
+    value2: str
+    severity: str  # 'high', 'medium', 'low'
+
+
+@dataclass
+class ValidationReport:
+    """Summary of validation results."""
+    sport: str
+    season: str
+    sources: list
+    total_games_source1: int = 0
+    total_games_source2: int = 0
+    games_matched: int = 0
+    games_missing_source1: int = 0
+    games_missing_source2: int = 0
+    discrepancies: list = field(default_factory=list)
+
+    def to_dict(self):
+        return {
+            'sport': self.sport,
+            'season': self.season,
+            'sources': self.sources,
+            'total_games_source1': self.total_games_source1,
+            'total_games_source2': self.total_games_source2,
+            'games_matched': self.games_matched,
+            'games_missing_source1': self.games_missing_source1,
+            'games_missing_source2': self.games_missing_source2,
+            'discrepancies': [asdict(d) for d in self.discrepancies],
+            'discrepancy_summary': self.get_summary()
+        }
+
+    def get_summary(self):
+        by_field = defaultdict(int)
+        by_severity = defaultdict(int)
+        for d in self.discrepancies:
+            by_field[d.field] += 1
+            by_severity[d.severity] += 1
+        return {
+            'by_field': dict(by_field),
+            'by_severity': dict(by_severity)
+        }
+
+
+# =============================================================================
+# GAME KEY GENERATION
+# =============================================================================
+
+def normalize_abbrev(abbrev: str, sport: str) -> str:
+    """Normalize team abbreviations across different sources."""
+    abbrev = abbrev.upper().strip()
+
+    if sport == 'MLB':
+        # MLB abbreviation mappings between sources
+        mlb_mappings = {
+            'AZ': 'ARI', 'ARI': 'ARI',  # Arizona
+            'ATH': 'OAK', 'OAK': 'OAK',  # Oakland/Athletics
+            'CWS': 'CHW', 'CHW': 'CHW',  # Chicago White Sox
+            'KC': 'KCR', 'KCR': 'KCR',  # Kansas City
+            'SD': 'SDP', 'SDP': 'SDP',  # San Diego
+            'SF': 'SFG', 'SFG': 'SFG',  # San Francisco
+            'TB': 'TBR', 'TBR': 'TBR',  # Tampa Bay
+            'WSH': 'WSN', 'WSN': 'WSN',  # Washington
+        }
+        return mlb_mappings.get(abbrev, abbrev)
+
+    elif sport == 'NBA':
+        nba_mappings = {
+            'PHX': 'PHO', 'PHO': 'PHO',  # Phoenix
+            'BKN': 'BRK', 'BRK': 'BRK',  # Brooklyn
+            'CHA': 'CHO', 'CHO': 'CHO',  # Charlotte
+            'NOP': 'NOP', 'NO': 'NOP',  # New Orleans
+        }
+        return nba_mappings.get(abbrev, abbrev)
+
+    elif sport == 'NHL':
+        nhl_mappings = {
+            'ARI': 'UTA', 'UTA': 'UTA',  # Arizona moved to Utah
+            'VGS': 'VGK', 'VGK': 'VGK',  # Vegas
+        }
+        return nhl_mappings.get(abbrev, abbrev)
+
+    return abbrev
+
+
+def generate_game_key(game: Game) -> str:
+    """
+    Generate a unique key for matching games across sources.
+    Uses date + normalized team abbreviations (sorted) to match.
+    """
+    home = normalize_abbrev(game.home_team_abbrev, game.sport)
+    away = normalize_abbrev(game.away_team_abbrev, game.sport)
+    teams = sorted([home, away])
+    return f"{game.date}_{teams[0]}_{teams[1]}"
+
+
+def normalize_team_name(name: str, sport: str) -> str:
+    """Normalize team name variations."""
+    teams = {'NBA': NBA_TEAMS, 'MLB': MLB_TEAMS, 'NHL': NHL_TEAMS}.get(sport, {})
+
+    name_lower = name.lower().strip()
+
+    # Check against known team names
+    for abbrev, info in teams.items():
+        if name_lower == info['name'].lower():
+            return abbrev
+        # Check city match
+        if name_lower == info['city'].lower():
+            return abbrev
+        # Check partial match
+        if name_lower in info['name'].lower() or info['name'].lower() in name_lower:
+            return abbrev
+
+    return name[:3].upper()
+
+
+def normalize_venue(venue: str) -> str:
+    """Normalize venue name for comparison."""
+    # Remove common variations
+    normalized = venue.lower().strip()
+
+    # Remove sponsorship prefixes that change
+    replacements = [
+        ('at ', ''),
+        ('the ', ''),
+        (' stadium', ''),
+        (' arena', ''),
+        (' center', ''),
+        (' field', ''),
+        (' park', ''),
+        ('.com', ''),
+        ('crypto', 'crypto.com'),
+    ]
+
+    for old, new in replacements:
+        normalized = normalized.replace(old, new)
+
+    return normalized.strip()
+
+
+def normalize_time(time_str: Optional[str]) -> Optional[str]:
+    """Normalize time format to HH:MM."""
+    if not time_str:
+        return None
+
+    time_str = time_str.strip().lower()
+
+    # Handle various formats
+    if 'pm' in time_str or 'am' in time_str:
+        # 12-hour format
+        try:
+            for fmt in ['%I:%M%p', '%I:%M %p', '%I%p']:
+                try:
+                    dt = datetime.strptime(time_str.replace(' ', ''), fmt)
+                    return dt.strftime('%H:%M')
+                except:
+                    continue
+        except:
+            pass
+
+    # Already 24-hour or just numbers
+    if ':' in time_str:
+        parts = time_str.split(':')
+        if len(parts) >= 2:
+            try:
+                hour = int(parts[0])
+                minute = int(parts[1][:2])
+                return f"{hour:02d}:{minute:02d}"
+            except:
+                pass
+
+    return time_str
+
+
+# =============================================================================
+# CROSS-VALIDATION LOGIC
+# =============================================================================
+
+def validate_games(
+    games1: list[Game],
+    games2: list[Game],
+    source1_name: str,
+    source2_name: str,
+    sport: str,
+    season: str
+) -> ValidationReport:
+    """
+    Compare two lists of games and find discrepancies.
+    """
+    report = ValidationReport(
+        sport=sport,
+        season=season,
+        sources=[source1_name, source2_name],
+        total_games_source1=len(games1),
+        total_games_source2=len(games2)
+    )
+
+    # Index games by key
+    games1_by_key = {}
+    for g in games1:
+        key = generate_game_key(g)
+        games1_by_key[key] = g
+
+    games2_by_key = {}
+    for g in games2:
+        key = generate_game_key(g)
+        games2_by_key[key] = g
+
+    # Find matches and discrepancies
+    all_keys = set(games1_by_key.keys()) | set(games2_by_key.keys())
+
+    for key in all_keys:
+        g1 = games1_by_key.get(key)
+        g2 = games2_by_key.get(key)
+
+        if g1 and g2:
+            # Both sources have this game - compare fields
+            report.games_matched += 1
+
+            # Compare dates (should match by key, but double-check)
+            if g1.date != g2.date:
+                report.discrepancies.append(Discrepancy(
+                    game_key=key,
+                    field='date',
+                    source1=source1_name,
+                    source2=source2_name,
+                    value1=g1.date,
+                    value2=g2.date,
+                    severity='high'
+                ))
+
+            # Compare times
+            time1 = normalize_time(g1.time)
+            time2 = normalize_time(g2.time)
+            if time1 and time2 and time1 != time2:
+                # Check if times are close (within 1 hour - could be timezone)
+                try:
+                    t1 = datetime.strptime(time1, '%H:%M')
+                    t2 = datetime.strptime(time2, '%H:%M')
+                    diff_minutes = abs((t1 - t2).total_seconds() / 60)
+                    severity = 'low' if diff_minutes <= 60 else 'medium'
+                except:
+                    severity = 'medium'
+
+                report.discrepancies.append(Discrepancy(
+                    game_key=key,
+                    field='time',
+                    source1=source1_name,
+                    source2=source2_name,
+                    value1=time1 or '',
+                    value2=time2 or '',
+                    severity=severity
+                ))
+
+            # Compare venues
+            venue1 = normalize_venue(g1.venue) if g1.venue else ''
+            venue2 = normalize_venue(g2.venue) if g2.venue else ''
+            if venue1 and venue2 and venue1 != venue2:
+                # Check for partial match
+                if venue1 not in venue2 and venue2 not in venue1:
+                    report.discrepancies.append(Discrepancy(
+                        game_key=key,
+                        field='venue',
+                        source1=source1_name,
+                        source2=source2_name,
+                        value1=g1.venue,
+                        value2=g2.venue,
+                        severity='low'
+                    ))
+
+        elif g1 and not g2:
+            # Game only in source 1
+            report.games_missing_source2 += 1
+
+            # Determine severity based on date
+            # Spring training (March before ~25th) and playoffs (Oct+) are expected differences
+            severity = 'high'
+            try:
+                game_date = datetime.strptime(g1.date, '%Y-%m-%d')
+                month = game_date.month
+                day = game_date.day
+                if month == 3 and day < 26:  # Spring training
+                    severity = 'medium'
+                elif month >= 10:  # Playoffs/postseason
+                    severity = 'medium'
+            except:
+                pass
+
+            report.discrepancies.append(Discrepancy(
+                game_key=key,
+                field='missing',
+                source1=source1_name,
+                source2=source2_name,
+                value1=f"{g1.away_team} @ {g1.home_team}",
+                value2='NOT FOUND',
+                severity=severity
+            ))
+
+        else:
+            # Game only in source 2
+            report.games_missing_source1 += 1
+
+            # Determine severity based on date
+            severity = 'high'
+            try:
+                game_date = datetime.strptime(g2.date, '%Y-%m-%d')
+                month = game_date.month
+                day = game_date.day
+                if month == 3 and day < 26:  # Spring training
+                    severity = 'medium'
+                elif month >= 10:  # Playoffs/postseason
+                    severity = 'medium'
+            except:
+                pass
+
+            report.discrepancies.append(Discrepancy(
+                game_key=key,
+                field='missing',
+                source1=source1_name,
+                source2=source2_name,
+                value1='NOT FOUND',
+                value2=f"{g2.away_team} @ {g2.home_team}",
+                severity=severity
+            ))
+
+    return report
+
+
+def validate_stadiums(stadiums: list[Stadium]) -> list[dict]:
+    """
+    Validate stadium data for completeness and accuracy.
+    """
+    issues = []
+
+    for s in stadiums:
+        # Check for missing coordinates
+        if s.latitude == 0 or s.longitude == 0:
+            issues.append({
+                'stadium': s.name,
+                'sport': s.sport,
+                'issue': 'Missing coordinates',
+                'severity': 'high'
+            })
+
+        # Check for missing capacity
+        if s.capacity == 0:
+            issues.append({
+                'stadium': s.name,
+                'sport': s.sport,
+                'issue': 'Missing capacity',
+                'severity': 'low'
+            })
+
+        # Check coordinate bounds (roughly North America)
+        if s.latitude != 0:
+            if not (24 < s.latitude < 55):
+                issues.append({
+                    'stadium': s.name,
+                    'sport': s.sport,
+                    'issue': f'Latitude {s.latitude} outside expected range',
+                    'severity': 'medium'
+                })
+
+        if s.longitude != 0:
+            if not (-130 < s.longitude < -60):
+                issues.append({
+                    'stadium': s.name,
+                    'sport': s.sport,
+                    'issue': f'Longitude {s.longitude} outside expected range',
+                    'severity': 'medium'
+                })
+
+    return issues
+
+
+# =============================================================================
+# MULTI-SOURCE SCRAPING
+# =============================================================================
+
+def scrape_nba_all_sources(season: int) -> dict:
+    """Scrape NBA from all available sources."""
+    nba_season = f"{season-1}-{str(season)[2:]}"
+    games = scrape_nba_basketball_reference(season)
+    games = assign_stable_ids(games, 'NBA', nba_season)
+    return {
+        'basketball-reference': games,
+        # ESPN requires JS rendering, skip for now
+    }
+
+
+def scrape_mlb_all_sources(season: int) -> dict:
+    """Scrape MLB from all available sources."""
+    mlb_season = str(season)
+
+    # MLB API uses official gamePk - already stable
+    api_games = scrape_mlb_statsapi(season)
+
+    # Baseball-Reference needs stable IDs
+    br_games = scrape_mlb_baseball_reference(season)
+    br_games = assign_stable_ids(br_games, 'MLB', mlb_season)
+
+    return {
+        'statsapi.mlb.com': api_games,
+        'baseball-reference': br_games,
+    }
+
+
+def scrape_nhl_all_sources(season: int) -> dict:
+    """Scrape NHL from all available sources."""
+    nhl_season = f"{season-1}-{str(season)[2:]}"
+    games = scrape_nhl_hockey_reference(season)
+    games = assign_stable_ids(games, 'NHL', nhl_season)
+    return {
+        'hockey-reference': games,
+        # NHL API requires date iteration, skip for now
+    }
+
+
+# =============================================================================
+# MAIN
+# =============================================================================
+
+def main():
+    parser = argparse.ArgumentParser(description='Validate sports data')
+    parser.add_argument('--data-dir', type=str, default='./data', help='Data directory')
+    parser.add_argument('--scrape-and-validate', action='store_true', help='Scrape fresh and validate')
+    parser.add_argument('--season', type=int, default=2025, help='Season year')
+    parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'all'], default='all')
+    parser.add_argument('--output', type=str, default='./data/validation_report.json')
+
+    args = parser.parse_args()
+
+    reports = []
+    stadium_issues = []
+
+    if args.scrape_and_validate:
+        print("\n" + "="*60)
+        print("CROSS-VALIDATION MODE")
+        print("="*60)
+
+        # MLB has two good sources - validate
+        if args.sport in ['mlb', 'all']:
+            print(f"\n--- MLB {args.season} ---")
+            mlb_sources = scrape_mlb_all_sources(args.season)
+
+            source_names = list(mlb_sources.keys())
+            if len(source_names) >= 2:
+                games1 = mlb_sources[source_names[0]]
+                games2 = mlb_sources[source_names[1]]
+
+                if games1 and games2:
+                    report = validate_games(
+                        games1, games2,
+                        source_names[0], source_names[1],
+                        'MLB', str(args.season)
+                    )
+                    reports.append(report)
+                    print(f"  Compared {report.total_games_source1} vs {report.total_games_source2} games")
+                    print(f"  Matched: {report.games_matched}")
+                    print(f"  Discrepancies: {len(report.discrepancies)}")
+
+        # NBA (single source for now, but validate data quality)
+        if args.sport in ['nba', 'all']:
+            print(f"\n--- NBA {args.season} ---")
+            nba_sources = scrape_nba_all_sources(args.season)
+            games = nba_sources.get('basketball-reference', [])
+            print(f"  Got {len(games)} games from Basketball-Reference")
+
+            # Validate internal consistency
+            teams_seen = defaultdict(int)
+            for g in games:
+                teams_seen[g.home_team_abbrev] += 1
+                teams_seen[g.away_team_abbrev] += 1
+
+            # Each team should have ~82 games
+            for team, count in teams_seen.items():
+                if count < 70 or count > 95:
+                    print(f"    Warning: {team} has {count} games (expected ~82)")
+
+    else:
+        # Load existing data and validate
+        data_dir = Path(args.data_dir)
+
+        # Load games
+        games_file = data_dir / 'games.json'
+        if games_file.exists():
+            with open(games_file) as f:
+                games_data = json.load(f)
+                print(f"\nLoaded {len(games_data)} games from {games_file}")
+
+                # Group by sport and validate counts
+                by_sport = defaultdict(list)
+                for g in games_data:
+                    by_sport[g['sport']].append(g)
+
+                for sport, sport_games in by_sport.items():
+                    print(f"  {sport}: {len(sport_games)} games")
+
+        # Load and validate stadiums
+        stadiums_file = data_dir / 'stadiums.json'
+        if stadiums_file.exists():
+            with open(stadiums_file) as f:
+                stadiums_data = json.load(f)
+                stadiums = [Stadium(**s) for s in stadiums_data]
+                print(f"\nLoaded {len(stadiums)} stadiums from {stadiums_file}")
+
+                stadium_issues = validate_stadiums(stadiums)
+                if stadium_issues:
+                    print(f"\nStadium validation issues ({len(stadium_issues)}):")
+                    for issue in stadium_issues[:10]:
+                        print(f"  [{issue['severity'].upper()}] {issue['stadium']}: {issue['issue']}")
+                    if len(stadium_issues) > 10:
+                        print(f"  ... and {len(stadium_issues) - 10} more")
+
+    # Save validation report
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    full_report = {
+        'generated_at': datetime.now().isoformat(),
+        'season': args.season,
+        'game_validations': [r.to_dict() for r in reports],
+        'stadium_issues': stadium_issues
+    }
+
+    with open(output_path, 'w') as f:
+        json.dump(full_report, f, indent=2)
+
+    print(f"\n Validation report saved to {output_path}")
+
+    # Summary
+    print("\n" + "="*60)
+    print("VALIDATION SUMMARY")
+    print("="*60)
+
+    total_discrepancies = sum(len(r.discrepancies) for r in reports)
+    high_severity = sum(
+        1 for r in reports
+        for d in r.discrepancies
+        if d.severity == 'high'
+    )
+
+    print(f"Total game validation reports: {len(reports)}")
+    print(f"Total discrepancies found: {total_discrepancies}")
+    print(f"High severity issues: {high_severity}")
+    print(f"Stadium data issues: {len(stadium_issues)}")
+
+
+if __name__ == '__main__':
+    main()