- Three-scenario planning engine (A: date range, B: selected games, C: directional routes) - GeographicRouteExplorer with anchor game support for route exploration - Shared ItineraryBuilder for travel segment calculation - TravelEstimator for driving time/distance estimation - SwiftUI views for trip creation and detail display - CloudKit integration for schedule data - Python scraping scripts for sports schedules 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
591 lines
19 KiB
Python
591 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cross-Validation System for SportsTime App
|
|
Compares scraped data from multiple sources and flags discrepancies.
|
|
|
|
Usage:
|
|
python validate_data.py --data-dir ./data
|
|
python validate_data.py --scrape-and-validate --season 2025
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from dataclasses import dataclass, asdict, field
|
|
from typing import Optional
|
|
from collections import defaultdict
|
|
|
|
# Import scrapers from main script
|
|
from scrape_schedules import (
|
|
Game, Stadium,
|
|
scrape_nba_basketball_reference,
|
|
scrape_mlb_statsapi, scrape_mlb_baseball_reference,
|
|
scrape_nhl_hockey_reference,
|
|
NBA_TEAMS, MLB_TEAMS, NHL_TEAMS,
|
|
assign_stable_ids,
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# VALIDATION DATA CLASSES
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class Discrepancy:
|
|
"""Represents a discrepancy between sources."""
|
|
game_key: str
|
|
field: str # 'date', 'time', 'venue', 'teams', 'missing'
|
|
source1: str
|
|
source2: str
|
|
value1: str
|
|
value2: str
|
|
severity: str # 'high', 'medium', 'low'
|
|
|
|
|
|
@dataclass
|
|
class ValidationReport:
|
|
"""Summary of validation results."""
|
|
sport: str
|
|
season: str
|
|
sources: list
|
|
total_games_source1: int = 0
|
|
total_games_source2: int = 0
|
|
games_matched: int = 0
|
|
games_missing_source1: int = 0
|
|
games_missing_source2: int = 0
|
|
discrepancies: list = field(default_factory=list)
|
|
|
|
def to_dict(self):
|
|
return {
|
|
'sport': self.sport,
|
|
'season': self.season,
|
|
'sources': self.sources,
|
|
'total_games_source1': self.total_games_source1,
|
|
'total_games_source2': self.total_games_source2,
|
|
'games_matched': self.games_matched,
|
|
'games_missing_source1': self.games_missing_source1,
|
|
'games_missing_source2': self.games_missing_source2,
|
|
'discrepancies': [asdict(d) for d in self.discrepancies],
|
|
'discrepancy_summary': self.get_summary()
|
|
}
|
|
|
|
def get_summary(self):
|
|
by_field = defaultdict(int)
|
|
by_severity = defaultdict(int)
|
|
for d in self.discrepancies:
|
|
by_field[d.field] += 1
|
|
by_severity[d.severity] += 1
|
|
return {
|
|
'by_field': dict(by_field),
|
|
'by_severity': dict(by_severity)
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# GAME KEY GENERATION
|
|
# =============================================================================
|
|
|
|
def normalize_abbrev(abbrev: str, sport: str) -> str:
|
|
"""Normalize team abbreviations across different sources."""
|
|
abbrev = abbrev.upper().strip()
|
|
|
|
if sport == 'MLB':
|
|
# MLB abbreviation mappings between sources
|
|
mlb_mappings = {
|
|
'AZ': 'ARI', 'ARI': 'ARI', # Arizona
|
|
'ATH': 'OAK', 'OAK': 'OAK', # Oakland/Athletics
|
|
'CWS': 'CHW', 'CHW': 'CHW', # Chicago White Sox
|
|
'KC': 'KCR', 'KCR': 'KCR', # Kansas City
|
|
'SD': 'SDP', 'SDP': 'SDP', # San Diego
|
|
'SF': 'SFG', 'SFG': 'SFG', # San Francisco
|
|
'TB': 'TBR', 'TBR': 'TBR', # Tampa Bay
|
|
'WSH': 'WSN', 'WSN': 'WSN', # Washington
|
|
}
|
|
return mlb_mappings.get(abbrev, abbrev)
|
|
|
|
elif sport == 'NBA':
|
|
nba_mappings = {
|
|
'PHX': 'PHO', 'PHO': 'PHO', # Phoenix
|
|
'BKN': 'BRK', 'BRK': 'BRK', # Brooklyn
|
|
'CHA': 'CHO', 'CHO': 'CHO', # Charlotte
|
|
'NOP': 'NOP', 'NO': 'NOP', # New Orleans
|
|
}
|
|
return nba_mappings.get(abbrev, abbrev)
|
|
|
|
elif sport == 'NHL':
|
|
nhl_mappings = {
|
|
'ARI': 'UTA', 'UTA': 'UTA', # Arizona moved to Utah
|
|
'VGS': 'VGK', 'VGK': 'VGK', # Vegas
|
|
}
|
|
return nhl_mappings.get(abbrev, abbrev)
|
|
|
|
return abbrev
|
|
|
|
|
|
def generate_game_key(game: Game) -> str:
|
|
"""
|
|
Generate a unique key for matching games across sources.
|
|
Uses date + normalized team abbreviations (sorted) to match.
|
|
"""
|
|
home = normalize_abbrev(game.home_team_abbrev, game.sport)
|
|
away = normalize_abbrev(game.away_team_abbrev, game.sport)
|
|
teams = sorted([home, away])
|
|
return f"{game.date}_{teams[0]}_{teams[1]}"
|
|
|
|
|
|
def normalize_team_name(name: str, sport: str) -> str:
|
|
"""Normalize team name variations."""
|
|
teams = {'NBA': NBA_TEAMS, 'MLB': MLB_TEAMS, 'NHL': NHL_TEAMS}.get(sport, {})
|
|
|
|
name_lower = name.lower().strip()
|
|
|
|
# Check against known team names
|
|
for abbrev, info in teams.items():
|
|
if name_lower == info['name'].lower():
|
|
return abbrev
|
|
# Check city match
|
|
if name_lower == info['city'].lower():
|
|
return abbrev
|
|
# Check partial match
|
|
if name_lower in info['name'].lower() or info['name'].lower() in name_lower:
|
|
return abbrev
|
|
|
|
return name[:3].upper()
|
|
|
|
|
|
def normalize_venue(venue: str) -> str:
|
|
"""Normalize venue name for comparison."""
|
|
# Remove common variations
|
|
normalized = venue.lower().strip()
|
|
|
|
# Remove sponsorship prefixes that change
|
|
replacements = [
|
|
('at ', ''),
|
|
('the ', ''),
|
|
(' stadium', ''),
|
|
(' arena', ''),
|
|
(' center', ''),
|
|
(' field', ''),
|
|
(' park', ''),
|
|
('.com', ''),
|
|
('crypto', 'crypto.com'),
|
|
]
|
|
|
|
for old, new in replacements:
|
|
normalized = normalized.replace(old, new)
|
|
|
|
return normalized.strip()
|
|
|
|
|
|
def normalize_time(time_str: Optional[str]) -> Optional[str]:
|
|
"""Normalize time format to HH:MM."""
|
|
if not time_str:
|
|
return None
|
|
|
|
time_str = time_str.strip().lower()
|
|
|
|
# Handle various formats
|
|
if 'pm' in time_str or 'am' in time_str:
|
|
# 12-hour format
|
|
try:
|
|
for fmt in ['%I:%M%p', '%I:%M %p', '%I%p']:
|
|
try:
|
|
dt = datetime.strptime(time_str.replace(' ', ''), fmt)
|
|
return dt.strftime('%H:%M')
|
|
except:
|
|
continue
|
|
except:
|
|
pass
|
|
|
|
# Already 24-hour or just numbers
|
|
if ':' in time_str:
|
|
parts = time_str.split(':')
|
|
if len(parts) >= 2:
|
|
try:
|
|
hour = int(parts[0])
|
|
minute = int(parts[1][:2])
|
|
return f"{hour:02d}:{minute:02d}"
|
|
except:
|
|
pass
|
|
|
|
return time_str
|
|
|
|
|
|
# =============================================================================
|
|
# CROSS-VALIDATION LOGIC
|
|
# =============================================================================
|
|
|
|
def validate_games(
|
|
games1: list[Game],
|
|
games2: list[Game],
|
|
source1_name: str,
|
|
source2_name: str,
|
|
sport: str,
|
|
season: str
|
|
) -> ValidationReport:
|
|
"""
|
|
Compare two lists of games and find discrepancies.
|
|
"""
|
|
report = ValidationReport(
|
|
sport=sport,
|
|
season=season,
|
|
sources=[source1_name, source2_name],
|
|
total_games_source1=len(games1),
|
|
total_games_source2=len(games2)
|
|
)
|
|
|
|
# Index games by key
|
|
games1_by_key = {}
|
|
for g in games1:
|
|
key = generate_game_key(g)
|
|
games1_by_key[key] = g
|
|
|
|
games2_by_key = {}
|
|
for g in games2:
|
|
key = generate_game_key(g)
|
|
games2_by_key[key] = g
|
|
|
|
# Find matches and discrepancies
|
|
all_keys = set(games1_by_key.keys()) | set(games2_by_key.keys())
|
|
|
|
for key in all_keys:
|
|
g1 = games1_by_key.get(key)
|
|
g2 = games2_by_key.get(key)
|
|
|
|
if g1 and g2:
|
|
# Both sources have this game - compare fields
|
|
report.games_matched += 1
|
|
|
|
# Compare dates (should match by key, but double-check)
|
|
if g1.date != g2.date:
|
|
report.discrepancies.append(Discrepancy(
|
|
game_key=key,
|
|
field='date',
|
|
source1=source1_name,
|
|
source2=source2_name,
|
|
value1=g1.date,
|
|
value2=g2.date,
|
|
severity='high'
|
|
))
|
|
|
|
# Compare times
|
|
time1 = normalize_time(g1.time)
|
|
time2 = normalize_time(g2.time)
|
|
if time1 and time2 and time1 != time2:
|
|
# Check if times are close (within 1 hour - could be timezone)
|
|
try:
|
|
t1 = datetime.strptime(time1, '%H:%M')
|
|
t2 = datetime.strptime(time2, '%H:%M')
|
|
diff_minutes = abs((t1 - t2).total_seconds() / 60)
|
|
severity = 'low' if diff_minutes <= 60 else 'medium'
|
|
except:
|
|
severity = 'medium'
|
|
|
|
report.discrepancies.append(Discrepancy(
|
|
game_key=key,
|
|
field='time',
|
|
source1=source1_name,
|
|
source2=source2_name,
|
|
value1=time1 or '',
|
|
value2=time2 or '',
|
|
severity=severity
|
|
))
|
|
|
|
# Compare venues
|
|
venue1 = normalize_venue(g1.venue) if g1.venue else ''
|
|
venue2 = normalize_venue(g2.venue) if g2.venue else ''
|
|
if venue1 and venue2 and venue1 != venue2:
|
|
# Check for partial match
|
|
if venue1 not in venue2 and venue2 not in venue1:
|
|
report.discrepancies.append(Discrepancy(
|
|
game_key=key,
|
|
field='venue',
|
|
source1=source1_name,
|
|
source2=source2_name,
|
|
value1=g1.venue,
|
|
value2=g2.venue,
|
|
severity='low'
|
|
))
|
|
|
|
elif g1 and not g2:
|
|
# Game only in source 1
|
|
report.games_missing_source2 += 1
|
|
|
|
# Determine severity based on date
|
|
# Spring training (March before ~25th) and playoffs (Oct+) are expected differences
|
|
severity = 'high'
|
|
try:
|
|
game_date = datetime.strptime(g1.date, '%Y-%m-%d')
|
|
month = game_date.month
|
|
day = game_date.day
|
|
if month == 3 and day < 26: # Spring training
|
|
severity = 'medium'
|
|
elif month >= 10: # Playoffs/postseason
|
|
severity = 'medium'
|
|
except:
|
|
pass
|
|
|
|
report.discrepancies.append(Discrepancy(
|
|
game_key=key,
|
|
field='missing',
|
|
source1=source1_name,
|
|
source2=source2_name,
|
|
value1=f"{g1.away_team} @ {g1.home_team}",
|
|
value2='NOT FOUND',
|
|
severity=severity
|
|
))
|
|
|
|
else:
|
|
# Game only in source 2
|
|
report.games_missing_source1 += 1
|
|
|
|
# Determine severity based on date
|
|
severity = 'high'
|
|
try:
|
|
game_date = datetime.strptime(g2.date, '%Y-%m-%d')
|
|
month = game_date.month
|
|
day = game_date.day
|
|
if month == 3 and day < 26: # Spring training
|
|
severity = 'medium'
|
|
elif month >= 10: # Playoffs/postseason
|
|
severity = 'medium'
|
|
except:
|
|
pass
|
|
|
|
report.discrepancies.append(Discrepancy(
|
|
game_key=key,
|
|
field='missing',
|
|
source1=source1_name,
|
|
source2=source2_name,
|
|
value1='NOT FOUND',
|
|
value2=f"{g2.away_team} @ {g2.home_team}",
|
|
severity=severity
|
|
))
|
|
|
|
return report
|
|
|
|
|
|
def validate_stadiums(stadiums: list[Stadium]) -> list[dict]:
|
|
"""
|
|
Validate stadium data for completeness and accuracy.
|
|
"""
|
|
issues = []
|
|
|
|
for s in stadiums:
|
|
# Check for missing coordinates
|
|
if s.latitude == 0 or s.longitude == 0:
|
|
issues.append({
|
|
'stadium': s.name,
|
|
'sport': s.sport,
|
|
'issue': 'Missing coordinates',
|
|
'severity': 'high'
|
|
})
|
|
|
|
# Check for missing capacity
|
|
if s.capacity == 0:
|
|
issues.append({
|
|
'stadium': s.name,
|
|
'sport': s.sport,
|
|
'issue': 'Missing capacity',
|
|
'severity': 'low'
|
|
})
|
|
|
|
# Check coordinate bounds (roughly North America)
|
|
if s.latitude != 0:
|
|
if not (24 < s.latitude < 55):
|
|
issues.append({
|
|
'stadium': s.name,
|
|
'sport': s.sport,
|
|
'issue': f'Latitude {s.latitude} outside expected range',
|
|
'severity': 'medium'
|
|
})
|
|
|
|
if s.longitude != 0:
|
|
if not (-130 < s.longitude < -60):
|
|
issues.append({
|
|
'stadium': s.name,
|
|
'sport': s.sport,
|
|
'issue': f'Longitude {s.longitude} outside expected range',
|
|
'severity': 'medium'
|
|
})
|
|
|
|
return issues
|
|
|
|
|
|
# =============================================================================
|
|
# MULTI-SOURCE SCRAPING
|
|
# =============================================================================
|
|
|
|
def scrape_nba_all_sources(season: int) -> dict:
|
|
"""Scrape NBA from all available sources."""
|
|
nba_season = f"{season-1}-{str(season)[2:]}"
|
|
games = scrape_nba_basketball_reference(season)
|
|
games = assign_stable_ids(games, 'NBA', nba_season)
|
|
return {
|
|
'basketball-reference': games,
|
|
# ESPN requires JS rendering, skip for now
|
|
}
|
|
|
|
|
|
def scrape_mlb_all_sources(season: int) -> dict:
|
|
"""Scrape MLB from all available sources."""
|
|
mlb_season = str(season)
|
|
|
|
# MLB API uses official gamePk - already stable
|
|
api_games = scrape_mlb_statsapi(season)
|
|
|
|
# Baseball-Reference needs stable IDs
|
|
br_games = scrape_mlb_baseball_reference(season)
|
|
br_games = assign_stable_ids(br_games, 'MLB', mlb_season)
|
|
|
|
return {
|
|
'statsapi.mlb.com': api_games,
|
|
'baseball-reference': br_games,
|
|
}
|
|
|
|
|
|
def scrape_nhl_all_sources(season: int) -> dict:
|
|
"""Scrape NHL from all available sources."""
|
|
nhl_season = f"{season-1}-{str(season)[2:]}"
|
|
games = scrape_nhl_hockey_reference(season)
|
|
games = assign_stable_ids(games, 'NHL', nhl_season)
|
|
return {
|
|
'hockey-reference': games,
|
|
# NHL API requires date iteration, skip for now
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Validate sports data')
|
|
parser.add_argument('--data-dir', type=str, default='./data', help='Data directory')
|
|
parser.add_argument('--scrape-and-validate', action='store_true', help='Scrape fresh and validate')
|
|
parser.add_argument('--season', type=int, default=2025, help='Season year')
|
|
parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'all'], default='all')
|
|
parser.add_argument('--output', type=str, default='./data/validation_report.json')
|
|
|
|
args = parser.parse_args()
|
|
|
|
reports = []
|
|
stadium_issues = []
|
|
|
|
if args.scrape_and_validate:
|
|
print("\n" + "="*60)
|
|
print("CROSS-VALIDATION MODE")
|
|
print("="*60)
|
|
|
|
# MLB has two good sources - validate
|
|
if args.sport in ['mlb', 'all']:
|
|
print(f"\n--- MLB {args.season} ---")
|
|
mlb_sources = scrape_mlb_all_sources(args.season)
|
|
|
|
source_names = list(mlb_sources.keys())
|
|
if len(source_names) >= 2:
|
|
games1 = mlb_sources[source_names[0]]
|
|
games2 = mlb_sources[source_names[1]]
|
|
|
|
if games1 and games2:
|
|
report = validate_games(
|
|
games1, games2,
|
|
source_names[0], source_names[1],
|
|
'MLB', str(args.season)
|
|
)
|
|
reports.append(report)
|
|
print(f" Compared {report.total_games_source1} vs {report.total_games_source2} games")
|
|
print(f" Matched: {report.games_matched}")
|
|
print(f" Discrepancies: {len(report.discrepancies)}")
|
|
|
|
# NBA (single source for now, but validate data quality)
|
|
if args.sport in ['nba', 'all']:
|
|
print(f"\n--- NBA {args.season} ---")
|
|
nba_sources = scrape_nba_all_sources(args.season)
|
|
games = nba_sources.get('basketball-reference', [])
|
|
print(f" Got {len(games)} games from Basketball-Reference")
|
|
|
|
# Validate internal consistency
|
|
teams_seen = defaultdict(int)
|
|
for g in games:
|
|
teams_seen[g.home_team_abbrev] += 1
|
|
teams_seen[g.away_team_abbrev] += 1
|
|
|
|
# Each team should have ~82 games
|
|
for team, count in teams_seen.items():
|
|
if count < 70 or count > 95:
|
|
print(f" Warning: {team} has {count} games (expected ~82)")
|
|
|
|
else:
|
|
# Load existing data and validate
|
|
data_dir = Path(args.data_dir)
|
|
|
|
# Load games
|
|
games_file = data_dir / 'games.json'
|
|
if games_file.exists():
|
|
with open(games_file) as f:
|
|
games_data = json.load(f)
|
|
print(f"\nLoaded {len(games_data)} games from {games_file}")
|
|
|
|
# Group by sport and validate counts
|
|
by_sport = defaultdict(list)
|
|
for g in games_data:
|
|
by_sport[g['sport']].append(g)
|
|
|
|
for sport, sport_games in by_sport.items():
|
|
print(f" {sport}: {len(sport_games)} games")
|
|
|
|
# Load and validate stadiums
|
|
stadiums_file = data_dir / 'stadiums.json'
|
|
if stadiums_file.exists():
|
|
with open(stadiums_file) as f:
|
|
stadiums_data = json.load(f)
|
|
stadiums = [Stadium(**s) for s in stadiums_data]
|
|
print(f"\nLoaded {len(stadiums)} stadiums from {stadiums_file}")
|
|
|
|
stadium_issues = validate_stadiums(stadiums)
|
|
if stadium_issues:
|
|
print(f"\nStadium validation issues ({len(stadium_issues)}):")
|
|
for issue in stadium_issues[:10]:
|
|
print(f" [{issue['severity'].upper()}] {issue['stadium']}: {issue['issue']}")
|
|
if len(stadium_issues) > 10:
|
|
print(f" ... and {len(stadium_issues) - 10} more")
|
|
|
|
# Save validation report
|
|
output_path = Path(args.output)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
full_report = {
|
|
'generated_at': datetime.now().isoformat(),
|
|
'season': args.season,
|
|
'game_validations': [r.to_dict() for r in reports],
|
|
'stadium_issues': stadium_issues
|
|
}
|
|
|
|
with open(output_path, 'w') as f:
|
|
json.dump(full_report, f, indent=2)
|
|
|
|
print(f"\n Validation report saved to {output_path}")
|
|
|
|
# Summary
|
|
print("\n" + "="*60)
|
|
print("VALIDATION SUMMARY")
|
|
print("="*60)
|
|
|
|
total_discrepancies = sum(len(r.discrepancies) for r in reports)
|
|
high_severity = sum(
|
|
1 for r in reports
|
|
for d in r.discrepancies
|
|
if d.severity == 'high'
|
|
)
|
|
|
|
print(f"Total game validation reports: {len(reports)}")
|
|
print(f"Total discrepancies found: {total_discrepancies}")
|
|
print(f"High severity issues: {high_severity}")
|
|
print(f"Stadium data issues: {len(stadium_issues)}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|