Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
100 lines
3.3 KiB
Python
100 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Validate alias files for orphan references and format issues.
|
|
|
|
This script checks stadium_aliases.json and team_aliases.json for:
|
|
1. Orphan references (aliases pointing to non-existent canonical IDs)
|
|
2. JSON syntax errors
|
|
3. Required field presence
|
|
|
|
Usage:
|
|
python validate_aliases.py
|
|
|
|
Returns exit code 0 on success, 1 on failure.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add parent to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from sportstime_parser.normalizers.stadium_resolver import STADIUM_MAPPINGS
|
|
from sportstime_parser.normalizers.team_resolver import TEAM_MAPPINGS
|
|
|
|
|
|
def main() -> int:
|
|
"""Run validation checks on alias files."""
|
|
errors: list[str] = []
|
|
|
|
# Build valid stadium ID set
|
|
valid_stadium_ids: set[str] = set()
|
|
for sport_stadiums in STADIUM_MAPPINGS.values():
|
|
for stadium_id in sport_stadiums.keys():
|
|
valid_stadium_ids.add(stadium_id)
|
|
|
|
# Build valid team ID set
|
|
valid_team_ids: set[str] = set()
|
|
for sport_teams in TEAM_MAPPINGS.values():
|
|
for abbrev, team_data in sport_teams.items():
|
|
valid_team_ids.add(team_data[0]) # team_id is first element
|
|
|
|
print(f"Valid stadium IDs: {len(valid_stadium_ids)}")
|
|
print(f"Valid team IDs: {len(valid_team_ids)}")
|
|
print()
|
|
|
|
# Check stadium aliases
|
|
try:
|
|
stadium_aliases = json.load(open("stadium_aliases.json"))
|
|
print(f"✓ stadium_aliases.json: Valid JSON ({len(stadium_aliases)} aliases)")
|
|
|
|
for alias in stadium_aliases:
|
|
# Check required fields
|
|
if "alias_name" not in alias:
|
|
errors.append(f"Stadium alias missing 'alias_name': {alias}")
|
|
if "stadium_canonical_id" not in alias:
|
|
errors.append(f"Stadium alias missing 'stadium_canonical_id': {alias}")
|
|
elif alias["stadium_canonical_id"] not in valid_stadium_ids:
|
|
errors.append(
|
|
f"Orphan stadium alias: '{alias.get('alias_name', '?')}' -> "
|
|
f"'{alias['stadium_canonical_id']}'"
|
|
)
|
|
except FileNotFoundError:
|
|
errors.append("stadium_aliases.json not found")
|
|
except json.JSONDecodeError as e:
|
|
errors.append(f"stadium_aliases.json: Invalid JSON - {e}")
|
|
|
|
# Check team aliases
|
|
try:
|
|
team_aliases = json.load(open("team_aliases.json"))
|
|
print(f"✓ team_aliases.json: Valid JSON ({len(team_aliases)} aliases)")
|
|
|
|
for alias in team_aliases:
|
|
# Check required fields
|
|
if "team_canonical_id" not in alias:
|
|
errors.append(f"Team alias missing 'team_canonical_id': {alias}")
|
|
elif alias["team_canonical_id"] not in valid_team_ids:
|
|
errors.append(
|
|
f"Orphan team alias: '{alias.get('alias_value', '?')}' -> "
|
|
f"'{alias['team_canonical_id']}'"
|
|
)
|
|
except FileNotFoundError:
|
|
errors.append("team_aliases.json not found")
|
|
except json.JSONDecodeError as e:
|
|
errors.append(f"team_aliases.json: Invalid JSON - {e}")
|
|
|
|
# Report results
|
|
print()
|
|
if errors:
|
|
print(f"❌ Validation failed with {len(errors)} error(s):")
|
|
for error in errors:
|
|
print(f" - {error}")
|
|
return 1
|
|
|
|
print("✅ All aliases valid")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|