Files
SportstimeAPI/validate_aliases.py
Trey t 52d445bca4 feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00

100 lines
3.3 KiB
Python

#!/usr/bin/env python3
"""Validate alias files for orphan references and format issues.
This script checks stadium_aliases.json and team_aliases.json for:
1. Orphan references (aliases pointing to non-existent canonical IDs)
2. JSON syntax errors
3. Required field presence
Usage:
python validate_aliases.py
Returns exit code 0 on success, 1 on failure.
"""
import json
import sys
from pathlib import Path
# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent))
from sportstime_parser.normalizers.stadium_resolver import STADIUM_MAPPINGS
from sportstime_parser.normalizers.team_resolver import TEAM_MAPPINGS
def main() -> int:
"""Run validation checks on alias files."""
errors: list[str] = []
# Build valid stadium ID set
valid_stadium_ids: set[str] = set()
for sport_stadiums in STADIUM_MAPPINGS.values():
for stadium_id in sport_stadiums.keys():
valid_stadium_ids.add(stadium_id)
# Build valid team ID set
valid_team_ids: set[str] = set()
for sport_teams in TEAM_MAPPINGS.values():
for abbrev, team_data in sport_teams.items():
valid_team_ids.add(team_data[0]) # team_id is first element
print(f"Valid stadium IDs: {len(valid_stadium_ids)}")
print(f"Valid team IDs: {len(valid_team_ids)}")
print()
# Check stadium aliases
try:
stadium_aliases = json.load(open("stadium_aliases.json"))
print(f"✓ stadium_aliases.json: Valid JSON ({len(stadium_aliases)} aliases)")
for alias in stadium_aliases:
# Check required fields
if "alias_name" not in alias:
errors.append(f"Stadium alias missing 'alias_name': {alias}")
if "stadium_canonical_id" not in alias:
errors.append(f"Stadium alias missing 'stadium_canonical_id': {alias}")
elif alias["stadium_canonical_id"] not in valid_stadium_ids:
errors.append(
f"Orphan stadium alias: '{alias.get('alias_name', '?')}' -> "
f"'{alias['stadium_canonical_id']}'"
)
except FileNotFoundError:
errors.append("stadium_aliases.json not found")
except json.JSONDecodeError as e:
errors.append(f"stadium_aliases.json: Invalid JSON - {e}")
# Check team aliases
try:
team_aliases = json.load(open("team_aliases.json"))
print(f"✓ team_aliases.json: Valid JSON ({len(team_aliases)} aliases)")
for alias in team_aliases:
# Check required fields
if "team_canonical_id" not in alias:
errors.append(f"Team alias missing 'team_canonical_id': {alias}")
elif alias["team_canonical_id"] not in valid_team_ids:
errors.append(
f"Orphan team alias: '{alias.get('alias_value', '?')}' -> "
f"'{alias['team_canonical_id']}'"
)
except FileNotFoundError:
errors.append("team_aliases.json not found")
except json.JSONDecodeError as e:
errors.append(f"team_aliases.json: Invalid JSON - {e}")
# Report results
print()
if errors:
print(f"❌ Validation failed with {len(errors)} error(s):")
for error in errors:
print(f" - {error}")
return 1
print("✅ All aliases valid")
return 0
if __name__ == "__main__":
sys.exit(main())