feat(scripts): complete data pipeline remediation
Scripts changes: - Add WNBA abbreviation aliases to team_resolver.py - Fix NHL stadium coordinates in stadium_resolver.py - Add validate_aliases.py script for orphan detection - Update scrapers with improved error handling - Add DATA_AUDIT.md and REMEDIATION_PLAN.md documentation - Update alias JSON files with new mappings iOS bundle updates: - Update games_canonical.json with latest scraped data - Update teams_canonical.json and stadiums_canonical.json - Sync alias files with Scripts versions All 5 remediation phases complete. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
99
Scripts/validate_aliases.py
Normal file
99
Scripts/validate_aliases.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate alias files for orphan references and format issues.
|
||||
|
||||
This script checks stadium_aliases.json and team_aliases.json for:
|
||||
1. Orphan references (aliases pointing to non-existent canonical IDs)
|
||||
2. JSON syntax errors
|
||||
3. Required field presence
|
||||
|
||||
Usage:
|
||||
python validate_aliases.py
|
||||
|
||||
Returns exit code 0 on success, 1 on failure.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from sportstime_parser.normalizers.stadium_resolver import STADIUM_MAPPINGS
|
||||
from sportstime_parser.normalizers.team_resolver import TEAM_MAPPINGS
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""Run validation checks on alias files."""
|
||||
errors: list[str] = []
|
||||
|
||||
# Build valid stadium ID set
|
||||
valid_stadium_ids: set[str] = set()
|
||||
for sport_stadiums in STADIUM_MAPPINGS.values():
|
||||
for stadium_id in sport_stadiums.keys():
|
||||
valid_stadium_ids.add(stadium_id)
|
||||
|
||||
# Build valid team ID set
|
||||
valid_team_ids: set[str] = set()
|
||||
for sport_teams in TEAM_MAPPINGS.values():
|
||||
for abbrev, team_data in sport_teams.items():
|
||||
valid_team_ids.add(team_data[0]) # team_id is first element
|
||||
|
||||
print(f"Valid stadium IDs: {len(valid_stadium_ids)}")
|
||||
print(f"Valid team IDs: {len(valid_team_ids)}")
|
||||
print()
|
||||
|
||||
# Check stadium aliases
|
||||
try:
|
||||
stadium_aliases = json.load(open("stadium_aliases.json"))
|
||||
print(f"✓ stadium_aliases.json: Valid JSON ({len(stadium_aliases)} aliases)")
|
||||
|
||||
for alias in stadium_aliases:
|
||||
# Check required fields
|
||||
if "alias_name" not in alias:
|
||||
errors.append(f"Stadium alias missing 'alias_name': {alias}")
|
||||
if "stadium_canonical_id" not in alias:
|
||||
errors.append(f"Stadium alias missing 'stadium_canonical_id': {alias}")
|
||||
elif alias["stadium_canonical_id"] not in valid_stadium_ids:
|
||||
errors.append(
|
||||
f"Orphan stadium alias: '{alias.get('alias_name', '?')}' -> "
|
||||
f"'{alias['stadium_canonical_id']}'"
|
||||
)
|
||||
except FileNotFoundError:
|
||||
errors.append("stadium_aliases.json not found")
|
||||
except json.JSONDecodeError as e:
|
||||
errors.append(f"stadium_aliases.json: Invalid JSON - {e}")
|
||||
|
||||
# Check team aliases
|
||||
try:
|
||||
team_aliases = json.load(open("team_aliases.json"))
|
||||
print(f"✓ team_aliases.json: Valid JSON ({len(team_aliases)} aliases)")
|
||||
|
||||
for alias in team_aliases:
|
||||
# Check required fields
|
||||
if "team_canonical_id" not in alias:
|
||||
errors.append(f"Team alias missing 'team_canonical_id': {alias}")
|
||||
elif alias["team_canonical_id"] not in valid_team_ids:
|
||||
errors.append(
|
||||
f"Orphan team alias: '{alias.get('alias_value', '?')}' -> "
|
||||
f"'{alias['team_canonical_id']}'"
|
||||
)
|
||||
except FileNotFoundError:
|
||||
errors.append("team_aliases.json not found")
|
||||
except json.JSONDecodeError as e:
|
||||
errors.append(f"team_aliases.json: Invalid JSON - {e}")
|
||||
|
||||
# Report results
|
||||
print()
|
||||
if errors:
|
||||
print(f"❌ Validation failed with {len(errors)} error(s):")
|
||||
for error in errors:
|
||||
print(f" - {error}")
|
||||
return 1
|
||||
|
||||
print("✅ All aliases valid")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user