feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
99
validate_aliases.py
Normal file
99
validate_aliases.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate alias files for orphan references and format issues.
|
||||
|
||||
This script checks stadium_aliases.json and team_aliases.json for:
|
||||
1. Orphan references (aliases pointing to non-existent canonical IDs)
|
||||
2. JSON syntax errors
|
||||
3. Required field presence
|
||||
|
||||
Usage:
|
||||
python validate_aliases.py
|
||||
|
||||
Returns exit code 0 on success, 1 on failure.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from sportstime_parser.normalizers.stadium_resolver import STADIUM_MAPPINGS
|
||||
from sportstime_parser.normalizers.team_resolver import TEAM_MAPPINGS
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""Run validation checks on alias files."""
|
||||
errors: list[str] = []
|
||||
|
||||
# Build valid stadium ID set
|
||||
valid_stadium_ids: set[str] = set()
|
||||
for sport_stadiums in STADIUM_MAPPINGS.values():
|
||||
for stadium_id in sport_stadiums.keys():
|
||||
valid_stadium_ids.add(stadium_id)
|
||||
|
||||
# Build valid team ID set
|
||||
valid_team_ids: set[str] = set()
|
||||
for sport_teams in TEAM_MAPPINGS.values():
|
||||
for abbrev, team_data in sport_teams.items():
|
||||
valid_team_ids.add(team_data[0]) # team_id is first element
|
||||
|
||||
print(f"Valid stadium IDs: {len(valid_stadium_ids)}")
|
||||
print(f"Valid team IDs: {len(valid_team_ids)}")
|
||||
print()
|
||||
|
||||
# Check stadium aliases
|
||||
try:
|
||||
stadium_aliases = json.load(open("stadium_aliases.json"))
|
||||
print(f"✓ stadium_aliases.json: Valid JSON ({len(stadium_aliases)} aliases)")
|
||||
|
||||
for alias in stadium_aliases:
|
||||
# Check required fields
|
||||
if "alias_name" not in alias:
|
||||
errors.append(f"Stadium alias missing 'alias_name': {alias}")
|
||||
if "stadium_canonical_id" not in alias:
|
||||
errors.append(f"Stadium alias missing 'stadium_canonical_id': {alias}")
|
||||
elif alias["stadium_canonical_id"] not in valid_stadium_ids:
|
||||
errors.append(
|
||||
f"Orphan stadium alias: '{alias.get('alias_name', '?')}' -> "
|
||||
f"'{alias['stadium_canonical_id']}'"
|
||||
)
|
||||
except FileNotFoundError:
|
||||
errors.append("stadium_aliases.json not found")
|
||||
except json.JSONDecodeError as e:
|
||||
errors.append(f"stadium_aliases.json: Invalid JSON - {e}")
|
||||
|
||||
# Check team aliases
|
||||
try:
|
||||
team_aliases = json.load(open("team_aliases.json"))
|
||||
print(f"✓ team_aliases.json: Valid JSON ({len(team_aliases)} aliases)")
|
||||
|
||||
for alias in team_aliases:
|
||||
# Check required fields
|
||||
if "team_canonical_id" not in alias:
|
||||
errors.append(f"Team alias missing 'team_canonical_id': {alias}")
|
||||
elif alias["team_canonical_id"] not in valid_team_ids:
|
||||
errors.append(
|
||||
f"Orphan team alias: '{alias.get('alias_value', '?')}' -> "
|
||||
f"'{alias['team_canonical_id']}'"
|
||||
)
|
||||
except FileNotFoundError:
|
||||
errors.append("team_aliases.json not found")
|
||||
except json.JSONDecodeError as e:
|
||||
errors.append(f"team_aliases.json: Invalid JSON - {e}")
|
||||
|
||||
# Report results
|
||||
print()
|
||||
if errors:
|
||||
print(f"❌ Validation failed with {len(errors)} error(s):")
|
||||
for error in errors:
|
||||
print(f" - {error}")
|
||||
return 1
|
||||
|
||||
print("✅ All aliases valid")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user