feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
59
sportstime_parser/config.py
Normal file
59
sportstime_parser/config.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""Configuration constants for sportstime-parser."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# Package paths
|
||||
PACKAGE_DIR = Path(__file__).parent
|
||||
SCRIPTS_DIR = PACKAGE_DIR.parent
|
||||
OUTPUT_DIR = SCRIPTS_DIR / "output"
|
||||
STATE_DIR = SCRIPTS_DIR / ".parser_state"
|
||||
|
||||
# Alias files (existing in Scripts/)
|
||||
TEAM_ALIASES_FILE = SCRIPTS_DIR / "team_aliases.json"
|
||||
STADIUM_ALIASES_FILE = SCRIPTS_DIR / "stadium_aliases.json"
|
||||
LEAGUE_STRUCTURE_FILE = SCRIPTS_DIR / "league_structure.json"
|
||||
|
||||
# Supported sports
|
||||
SUPPORTED_SPORTS: list[str] = [
|
||||
"nba",
|
||||
"mlb",
|
||||
"nfl",
|
||||
"nhl",
|
||||
"mls",
|
||||
"wnba",
|
||||
"nwsl",
|
||||
]
|
||||
|
||||
# Default season (start year of the season, e.g., 2025 for 2025-26)
|
||||
DEFAULT_SEASON: int = 2025
|
||||
|
||||
# CloudKit configuration
|
||||
CLOUDKIT_CONTAINER_ID: str = "iCloud.com.sportstime.app"
|
||||
CLOUDKIT_ENVIRONMENT: str = "development"
|
||||
CLOUDKIT_BATCH_SIZE: int = 200
|
||||
CLOUDKIT_KEY_ID: str = "152be0715e0276e31aaea5cbfe79dc872f298861a55c70fae14e5fe3e026cff9"
|
||||
CLOUDKIT_PRIVATE_KEY_PATH: Path = SCRIPTS_DIR / "eckey.pem"
|
||||
|
||||
# Rate limiting
|
||||
DEFAULT_REQUEST_DELAY: float = 1.0 # seconds between requests
|
||||
MAX_RETRIES: int = 3
|
||||
BACKOFF_FACTOR: float = 2.0 # exponential backoff multiplier
|
||||
INITIAL_BACKOFF: float = 1.0 # initial backoff in seconds
|
||||
|
||||
# Expected game counts per sport (approximate, for validation)
|
||||
# Updated 2026-01-20 based on 2025-26 season data
|
||||
EXPECTED_GAME_COUNTS: dict[str, int] = {
|
||||
"nba": 1230, # 30 teams × 82 games / 2
|
||||
"mlb": 2430, # 30 teams × 162 games / 2 (regular season only)
|
||||
"nfl": 272, # 32 teams × 17 games / 2 (regular season only)
|
||||
"nhl": 1312, # 32 teams × 82 games / 2
|
||||
"mls": 540, # 30 teams × varies (updated for 2025 expansion)
|
||||
"wnba": 286, # 13 teams × 44 games / 2 (updated for 2025 expansion)
|
||||
"nwsl": 188, # 14→16 teams × varies (updated for 2025 expansion)
|
||||
}
|
||||
|
||||
# Minimum match score for fuzzy matching (0-100)
|
||||
FUZZY_MATCH_THRESHOLD: int = 80
|
||||
|
||||
# Geographic filter (only include games in these countries)
|
||||
ALLOWED_COUNTRIES: set[str] = {"USA", "US", "United States", "Canada", "Mexico"}
|
||||
Reference in New Issue
Block a user