feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
56
Scripts/sportstime_parser/config.py
Normal file
56
Scripts/sportstime_parser/config.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""Configuration constants for sportstime-parser."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# Package paths
|
||||
PACKAGE_DIR = Path(__file__).parent
|
||||
SCRIPTS_DIR = PACKAGE_DIR.parent
|
||||
OUTPUT_DIR = SCRIPTS_DIR / "output"
|
||||
STATE_DIR = SCRIPTS_DIR / ".parser_state"
|
||||
|
||||
# Alias files (existing in Scripts/)
|
||||
TEAM_ALIASES_FILE = SCRIPTS_DIR / "team_aliases.json"
|
||||
STADIUM_ALIASES_FILE = SCRIPTS_DIR / "stadium_aliases.json"
|
||||
LEAGUE_STRUCTURE_FILE = SCRIPTS_DIR / "league_structure.json"
|
||||
|
||||
# Supported sports
|
||||
SUPPORTED_SPORTS: list[str] = [
|
||||
"nba",
|
||||
"mlb",
|
||||
"nfl",
|
||||
"nhl",
|
||||
"mls",
|
||||
"wnba",
|
||||
"nwsl",
|
||||
]
|
||||
|
||||
# Default season (start year of the season, e.g., 2025 for 2025-26)
|
||||
DEFAULT_SEASON: int = 2025
|
||||
|
||||
# CloudKit configuration
|
||||
CLOUDKIT_CONTAINER_ID: str = "iCloud.com.sportstime.app"
|
||||
CLOUDKIT_ENVIRONMENT: str = "development"
|
||||
CLOUDKIT_BATCH_SIZE: int = 200
|
||||
|
||||
# Rate limiting
|
||||
DEFAULT_REQUEST_DELAY: float = 1.0 # seconds between requests
|
||||
MAX_RETRIES: int = 3
|
||||
BACKOFF_FACTOR: float = 2.0 # exponential backoff multiplier
|
||||
INITIAL_BACKOFF: float = 1.0 # initial backoff in seconds
|
||||
|
||||
# Expected game counts per sport (approximate, for validation)
|
||||
EXPECTED_GAME_COUNTS: dict[str, int] = {
|
||||
"nba": 1230, # 30 teams × 82 games / 2
|
||||
"mlb": 2430, # 30 teams × 162 games / 2
|
||||
"nfl": 272, # 32 teams × 17 games / 2
|
||||
"nhl": 1312, # 32 teams × 82 games / 2
|
||||
"mls": 493, # 30 teams × varies
|
||||
"wnba": 220, # 13 teams × 40 games / 2 (approx)
|
||||
"nwsl": 182, # 14 teams × 26 games / 2
|
||||
}
|
||||
|
||||
# Minimum match score for fuzzy matching (0-100)
|
||||
FUZZY_MATCH_THRESHOLD: int = 80
|
||||
|
||||
# Geographic filter (only include games in these countries)
|
||||
ALLOWED_COUNTRIES: set[str] = {"USA", "US", "United States", "Canada", "Mexico"}
|
||||
Reference in New Issue
Block a user