feat(scripts): rewrite parser as modular Python CLI

Replace monolithic scraping scripts with sportstime_parser package:

- Multi-source scrapers with automatic fallback for 7 sports
- Canonical ID generation for games, teams, and stadiums
- Fuzzy matching with configurable thresholds for name resolution
- CloudKit Web Services uploader with JWT auth, diff-based updates
- Resumable uploads with checkpoint state persistence
- Validation reports with manual review items and suggested matches
- Comprehensive test suite (249 tests)

CLI: sportstime-parser scrape|validate|upload|status|retry|clear

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-10 21:06:12 -06:00
parent 284a10d9e1
commit eeaf900e5a
109 changed files with 18415 additions and 266211 deletions

View File

@@ -0,0 +1,58 @@
"""Utility modules for sportstime-parser."""
from .logging import (
get_console,
get_logger,
is_verbose,
log_error,
log_failure,
log_game,
log_stadium,
log_success,
log_team,
log_warning,
set_verbose,
)
from .http import (
RateLimitedSession,
get_session,
fetch_url,
fetch_json,
fetch_html,
)
from .progress import (
create_progress,
create_spinner_progress,
progress_bar,
track_progress,
ProgressTracker,
ScrapeProgress,
)
__all__ = [
# Logging
"get_console",
"get_logger",
"is_verbose",
"log_error",
"log_failure",
"log_game",
"log_stadium",
"log_success",
"log_team",
"log_warning",
"set_verbose",
# HTTP
"RateLimitedSession",
"get_session",
"fetch_url",
"fetch_json",
"fetch_html",
# Progress
"create_progress",
"create_spinner_progress",
"progress_bar",
"track_progress",
"ProgressTracker",
"ScrapeProgress",
]

View File

@@ -0,0 +1,276 @@
"""HTTP utilities with rate limiting and exponential backoff."""
import random
import time
from typing import Optional
from urllib.parse import urlparse
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from ..config import (
DEFAULT_REQUEST_DELAY,
MAX_RETRIES,
BACKOFF_FACTOR,
INITIAL_BACKOFF,
)
from .logging import get_logger, log_warning
# User agents for rotation to avoid blocks
USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
]
class RateLimitedSession:
"""HTTP session with rate limiting and exponential backoff.
Features:
- Configurable delay between requests
- Automatic 429 detection with exponential backoff
- User-agent rotation
- Connection pooling
- Automatic retries for transient errors
"""
def __init__(
self,
delay: float = DEFAULT_REQUEST_DELAY,
max_retries: int = MAX_RETRIES,
backoff_factor: float = BACKOFF_FACTOR,
initial_backoff: float = INITIAL_BACKOFF,
):
"""Initialize the rate-limited session.
Args:
delay: Minimum delay between requests in seconds
max_retries: Maximum number of retry attempts
backoff_factor: Multiplier for exponential backoff
initial_backoff: Initial backoff duration in seconds
"""
self.delay = delay
self.max_retries = max_retries
self.backoff_factor = backoff_factor
self.initial_backoff = initial_backoff
self.last_request_time: float = 0.0
self._domain_delays: dict[str, float] = {}
# Create session with retry adapter
self.session = requests.Session()
# Configure automatic retries for connection errors
retry_strategy = Retry(
total=max_retries,
backoff_factor=0.5,
status_forcelist=[500, 502, 503, 504],
allowed_methods=["GET", "HEAD"],
)
adapter = HTTPAdapter(max_retries=retry_strategy, pool_maxsize=10)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
self._logger = get_logger()
def _get_user_agent(self) -> str:
"""Get a random user agent."""
return random.choice(USER_AGENTS)
def _get_domain(self, url: str) -> str:
"""Extract domain from URL."""
parsed = urlparse(url)
return parsed.netloc
def _wait_for_rate_limit(self, url: str) -> None:
"""Wait to respect rate limiting."""
domain = self._get_domain(url)
# Get domain-specific delay (if 429 was received)
domain_delay = self._domain_delays.get(domain, 0.0)
effective_delay = max(self.delay, domain_delay)
elapsed = time.time() - self.last_request_time
if elapsed < effective_delay:
sleep_time = effective_delay - elapsed
self._logger.debug(f"Rate limiting: sleeping {sleep_time:.2f}s")
time.sleep(sleep_time)
def _handle_429(self, url: str, attempt: int) -> float:
"""Handle 429 Too Many Requests with exponential backoff.
Returns the backoff duration in seconds.
"""
domain = self._get_domain(url)
backoff = self.initial_backoff * (self.backoff_factor ** attempt)
# Add jitter to prevent thundering herd
backoff += random.uniform(0, 1)
# Update domain-specific delay
self._domain_delays[domain] = min(backoff * 2, 60.0) # Cap at 60s
log_warning(f"Rate limited (429) for {domain}, backing off {backoff:.1f}s")
return backoff
def get(
self,
url: str,
headers: Optional[dict] = None,
params: Optional[dict] = None,
timeout: float = 30.0,
) -> requests.Response:
"""Make a rate-limited GET request with automatic retries.
Args:
url: URL to fetch
headers: Additional headers to include
params: Query parameters
timeout: Request timeout in seconds
Returns:
Response object
Raises:
requests.RequestException: If all retries fail
"""
# Prepare headers with user agent
request_headers = {"User-Agent": self._get_user_agent()}
if headers:
request_headers.update(headers)
last_exception: Optional[Exception] = None
for attempt in range(self.max_retries + 1):
try:
# Wait for rate limit
self._wait_for_rate_limit(url)
# Make request
self.last_request_time = time.time()
response = self.session.get(
url,
headers=request_headers,
params=params,
timeout=timeout,
)
# Handle 429
if response.status_code == 429:
if attempt < self.max_retries:
backoff = self._handle_429(url, attempt)
time.sleep(backoff)
continue
else:
response.raise_for_status()
# Return successful response
return response
except requests.RequestException as e:
last_exception = e
if attempt < self.max_retries:
backoff = self.initial_backoff * (self.backoff_factor ** attempt)
self._logger.warning(
f"Request failed (attempt {attempt + 1}): {e}, retrying in {backoff:.1f}s"
)
time.sleep(backoff)
else:
raise
# Should not reach here, but just in case
if last_exception:
raise last_exception
raise requests.RequestException("Max retries exceeded")
def get_json(
self,
url: str,
headers: Optional[dict] = None,
params: Optional[dict] = None,
timeout: float = 30.0,
) -> dict:
"""Make a rate-limited GET request and parse JSON response.
Args:
url: URL to fetch
headers: Additional headers to include
params: Query parameters
timeout: Request timeout in seconds
Returns:
Parsed JSON as dictionary
Raises:
requests.RequestException: If request fails
ValueError: If response is not valid JSON
"""
response = self.get(url, headers=headers, params=params, timeout=timeout)
response.raise_for_status()
return response.json()
def get_html(
self,
url: str,
headers: Optional[dict] = None,
params: Optional[dict] = None,
timeout: float = 30.0,
) -> str:
"""Make a rate-limited GET request and return HTML text.
Args:
url: URL to fetch
headers: Additional headers to include
params: Query parameters
timeout: Request timeout in seconds
Returns:
HTML text content
Raises:
requests.RequestException: If request fails
"""
response = self.get(url, headers=headers, params=params, timeout=timeout)
response.raise_for_status()
return response.text
def reset_domain_delays(self) -> None:
"""Reset domain-specific delays (e.g., after a long pause)."""
self._domain_delays.clear()
def close(self) -> None:
"""Close the session and release resources."""
self.session.close()
# Global session instance (lazy initialized)
_global_session: Optional[RateLimitedSession] = None
def get_session() -> RateLimitedSession:
"""Get the global rate-limited session instance."""
global _global_session
if _global_session is None:
_global_session = RateLimitedSession()
return _global_session
def fetch_url(url: str, **kwargs) -> requests.Response:
"""Convenience function to fetch a URL with rate limiting."""
return get_session().get(url, **kwargs)
def fetch_json(url: str, **kwargs) -> dict:
"""Convenience function to fetch JSON with rate limiting."""
return get_session().get_json(url, **kwargs)
def fetch_html(url: str, **kwargs) -> str:
"""Convenience function to fetch HTML with rate limiting."""
return get_session().get_html(url, **kwargs)

View File

@@ -0,0 +1,149 @@
"""Logging infrastructure for sportstime-parser."""
import logging
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
from rich.console import Console
from rich.logging import RichHandler
from ..config import SCRIPTS_DIR
# Module-level state
_logger: Optional[logging.Logger] = None
_verbose: bool = False
_console: Optional[Console] = None
def get_console() -> Console:
"""Get the shared Rich console instance."""
global _console
if _console is None:
_console = Console()
return _console
def set_verbose(verbose: bool) -> None:
"""Set verbose mode globally."""
global _verbose
_verbose = verbose
logger = get_logger()
if verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
def is_verbose() -> bool:
"""Check if verbose mode is enabled."""
return _verbose
def get_logger() -> logging.Logger:
"""Get or create the application logger."""
global _logger
if _logger is not None:
return _logger
_logger = logging.getLogger("sportstime_parser")
_logger.setLevel(logging.INFO)
# Prevent propagation to root logger
_logger.propagate = False
# Clear any existing handlers
_logger.handlers.clear()
# Console handler with Rich formatting
console_handler = RichHandler(
console=get_console(),
show_time=True,
show_path=False,
rich_tracebacks=True,
tracebacks_show_locals=True,
markup=True,
)
console_handler.setLevel(logging.DEBUG)
console_format = logging.Formatter("%(message)s")
console_handler.setFormatter(console_format)
_logger.addHandler(console_handler)
# File handler for persistent logs
log_dir = SCRIPTS_DIR / "logs"
log_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = log_dir / f"parser_{timestamp}.log"
file_handler = logging.FileHandler(log_file, encoding="utf-8")
file_handler.setLevel(logging.DEBUG)
file_format = logging.Formatter(
"%(asctime)s | %(levelname)-8s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
file_handler.setFormatter(file_format)
_logger.addHandler(file_handler)
return _logger
def log_game(
sport: str,
game_id: str,
home: str,
away: str,
date: str,
status: str = "parsed",
) -> None:
"""Log a game being processed (only in verbose mode)."""
if not is_verbose():
return
logger = get_logger()
logger.debug(f"[{sport.upper()}] {game_id}: {away} @ {home} ({date}) - {status}")
def log_team(sport: str, team_id: str, name: str, status: str = "resolved") -> None:
"""Log a team being processed (only in verbose mode)."""
if not is_verbose():
return
logger = get_logger()
logger.debug(f"[{sport.upper()}] Team: {name} -> {team_id} ({status})")
def log_stadium(sport: str, stadium_id: str, name: str, status: str = "resolved") -> None:
"""Log a stadium being processed (only in verbose mode)."""
if not is_verbose():
return
logger = get_logger()
logger.debug(f"[{sport.upper()}] Stadium: {name} -> {stadium_id} ({status})")
def log_error(message: str, exc_info: bool = False) -> None:
"""Log an error message."""
logger = get_logger()
logger.error(message, exc_info=exc_info)
def log_warning(message: str) -> None:
"""Log a warning message."""
logger = get_logger()
logger.warning(message)
def log_success(message: str) -> None:
"""Log a success message with green formatting."""
logger = get_logger()
logger.info(f"[green]✓[/green] {message}")
def log_failure(message: str) -> None:
"""Log a failure message with red formatting."""
logger = get_logger()
logger.info(f"[red]✗[/red] {message}")

View File

@@ -0,0 +1,360 @@
"""Progress utilities using Rich for visual feedback."""
from contextlib import contextmanager
from typing import Generator, Iterable, Optional, TypeVar
from rich.progress import (
Progress,
SpinnerColumn,
TextColumn,
BarColumn,
TaskProgressColumn,
TimeElapsedColumn,
TimeRemainingColumn,
MofNCompleteColumn,
)
from rich.console import Console
from .logging import get_console
T = TypeVar("T")
def create_progress() -> Progress:
"""Create a Rich progress bar with standard columns."""
return Progress(
SpinnerColumn(),
TextColumn("[bold blue]{task.description}"),
BarColumn(bar_width=40),
TaskProgressColumn(),
MofNCompleteColumn(),
TimeElapsedColumn(),
TimeRemainingColumn(),
console=get_console(),
transient=False,
)
def create_spinner_progress() -> Progress:
"""Create a Rich progress bar with spinner only (for indeterminate tasks)."""
return Progress(
SpinnerColumn(),
TextColumn("[bold blue]{task.description}"),
TimeElapsedColumn(),
console=get_console(),
transient=True,
)
@contextmanager
def progress_bar(
description: str,
total: Optional[int] = None,
) -> Generator[tuple[Progress, int], None, None]:
"""Context manager for a progress bar.
Args:
description: Task description to display
total: Total number of items (None for indeterminate)
Yields:
Tuple of (Progress instance, task_id)
Example:
with progress_bar("Scraping games", total=100) as (progress, task):
for item in items:
process(item)
progress.advance(task)
"""
if total is None:
progress = create_spinner_progress()
else:
progress = create_progress()
with progress:
task_id = progress.add_task(description, total=total)
yield progress, task_id
def track_progress(
iterable: Iterable[T],
description: str,
total: Optional[int] = None,
) -> Generator[T, None, None]:
"""Wrap an iterable with a progress bar.
Args:
iterable: Items to iterate over
description: Task description to display
total: Total number of items (auto-detected if iterable has len)
Yields:
Items from the iterable
Example:
for game in track_progress(games, "Processing games"):
process(game)
"""
# Try to get length if not provided
if total is None:
try:
total = len(iterable) # type: ignore
except TypeError:
pass
if total is None:
# Indeterminate progress
progress = create_spinner_progress()
with progress:
task_id = progress.add_task(description, total=None)
for item in iterable:
yield item
progress.update(task_id, advance=1)
else:
# Determinate progress
progress = create_progress()
with progress:
task_id = progress.add_task(description, total=total)
for item in iterable:
yield item
progress.advance(task_id)
class ProgressTracker:
"""Track progress across multiple phases with nested tasks.
Example:
tracker = ProgressTracker()
tracker.start("Scraping NBA")
with tracker.task("Fetching schedule", total=12) as advance:
for month in months:
fetch(month)
advance()
with tracker.task("Parsing games", total=1230) as advance:
for game in games:
parse(game)
advance()
tracker.finish("Completed NBA scrape")
"""
def __init__(self):
"""Initialize the progress tracker."""
self._console = get_console()
self._current_progress: Optional[Progress] = None
self._current_task: Optional[int] = None
def start(self, message: str) -> None:
"""Start a new tracking session with a message."""
self._console.print(f"\n[bold cyan]>>> {message}[/bold cyan]")
def finish(self, message: str) -> None:
"""Finish the tracking session with a message."""
self._console.print(f"[bold green]<<< {message}[/bold green]\n")
@contextmanager
def task(
self,
description: str,
total: Optional[int] = None,
) -> Generator[callable, None, None]:
"""Context manager for a tracked task.
Args:
description: Task description
total: Total items (None for indeterminate)
Yields:
Callable to advance the progress
Example:
with tracker.task("Processing", total=100) as advance:
for item in items:
process(item)
advance()
"""
with progress_bar(description, total) as (progress, task_id):
self._current_progress = progress
self._current_task = task_id
def advance(amount: int = 1) -> None:
progress.advance(task_id, advance=amount)
yield advance
self._current_progress = None
self._current_task = None
def log(self, message: str) -> None:
"""Log a message (will be displayed above progress bar if active)."""
if self._current_progress:
self._current_progress.console.print(f" {message}")
else:
self._console.print(f" {message}")
class ScrapeProgress:
"""Specialized progress tracker for scraping operations.
Tracks counts of games, teams, stadiums scraped and provides
formatted status updates.
"""
def __init__(self, sport: str, season: int):
"""Initialize scrape progress for a sport.
Args:
sport: Sport code (e.g., 'nba')
season: Season start year
"""
self.sport = sport
self.season = season
self.games_count = 0
self.teams_count = 0
self.stadiums_count = 0
self.errors_count = 0
self._tracker = ProgressTracker()
def start(self) -> None:
"""Start the scraping session."""
self._tracker.start(
f"Scraping {self.sport.upper()} {self.season}-{self.season + 1}"
)
def finish(self) -> None:
"""Finish the scraping session with summary."""
summary = (
f"Scraped {self.games_count} games, "
f"{self.teams_count} teams, "
f"{self.stadiums_count} stadiums"
)
if self.errors_count > 0:
summary += f" ({self.errors_count} errors)"
self._tracker.finish(summary)
@contextmanager
def scraping_schedule(
self,
total_months: Optional[int] = None,
) -> Generator[callable, None, None]:
"""Track schedule scraping progress."""
with self._tracker.task(
f"Fetching {self.sport.upper()} schedule",
total=total_months,
) as advance:
yield advance
@contextmanager
def parsing_games(
self,
total_games: Optional[int] = None,
) -> Generator[callable, None, None]:
"""Track game parsing progress."""
with self._tracker.task(
"Parsing games",
total=total_games,
) as advance:
def advance_and_count(amount: int = 1) -> None:
self.games_count += amount
advance(amount)
yield advance_and_count
@contextmanager
def resolving_teams(
self,
total_teams: Optional[int] = None,
) -> Generator[callable, None, None]:
"""Track team resolution progress."""
with self._tracker.task(
"Resolving teams",
total=total_teams,
) as advance:
def advance_and_count(amount: int = 1) -> None:
self.teams_count += amount
advance(amount)
yield advance_and_count
@contextmanager
def resolving_stadiums(
self,
total_stadiums: Optional[int] = None,
) -> Generator[callable, None, None]:
"""Track stadium resolution progress."""
with self._tracker.task(
"Resolving stadiums",
total=total_stadiums,
) as advance:
def advance_and_count(amount: int = 1) -> None:
self.stadiums_count += amount
advance(amount)
yield advance_and_count
def log_error(self, message: str) -> None:
"""Log an error during scraping."""
self.errors_count += 1
self._tracker.log(f"[red]Error: {message}[/red]")
def log_warning(self, message: str) -> None:
"""Log a warning during scraping."""
self._tracker.log(f"[yellow]Warning: {message}[/yellow]")
def log_info(self, message: str) -> None:
"""Log an info message during scraping."""
self._tracker.log(message)
class SimpleProgressBar:
"""Simple progress bar wrapper for batch operations.
Example:
with create_progress_bar(total=100, description="Uploading") as progress:
for item in items:
upload(item)
progress.advance()
"""
def __init__(self, progress: Progress, task_id: int):
self._progress = progress
self._task_id = task_id
def advance(self, amount: int = 1) -> None:
"""Advance the progress bar."""
self._progress.advance(self._task_id, advance=amount)
def update(self, completed: int) -> None:
"""Set the progress to a specific value."""
self._progress.update(self._task_id, completed=completed)
@contextmanager
def create_progress_bar(
total: int,
description: str = "Progress",
) -> Generator[SimpleProgressBar, None, None]:
"""Create a simple progress bar for batch operations.
Args:
total: Total number of items
description: Task description
Yields:
SimpleProgressBar with advance() and update() methods
Example:
with create_progress_bar(total=100, description="Uploading") as progress:
for item in items:
upload(item)
progress.advance()
"""
progress = create_progress()
with progress:
task_id = progress.add_task(description, total=total)
yield SimpleProgressBar(progress, task_id)