Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
361 lines
10 KiB
Python
361 lines
10 KiB
Python
"""Progress utilities using Rich for visual feedback."""
|
|
|
|
from contextlib import contextmanager
|
|
from typing import Generator, Iterable, Optional, TypeVar
|
|
|
|
from rich.progress import (
|
|
Progress,
|
|
SpinnerColumn,
|
|
TextColumn,
|
|
BarColumn,
|
|
TaskProgressColumn,
|
|
TimeElapsedColumn,
|
|
TimeRemainingColumn,
|
|
MofNCompleteColumn,
|
|
)
|
|
from rich.console import Console
|
|
|
|
from .logging import get_console
|
|
|
|
T = TypeVar("T")
|
|
|
|
|
|
def create_progress() -> Progress:
|
|
"""Create a Rich progress bar with standard columns."""
|
|
return Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[bold blue]{task.description}"),
|
|
BarColumn(bar_width=40),
|
|
TaskProgressColumn(),
|
|
MofNCompleteColumn(),
|
|
TimeElapsedColumn(),
|
|
TimeRemainingColumn(),
|
|
console=get_console(),
|
|
transient=False,
|
|
)
|
|
|
|
|
|
def create_spinner_progress() -> Progress:
|
|
"""Create a Rich progress bar with spinner only (for indeterminate tasks)."""
|
|
return Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[bold blue]{task.description}"),
|
|
TimeElapsedColumn(),
|
|
console=get_console(),
|
|
transient=True,
|
|
)
|
|
|
|
|
|
@contextmanager
|
|
def progress_bar(
|
|
description: str,
|
|
total: Optional[int] = None,
|
|
) -> Generator[tuple[Progress, int], None, None]:
|
|
"""Context manager for a progress bar.
|
|
|
|
Args:
|
|
description: Task description to display
|
|
total: Total number of items (None for indeterminate)
|
|
|
|
Yields:
|
|
Tuple of (Progress instance, task_id)
|
|
|
|
Example:
|
|
with progress_bar("Scraping games", total=100) as (progress, task):
|
|
for item in items:
|
|
process(item)
|
|
progress.advance(task)
|
|
"""
|
|
if total is None:
|
|
progress = create_spinner_progress()
|
|
else:
|
|
progress = create_progress()
|
|
|
|
with progress:
|
|
task_id = progress.add_task(description, total=total)
|
|
yield progress, task_id
|
|
|
|
|
|
def track_progress(
|
|
iterable: Iterable[T],
|
|
description: str,
|
|
total: Optional[int] = None,
|
|
) -> Generator[T, None, None]:
|
|
"""Wrap an iterable with a progress bar.
|
|
|
|
Args:
|
|
iterable: Items to iterate over
|
|
description: Task description to display
|
|
total: Total number of items (auto-detected if iterable has len)
|
|
|
|
Yields:
|
|
Items from the iterable
|
|
|
|
Example:
|
|
for game in track_progress(games, "Processing games"):
|
|
process(game)
|
|
"""
|
|
# Try to get length if not provided
|
|
if total is None:
|
|
try:
|
|
total = len(iterable) # type: ignore
|
|
except TypeError:
|
|
pass
|
|
|
|
if total is None:
|
|
# Indeterminate progress
|
|
progress = create_spinner_progress()
|
|
with progress:
|
|
task_id = progress.add_task(description, total=None)
|
|
for item in iterable:
|
|
yield item
|
|
progress.update(task_id, advance=1)
|
|
else:
|
|
# Determinate progress
|
|
progress = create_progress()
|
|
with progress:
|
|
task_id = progress.add_task(description, total=total)
|
|
for item in iterable:
|
|
yield item
|
|
progress.advance(task_id)
|
|
|
|
|
|
class ProgressTracker:
|
|
"""Track progress across multiple phases with nested tasks.
|
|
|
|
Example:
|
|
tracker = ProgressTracker()
|
|
tracker.start("Scraping NBA")
|
|
|
|
with tracker.task("Fetching schedule", total=12) as advance:
|
|
for month in months:
|
|
fetch(month)
|
|
advance()
|
|
|
|
with tracker.task("Parsing games", total=1230) as advance:
|
|
for game in games:
|
|
parse(game)
|
|
advance()
|
|
|
|
tracker.finish("Completed NBA scrape")
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the progress tracker."""
|
|
self._console = get_console()
|
|
self._current_progress: Optional[Progress] = None
|
|
self._current_task: Optional[int] = None
|
|
|
|
def start(self, message: str) -> None:
|
|
"""Start a new tracking session with a message."""
|
|
self._console.print(f"\n[bold cyan]>>> {message}[/bold cyan]")
|
|
|
|
def finish(self, message: str) -> None:
|
|
"""Finish the tracking session with a message."""
|
|
self._console.print(f"[bold green]<<< {message}[/bold green]\n")
|
|
|
|
@contextmanager
|
|
def task(
|
|
self,
|
|
description: str,
|
|
total: Optional[int] = None,
|
|
) -> Generator[callable, None, None]:
|
|
"""Context manager for a tracked task.
|
|
|
|
Args:
|
|
description: Task description
|
|
total: Total items (None for indeterminate)
|
|
|
|
Yields:
|
|
Callable to advance the progress
|
|
|
|
Example:
|
|
with tracker.task("Processing", total=100) as advance:
|
|
for item in items:
|
|
process(item)
|
|
advance()
|
|
"""
|
|
with progress_bar(description, total) as (progress, task_id):
|
|
self._current_progress = progress
|
|
self._current_task = task_id
|
|
|
|
def advance(amount: int = 1) -> None:
|
|
progress.advance(task_id, advance=amount)
|
|
|
|
yield advance
|
|
|
|
self._current_progress = None
|
|
self._current_task = None
|
|
|
|
def log(self, message: str) -> None:
|
|
"""Log a message (will be displayed above progress bar if active)."""
|
|
if self._current_progress:
|
|
self._current_progress.console.print(f" {message}")
|
|
else:
|
|
self._console.print(f" {message}")
|
|
|
|
|
|
class ScrapeProgress:
|
|
"""Specialized progress tracker for scraping operations.
|
|
|
|
Tracks counts of games, teams, stadiums scraped and provides
|
|
formatted status updates.
|
|
"""
|
|
|
|
def __init__(self, sport: str, season: int):
|
|
"""Initialize scrape progress for a sport.
|
|
|
|
Args:
|
|
sport: Sport code (e.g., 'nba')
|
|
season: Season start year
|
|
"""
|
|
self.sport = sport
|
|
self.season = season
|
|
self.games_count = 0
|
|
self.teams_count = 0
|
|
self.stadiums_count = 0
|
|
self.errors_count = 0
|
|
self._tracker = ProgressTracker()
|
|
|
|
def start(self) -> None:
|
|
"""Start the scraping session."""
|
|
self._tracker.start(
|
|
f"Scraping {self.sport.upper()} {self.season}-{self.season + 1}"
|
|
)
|
|
|
|
def finish(self) -> None:
|
|
"""Finish the scraping session with summary."""
|
|
summary = (
|
|
f"Scraped {self.games_count} games, "
|
|
f"{self.teams_count} teams, "
|
|
f"{self.stadiums_count} stadiums"
|
|
)
|
|
if self.errors_count > 0:
|
|
summary += f" ({self.errors_count} errors)"
|
|
self._tracker.finish(summary)
|
|
|
|
@contextmanager
|
|
def scraping_schedule(
|
|
self,
|
|
total_months: Optional[int] = None,
|
|
) -> Generator[callable, None, None]:
|
|
"""Track schedule scraping progress."""
|
|
with self._tracker.task(
|
|
f"Fetching {self.sport.upper()} schedule",
|
|
total=total_months,
|
|
) as advance:
|
|
yield advance
|
|
|
|
@contextmanager
|
|
def parsing_games(
|
|
self,
|
|
total_games: Optional[int] = None,
|
|
) -> Generator[callable, None, None]:
|
|
"""Track game parsing progress."""
|
|
with self._tracker.task(
|
|
"Parsing games",
|
|
total=total_games,
|
|
) as advance:
|
|
|
|
def advance_and_count(amount: int = 1) -> None:
|
|
self.games_count += amount
|
|
advance(amount)
|
|
|
|
yield advance_and_count
|
|
|
|
@contextmanager
|
|
def resolving_teams(
|
|
self,
|
|
total_teams: Optional[int] = None,
|
|
) -> Generator[callable, None, None]:
|
|
"""Track team resolution progress."""
|
|
with self._tracker.task(
|
|
"Resolving teams",
|
|
total=total_teams,
|
|
) as advance:
|
|
|
|
def advance_and_count(amount: int = 1) -> None:
|
|
self.teams_count += amount
|
|
advance(amount)
|
|
|
|
yield advance_and_count
|
|
|
|
@contextmanager
|
|
def resolving_stadiums(
|
|
self,
|
|
total_stadiums: Optional[int] = None,
|
|
) -> Generator[callable, None, None]:
|
|
"""Track stadium resolution progress."""
|
|
with self._tracker.task(
|
|
"Resolving stadiums",
|
|
total=total_stadiums,
|
|
) as advance:
|
|
|
|
def advance_and_count(amount: int = 1) -> None:
|
|
self.stadiums_count += amount
|
|
advance(amount)
|
|
|
|
yield advance_and_count
|
|
|
|
def log_error(self, message: str) -> None:
|
|
"""Log an error during scraping."""
|
|
self.errors_count += 1
|
|
self._tracker.log(f"[red]Error: {message}[/red]")
|
|
|
|
def log_warning(self, message: str) -> None:
|
|
"""Log a warning during scraping."""
|
|
self._tracker.log(f"[yellow]Warning: {message}[/yellow]")
|
|
|
|
def log_info(self, message: str) -> None:
|
|
"""Log an info message during scraping."""
|
|
self._tracker.log(message)
|
|
|
|
|
|
class SimpleProgressBar:
|
|
"""Simple progress bar wrapper for batch operations.
|
|
|
|
Example:
|
|
with create_progress_bar(total=100, description="Uploading") as progress:
|
|
for item in items:
|
|
upload(item)
|
|
progress.advance()
|
|
"""
|
|
|
|
def __init__(self, progress: Progress, task_id: int):
|
|
self._progress = progress
|
|
self._task_id = task_id
|
|
|
|
def advance(self, amount: int = 1) -> None:
|
|
"""Advance the progress bar."""
|
|
self._progress.advance(self._task_id, advance=amount)
|
|
|
|
def update(self, completed: int) -> None:
|
|
"""Set the progress to a specific value."""
|
|
self._progress.update(self._task_id, completed=completed)
|
|
|
|
|
|
@contextmanager
|
|
def create_progress_bar(
|
|
total: int,
|
|
description: str = "Progress",
|
|
) -> Generator[SimpleProgressBar, None, None]:
|
|
"""Create a simple progress bar for batch operations.
|
|
|
|
Args:
|
|
total: Total number of items
|
|
description: Task description
|
|
|
|
Yields:
|
|
SimpleProgressBar with advance() and update() methods
|
|
|
|
Example:
|
|
with create_progress_bar(total=100, description="Uploading") as progress:
|
|
for item in items:
|
|
upload(item)
|
|
progress.advance()
|
|
"""
|
|
progress = create_progress()
|
|
with progress:
|
|
task_id = progress.add_task(description, total=total)
|
|
yield SimpleProgressBar(progress, task_id)
|