feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
360
sportstime_parser/utils/progress.py
Normal file
360
sportstime_parser/utils/progress.py
Normal file
@@ -0,0 +1,360 @@
|
||||
"""Progress utilities using Rich for visual feedback."""
|
||||
|
||||
from contextlib import contextmanager
|
||||
from typing import Generator, Iterable, Optional, TypeVar
|
||||
|
||||
from rich.progress import (
|
||||
Progress,
|
||||
SpinnerColumn,
|
||||
TextColumn,
|
||||
BarColumn,
|
||||
TaskProgressColumn,
|
||||
TimeElapsedColumn,
|
||||
TimeRemainingColumn,
|
||||
MofNCompleteColumn,
|
||||
)
|
||||
from rich.console import Console
|
||||
|
||||
from .logging import get_console
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def create_progress() -> Progress:
|
||||
"""Create a Rich progress bar with standard columns."""
|
||||
return Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[bold blue]{task.description}"),
|
||||
BarColumn(bar_width=40),
|
||||
TaskProgressColumn(),
|
||||
MofNCompleteColumn(),
|
||||
TimeElapsedColumn(),
|
||||
TimeRemainingColumn(),
|
||||
console=get_console(),
|
||||
transient=False,
|
||||
)
|
||||
|
||||
|
||||
def create_spinner_progress() -> Progress:
|
||||
"""Create a Rich progress bar with spinner only (for indeterminate tasks)."""
|
||||
return Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[bold blue]{task.description}"),
|
||||
TimeElapsedColumn(),
|
||||
console=get_console(),
|
||||
transient=True,
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def progress_bar(
|
||||
description: str,
|
||||
total: Optional[int] = None,
|
||||
) -> Generator[tuple[Progress, int], None, None]:
|
||||
"""Context manager for a progress bar.
|
||||
|
||||
Args:
|
||||
description: Task description to display
|
||||
total: Total number of items (None for indeterminate)
|
||||
|
||||
Yields:
|
||||
Tuple of (Progress instance, task_id)
|
||||
|
||||
Example:
|
||||
with progress_bar("Scraping games", total=100) as (progress, task):
|
||||
for item in items:
|
||||
process(item)
|
||||
progress.advance(task)
|
||||
"""
|
||||
if total is None:
|
||||
progress = create_spinner_progress()
|
||||
else:
|
||||
progress = create_progress()
|
||||
|
||||
with progress:
|
||||
task_id = progress.add_task(description, total=total)
|
||||
yield progress, task_id
|
||||
|
||||
|
||||
def track_progress(
|
||||
iterable: Iterable[T],
|
||||
description: str,
|
||||
total: Optional[int] = None,
|
||||
) -> Generator[T, None, None]:
|
||||
"""Wrap an iterable with a progress bar.
|
||||
|
||||
Args:
|
||||
iterable: Items to iterate over
|
||||
description: Task description to display
|
||||
total: Total number of items (auto-detected if iterable has len)
|
||||
|
||||
Yields:
|
||||
Items from the iterable
|
||||
|
||||
Example:
|
||||
for game in track_progress(games, "Processing games"):
|
||||
process(game)
|
||||
"""
|
||||
# Try to get length if not provided
|
||||
if total is None:
|
||||
try:
|
||||
total = len(iterable) # type: ignore
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
if total is None:
|
||||
# Indeterminate progress
|
||||
progress = create_spinner_progress()
|
||||
with progress:
|
||||
task_id = progress.add_task(description, total=None)
|
||||
for item in iterable:
|
||||
yield item
|
||||
progress.update(task_id, advance=1)
|
||||
else:
|
||||
# Determinate progress
|
||||
progress = create_progress()
|
||||
with progress:
|
||||
task_id = progress.add_task(description, total=total)
|
||||
for item in iterable:
|
||||
yield item
|
||||
progress.advance(task_id)
|
||||
|
||||
|
||||
class ProgressTracker:
|
||||
"""Track progress across multiple phases with nested tasks.
|
||||
|
||||
Example:
|
||||
tracker = ProgressTracker()
|
||||
tracker.start("Scraping NBA")
|
||||
|
||||
with tracker.task("Fetching schedule", total=12) as advance:
|
||||
for month in months:
|
||||
fetch(month)
|
||||
advance()
|
||||
|
||||
with tracker.task("Parsing games", total=1230) as advance:
|
||||
for game in games:
|
||||
parse(game)
|
||||
advance()
|
||||
|
||||
tracker.finish("Completed NBA scrape")
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the progress tracker."""
|
||||
self._console = get_console()
|
||||
self._current_progress: Optional[Progress] = None
|
||||
self._current_task: Optional[int] = None
|
||||
|
||||
def start(self, message: str) -> None:
|
||||
"""Start a new tracking session with a message."""
|
||||
self._console.print(f"\n[bold cyan]>>> {message}[/bold cyan]")
|
||||
|
||||
def finish(self, message: str) -> None:
|
||||
"""Finish the tracking session with a message."""
|
||||
self._console.print(f"[bold green]<<< {message}[/bold green]\n")
|
||||
|
||||
@contextmanager
|
||||
def task(
|
||||
self,
|
||||
description: str,
|
||||
total: Optional[int] = None,
|
||||
) -> Generator[callable, None, None]:
|
||||
"""Context manager for a tracked task.
|
||||
|
||||
Args:
|
||||
description: Task description
|
||||
total: Total items (None for indeterminate)
|
||||
|
||||
Yields:
|
||||
Callable to advance the progress
|
||||
|
||||
Example:
|
||||
with tracker.task("Processing", total=100) as advance:
|
||||
for item in items:
|
||||
process(item)
|
||||
advance()
|
||||
"""
|
||||
with progress_bar(description, total) as (progress, task_id):
|
||||
self._current_progress = progress
|
||||
self._current_task = task_id
|
||||
|
||||
def advance(amount: int = 1) -> None:
|
||||
progress.advance(task_id, advance=amount)
|
||||
|
||||
yield advance
|
||||
|
||||
self._current_progress = None
|
||||
self._current_task = None
|
||||
|
||||
def log(self, message: str) -> None:
|
||||
"""Log a message (will be displayed above progress bar if active)."""
|
||||
if self._current_progress:
|
||||
self._current_progress.console.print(f" {message}")
|
||||
else:
|
||||
self._console.print(f" {message}")
|
||||
|
||||
|
||||
class ScrapeProgress:
|
||||
"""Specialized progress tracker for scraping operations.
|
||||
|
||||
Tracks counts of games, teams, stadiums scraped and provides
|
||||
formatted status updates.
|
||||
"""
|
||||
|
||||
def __init__(self, sport: str, season: int):
|
||||
"""Initialize scrape progress for a sport.
|
||||
|
||||
Args:
|
||||
sport: Sport code (e.g., 'nba')
|
||||
season: Season start year
|
||||
"""
|
||||
self.sport = sport
|
||||
self.season = season
|
||||
self.games_count = 0
|
||||
self.teams_count = 0
|
||||
self.stadiums_count = 0
|
||||
self.errors_count = 0
|
||||
self._tracker = ProgressTracker()
|
||||
|
||||
def start(self) -> None:
|
||||
"""Start the scraping session."""
|
||||
self._tracker.start(
|
||||
f"Scraping {self.sport.upper()} {self.season}-{self.season + 1}"
|
||||
)
|
||||
|
||||
def finish(self) -> None:
|
||||
"""Finish the scraping session with summary."""
|
||||
summary = (
|
||||
f"Scraped {self.games_count} games, "
|
||||
f"{self.teams_count} teams, "
|
||||
f"{self.stadiums_count} stadiums"
|
||||
)
|
||||
if self.errors_count > 0:
|
||||
summary += f" ({self.errors_count} errors)"
|
||||
self._tracker.finish(summary)
|
||||
|
||||
@contextmanager
|
||||
def scraping_schedule(
|
||||
self,
|
||||
total_months: Optional[int] = None,
|
||||
) -> Generator[callable, None, None]:
|
||||
"""Track schedule scraping progress."""
|
||||
with self._tracker.task(
|
||||
f"Fetching {self.sport.upper()} schedule",
|
||||
total=total_months,
|
||||
) as advance:
|
||||
yield advance
|
||||
|
||||
@contextmanager
|
||||
def parsing_games(
|
||||
self,
|
||||
total_games: Optional[int] = None,
|
||||
) -> Generator[callable, None, None]:
|
||||
"""Track game parsing progress."""
|
||||
with self._tracker.task(
|
||||
"Parsing games",
|
||||
total=total_games,
|
||||
) as advance:
|
||||
|
||||
def advance_and_count(amount: int = 1) -> None:
|
||||
self.games_count += amount
|
||||
advance(amount)
|
||||
|
||||
yield advance_and_count
|
||||
|
||||
@contextmanager
|
||||
def resolving_teams(
|
||||
self,
|
||||
total_teams: Optional[int] = None,
|
||||
) -> Generator[callable, None, None]:
|
||||
"""Track team resolution progress."""
|
||||
with self._tracker.task(
|
||||
"Resolving teams",
|
||||
total=total_teams,
|
||||
) as advance:
|
||||
|
||||
def advance_and_count(amount: int = 1) -> None:
|
||||
self.teams_count += amount
|
||||
advance(amount)
|
||||
|
||||
yield advance_and_count
|
||||
|
||||
@contextmanager
|
||||
def resolving_stadiums(
|
||||
self,
|
||||
total_stadiums: Optional[int] = None,
|
||||
) -> Generator[callable, None, None]:
|
||||
"""Track stadium resolution progress."""
|
||||
with self._tracker.task(
|
||||
"Resolving stadiums",
|
||||
total=total_stadiums,
|
||||
) as advance:
|
||||
|
||||
def advance_and_count(amount: int = 1) -> None:
|
||||
self.stadiums_count += amount
|
||||
advance(amount)
|
||||
|
||||
yield advance_and_count
|
||||
|
||||
def log_error(self, message: str) -> None:
|
||||
"""Log an error during scraping."""
|
||||
self.errors_count += 1
|
||||
self._tracker.log(f"[red]Error: {message}[/red]")
|
||||
|
||||
def log_warning(self, message: str) -> None:
|
||||
"""Log a warning during scraping."""
|
||||
self._tracker.log(f"[yellow]Warning: {message}[/yellow]")
|
||||
|
||||
def log_info(self, message: str) -> None:
|
||||
"""Log an info message during scraping."""
|
||||
self._tracker.log(message)
|
||||
|
||||
|
||||
class SimpleProgressBar:
|
||||
"""Simple progress bar wrapper for batch operations.
|
||||
|
||||
Example:
|
||||
with create_progress_bar(total=100, description="Uploading") as progress:
|
||||
for item in items:
|
||||
upload(item)
|
||||
progress.advance()
|
||||
"""
|
||||
|
||||
def __init__(self, progress: Progress, task_id: int):
|
||||
self._progress = progress
|
||||
self._task_id = task_id
|
||||
|
||||
def advance(self, amount: int = 1) -> None:
|
||||
"""Advance the progress bar."""
|
||||
self._progress.advance(self._task_id, advance=amount)
|
||||
|
||||
def update(self, completed: int) -> None:
|
||||
"""Set the progress to a specific value."""
|
||||
self._progress.update(self._task_id, completed=completed)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def create_progress_bar(
|
||||
total: int,
|
||||
description: str = "Progress",
|
||||
) -> Generator[SimpleProgressBar, None, None]:
|
||||
"""Create a simple progress bar for batch operations.
|
||||
|
||||
Args:
|
||||
total: Total number of items
|
||||
description: Task description
|
||||
|
||||
Yields:
|
||||
SimpleProgressBar with advance() and update() methods
|
||||
|
||||
Example:
|
||||
with create_progress_bar(total=100, description="Uploading") as progress:
|
||||
for item in items:
|
||||
upload(item)
|
||||
progress.advance()
|
||||
"""
|
||||
progress = create_progress()
|
||||
with progress:
|
||||
task_id = progress.add_task(description, total=total)
|
||||
yield SimpleProgressBar(progress, task_id)
|
||||
Reference in New Issue
Block a user