feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
58
Scripts/sportstime_parser/utils/__init__.py
Normal file
58
Scripts/sportstime_parser/utils/__init__.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Utility modules for sportstime-parser."""
|
||||
|
||||
from .logging import (
|
||||
get_console,
|
||||
get_logger,
|
||||
is_verbose,
|
||||
log_error,
|
||||
log_failure,
|
||||
log_game,
|
||||
log_stadium,
|
||||
log_success,
|
||||
log_team,
|
||||
log_warning,
|
||||
set_verbose,
|
||||
)
|
||||
from .http import (
|
||||
RateLimitedSession,
|
||||
get_session,
|
||||
fetch_url,
|
||||
fetch_json,
|
||||
fetch_html,
|
||||
)
|
||||
from .progress import (
|
||||
create_progress,
|
||||
create_spinner_progress,
|
||||
progress_bar,
|
||||
track_progress,
|
||||
ProgressTracker,
|
||||
ScrapeProgress,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Logging
|
||||
"get_console",
|
||||
"get_logger",
|
||||
"is_verbose",
|
||||
"log_error",
|
||||
"log_failure",
|
||||
"log_game",
|
||||
"log_stadium",
|
||||
"log_success",
|
||||
"log_team",
|
||||
"log_warning",
|
||||
"set_verbose",
|
||||
# HTTP
|
||||
"RateLimitedSession",
|
||||
"get_session",
|
||||
"fetch_url",
|
||||
"fetch_json",
|
||||
"fetch_html",
|
||||
# Progress
|
||||
"create_progress",
|
||||
"create_spinner_progress",
|
||||
"progress_bar",
|
||||
"track_progress",
|
||||
"ProgressTracker",
|
||||
"ScrapeProgress",
|
||||
]
|
||||
276
Scripts/sportstime_parser/utils/http.py
Normal file
276
Scripts/sportstime_parser/utils/http.py
Normal file
@@ -0,0 +1,276 @@
|
||||
"""HTTP utilities with rate limiting and exponential backoff."""
|
||||
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from ..config import (
|
||||
DEFAULT_REQUEST_DELAY,
|
||||
MAX_RETRIES,
|
||||
BACKOFF_FACTOR,
|
||||
INITIAL_BACKOFF,
|
||||
)
|
||||
from .logging import get_logger, log_warning
|
||||
|
||||
|
||||
# User agents for rotation to avoid blocks
|
||||
USER_AGENTS = [
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
]
|
||||
|
||||
|
||||
class RateLimitedSession:
|
||||
"""HTTP session with rate limiting and exponential backoff.
|
||||
|
||||
Features:
|
||||
- Configurable delay between requests
|
||||
- Automatic 429 detection with exponential backoff
|
||||
- User-agent rotation
|
||||
- Connection pooling
|
||||
- Automatic retries for transient errors
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
delay: float = DEFAULT_REQUEST_DELAY,
|
||||
max_retries: int = MAX_RETRIES,
|
||||
backoff_factor: float = BACKOFF_FACTOR,
|
||||
initial_backoff: float = INITIAL_BACKOFF,
|
||||
):
|
||||
"""Initialize the rate-limited session.
|
||||
|
||||
Args:
|
||||
delay: Minimum delay between requests in seconds
|
||||
max_retries: Maximum number of retry attempts
|
||||
backoff_factor: Multiplier for exponential backoff
|
||||
initial_backoff: Initial backoff duration in seconds
|
||||
"""
|
||||
self.delay = delay
|
||||
self.max_retries = max_retries
|
||||
self.backoff_factor = backoff_factor
|
||||
self.initial_backoff = initial_backoff
|
||||
self.last_request_time: float = 0.0
|
||||
self._domain_delays: dict[str, float] = {}
|
||||
|
||||
# Create session with retry adapter
|
||||
self.session = requests.Session()
|
||||
|
||||
# Configure automatic retries for connection errors
|
||||
retry_strategy = Retry(
|
||||
total=max_retries,
|
||||
backoff_factor=0.5,
|
||||
status_forcelist=[500, 502, 503, 504],
|
||||
allowed_methods=["GET", "HEAD"],
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy, pool_maxsize=10)
|
||||
self.session.mount("http://", adapter)
|
||||
self.session.mount("https://", adapter)
|
||||
|
||||
self._logger = get_logger()
|
||||
|
||||
def _get_user_agent(self) -> str:
|
||||
"""Get a random user agent."""
|
||||
return random.choice(USER_AGENTS)
|
||||
|
||||
def _get_domain(self, url: str) -> str:
|
||||
"""Extract domain from URL."""
|
||||
parsed = urlparse(url)
|
||||
return parsed.netloc
|
||||
|
||||
def _wait_for_rate_limit(self, url: str) -> None:
|
||||
"""Wait to respect rate limiting."""
|
||||
domain = self._get_domain(url)
|
||||
|
||||
# Get domain-specific delay (if 429 was received)
|
||||
domain_delay = self._domain_delays.get(domain, 0.0)
|
||||
effective_delay = max(self.delay, domain_delay)
|
||||
|
||||
elapsed = time.time() - self.last_request_time
|
||||
if elapsed < effective_delay:
|
||||
sleep_time = effective_delay - elapsed
|
||||
self._logger.debug(f"Rate limiting: sleeping {sleep_time:.2f}s")
|
||||
time.sleep(sleep_time)
|
||||
|
||||
def _handle_429(self, url: str, attempt: int) -> float:
|
||||
"""Handle 429 Too Many Requests with exponential backoff.
|
||||
|
||||
Returns the backoff duration in seconds.
|
||||
"""
|
||||
domain = self._get_domain(url)
|
||||
backoff = self.initial_backoff * (self.backoff_factor ** attempt)
|
||||
|
||||
# Add jitter to prevent thundering herd
|
||||
backoff += random.uniform(0, 1)
|
||||
|
||||
# Update domain-specific delay
|
||||
self._domain_delays[domain] = min(backoff * 2, 60.0) # Cap at 60s
|
||||
|
||||
log_warning(f"Rate limited (429) for {domain}, backing off {backoff:.1f}s")
|
||||
|
||||
return backoff
|
||||
|
||||
def get(
|
||||
self,
|
||||
url: str,
|
||||
headers: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
timeout: float = 30.0,
|
||||
) -> requests.Response:
|
||||
"""Make a rate-limited GET request with automatic retries.
|
||||
|
||||
Args:
|
||||
url: URL to fetch
|
||||
headers: Additional headers to include
|
||||
params: Query parameters
|
||||
timeout: Request timeout in seconds
|
||||
|
||||
Returns:
|
||||
Response object
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If all retries fail
|
||||
"""
|
||||
# Prepare headers with user agent
|
||||
request_headers = {"User-Agent": self._get_user_agent()}
|
||||
if headers:
|
||||
request_headers.update(headers)
|
||||
|
||||
last_exception: Optional[Exception] = None
|
||||
|
||||
for attempt in range(self.max_retries + 1):
|
||||
try:
|
||||
# Wait for rate limit
|
||||
self._wait_for_rate_limit(url)
|
||||
|
||||
# Make request
|
||||
self.last_request_time = time.time()
|
||||
response = self.session.get(
|
||||
url,
|
||||
headers=request_headers,
|
||||
params=params,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
# Handle 429
|
||||
if response.status_code == 429:
|
||||
if attempt < self.max_retries:
|
||||
backoff = self._handle_429(url, attempt)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
else:
|
||||
response.raise_for_status()
|
||||
|
||||
# Return successful response
|
||||
return response
|
||||
|
||||
except requests.RequestException as e:
|
||||
last_exception = e
|
||||
if attempt < self.max_retries:
|
||||
backoff = self.initial_backoff * (self.backoff_factor ** attempt)
|
||||
self._logger.warning(
|
||||
f"Request failed (attempt {attempt + 1}): {e}, retrying in {backoff:.1f}s"
|
||||
)
|
||||
time.sleep(backoff)
|
||||
else:
|
||||
raise
|
||||
|
||||
# Should not reach here, but just in case
|
||||
if last_exception:
|
||||
raise last_exception
|
||||
|
||||
raise requests.RequestException("Max retries exceeded")
|
||||
|
||||
def get_json(
|
||||
self,
|
||||
url: str,
|
||||
headers: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
timeout: float = 30.0,
|
||||
) -> dict:
|
||||
"""Make a rate-limited GET request and parse JSON response.
|
||||
|
||||
Args:
|
||||
url: URL to fetch
|
||||
headers: Additional headers to include
|
||||
params: Query parameters
|
||||
timeout: Request timeout in seconds
|
||||
|
||||
Returns:
|
||||
Parsed JSON as dictionary
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If request fails
|
||||
ValueError: If response is not valid JSON
|
||||
"""
|
||||
response = self.get(url, headers=headers, params=params, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def get_html(
|
||||
self,
|
||||
url: str,
|
||||
headers: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
timeout: float = 30.0,
|
||||
) -> str:
|
||||
"""Make a rate-limited GET request and return HTML text.
|
||||
|
||||
Args:
|
||||
url: URL to fetch
|
||||
headers: Additional headers to include
|
||||
params: Query parameters
|
||||
timeout: Request timeout in seconds
|
||||
|
||||
Returns:
|
||||
HTML text content
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If request fails
|
||||
"""
|
||||
response = self.get(url, headers=headers, params=params, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
def reset_domain_delays(self) -> None:
|
||||
"""Reset domain-specific delays (e.g., after a long pause)."""
|
||||
self._domain_delays.clear()
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the session and release resources."""
|
||||
self.session.close()
|
||||
|
||||
|
||||
# Global session instance (lazy initialized)
|
||||
_global_session: Optional[RateLimitedSession] = None
|
||||
|
||||
|
||||
def get_session() -> RateLimitedSession:
|
||||
"""Get the global rate-limited session instance."""
|
||||
global _global_session
|
||||
if _global_session is None:
|
||||
_global_session = RateLimitedSession()
|
||||
return _global_session
|
||||
|
||||
|
||||
def fetch_url(url: str, **kwargs) -> requests.Response:
|
||||
"""Convenience function to fetch a URL with rate limiting."""
|
||||
return get_session().get(url, **kwargs)
|
||||
|
||||
|
||||
def fetch_json(url: str, **kwargs) -> dict:
|
||||
"""Convenience function to fetch JSON with rate limiting."""
|
||||
return get_session().get_json(url, **kwargs)
|
||||
|
||||
|
||||
def fetch_html(url: str, **kwargs) -> str:
|
||||
"""Convenience function to fetch HTML with rate limiting."""
|
||||
return get_session().get_html(url, **kwargs)
|
||||
149
Scripts/sportstime_parser/utils/logging.py
Normal file
149
Scripts/sportstime_parser/utils/logging.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""Logging infrastructure for sportstime-parser."""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from rich.console import Console
|
||||
from rich.logging import RichHandler
|
||||
|
||||
from ..config import SCRIPTS_DIR
|
||||
|
||||
# Module-level state
|
||||
_logger: Optional[logging.Logger] = None
|
||||
_verbose: bool = False
|
||||
_console: Optional[Console] = None
|
||||
|
||||
|
||||
def get_console() -> Console:
|
||||
"""Get the shared Rich console instance."""
|
||||
global _console
|
||||
if _console is None:
|
||||
_console = Console()
|
||||
return _console
|
||||
|
||||
|
||||
def set_verbose(verbose: bool) -> None:
|
||||
"""Set verbose mode globally."""
|
||||
global _verbose
|
||||
_verbose = verbose
|
||||
|
||||
logger = get_logger()
|
||||
if verbose:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
else:
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def is_verbose() -> bool:
|
||||
"""Check if verbose mode is enabled."""
|
||||
return _verbose
|
||||
|
||||
|
||||
def get_logger() -> logging.Logger:
|
||||
"""Get or create the application logger."""
|
||||
global _logger
|
||||
|
||||
if _logger is not None:
|
||||
return _logger
|
||||
|
||||
_logger = logging.getLogger("sportstime_parser")
|
||||
_logger.setLevel(logging.INFO)
|
||||
|
||||
# Prevent propagation to root logger
|
||||
_logger.propagate = False
|
||||
|
||||
# Clear any existing handlers
|
||||
_logger.handlers.clear()
|
||||
|
||||
# Console handler with Rich formatting
|
||||
console_handler = RichHandler(
|
||||
console=get_console(),
|
||||
show_time=True,
|
||||
show_path=False,
|
||||
rich_tracebacks=True,
|
||||
tracebacks_show_locals=True,
|
||||
markup=True,
|
||||
)
|
||||
console_handler.setLevel(logging.DEBUG)
|
||||
console_format = logging.Formatter("%(message)s")
|
||||
console_handler.setFormatter(console_format)
|
||||
_logger.addHandler(console_handler)
|
||||
|
||||
# File handler for persistent logs
|
||||
log_dir = SCRIPTS_DIR / "logs"
|
||||
log_dir.mkdir(exist_ok=True)
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
log_file = log_dir / f"parser_{timestamp}.log"
|
||||
|
||||
file_handler = logging.FileHandler(log_file, encoding="utf-8")
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
file_format = logging.Formatter(
|
||||
"%(asctime)s | %(levelname)-8s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
file_handler.setFormatter(file_format)
|
||||
_logger.addHandler(file_handler)
|
||||
|
||||
return _logger
|
||||
|
||||
|
||||
def log_game(
|
||||
sport: str,
|
||||
game_id: str,
|
||||
home: str,
|
||||
away: str,
|
||||
date: str,
|
||||
status: str = "parsed",
|
||||
) -> None:
|
||||
"""Log a game being processed (only in verbose mode)."""
|
||||
if not is_verbose():
|
||||
return
|
||||
|
||||
logger = get_logger()
|
||||
logger.debug(f"[{sport.upper()}] {game_id}: {away} @ {home} ({date}) - {status}")
|
||||
|
||||
|
||||
def log_team(sport: str, team_id: str, name: str, status: str = "resolved") -> None:
|
||||
"""Log a team being processed (only in verbose mode)."""
|
||||
if not is_verbose():
|
||||
return
|
||||
|
||||
logger = get_logger()
|
||||
logger.debug(f"[{sport.upper()}] Team: {name} -> {team_id} ({status})")
|
||||
|
||||
|
||||
def log_stadium(sport: str, stadium_id: str, name: str, status: str = "resolved") -> None:
|
||||
"""Log a stadium being processed (only in verbose mode)."""
|
||||
if not is_verbose():
|
||||
return
|
||||
|
||||
logger = get_logger()
|
||||
logger.debug(f"[{sport.upper()}] Stadium: {name} -> {stadium_id} ({status})")
|
||||
|
||||
|
||||
def log_error(message: str, exc_info: bool = False) -> None:
|
||||
"""Log an error message."""
|
||||
logger = get_logger()
|
||||
logger.error(message, exc_info=exc_info)
|
||||
|
||||
|
||||
def log_warning(message: str) -> None:
|
||||
"""Log a warning message."""
|
||||
logger = get_logger()
|
||||
logger.warning(message)
|
||||
|
||||
|
||||
def log_success(message: str) -> None:
|
||||
"""Log a success message with green formatting."""
|
||||
logger = get_logger()
|
||||
logger.info(f"[green]✓[/green] {message}")
|
||||
|
||||
|
||||
def log_failure(message: str) -> None:
|
||||
"""Log a failure message with red formatting."""
|
||||
logger = get_logger()
|
||||
logger.info(f"[red]✗[/red] {message}")
|
||||
360
Scripts/sportstime_parser/utils/progress.py
Normal file
360
Scripts/sportstime_parser/utils/progress.py
Normal file
@@ -0,0 +1,360 @@
|
||||
"""Progress utilities using Rich for visual feedback."""
|
||||
|
||||
from contextlib import contextmanager
|
||||
from typing import Generator, Iterable, Optional, TypeVar
|
||||
|
||||
from rich.progress import (
|
||||
Progress,
|
||||
SpinnerColumn,
|
||||
TextColumn,
|
||||
BarColumn,
|
||||
TaskProgressColumn,
|
||||
TimeElapsedColumn,
|
||||
TimeRemainingColumn,
|
||||
MofNCompleteColumn,
|
||||
)
|
||||
from rich.console import Console
|
||||
|
||||
from .logging import get_console
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def create_progress() -> Progress:
|
||||
"""Create a Rich progress bar with standard columns."""
|
||||
return Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[bold blue]{task.description}"),
|
||||
BarColumn(bar_width=40),
|
||||
TaskProgressColumn(),
|
||||
MofNCompleteColumn(),
|
||||
TimeElapsedColumn(),
|
||||
TimeRemainingColumn(),
|
||||
console=get_console(),
|
||||
transient=False,
|
||||
)
|
||||
|
||||
|
||||
def create_spinner_progress() -> Progress:
|
||||
"""Create a Rich progress bar with spinner only (for indeterminate tasks)."""
|
||||
return Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[bold blue]{task.description}"),
|
||||
TimeElapsedColumn(),
|
||||
console=get_console(),
|
||||
transient=True,
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def progress_bar(
|
||||
description: str,
|
||||
total: Optional[int] = None,
|
||||
) -> Generator[tuple[Progress, int], None, None]:
|
||||
"""Context manager for a progress bar.
|
||||
|
||||
Args:
|
||||
description: Task description to display
|
||||
total: Total number of items (None for indeterminate)
|
||||
|
||||
Yields:
|
||||
Tuple of (Progress instance, task_id)
|
||||
|
||||
Example:
|
||||
with progress_bar("Scraping games", total=100) as (progress, task):
|
||||
for item in items:
|
||||
process(item)
|
||||
progress.advance(task)
|
||||
"""
|
||||
if total is None:
|
||||
progress = create_spinner_progress()
|
||||
else:
|
||||
progress = create_progress()
|
||||
|
||||
with progress:
|
||||
task_id = progress.add_task(description, total=total)
|
||||
yield progress, task_id
|
||||
|
||||
|
||||
def track_progress(
|
||||
iterable: Iterable[T],
|
||||
description: str,
|
||||
total: Optional[int] = None,
|
||||
) -> Generator[T, None, None]:
|
||||
"""Wrap an iterable with a progress bar.
|
||||
|
||||
Args:
|
||||
iterable: Items to iterate over
|
||||
description: Task description to display
|
||||
total: Total number of items (auto-detected if iterable has len)
|
||||
|
||||
Yields:
|
||||
Items from the iterable
|
||||
|
||||
Example:
|
||||
for game in track_progress(games, "Processing games"):
|
||||
process(game)
|
||||
"""
|
||||
# Try to get length if not provided
|
||||
if total is None:
|
||||
try:
|
||||
total = len(iterable) # type: ignore
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
if total is None:
|
||||
# Indeterminate progress
|
||||
progress = create_spinner_progress()
|
||||
with progress:
|
||||
task_id = progress.add_task(description, total=None)
|
||||
for item in iterable:
|
||||
yield item
|
||||
progress.update(task_id, advance=1)
|
||||
else:
|
||||
# Determinate progress
|
||||
progress = create_progress()
|
||||
with progress:
|
||||
task_id = progress.add_task(description, total=total)
|
||||
for item in iterable:
|
||||
yield item
|
||||
progress.advance(task_id)
|
||||
|
||||
|
||||
class ProgressTracker:
|
||||
"""Track progress across multiple phases with nested tasks.
|
||||
|
||||
Example:
|
||||
tracker = ProgressTracker()
|
||||
tracker.start("Scraping NBA")
|
||||
|
||||
with tracker.task("Fetching schedule", total=12) as advance:
|
||||
for month in months:
|
||||
fetch(month)
|
||||
advance()
|
||||
|
||||
with tracker.task("Parsing games", total=1230) as advance:
|
||||
for game in games:
|
||||
parse(game)
|
||||
advance()
|
||||
|
||||
tracker.finish("Completed NBA scrape")
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the progress tracker."""
|
||||
self._console = get_console()
|
||||
self._current_progress: Optional[Progress] = None
|
||||
self._current_task: Optional[int] = None
|
||||
|
||||
def start(self, message: str) -> None:
|
||||
"""Start a new tracking session with a message."""
|
||||
self._console.print(f"\n[bold cyan]>>> {message}[/bold cyan]")
|
||||
|
||||
def finish(self, message: str) -> None:
|
||||
"""Finish the tracking session with a message."""
|
||||
self._console.print(f"[bold green]<<< {message}[/bold green]\n")
|
||||
|
||||
@contextmanager
|
||||
def task(
|
||||
self,
|
||||
description: str,
|
||||
total: Optional[int] = None,
|
||||
) -> Generator[callable, None, None]:
|
||||
"""Context manager for a tracked task.
|
||||
|
||||
Args:
|
||||
description: Task description
|
||||
total: Total items (None for indeterminate)
|
||||
|
||||
Yields:
|
||||
Callable to advance the progress
|
||||
|
||||
Example:
|
||||
with tracker.task("Processing", total=100) as advance:
|
||||
for item in items:
|
||||
process(item)
|
||||
advance()
|
||||
"""
|
||||
with progress_bar(description, total) as (progress, task_id):
|
||||
self._current_progress = progress
|
||||
self._current_task = task_id
|
||||
|
||||
def advance(amount: int = 1) -> None:
|
||||
progress.advance(task_id, advance=amount)
|
||||
|
||||
yield advance
|
||||
|
||||
self._current_progress = None
|
||||
self._current_task = None
|
||||
|
||||
def log(self, message: str) -> None:
|
||||
"""Log a message (will be displayed above progress bar if active)."""
|
||||
if self._current_progress:
|
||||
self._current_progress.console.print(f" {message}")
|
||||
else:
|
||||
self._console.print(f" {message}")
|
||||
|
||||
|
||||
class ScrapeProgress:
|
||||
"""Specialized progress tracker for scraping operations.
|
||||
|
||||
Tracks counts of games, teams, stadiums scraped and provides
|
||||
formatted status updates.
|
||||
"""
|
||||
|
||||
def __init__(self, sport: str, season: int):
|
||||
"""Initialize scrape progress for a sport.
|
||||
|
||||
Args:
|
||||
sport: Sport code (e.g., 'nba')
|
||||
season: Season start year
|
||||
"""
|
||||
self.sport = sport
|
||||
self.season = season
|
||||
self.games_count = 0
|
||||
self.teams_count = 0
|
||||
self.stadiums_count = 0
|
||||
self.errors_count = 0
|
||||
self._tracker = ProgressTracker()
|
||||
|
||||
def start(self) -> None:
|
||||
"""Start the scraping session."""
|
||||
self._tracker.start(
|
||||
f"Scraping {self.sport.upper()} {self.season}-{self.season + 1}"
|
||||
)
|
||||
|
||||
def finish(self) -> None:
|
||||
"""Finish the scraping session with summary."""
|
||||
summary = (
|
||||
f"Scraped {self.games_count} games, "
|
||||
f"{self.teams_count} teams, "
|
||||
f"{self.stadiums_count} stadiums"
|
||||
)
|
||||
if self.errors_count > 0:
|
||||
summary += f" ({self.errors_count} errors)"
|
||||
self._tracker.finish(summary)
|
||||
|
||||
@contextmanager
|
||||
def scraping_schedule(
|
||||
self,
|
||||
total_months: Optional[int] = None,
|
||||
) -> Generator[callable, None, None]:
|
||||
"""Track schedule scraping progress."""
|
||||
with self._tracker.task(
|
||||
f"Fetching {self.sport.upper()} schedule",
|
||||
total=total_months,
|
||||
) as advance:
|
||||
yield advance
|
||||
|
||||
@contextmanager
|
||||
def parsing_games(
|
||||
self,
|
||||
total_games: Optional[int] = None,
|
||||
) -> Generator[callable, None, None]:
|
||||
"""Track game parsing progress."""
|
||||
with self._tracker.task(
|
||||
"Parsing games",
|
||||
total=total_games,
|
||||
) as advance:
|
||||
|
||||
def advance_and_count(amount: int = 1) -> None:
|
||||
self.games_count += amount
|
||||
advance(amount)
|
||||
|
||||
yield advance_and_count
|
||||
|
||||
@contextmanager
|
||||
def resolving_teams(
|
||||
self,
|
||||
total_teams: Optional[int] = None,
|
||||
) -> Generator[callable, None, None]:
|
||||
"""Track team resolution progress."""
|
||||
with self._tracker.task(
|
||||
"Resolving teams",
|
||||
total=total_teams,
|
||||
) as advance:
|
||||
|
||||
def advance_and_count(amount: int = 1) -> None:
|
||||
self.teams_count += amount
|
||||
advance(amount)
|
||||
|
||||
yield advance_and_count
|
||||
|
||||
@contextmanager
|
||||
def resolving_stadiums(
|
||||
self,
|
||||
total_stadiums: Optional[int] = None,
|
||||
) -> Generator[callable, None, None]:
|
||||
"""Track stadium resolution progress."""
|
||||
with self._tracker.task(
|
||||
"Resolving stadiums",
|
||||
total=total_stadiums,
|
||||
) as advance:
|
||||
|
||||
def advance_and_count(amount: int = 1) -> None:
|
||||
self.stadiums_count += amount
|
||||
advance(amount)
|
||||
|
||||
yield advance_and_count
|
||||
|
||||
def log_error(self, message: str) -> None:
|
||||
"""Log an error during scraping."""
|
||||
self.errors_count += 1
|
||||
self._tracker.log(f"[red]Error: {message}[/red]")
|
||||
|
||||
def log_warning(self, message: str) -> None:
|
||||
"""Log a warning during scraping."""
|
||||
self._tracker.log(f"[yellow]Warning: {message}[/yellow]")
|
||||
|
||||
def log_info(self, message: str) -> None:
|
||||
"""Log an info message during scraping."""
|
||||
self._tracker.log(message)
|
||||
|
||||
|
||||
class SimpleProgressBar:
|
||||
"""Simple progress bar wrapper for batch operations.
|
||||
|
||||
Example:
|
||||
with create_progress_bar(total=100, description="Uploading") as progress:
|
||||
for item in items:
|
||||
upload(item)
|
||||
progress.advance()
|
||||
"""
|
||||
|
||||
def __init__(self, progress: Progress, task_id: int):
|
||||
self._progress = progress
|
||||
self._task_id = task_id
|
||||
|
||||
def advance(self, amount: int = 1) -> None:
|
||||
"""Advance the progress bar."""
|
||||
self._progress.advance(self._task_id, advance=amount)
|
||||
|
||||
def update(self, completed: int) -> None:
|
||||
"""Set the progress to a specific value."""
|
||||
self._progress.update(self._task_id, completed=completed)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def create_progress_bar(
|
||||
total: int,
|
||||
description: str = "Progress",
|
||||
) -> Generator[SimpleProgressBar, None, None]:
|
||||
"""Create a simple progress bar for batch operations.
|
||||
|
||||
Args:
|
||||
total: Total number of items
|
||||
description: Task description
|
||||
|
||||
Yields:
|
||||
SimpleProgressBar with advance() and update() methods
|
||||
|
||||
Example:
|
||||
with create_progress_bar(total=100, description="Uploading") as progress:
|
||||
for item in items:
|
||||
upload(item)
|
||||
progress.advance()
|
||||
"""
|
||||
progress = create_progress()
|
||||
with progress:
|
||||
task_id = progress.add_task(description, total=total)
|
||||
yield SimpleProgressBar(progress, task_id)
|
||||
Reference in New Issue
Block a user