feat(scripts): rewrite parser as modular Python CLI

Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 21:06:12 -06:00
parent 284a10d9e1
commit eeaf900e5a
109 changed files with 18415 additions and 266211 deletions
--- a/Scripts/sportstime_parser/utils/http.py
+++ b/Scripts/sportstime_parser/utils/http.py
@@ -0,0 +1,276 @@
+"""HTTP utilities with rate limiting and exponential backoff."""
+
+import random
+import time
+from typing import Optional
+from urllib.parse import urlparse
+
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+from ..config import (
+    DEFAULT_REQUEST_DELAY,
+    MAX_RETRIES,
+    BACKOFF_FACTOR,
+    INITIAL_BACKOFF,
+)
+from .logging import get_logger, log_warning
+
+
+# User agents for rotation to avoid blocks
+USER_AGENTS = [
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
+]
+
+
+class RateLimitedSession:
+    """HTTP session with rate limiting and exponential backoff.
+
+    Features:
+    - Configurable delay between requests
+    - Automatic 429 detection with exponential backoff
+    - User-agent rotation
+    - Connection pooling
+    - Automatic retries for transient errors
+    """
+
+    def __init__(
+        self,
+        delay: float = DEFAULT_REQUEST_DELAY,
+        max_retries: int = MAX_RETRIES,
+        backoff_factor: float = BACKOFF_FACTOR,
+        initial_backoff: float = INITIAL_BACKOFF,
+    ):
+        """Initialize the rate-limited session.
+
+        Args:
+            delay: Minimum delay between requests in seconds
+            max_retries: Maximum number of retry attempts
+            backoff_factor: Multiplier for exponential backoff
+            initial_backoff: Initial backoff duration in seconds
+        """
+        self.delay = delay
+        self.max_retries = max_retries
+        self.backoff_factor = backoff_factor
+        self.initial_backoff = initial_backoff
+        self.last_request_time: float = 0.0
+        self._domain_delays: dict[str, float] = {}
+
+        # Create session with retry adapter
+        self.session = requests.Session()
+
+        # Configure automatic retries for connection errors
+        retry_strategy = Retry(
+            total=max_retries,
+            backoff_factor=0.5,
+            status_forcelist=[500, 502, 503, 504],
+            allowed_methods=["GET", "HEAD"],
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy, pool_maxsize=10)
+        self.session.mount("http://", adapter)
+        self.session.mount("https://", adapter)
+
+        self._logger = get_logger()
+
+    def _get_user_agent(self) -> str:
+        """Get a random user agent."""
+        return random.choice(USER_AGENTS)
+
+    def _get_domain(self, url: str) -> str:
+        """Extract domain from URL."""
+        parsed = urlparse(url)
+        return parsed.netloc
+
+    def _wait_for_rate_limit(self, url: str) -> None:
+        """Wait to respect rate limiting."""
+        domain = self._get_domain(url)
+
+        # Get domain-specific delay (if 429 was received)
+        domain_delay = self._domain_delays.get(domain, 0.0)
+        effective_delay = max(self.delay, domain_delay)
+
+        elapsed = time.time() - self.last_request_time
+        if elapsed < effective_delay:
+            sleep_time = effective_delay - elapsed
+            self._logger.debug(f"Rate limiting: sleeping {sleep_time:.2f}s")
+            time.sleep(sleep_time)
+
+    def _handle_429(self, url: str, attempt: int) -> float:
+        """Handle 429 Too Many Requests with exponential backoff.
+
+        Returns the backoff duration in seconds.
+        """
+        domain = self._get_domain(url)
+        backoff = self.initial_backoff * (self.backoff_factor ** attempt)
+
+        # Add jitter to prevent thundering herd
+        backoff += random.uniform(0, 1)
+
+        # Update domain-specific delay
+        self._domain_delays[domain] = min(backoff * 2, 60.0)  # Cap at 60s
+
+        log_warning(f"Rate limited (429) for {domain}, backing off {backoff:.1f}s")
+
+        return backoff
+
+    def get(
+        self,
+        url: str,
+        headers: Optional[dict] = None,
+        params: Optional[dict] = None,
+        timeout: float = 30.0,
+    ) -> requests.Response:
+        """Make a rate-limited GET request with automatic retries.
+
+        Args:
+            url: URL to fetch
+            headers: Additional headers to include
+            params: Query parameters
+            timeout: Request timeout in seconds
+
+        Returns:
+            Response object
+
+        Raises:
+            requests.RequestException: If all retries fail
+        """
+        # Prepare headers with user agent
+        request_headers = {"User-Agent": self._get_user_agent()}
+        if headers:
+            request_headers.update(headers)
+
+        last_exception: Optional[Exception] = None
+
+        for attempt in range(self.max_retries + 1):
+            try:
+                # Wait for rate limit
+                self._wait_for_rate_limit(url)
+
+                # Make request
+                self.last_request_time = time.time()
+                response = self.session.get(
+                    url,
+                    headers=request_headers,
+                    params=params,
+                    timeout=timeout,
+                )
+
+                # Handle 429
+                if response.status_code == 429:
+                    if attempt < self.max_retries:
+                        backoff = self._handle_429(url, attempt)
+                        time.sleep(backoff)
+                        continue
+                    else:
+                        response.raise_for_status()
+
+                # Return successful response
+                return response
+
+            except requests.RequestException as e:
+                last_exception = e
+                if attempt < self.max_retries:
+                    backoff = self.initial_backoff * (self.backoff_factor ** attempt)
+                    self._logger.warning(
+                        f"Request failed (attempt {attempt + 1}): {e}, retrying in {backoff:.1f}s"
+                    )
+                    time.sleep(backoff)
+                else:
+                    raise
+
+        # Should not reach here, but just in case
+        if last_exception:
+            raise last_exception
+
+        raise requests.RequestException("Max retries exceeded")
+
+    def get_json(
+        self,
+        url: str,
+        headers: Optional[dict] = None,
+        params: Optional[dict] = None,
+        timeout: float = 30.0,
+    ) -> dict:
+        """Make a rate-limited GET request and parse JSON response.
+
+        Args:
+            url: URL to fetch
+            headers: Additional headers to include
+            params: Query parameters
+            timeout: Request timeout in seconds
+
+        Returns:
+            Parsed JSON as dictionary
+
+        Raises:
+            requests.RequestException: If request fails
+            ValueError: If response is not valid JSON
+        """
+        response = self.get(url, headers=headers, params=params, timeout=timeout)
+        response.raise_for_status()
+        return response.json()
+
+    def get_html(
+        self,
+        url: str,
+        headers: Optional[dict] = None,
+        params: Optional[dict] = None,
+        timeout: float = 30.0,
+    ) -> str:
+        """Make a rate-limited GET request and return HTML text.
+
+        Args:
+            url: URL to fetch
+            headers: Additional headers to include
+            params: Query parameters
+            timeout: Request timeout in seconds
+
+        Returns:
+            HTML text content
+
+        Raises:
+            requests.RequestException: If request fails
+        """
+        response = self.get(url, headers=headers, params=params, timeout=timeout)
+        response.raise_for_status()
+        return response.text
+
+    def reset_domain_delays(self) -> None:
+        """Reset domain-specific delays (e.g., after a long pause)."""
+        self._domain_delays.clear()
+
+    def close(self) -> None:
+        """Close the session and release resources."""
+        self.session.close()
+
+
+# Global session instance (lazy initialized)
+_global_session: Optional[RateLimitedSession] = None
+
+
+def get_session() -> RateLimitedSession:
+    """Get the global rate-limited session instance."""
+    global _global_session
+    if _global_session is None:
+        _global_session = RateLimitedSession()
+    return _global_session
+
+
+def fetch_url(url: str, **kwargs) -> requests.Response:
+    """Convenience function to fetch a URL with rate limiting."""
+    return get_session().get(url, **kwargs)
+
+
+def fetch_json(url: str, **kwargs) -> dict:
+    """Convenience function to fetch JSON with rate limiting."""
+    return get_session().get_json(url, **kwargs)
+
+
+def fetch_html(url: str, **kwargs) -> str:
+    """Convenience function to fetch HTML with rate limiting."""
+    return get_session().get_html(url, **kwargs)