feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
276
Scripts/sportstime_parser/utils/http.py
Normal file
276
Scripts/sportstime_parser/utils/http.py
Normal file
@@ -0,0 +1,276 @@
|
||||
"""HTTP utilities with rate limiting and exponential backoff."""
|
||||
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from ..config import (
|
||||
DEFAULT_REQUEST_DELAY,
|
||||
MAX_RETRIES,
|
||||
BACKOFF_FACTOR,
|
||||
INITIAL_BACKOFF,
|
||||
)
|
||||
from .logging import get_logger, log_warning
|
||||
|
||||
|
||||
# User agents for rotation to avoid blocks
|
||||
USER_AGENTS = [
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
]
|
||||
|
||||
|
||||
class RateLimitedSession:
|
||||
"""HTTP session with rate limiting and exponential backoff.
|
||||
|
||||
Features:
|
||||
- Configurable delay between requests
|
||||
- Automatic 429 detection with exponential backoff
|
||||
- User-agent rotation
|
||||
- Connection pooling
|
||||
- Automatic retries for transient errors
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
delay: float = DEFAULT_REQUEST_DELAY,
|
||||
max_retries: int = MAX_RETRIES,
|
||||
backoff_factor: float = BACKOFF_FACTOR,
|
||||
initial_backoff: float = INITIAL_BACKOFF,
|
||||
):
|
||||
"""Initialize the rate-limited session.
|
||||
|
||||
Args:
|
||||
delay: Minimum delay between requests in seconds
|
||||
max_retries: Maximum number of retry attempts
|
||||
backoff_factor: Multiplier for exponential backoff
|
||||
initial_backoff: Initial backoff duration in seconds
|
||||
"""
|
||||
self.delay = delay
|
||||
self.max_retries = max_retries
|
||||
self.backoff_factor = backoff_factor
|
||||
self.initial_backoff = initial_backoff
|
||||
self.last_request_time: float = 0.0
|
||||
self._domain_delays: dict[str, float] = {}
|
||||
|
||||
# Create session with retry adapter
|
||||
self.session = requests.Session()
|
||||
|
||||
# Configure automatic retries for connection errors
|
||||
retry_strategy = Retry(
|
||||
total=max_retries,
|
||||
backoff_factor=0.5,
|
||||
status_forcelist=[500, 502, 503, 504],
|
||||
allowed_methods=["GET", "HEAD"],
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy, pool_maxsize=10)
|
||||
self.session.mount("http://", adapter)
|
||||
self.session.mount("https://", adapter)
|
||||
|
||||
self._logger = get_logger()
|
||||
|
||||
def _get_user_agent(self) -> str:
|
||||
"""Get a random user agent."""
|
||||
return random.choice(USER_AGENTS)
|
||||
|
||||
def _get_domain(self, url: str) -> str:
|
||||
"""Extract domain from URL."""
|
||||
parsed = urlparse(url)
|
||||
return parsed.netloc
|
||||
|
||||
def _wait_for_rate_limit(self, url: str) -> None:
|
||||
"""Wait to respect rate limiting."""
|
||||
domain = self._get_domain(url)
|
||||
|
||||
# Get domain-specific delay (if 429 was received)
|
||||
domain_delay = self._domain_delays.get(domain, 0.0)
|
||||
effective_delay = max(self.delay, domain_delay)
|
||||
|
||||
elapsed = time.time() - self.last_request_time
|
||||
if elapsed < effective_delay:
|
||||
sleep_time = effective_delay - elapsed
|
||||
self._logger.debug(f"Rate limiting: sleeping {sleep_time:.2f}s")
|
||||
time.sleep(sleep_time)
|
||||
|
||||
def _handle_429(self, url: str, attempt: int) -> float:
|
||||
"""Handle 429 Too Many Requests with exponential backoff.
|
||||
|
||||
Returns the backoff duration in seconds.
|
||||
"""
|
||||
domain = self._get_domain(url)
|
||||
backoff = self.initial_backoff * (self.backoff_factor ** attempt)
|
||||
|
||||
# Add jitter to prevent thundering herd
|
||||
backoff += random.uniform(0, 1)
|
||||
|
||||
# Update domain-specific delay
|
||||
self._domain_delays[domain] = min(backoff * 2, 60.0) # Cap at 60s
|
||||
|
||||
log_warning(f"Rate limited (429) for {domain}, backing off {backoff:.1f}s")
|
||||
|
||||
return backoff
|
||||
|
||||
def get(
|
||||
self,
|
||||
url: str,
|
||||
headers: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
timeout: float = 30.0,
|
||||
) -> requests.Response:
|
||||
"""Make a rate-limited GET request with automatic retries.
|
||||
|
||||
Args:
|
||||
url: URL to fetch
|
||||
headers: Additional headers to include
|
||||
params: Query parameters
|
||||
timeout: Request timeout in seconds
|
||||
|
||||
Returns:
|
||||
Response object
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If all retries fail
|
||||
"""
|
||||
# Prepare headers with user agent
|
||||
request_headers = {"User-Agent": self._get_user_agent()}
|
||||
if headers:
|
||||
request_headers.update(headers)
|
||||
|
||||
last_exception: Optional[Exception] = None
|
||||
|
||||
for attempt in range(self.max_retries + 1):
|
||||
try:
|
||||
# Wait for rate limit
|
||||
self._wait_for_rate_limit(url)
|
||||
|
||||
# Make request
|
||||
self.last_request_time = time.time()
|
||||
response = self.session.get(
|
||||
url,
|
||||
headers=request_headers,
|
||||
params=params,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
# Handle 429
|
||||
if response.status_code == 429:
|
||||
if attempt < self.max_retries:
|
||||
backoff = self._handle_429(url, attempt)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
else:
|
||||
response.raise_for_status()
|
||||
|
||||
# Return successful response
|
||||
return response
|
||||
|
||||
except requests.RequestException as e:
|
||||
last_exception = e
|
||||
if attempt < self.max_retries:
|
||||
backoff = self.initial_backoff * (self.backoff_factor ** attempt)
|
||||
self._logger.warning(
|
||||
f"Request failed (attempt {attempt + 1}): {e}, retrying in {backoff:.1f}s"
|
||||
)
|
||||
time.sleep(backoff)
|
||||
else:
|
||||
raise
|
||||
|
||||
# Should not reach here, but just in case
|
||||
if last_exception:
|
||||
raise last_exception
|
||||
|
||||
raise requests.RequestException("Max retries exceeded")
|
||||
|
||||
def get_json(
|
||||
self,
|
||||
url: str,
|
||||
headers: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
timeout: float = 30.0,
|
||||
) -> dict:
|
||||
"""Make a rate-limited GET request and parse JSON response.
|
||||
|
||||
Args:
|
||||
url: URL to fetch
|
||||
headers: Additional headers to include
|
||||
params: Query parameters
|
||||
timeout: Request timeout in seconds
|
||||
|
||||
Returns:
|
||||
Parsed JSON as dictionary
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If request fails
|
||||
ValueError: If response is not valid JSON
|
||||
"""
|
||||
response = self.get(url, headers=headers, params=params, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def get_html(
|
||||
self,
|
||||
url: str,
|
||||
headers: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
timeout: float = 30.0,
|
||||
) -> str:
|
||||
"""Make a rate-limited GET request and return HTML text.
|
||||
|
||||
Args:
|
||||
url: URL to fetch
|
||||
headers: Additional headers to include
|
||||
params: Query parameters
|
||||
timeout: Request timeout in seconds
|
||||
|
||||
Returns:
|
||||
HTML text content
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If request fails
|
||||
"""
|
||||
response = self.get(url, headers=headers, params=params, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
def reset_domain_delays(self) -> None:
|
||||
"""Reset domain-specific delays (e.g., after a long pause)."""
|
||||
self._domain_delays.clear()
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the session and release resources."""
|
||||
self.session.close()
|
||||
|
||||
|
||||
# Global session instance (lazy initialized)
|
||||
_global_session: Optional[RateLimitedSession] = None
|
||||
|
||||
|
||||
def get_session() -> RateLimitedSession:
|
||||
"""Get the global rate-limited session instance."""
|
||||
global _global_session
|
||||
if _global_session is None:
|
||||
_global_session = RateLimitedSession()
|
||||
return _global_session
|
||||
|
||||
|
||||
def fetch_url(url: str, **kwargs) -> requests.Response:
|
||||
"""Convenience function to fetch a URL with rate limiting."""
|
||||
return get_session().get(url, **kwargs)
|
||||
|
||||
|
||||
def fetch_json(url: str, **kwargs) -> dict:
|
||||
"""Convenience function to fetch JSON with rate limiting."""
|
||||
return get_session().get_json(url, **kwargs)
|
||||
|
||||
|
||||
def fetch_html(url: str, **kwargs) -> str:
|
||||
"""Convenience function to fetch HTML with rate limiting."""
|
||||
return get_session().get_html(url, **kwargs)
|
||||
Reference in New Issue
Block a user