feat(scripts): add sportstime-parser data pipeline

Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00
parent ac78042a7e
commit 52d445bca4
76 changed files with 25065 additions and 0 deletions
--- a/sportstime_parser/normalizers/fuzzy.py
+++ b/sportstime_parser/normalizers/fuzzy.py
@@ -0,0 +1,272 @@
+"""Fuzzy string matching utilities for team and stadium name resolution."""
+
+from dataclasses import dataclass
+from typing import Optional
+
+from rapidfuzz import fuzz, process
+from rapidfuzz.utils import default_process
+
+from ..config import FUZZY_MATCH_THRESHOLD
+from ..models.aliases import FuzzyMatch
+
+
+@dataclass
+class MatchCandidate:
+    """A candidate for fuzzy matching.
+
+    Attributes:
+        canonical_id: The canonical ID of this candidate
+        name: The display name for this candidate
+        aliases: List of alternative names to match against
+    """
+
+    canonical_id: str
+    name: str
+    aliases: list[str]
+
+
+def normalize_for_matching(s: str) -> str:
+    """Normalize a string for fuzzy matching.
+
+    - Convert to lowercase
+    - Remove common prefixes/suffixes
+    - Collapse whitespace
+
+    Args:
+        s: String to normalize
+
+    Returns:
+        Normalized string
+    """
+    result = s.lower().strip()
+
+    # Remove common prefixes
+    prefixes = ["the ", "team ", "stadium "]
+    for prefix in prefixes:
+        if result.startswith(prefix):
+            result = result[len(prefix) :]
+
+    # Remove common suffixes
+    suffixes = [" stadium", " arena", " center", " field", " park"]
+    for suffix in suffixes:
+        if result.endswith(suffix):
+            result = result[: -len(suffix)]
+
+    return result.strip()
+
+
+def fuzzy_match_team(
+    query: str,
+    candidates: list[MatchCandidate],
+    threshold: int = FUZZY_MATCH_THRESHOLD,
+    top_n: int = 3,
+) -> list[FuzzyMatch]:
+    """Find fuzzy matches for a team name.
+
+    Uses multiple matching strategies:
+    1. Token set ratio (handles word order differences)
+    2. Partial ratio (handles substring matches)
+    3. Standard ratio (overall similarity)
+
+    Args:
+        query: Team name to match
+        candidates: List of candidate teams to match against
+        threshold: Minimum score to consider a match (0-100)
+        top_n: Maximum number of matches to return
+
+    Returns:
+        List of FuzzyMatch objects sorted by confidence (descending)
+    """
+    query_norm = normalize_for_matching(query)
+
+    # Build list of all matchable strings with their canonical IDs
+    match_strings: list[tuple[str, str, str]] = []  # (string, canonical_id, name)
+
+    for candidate in candidates:
+        # Add primary name
+        match_strings.append(
+            (normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name)
+        )
+        # Add aliases
+        for alias in candidate.aliases:
+            match_strings.append(
+                (normalize_for_matching(alias), candidate.canonical_id, candidate.name)
+            )
+
+    # Score all candidates
+    scored: dict[str, tuple[int, str]] = {}  # canonical_id -> (best_score, name)
+
+    for match_str, canonical_id, name in match_strings:
+        # Use multiple scoring methods
+        token_score = fuzz.token_set_ratio(query_norm, match_str)
+        partial_score = fuzz.partial_ratio(query_norm, match_str)
+        ratio_score = fuzz.ratio(query_norm, match_str)
+
+        # Weighted average favoring token_set_ratio for team names
+        score = int(0.5 * token_score + 0.3 * partial_score + 0.2 * ratio_score)
+
+        # Keep best score for each canonical ID
+        if canonical_id not in scored or score > scored[canonical_id][0]:
+            scored[canonical_id] = (score, name)
+
+    # Filter by threshold and sort
+    matches = [
+        FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score)
+        for cid, (score, name) in scored.items()
+        if score >= threshold
+    ]
+
+    # Sort by confidence descending
+    matches.sort(key=lambda m: m.confidence, reverse=True)
+
+    return matches[:top_n]
+
+
+def fuzzy_match_stadium(
+    query: str,
+    candidates: list[MatchCandidate],
+    threshold: int = FUZZY_MATCH_THRESHOLD,
+    top_n: int = 3,
+) -> list[FuzzyMatch]:
+    """Find fuzzy matches for a stadium name.
+
+    Uses matching strategies optimized for stadium names:
+    1. Token sort ratio (handles "X Stadium" vs "Stadium X")
+    2. Partial ratio (handles naming rights changes)
+    3. Standard ratio
+
+    Args:
+        query: Stadium name to match
+        candidates: List of candidate stadiums to match against
+        threshold: Minimum score to consider a match (0-100)
+        top_n: Maximum number of matches to return
+
+    Returns:
+        List of FuzzyMatch objects sorted by confidence (descending)
+    """
+    query_norm = normalize_for_matching(query)
+
+    # Build list of all matchable strings
+    match_strings: list[tuple[str, str, str]] = []
+
+    for candidate in candidates:
+        match_strings.append(
+            (normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name)
+        )
+        for alias in candidate.aliases:
+            match_strings.append(
+                (normalize_for_matching(alias), candidate.canonical_id, candidate.name)
+            )
+
+    # Score all candidates
+    scored: dict[str, tuple[int, str]] = {}
+
+    for match_str, canonical_id, name in match_strings:
+        # Use scoring methods suited for stadium names
+        token_sort_score = fuzz.token_sort_ratio(query_norm, match_str)
+        partial_score = fuzz.partial_ratio(query_norm, match_str)
+        ratio_score = fuzz.ratio(query_norm, match_str)
+
+        # Weighted average
+        score = int(0.4 * token_sort_score + 0.4 * partial_score + 0.2 * ratio_score)
+
+        if canonical_id not in scored or score > scored[canonical_id][0]:
+            scored[canonical_id] = (score, name)
+
+    # Filter and sort
+    matches = [
+        FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score)
+        for cid, (score, name) in scored.items()
+        if score >= threshold
+    ]
+
+    matches.sort(key=lambda m: m.confidence, reverse=True)
+
+    return matches[:top_n]
+
+
+def exact_match(
+    query: str,
+    candidates: list[MatchCandidate],
+    case_sensitive: bool = False,
+) -> Optional[str]:
+    """Find an exact match for a string.
+
+    Args:
+        query: String to match
+        candidates: List of candidates to match against
+        case_sensitive: Whether to use case-sensitive matching
+
+    Returns:
+        Canonical ID if exact match found, None otherwise
+    """
+    if case_sensitive:
+        query_norm = query.strip()
+    else:
+        query_norm = query.lower().strip()
+
+    for candidate in candidates:
+        # Check primary name
+        name = candidate.name if case_sensitive else candidate.name.lower()
+        if query_norm == name.strip():
+            return candidate.canonical_id
+
+        # Check aliases
+        for alias in candidate.aliases:
+            alias_norm = alias if case_sensitive else alias.lower()
+            if query_norm == alias_norm.strip():
+                return candidate.canonical_id
+
+    return None
+
+
+def best_match(
+    query: str,
+    candidates: list[MatchCandidate],
+    threshold: int = FUZZY_MATCH_THRESHOLD,
+) -> Optional[FuzzyMatch]:
+    """Find the best match for a query string.
+
+    First tries exact match, then falls back to fuzzy matching.
+
+    Args:
+        query: String to match
+        candidates: List of candidates
+        threshold: Minimum fuzzy match score
+
+    Returns:
+        Best FuzzyMatch or None if no match above threshold
+    """
+    # Try exact match first
+    exact = exact_match(query, candidates)
+    if exact:
+        # Find the name for this ID
+        for c in candidates:
+            if c.canonical_id == exact:
+                return FuzzyMatch(
+                    canonical_id=exact,
+                    canonical_name=c.name,
+                    confidence=100,
+                )
+
+    # Fall back to fuzzy matching
+    # Use team matching by default (works for both)
+    matches = fuzzy_match_team(query, candidates, threshold=threshold, top_n=1)
+
+    return matches[0] if matches else None
+
+
+def calculate_similarity(s1: str, s2: str) -> int:
+    """Calculate similarity between two strings.
+
+    Args:
+        s1: First string
+        s2: Second string
+
+    Returns:
+        Similarity score 0-100
+    """
+    s1_norm = normalize_for_matching(s1)
+    s2_norm = normalize_for_matching(s2)
+
+    return fuzz.token_set_ratio(s1_norm, s2_norm)