Files
Trey t 52d445bca4 feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00

273 lines
8.0 KiB
Python

"""Fuzzy string matching utilities for team and stadium name resolution."""
from dataclasses import dataclass
from typing import Optional
from rapidfuzz import fuzz, process
from rapidfuzz.utils import default_process
from ..config import FUZZY_MATCH_THRESHOLD
from ..models.aliases import FuzzyMatch
@dataclass
class MatchCandidate:
"""A candidate for fuzzy matching.
Attributes:
canonical_id: The canonical ID of this candidate
name: The display name for this candidate
aliases: List of alternative names to match against
"""
canonical_id: str
name: str
aliases: list[str]
def normalize_for_matching(s: str) -> str:
"""Normalize a string for fuzzy matching.
- Convert to lowercase
- Remove common prefixes/suffixes
- Collapse whitespace
Args:
s: String to normalize
Returns:
Normalized string
"""
result = s.lower().strip()
# Remove common prefixes
prefixes = ["the ", "team ", "stadium "]
for prefix in prefixes:
if result.startswith(prefix):
result = result[len(prefix) :]
# Remove common suffixes
suffixes = [" stadium", " arena", " center", " field", " park"]
for suffix in suffixes:
if result.endswith(suffix):
result = result[: -len(suffix)]
return result.strip()
def fuzzy_match_team(
query: str,
candidates: list[MatchCandidate],
threshold: int = FUZZY_MATCH_THRESHOLD,
top_n: int = 3,
) -> list[FuzzyMatch]:
"""Find fuzzy matches for a team name.
Uses multiple matching strategies:
1. Token set ratio (handles word order differences)
2. Partial ratio (handles substring matches)
3. Standard ratio (overall similarity)
Args:
query: Team name to match
candidates: List of candidate teams to match against
threshold: Minimum score to consider a match (0-100)
top_n: Maximum number of matches to return
Returns:
List of FuzzyMatch objects sorted by confidence (descending)
"""
query_norm = normalize_for_matching(query)
# Build list of all matchable strings with their canonical IDs
match_strings: list[tuple[str, str, str]] = [] # (string, canonical_id, name)
for candidate in candidates:
# Add primary name
match_strings.append(
(normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name)
)
# Add aliases
for alias in candidate.aliases:
match_strings.append(
(normalize_for_matching(alias), candidate.canonical_id, candidate.name)
)
# Score all candidates
scored: dict[str, tuple[int, str]] = {} # canonical_id -> (best_score, name)
for match_str, canonical_id, name in match_strings:
# Use multiple scoring methods
token_score = fuzz.token_set_ratio(query_norm, match_str)
partial_score = fuzz.partial_ratio(query_norm, match_str)
ratio_score = fuzz.ratio(query_norm, match_str)
# Weighted average favoring token_set_ratio for team names
score = int(0.5 * token_score + 0.3 * partial_score + 0.2 * ratio_score)
# Keep best score for each canonical ID
if canonical_id not in scored or score > scored[canonical_id][0]:
scored[canonical_id] = (score, name)
# Filter by threshold and sort
matches = [
FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score)
for cid, (score, name) in scored.items()
if score >= threshold
]
# Sort by confidence descending
matches.sort(key=lambda m: m.confidence, reverse=True)
return matches[:top_n]
def fuzzy_match_stadium(
query: str,
candidates: list[MatchCandidate],
threshold: int = FUZZY_MATCH_THRESHOLD,
top_n: int = 3,
) -> list[FuzzyMatch]:
"""Find fuzzy matches for a stadium name.
Uses matching strategies optimized for stadium names:
1. Token sort ratio (handles "X Stadium" vs "Stadium X")
2. Partial ratio (handles naming rights changes)
3. Standard ratio
Args:
query: Stadium name to match
candidates: List of candidate stadiums to match against
threshold: Minimum score to consider a match (0-100)
top_n: Maximum number of matches to return
Returns:
List of FuzzyMatch objects sorted by confidence (descending)
"""
query_norm = normalize_for_matching(query)
# Build list of all matchable strings
match_strings: list[tuple[str, str, str]] = []
for candidate in candidates:
match_strings.append(
(normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name)
)
for alias in candidate.aliases:
match_strings.append(
(normalize_for_matching(alias), candidate.canonical_id, candidate.name)
)
# Score all candidates
scored: dict[str, tuple[int, str]] = {}
for match_str, canonical_id, name in match_strings:
# Use scoring methods suited for stadium names
token_sort_score = fuzz.token_sort_ratio(query_norm, match_str)
partial_score = fuzz.partial_ratio(query_norm, match_str)
ratio_score = fuzz.ratio(query_norm, match_str)
# Weighted average
score = int(0.4 * token_sort_score + 0.4 * partial_score + 0.2 * ratio_score)
if canonical_id not in scored or score > scored[canonical_id][0]:
scored[canonical_id] = (score, name)
# Filter and sort
matches = [
FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score)
for cid, (score, name) in scored.items()
if score >= threshold
]
matches.sort(key=lambda m: m.confidence, reverse=True)
return matches[:top_n]
def exact_match(
query: str,
candidates: list[MatchCandidate],
case_sensitive: bool = False,
) -> Optional[str]:
"""Find an exact match for a string.
Args:
query: String to match
candidates: List of candidates to match against
case_sensitive: Whether to use case-sensitive matching
Returns:
Canonical ID if exact match found, None otherwise
"""
if case_sensitive:
query_norm = query.strip()
else:
query_norm = query.lower().strip()
for candidate in candidates:
# Check primary name
name = candidate.name if case_sensitive else candidate.name.lower()
if query_norm == name.strip():
return candidate.canonical_id
# Check aliases
for alias in candidate.aliases:
alias_norm = alias if case_sensitive else alias.lower()
if query_norm == alias_norm.strip():
return candidate.canonical_id
return None
def best_match(
query: str,
candidates: list[MatchCandidate],
threshold: int = FUZZY_MATCH_THRESHOLD,
) -> Optional[FuzzyMatch]:
"""Find the best match for a query string.
First tries exact match, then falls back to fuzzy matching.
Args:
query: String to match
candidates: List of candidates
threshold: Minimum fuzzy match score
Returns:
Best FuzzyMatch or None if no match above threshold
"""
# Try exact match first
exact = exact_match(query, candidates)
if exact:
# Find the name for this ID
for c in candidates:
if c.canonical_id == exact:
return FuzzyMatch(
canonical_id=exact,
canonical_name=c.name,
confidence=100,
)
# Fall back to fuzzy matching
# Use team matching by default (works for both)
matches = fuzzy_match_team(query, candidates, threshold=threshold, top_n=1)
return matches[0] if matches else None
def calculate_similarity(s1: str, s2: str) -> int:
"""Calculate similarity between two strings.
Args:
s1: First string
s2: Second string
Returns:
Similarity score 0-100
"""
s1_norm = normalize_for_matching(s1)
s2_norm = normalize_for_matching(s2)
return fuzz.token_set_ratio(s1_norm, s2_norm)