Files
Sportstime/Scripts/sportstime_parser/normalizers/fuzzy.py
Trey t eeaf900e5a feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package:

- Multi-source scrapers with automatic fallback for 7 sports
- Canonical ID generation for games, teams, and stadiums
- Fuzzy matching with configurable thresholds for name resolution
- CloudKit Web Services uploader with JWT auth, diff-based updates
- Resumable uploads with checkpoint state persistence
- Validation reports with manual review items and suggested matches
- Comprehensive test suite (249 tests)

CLI: sportstime-parser scrape|validate|upload|status|retry|clear

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 21:06:12 -06:00

273 lines
8.0 KiB
Python

"""Fuzzy string matching utilities for team and stadium name resolution."""
from dataclasses import dataclass
from typing import Optional
from rapidfuzz import fuzz, process
from rapidfuzz.utils import default_process
from ..config import FUZZY_MATCH_THRESHOLD
from ..models.aliases import FuzzyMatch
@dataclass
class MatchCandidate:
"""A candidate for fuzzy matching.
Attributes:
canonical_id: The canonical ID of this candidate
name: The display name for this candidate
aliases: List of alternative names to match against
"""
canonical_id: str
name: str
aliases: list[str]
def normalize_for_matching(s: str) -> str:
"""Normalize a string for fuzzy matching.
- Convert to lowercase
- Remove common prefixes/suffixes
- Collapse whitespace
Args:
s: String to normalize
Returns:
Normalized string
"""
result = s.lower().strip()
# Remove common prefixes
prefixes = ["the ", "team ", "stadium "]
for prefix in prefixes:
if result.startswith(prefix):
result = result[len(prefix) :]
# Remove common suffixes
suffixes = [" stadium", " arena", " center", " field", " park"]
for suffix in suffixes:
if result.endswith(suffix):
result = result[: -len(suffix)]
return result.strip()
def fuzzy_match_team(
query: str,
candidates: list[MatchCandidate],
threshold: int = FUZZY_MATCH_THRESHOLD,
top_n: int = 3,
) -> list[FuzzyMatch]:
"""Find fuzzy matches for a team name.
Uses multiple matching strategies:
1. Token set ratio (handles word order differences)
2. Partial ratio (handles substring matches)
3. Standard ratio (overall similarity)
Args:
query: Team name to match
candidates: List of candidate teams to match against
threshold: Minimum score to consider a match (0-100)
top_n: Maximum number of matches to return
Returns:
List of FuzzyMatch objects sorted by confidence (descending)
"""
query_norm = normalize_for_matching(query)
# Build list of all matchable strings with their canonical IDs
match_strings: list[tuple[str, str, str]] = [] # (string, canonical_id, name)
for candidate in candidates:
# Add primary name
match_strings.append(
(normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name)
)
# Add aliases
for alias in candidate.aliases:
match_strings.append(
(normalize_for_matching(alias), candidate.canonical_id, candidate.name)
)
# Score all candidates
scored: dict[str, tuple[int, str]] = {} # canonical_id -> (best_score, name)
for match_str, canonical_id, name in match_strings:
# Use multiple scoring methods
token_score = fuzz.token_set_ratio(query_norm, match_str)
partial_score = fuzz.partial_ratio(query_norm, match_str)
ratio_score = fuzz.ratio(query_norm, match_str)
# Weighted average favoring token_set_ratio for team names
score = int(0.5 * token_score + 0.3 * partial_score + 0.2 * ratio_score)
# Keep best score for each canonical ID
if canonical_id not in scored or score > scored[canonical_id][0]:
scored[canonical_id] = (score, name)
# Filter by threshold and sort
matches = [
FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score)
for cid, (score, name) in scored.items()
if score >= threshold
]
# Sort by confidence descending
matches.sort(key=lambda m: m.confidence, reverse=True)
return matches[:top_n]
def fuzzy_match_stadium(
query: str,
candidates: list[MatchCandidate],
threshold: int = FUZZY_MATCH_THRESHOLD,
top_n: int = 3,
) -> list[FuzzyMatch]:
"""Find fuzzy matches for a stadium name.
Uses matching strategies optimized for stadium names:
1. Token sort ratio (handles "X Stadium" vs "Stadium X")
2. Partial ratio (handles naming rights changes)
3. Standard ratio
Args:
query: Stadium name to match
candidates: List of candidate stadiums to match against
threshold: Minimum score to consider a match (0-100)
top_n: Maximum number of matches to return
Returns:
List of FuzzyMatch objects sorted by confidence (descending)
"""
query_norm = normalize_for_matching(query)
# Build list of all matchable strings
match_strings: list[tuple[str, str, str]] = []
for candidate in candidates:
match_strings.append(
(normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name)
)
for alias in candidate.aliases:
match_strings.append(
(normalize_for_matching(alias), candidate.canonical_id, candidate.name)
)
# Score all candidates
scored: dict[str, tuple[int, str]] = {}
for match_str, canonical_id, name in match_strings:
# Use scoring methods suited for stadium names
token_sort_score = fuzz.token_sort_ratio(query_norm, match_str)
partial_score = fuzz.partial_ratio(query_norm, match_str)
ratio_score = fuzz.ratio(query_norm, match_str)
# Weighted average
score = int(0.4 * token_sort_score + 0.4 * partial_score + 0.2 * ratio_score)
if canonical_id not in scored or score > scored[canonical_id][0]:
scored[canonical_id] = (score, name)
# Filter and sort
matches = [
FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score)
for cid, (score, name) in scored.items()
if score >= threshold
]
matches.sort(key=lambda m: m.confidence, reverse=True)
return matches[:top_n]
def exact_match(
query: str,
candidates: list[MatchCandidate],
case_sensitive: bool = False,
) -> Optional[str]:
"""Find an exact match for a string.
Args:
query: String to match
candidates: List of candidates to match against
case_sensitive: Whether to use case-sensitive matching
Returns:
Canonical ID if exact match found, None otherwise
"""
if case_sensitive:
query_norm = query.strip()
else:
query_norm = query.lower().strip()
for candidate in candidates:
# Check primary name
name = candidate.name if case_sensitive else candidate.name.lower()
if query_norm == name.strip():
return candidate.canonical_id
# Check aliases
for alias in candidate.aliases:
alias_norm = alias if case_sensitive else alias.lower()
if query_norm == alias_norm.strip():
return candidate.canonical_id
return None
def best_match(
query: str,
candidates: list[MatchCandidate],
threshold: int = FUZZY_MATCH_THRESHOLD,
) -> Optional[FuzzyMatch]:
"""Find the best match for a query string.
First tries exact match, then falls back to fuzzy matching.
Args:
query: String to match
candidates: List of candidates
threshold: Minimum fuzzy match score
Returns:
Best FuzzyMatch or None if no match above threshold
"""
# Try exact match first
exact = exact_match(query, candidates)
if exact:
# Find the name for this ID
for c in candidates:
if c.canonical_id == exact:
return FuzzyMatch(
canonical_id=exact,
canonical_name=c.name,
confidence=100,
)
# Fall back to fuzzy matching
# Use team matching by default (works for both)
matches = fuzzy_match_team(query, candidates, threshold=threshold, top_n=1)
return matches[0] if matches else None
def calculate_similarity(s1: str, s2: str) -> int:
"""Calculate similarity between two strings.
Args:
s1: First string
s2: Second string
Returns:
Similarity score 0-100
"""
s1_norm = normalize_for_matching(s1)
s2_norm = normalize_for_matching(s2)
return fuzz.token_set_ratio(s1_norm, s2_norm)