"""Fuzzy string matching utilities for team and stadium name resolution.""" from dataclasses import dataclass from typing import Optional from rapidfuzz import fuzz, process from rapidfuzz.utils import default_process from ..config import FUZZY_MATCH_THRESHOLD from ..models.aliases import FuzzyMatch @dataclass class MatchCandidate: """A candidate for fuzzy matching. Attributes: canonical_id: The canonical ID of this candidate name: The display name for this candidate aliases: List of alternative names to match against """ canonical_id: str name: str aliases: list[str] def normalize_for_matching(s: str) -> str: """Normalize a string for fuzzy matching. - Convert to lowercase - Remove common prefixes/suffixes - Collapse whitespace Args: s: String to normalize Returns: Normalized string """ result = s.lower().strip() # Remove common prefixes prefixes = ["the ", "team ", "stadium "] for prefix in prefixes: if result.startswith(prefix): result = result[len(prefix) :] # Remove common suffixes suffixes = [" stadium", " arena", " center", " field", " park"] for suffix in suffixes: if result.endswith(suffix): result = result[: -len(suffix)] return result.strip() def fuzzy_match_team( query: str, candidates: list[MatchCandidate], threshold: int = FUZZY_MATCH_THRESHOLD, top_n: int = 3, ) -> list[FuzzyMatch]: """Find fuzzy matches for a team name. Uses multiple matching strategies: 1. Token set ratio (handles word order differences) 2. Partial ratio (handles substring matches) 3. Standard ratio (overall similarity) Args: query: Team name to match candidates: List of candidate teams to match against threshold: Minimum score to consider a match (0-100) top_n: Maximum number of matches to return Returns: List of FuzzyMatch objects sorted by confidence (descending) """ query_norm = normalize_for_matching(query) # Build list of all matchable strings with their canonical IDs match_strings: list[tuple[str, str, str]] = [] # (string, canonical_id, name) for candidate in candidates: # Add primary name match_strings.append( (normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name) ) # Add aliases for alias in candidate.aliases: match_strings.append( (normalize_for_matching(alias), candidate.canonical_id, candidate.name) ) # Score all candidates scored: dict[str, tuple[int, str]] = {} # canonical_id -> (best_score, name) for match_str, canonical_id, name in match_strings: # Use multiple scoring methods token_score = fuzz.token_set_ratio(query_norm, match_str) partial_score = fuzz.partial_ratio(query_norm, match_str) ratio_score = fuzz.ratio(query_norm, match_str) # Weighted average favoring token_set_ratio for team names score = int(0.5 * token_score + 0.3 * partial_score + 0.2 * ratio_score) # Keep best score for each canonical ID if canonical_id not in scored or score > scored[canonical_id][0]: scored[canonical_id] = (score, name) # Filter by threshold and sort matches = [ FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score) for cid, (score, name) in scored.items() if score >= threshold ] # Sort by confidence descending matches.sort(key=lambda m: m.confidence, reverse=True) return matches[:top_n] def fuzzy_match_stadium( query: str, candidates: list[MatchCandidate], threshold: int = FUZZY_MATCH_THRESHOLD, top_n: int = 3, ) -> list[FuzzyMatch]: """Find fuzzy matches for a stadium name. Uses matching strategies optimized for stadium names: 1. Token sort ratio (handles "X Stadium" vs "Stadium X") 2. Partial ratio (handles naming rights changes) 3. Standard ratio Args: query: Stadium name to match candidates: List of candidate stadiums to match against threshold: Minimum score to consider a match (0-100) top_n: Maximum number of matches to return Returns: List of FuzzyMatch objects sorted by confidence (descending) """ query_norm = normalize_for_matching(query) # Build list of all matchable strings match_strings: list[tuple[str, str, str]] = [] for candidate in candidates: match_strings.append( (normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name) ) for alias in candidate.aliases: match_strings.append( (normalize_for_matching(alias), candidate.canonical_id, candidate.name) ) # Score all candidates scored: dict[str, tuple[int, str]] = {} for match_str, canonical_id, name in match_strings: # Use scoring methods suited for stadium names token_sort_score = fuzz.token_sort_ratio(query_norm, match_str) partial_score = fuzz.partial_ratio(query_norm, match_str) ratio_score = fuzz.ratio(query_norm, match_str) # Weighted average score = int(0.4 * token_sort_score + 0.4 * partial_score + 0.2 * ratio_score) if canonical_id not in scored or score > scored[canonical_id][0]: scored[canonical_id] = (score, name) # Filter and sort matches = [ FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score) for cid, (score, name) in scored.items() if score >= threshold ] matches.sort(key=lambda m: m.confidence, reverse=True) return matches[:top_n] def exact_match( query: str, candidates: list[MatchCandidate], case_sensitive: bool = False, ) -> Optional[str]: """Find an exact match for a string. Args: query: String to match candidates: List of candidates to match against case_sensitive: Whether to use case-sensitive matching Returns: Canonical ID if exact match found, None otherwise """ if case_sensitive: query_norm = query.strip() else: query_norm = query.lower().strip() for candidate in candidates: # Check primary name name = candidate.name if case_sensitive else candidate.name.lower() if query_norm == name.strip(): return candidate.canonical_id # Check aliases for alias in candidate.aliases: alias_norm = alias if case_sensitive else alias.lower() if query_norm == alias_norm.strip(): return candidate.canonical_id return None def best_match( query: str, candidates: list[MatchCandidate], threshold: int = FUZZY_MATCH_THRESHOLD, ) -> Optional[FuzzyMatch]: """Find the best match for a query string. First tries exact match, then falls back to fuzzy matching. Args: query: String to match candidates: List of candidates threshold: Minimum fuzzy match score Returns: Best FuzzyMatch or None if no match above threshold """ # Try exact match first exact = exact_match(query, candidates) if exact: # Find the name for this ID for c in candidates: if c.canonical_id == exact: return FuzzyMatch( canonical_id=exact, canonical_name=c.name, confidence=100, ) # Fall back to fuzzy matching # Use team matching by default (works for both) matches = fuzzy_match_team(query, candidates, threshold=threshold, top_n=1) return matches[0] if matches else None def calculate_similarity(s1: str, s2: str) -> int: """Calculate similarity between two strings. Args: s1: First string s2: Second string Returns: Similarity score 0-100 """ s1_norm = normalize_for_matching(s1) s2_norm = normalize_for_matching(s2) return fuzz.token_set_ratio(s1_norm, s2_norm)