SportstimeAPI/sportstime_parser/normalizers/fuzzy.py

"""Fuzzy string matching utilities for team and stadium name resolution."""

from dataclasses import dataclass
from typing import Optional

from rapidfuzz import fuzz, process
from rapidfuzz.utils import default_process

from ..config import FUZZY_MATCH_THRESHOLD
from ..models.aliases import FuzzyMatch


@dataclass
class MatchCandidate:
    """A candidate for fuzzy matching.

    Attributes:
        canonical_id: The canonical ID of this candidate
        name: The display name for this candidate
        aliases: List of alternative names to match against
    """

    canonical_id: str
    name: str
    aliases: list[str]


def normalize_for_matching(s: str) -> str:
    """Normalize a string for fuzzy matching.

    - Convert to lowercase
    - Remove common prefixes/suffixes
    - Collapse whitespace

    Args:
        s: String to normalize

    Returns:
        Normalized string
    """
    result = s.lower().strip()

    # Remove common prefixes
    prefixes = ["the ", "team ", "stadium "]
    for prefix in prefixes:
        if result.startswith(prefix):
            result = result[len(prefix) :]

    # Remove common suffixes
    suffixes = [" stadium", " arena", " center", " field", " park"]
    for suffix in suffixes:
        if result.endswith(suffix):
            result = result[: -len(suffix)]

    return result.strip()


def fuzzy_match_team(
    query: str,
    candidates: list[MatchCandidate],
    threshold: int = FUZZY_MATCH_THRESHOLD,
    top_n: int = 3,
) -> list[FuzzyMatch]:
    """Find fuzzy matches for a team name.

    Uses multiple matching strategies:
    1. Token set ratio (handles word order differences)
    2. Partial ratio (handles substring matches)
    3. Standard ratio (overall similarity)

    Args:
        query: Team name to match
        candidates: List of candidate teams to match against
        threshold: Minimum score to consider a match (0-100)
        top_n: Maximum number of matches to return

    Returns:
        List of FuzzyMatch objects sorted by confidence (descending)
    """
    query_norm = normalize_for_matching(query)

    # Build list of all matchable strings with their canonical IDs
    match_strings: list[tuple[str, str, str]] = []  # (string, canonical_id, name)

    for candidate in candidates:
        # Add primary name
        match_strings.append(
            (normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name)
        )
        # Add aliases
        for alias in candidate.aliases:
            match_strings.append(
                (normalize_for_matching(alias), candidate.canonical_id, candidate.name)
            )

    # Score all candidates
    scored: dict[str, tuple[int, str]] = {}  # canonical_id -> (best_score, name)

    for match_str, canonical_id, name in match_strings:
        # Use multiple scoring methods
        token_score = fuzz.token_set_ratio(query_norm, match_str)
        partial_score = fuzz.partial_ratio(query_norm, match_str)
        ratio_score = fuzz.ratio(query_norm, match_str)

        # Weighted average favoring token_set_ratio for team names
        score = int(0.5 * token_score + 0.3 * partial_score + 0.2 * ratio_score)

        # Keep best score for each canonical ID
        if canonical_id not in scored or score > scored[canonical_id][0]:
            scored[canonical_id] = (score, name)

    # Filter by threshold and sort
    matches = [
        FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score)
        for cid, (score, name) in scored.items()
        if score >= threshold
    ]

    # Sort by confidence descending
    matches.sort(key=lambda m: m.confidence, reverse=True)

    return matches[:top_n]


def fuzzy_match_stadium(
    query: str,
    candidates: list[MatchCandidate],
    threshold: int = FUZZY_MATCH_THRESHOLD,
    top_n: int = 3,
) -> list[FuzzyMatch]:
    """Find fuzzy matches for a stadium name.

    Uses matching strategies optimized for stadium names:
    1. Token sort ratio (handles "X Stadium" vs "Stadium X")
    2. Partial ratio (handles naming rights changes)
    3. Standard ratio

    Args:
        query: Stadium name to match
        candidates: List of candidate stadiums to match against
        threshold: Minimum score to consider a match (0-100)
        top_n: Maximum number of matches to return

    Returns:
        List of FuzzyMatch objects sorted by confidence (descending)
    """
    query_norm = normalize_for_matching(query)

    # Build list of all matchable strings
    match_strings: list[tuple[str, str, str]] = []

    for candidate in candidates:
        match_strings.append(
            (normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name)
        )
        for alias in candidate.aliases:
            match_strings.append(
                (normalize_for_matching(alias), candidate.canonical_id, candidate.name)
            )

    # Score all candidates
    scored: dict[str, tuple[int, str]] = {}

    for match_str, canonical_id, name in match_strings:
        # Use scoring methods suited for stadium names
        token_sort_score = fuzz.token_sort_ratio(query_norm, match_str)
        partial_score = fuzz.partial_ratio(query_norm, match_str)
        ratio_score = fuzz.ratio(query_norm, match_str)

        # Weighted average
        score = int(0.4 * token_sort_score + 0.4 * partial_score + 0.2 * ratio_score)

        if canonical_id not in scored or score > scored[canonical_id][0]:
            scored[canonical_id] = (score, name)

    # Filter and sort
    matches = [
        FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score)
        for cid, (score, name) in scored.items()
        if score >= threshold
    ]

    matches.sort(key=lambda m: m.confidence, reverse=True)

    return matches[:top_n]


def exact_match(
    query: str,
    candidates: list[MatchCandidate],
    case_sensitive: bool = False,
) -> Optional[str]:
    """Find an exact match for a string.

    Args:
        query: String to match
        candidates: List of candidates to match against
        case_sensitive: Whether to use case-sensitive matching

    Returns:
        Canonical ID if exact match found, None otherwise
    """
    if case_sensitive:
        query_norm = query.strip()
    else:
        query_norm = query.lower().strip()

    for candidate in candidates:
        # Check primary name
        name = candidate.name if case_sensitive else candidate.name.lower()
        if query_norm == name.strip():
            return candidate.canonical_id

        # Check aliases
        for alias in candidate.aliases:
            alias_norm = alias if case_sensitive else alias.lower()
            if query_norm == alias_norm.strip():
                return candidate.canonical_id

    return None


def best_match(
    query: str,
    candidates: list[MatchCandidate],
    threshold: int = FUZZY_MATCH_THRESHOLD,
) -> Optional[FuzzyMatch]:
    """Find the best match for a query string.

    First tries exact match, then falls back to fuzzy matching.

    Args:
        query: String to match
        candidates: List of candidates
        threshold: Minimum fuzzy match score

    Returns:
        Best FuzzyMatch or None if no match above threshold
    """
    # Try exact match first
    exact = exact_match(query, candidates)
    if exact:
        # Find the name for this ID
        for c in candidates:
            if c.canonical_id == exact:
                return FuzzyMatch(
                    canonical_id=exact,
                    canonical_name=c.name,
                    confidence=100,
                )

    # Fall back to fuzzy matching
    # Use team matching by default (works for both)
    matches = fuzzy_match_team(query, candidates, threshold=threshold, top_n=1)

    return matches[0] if matches else None


def calculate_similarity(s1: str, s2: str) -> int:
    """Calculate similarity between two strings.

    Args:
        s1: First string
        s2: Second string

    Returns:
        Similarity score 0-100
    """
    s1_norm = normalize_for_matching(s1)
    s2_norm = normalize_for_matching(s2)

    return fuzz.token_set_ratio(s1_norm, s2_norm)