feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
272
sportstime_parser/normalizers/fuzzy.py
Normal file
272
sportstime_parser/normalizers/fuzzy.py
Normal file
@@ -0,0 +1,272 @@
|
||||
"""Fuzzy string matching utilities for team and stadium name resolution."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from rapidfuzz import fuzz, process
|
||||
from rapidfuzz.utils import default_process
|
||||
|
||||
from ..config import FUZZY_MATCH_THRESHOLD
|
||||
from ..models.aliases import FuzzyMatch
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchCandidate:
|
||||
"""A candidate for fuzzy matching.
|
||||
|
||||
Attributes:
|
||||
canonical_id: The canonical ID of this candidate
|
||||
name: The display name for this candidate
|
||||
aliases: List of alternative names to match against
|
||||
"""
|
||||
|
||||
canonical_id: str
|
||||
name: str
|
||||
aliases: list[str]
|
||||
|
||||
|
||||
def normalize_for_matching(s: str) -> str:
|
||||
"""Normalize a string for fuzzy matching.
|
||||
|
||||
- Convert to lowercase
|
||||
- Remove common prefixes/suffixes
|
||||
- Collapse whitespace
|
||||
|
||||
Args:
|
||||
s: String to normalize
|
||||
|
||||
Returns:
|
||||
Normalized string
|
||||
"""
|
||||
result = s.lower().strip()
|
||||
|
||||
# Remove common prefixes
|
||||
prefixes = ["the ", "team ", "stadium "]
|
||||
for prefix in prefixes:
|
||||
if result.startswith(prefix):
|
||||
result = result[len(prefix) :]
|
||||
|
||||
# Remove common suffixes
|
||||
suffixes = [" stadium", " arena", " center", " field", " park"]
|
||||
for suffix in suffixes:
|
||||
if result.endswith(suffix):
|
||||
result = result[: -len(suffix)]
|
||||
|
||||
return result.strip()
|
||||
|
||||
|
||||
def fuzzy_match_team(
|
||||
query: str,
|
||||
candidates: list[MatchCandidate],
|
||||
threshold: int = FUZZY_MATCH_THRESHOLD,
|
||||
top_n: int = 3,
|
||||
) -> list[FuzzyMatch]:
|
||||
"""Find fuzzy matches for a team name.
|
||||
|
||||
Uses multiple matching strategies:
|
||||
1. Token set ratio (handles word order differences)
|
||||
2. Partial ratio (handles substring matches)
|
||||
3. Standard ratio (overall similarity)
|
||||
|
||||
Args:
|
||||
query: Team name to match
|
||||
candidates: List of candidate teams to match against
|
||||
threshold: Minimum score to consider a match (0-100)
|
||||
top_n: Maximum number of matches to return
|
||||
|
||||
Returns:
|
||||
List of FuzzyMatch objects sorted by confidence (descending)
|
||||
"""
|
||||
query_norm = normalize_for_matching(query)
|
||||
|
||||
# Build list of all matchable strings with their canonical IDs
|
||||
match_strings: list[tuple[str, str, str]] = [] # (string, canonical_id, name)
|
||||
|
||||
for candidate in candidates:
|
||||
# Add primary name
|
||||
match_strings.append(
|
||||
(normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name)
|
||||
)
|
||||
# Add aliases
|
||||
for alias in candidate.aliases:
|
||||
match_strings.append(
|
||||
(normalize_for_matching(alias), candidate.canonical_id, candidate.name)
|
||||
)
|
||||
|
||||
# Score all candidates
|
||||
scored: dict[str, tuple[int, str]] = {} # canonical_id -> (best_score, name)
|
||||
|
||||
for match_str, canonical_id, name in match_strings:
|
||||
# Use multiple scoring methods
|
||||
token_score = fuzz.token_set_ratio(query_norm, match_str)
|
||||
partial_score = fuzz.partial_ratio(query_norm, match_str)
|
||||
ratio_score = fuzz.ratio(query_norm, match_str)
|
||||
|
||||
# Weighted average favoring token_set_ratio for team names
|
||||
score = int(0.5 * token_score + 0.3 * partial_score + 0.2 * ratio_score)
|
||||
|
||||
# Keep best score for each canonical ID
|
||||
if canonical_id not in scored or score > scored[canonical_id][0]:
|
||||
scored[canonical_id] = (score, name)
|
||||
|
||||
# Filter by threshold and sort
|
||||
matches = [
|
||||
FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score)
|
||||
for cid, (score, name) in scored.items()
|
||||
if score >= threshold
|
||||
]
|
||||
|
||||
# Sort by confidence descending
|
||||
matches.sort(key=lambda m: m.confidence, reverse=True)
|
||||
|
||||
return matches[:top_n]
|
||||
|
||||
|
||||
def fuzzy_match_stadium(
|
||||
query: str,
|
||||
candidates: list[MatchCandidate],
|
||||
threshold: int = FUZZY_MATCH_THRESHOLD,
|
||||
top_n: int = 3,
|
||||
) -> list[FuzzyMatch]:
|
||||
"""Find fuzzy matches for a stadium name.
|
||||
|
||||
Uses matching strategies optimized for stadium names:
|
||||
1. Token sort ratio (handles "X Stadium" vs "Stadium X")
|
||||
2. Partial ratio (handles naming rights changes)
|
||||
3. Standard ratio
|
||||
|
||||
Args:
|
||||
query: Stadium name to match
|
||||
candidates: List of candidate stadiums to match against
|
||||
threshold: Minimum score to consider a match (0-100)
|
||||
top_n: Maximum number of matches to return
|
||||
|
||||
Returns:
|
||||
List of FuzzyMatch objects sorted by confidence (descending)
|
||||
"""
|
||||
query_norm = normalize_for_matching(query)
|
||||
|
||||
# Build list of all matchable strings
|
||||
match_strings: list[tuple[str, str, str]] = []
|
||||
|
||||
for candidate in candidates:
|
||||
match_strings.append(
|
||||
(normalize_for_matching(candidate.name), candidate.canonical_id, candidate.name)
|
||||
)
|
||||
for alias in candidate.aliases:
|
||||
match_strings.append(
|
||||
(normalize_for_matching(alias), candidate.canonical_id, candidate.name)
|
||||
)
|
||||
|
||||
# Score all candidates
|
||||
scored: dict[str, tuple[int, str]] = {}
|
||||
|
||||
for match_str, canonical_id, name in match_strings:
|
||||
# Use scoring methods suited for stadium names
|
||||
token_sort_score = fuzz.token_sort_ratio(query_norm, match_str)
|
||||
partial_score = fuzz.partial_ratio(query_norm, match_str)
|
||||
ratio_score = fuzz.ratio(query_norm, match_str)
|
||||
|
||||
# Weighted average
|
||||
score = int(0.4 * token_sort_score + 0.4 * partial_score + 0.2 * ratio_score)
|
||||
|
||||
if canonical_id not in scored or score > scored[canonical_id][0]:
|
||||
scored[canonical_id] = (score, name)
|
||||
|
||||
# Filter and sort
|
||||
matches = [
|
||||
FuzzyMatch(canonical_id=cid, canonical_name=name, confidence=score)
|
||||
for cid, (score, name) in scored.items()
|
||||
if score >= threshold
|
||||
]
|
||||
|
||||
matches.sort(key=lambda m: m.confidence, reverse=True)
|
||||
|
||||
return matches[:top_n]
|
||||
|
||||
|
||||
def exact_match(
|
||||
query: str,
|
||||
candidates: list[MatchCandidate],
|
||||
case_sensitive: bool = False,
|
||||
) -> Optional[str]:
|
||||
"""Find an exact match for a string.
|
||||
|
||||
Args:
|
||||
query: String to match
|
||||
candidates: List of candidates to match against
|
||||
case_sensitive: Whether to use case-sensitive matching
|
||||
|
||||
Returns:
|
||||
Canonical ID if exact match found, None otherwise
|
||||
"""
|
||||
if case_sensitive:
|
||||
query_norm = query.strip()
|
||||
else:
|
||||
query_norm = query.lower().strip()
|
||||
|
||||
for candidate in candidates:
|
||||
# Check primary name
|
||||
name = candidate.name if case_sensitive else candidate.name.lower()
|
||||
if query_norm == name.strip():
|
||||
return candidate.canonical_id
|
||||
|
||||
# Check aliases
|
||||
for alias in candidate.aliases:
|
||||
alias_norm = alias if case_sensitive else alias.lower()
|
||||
if query_norm == alias_norm.strip():
|
||||
return candidate.canonical_id
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def best_match(
|
||||
query: str,
|
||||
candidates: list[MatchCandidate],
|
||||
threshold: int = FUZZY_MATCH_THRESHOLD,
|
||||
) -> Optional[FuzzyMatch]:
|
||||
"""Find the best match for a query string.
|
||||
|
||||
First tries exact match, then falls back to fuzzy matching.
|
||||
|
||||
Args:
|
||||
query: String to match
|
||||
candidates: List of candidates
|
||||
threshold: Minimum fuzzy match score
|
||||
|
||||
Returns:
|
||||
Best FuzzyMatch or None if no match above threshold
|
||||
"""
|
||||
# Try exact match first
|
||||
exact = exact_match(query, candidates)
|
||||
if exact:
|
||||
# Find the name for this ID
|
||||
for c in candidates:
|
||||
if c.canonical_id == exact:
|
||||
return FuzzyMatch(
|
||||
canonical_id=exact,
|
||||
canonical_name=c.name,
|
||||
confidence=100,
|
||||
)
|
||||
|
||||
# Fall back to fuzzy matching
|
||||
# Use team matching by default (works for both)
|
||||
matches = fuzzy_match_team(query, candidates, threshold=threshold, top_n=1)
|
||||
|
||||
return matches[0] if matches else None
|
||||
|
||||
|
||||
def calculate_similarity(s1: str, s2: str) -> int:
|
||||
"""Calculate similarity between two strings.
|
||||
|
||||
Args:
|
||||
s1: First string
|
||||
s2: Second string
|
||||
|
||||
Returns:
|
||||
Similarity score 0-100
|
||||
"""
|
||||
s1_norm = normalize_for_matching(s1)
|
||||
s2_norm = normalize_for_matching(s2)
|
||||
|
||||
return fuzz.token_set_ratio(s1_norm, s2_norm)
|
||||
Reference in New Issue
Block a user