feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
194
sportstime_parser/tests/test_fuzzy.py
Normal file
194
sportstime_parser/tests/test_fuzzy.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""Tests for fuzzy string matching utilities."""
|
||||
|
||||
import pytest
|
||||
|
||||
from sportstime_parser.normalizers.fuzzy import (
|
||||
normalize_for_matching,
|
||||
fuzzy_match_team,
|
||||
fuzzy_match_stadium,
|
||||
exact_match,
|
||||
best_match,
|
||||
calculate_similarity,
|
||||
MatchCandidate,
|
||||
)
|
||||
|
||||
|
||||
class TestNormalizeForMatching:
|
||||
"""Tests for normalize_for_matching function."""
|
||||
|
||||
def test_basic_normalization(self):
|
||||
"""Test basic string normalization."""
|
||||
assert normalize_for_matching("Los Angeles Lakers") == "los angeles lakers"
|
||||
assert normalize_for_matching(" Boston Celtics ") == "boston celtics"
|
||||
|
||||
def test_removes_common_prefixes(self):
|
||||
"""Test removal of common prefixes."""
|
||||
assert normalize_for_matching("The Boston Celtics") == "boston celtics"
|
||||
assert normalize_for_matching("Team Lakers") == "lakers"
|
||||
|
||||
def test_removes_stadium_suffixes(self):
|
||||
"""Test removal of stadium-related suffixes."""
|
||||
assert normalize_for_matching("Fenway Park") == "fenway"
|
||||
assert normalize_for_matching("Madison Square Garden Arena") == "madison square garden"
|
||||
assert normalize_for_matching("Wrigley Field") == "wrigley"
|
||||
assert normalize_for_matching("TD Garden Center") == "td garden"
|
||||
|
||||
|
||||
class TestExactMatch:
|
||||
"""Tests for exact_match function."""
|
||||
|
||||
def test_exact_match_primary_name(self):
|
||||
"""Test exact match on primary name."""
|
||||
candidates = [
|
||||
MatchCandidate("nba_lal", "Los Angeles Lakers", ["Lakers", "LAL"]),
|
||||
MatchCandidate("nba_bos", "Boston Celtics", ["Celtics", "BOS"]),
|
||||
]
|
||||
assert exact_match("Los Angeles Lakers", candidates) == "nba_lal"
|
||||
assert exact_match("Boston Celtics", candidates) == "nba_bos"
|
||||
|
||||
def test_exact_match_alias(self):
|
||||
"""Test exact match on alias."""
|
||||
candidates = [
|
||||
MatchCandidate("nba_lal", "Los Angeles Lakers", ["Lakers", "LAL"]),
|
||||
]
|
||||
assert exact_match("Lakers", candidates) == "nba_lal"
|
||||
assert exact_match("LAL", candidates) == "nba_lal"
|
||||
|
||||
def test_case_insensitive(self):
|
||||
"""Test case insensitive matching."""
|
||||
candidates = [
|
||||
MatchCandidate("nba_lal", "Los Angeles Lakers", ["Lakers"]),
|
||||
]
|
||||
assert exact_match("los angeles lakers", candidates) == "nba_lal"
|
||||
assert exact_match("LAKERS", candidates) == "nba_lal"
|
||||
|
||||
def test_no_match(self):
|
||||
"""Test no match returns None."""
|
||||
candidates = [
|
||||
MatchCandidate("nba_lal", "Los Angeles Lakers", ["Lakers"]),
|
||||
]
|
||||
assert exact_match("New York Knicks", candidates) is None
|
||||
|
||||
|
||||
class TestFuzzyMatchTeam:
|
||||
"""Tests for fuzzy_match_team function."""
|
||||
|
||||
def test_close_match(self):
|
||||
"""Test fuzzy matching finds close matches."""
|
||||
candidates = [
|
||||
MatchCandidate("nba_lal", "Los Angeles Lakers", ["Lakers", "LA Lakers"]),
|
||||
MatchCandidate("nba_lac", "Los Angeles Clippers", ["Clippers", "LA Clippers"]),
|
||||
]
|
||||
matches = fuzzy_match_team("LA Lakers", candidates, threshold=70)
|
||||
assert len(matches) > 0
|
||||
assert matches[0].canonical_id == "nba_lal"
|
||||
|
||||
def test_partial_name_match(self):
|
||||
"""Test matching on partial team name."""
|
||||
candidates = [
|
||||
MatchCandidate("nba_bos", "Boston Celtics", ["Celtics", "BOS"]),
|
||||
]
|
||||
matches = fuzzy_match_team("Celtics", candidates, threshold=80)
|
||||
assert len(matches) > 0
|
||||
assert matches[0].canonical_id == "nba_bos"
|
||||
|
||||
def test_threshold_filtering(self):
|
||||
"""Test that threshold filters low-confidence matches."""
|
||||
candidates = [
|
||||
MatchCandidate("nba_bos", "Boston Celtics", []),
|
||||
]
|
||||
# Very different string should not match at high threshold
|
||||
matches = fuzzy_match_team("xyz123", candidates, threshold=90)
|
||||
assert len(matches) == 0
|
||||
|
||||
def test_returns_top_n(self):
|
||||
"""Test that top_n parameter limits results."""
|
||||
candidates = [
|
||||
MatchCandidate("nba_lal", "Los Angeles Lakers", []),
|
||||
MatchCandidate("nba_lac", "Los Angeles Clippers", []),
|
||||
MatchCandidate("mlb_lad", "Los Angeles Dodgers", []),
|
||||
]
|
||||
matches = fuzzy_match_team("Los Angeles", candidates, threshold=50, top_n=2)
|
||||
assert len(matches) <= 2
|
||||
|
||||
|
||||
class TestFuzzyMatchStadium:
|
||||
"""Tests for fuzzy_match_stadium function."""
|
||||
|
||||
def test_stadium_match(self):
|
||||
"""Test fuzzy matching stadium names."""
|
||||
candidates = [
|
||||
MatchCandidate("fenway", "Fenway Park", ["Fenway"]),
|
||||
MatchCandidate("td_garden", "TD Garden", ["Boston Garden"]),
|
||||
]
|
||||
matches = fuzzy_match_stadium("Fenway Park Boston", candidates, threshold=70)
|
||||
assert len(matches) > 0
|
||||
assert matches[0].canonical_id == "fenway"
|
||||
|
||||
def test_naming_rights_change(self):
|
||||
"""Test matching old stadium names."""
|
||||
candidates = [
|
||||
MatchCandidate(
|
||||
"chase_center",
|
||||
"Chase Center",
|
||||
["Oracle Arena", "Oakland Coliseum Arena"],
|
||||
),
|
||||
]
|
||||
# Should match on alias
|
||||
matches = fuzzy_match_stadium("Oracle Arena", candidates, threshold=70)
|
||||
assert len(matches) > 0
|
||||
|
||||
|
||||
class TestBestMatch:
|
||||
"""Tests for best_match function."""
|
||||
|
||||
def test_prefers_exact_match(self):
|
||||
"""Test that exact match is preferred over fuzzy."""
|
||||
candidates = [
|
||||
MatchCandidate("nba_lal", "Los Angeles Lakers", ["Lakers"]),
|
||||
MatchCandidate("nba_bos", "Boston Celtics", ["Celtics"]),
|
||||
]
|
||||
result = best_match("Lakers", candidates)
|
||||
assert result is not None
|
||||
assert result.canonical_id == "nba_lal"
|
||||
assert result.confidence == 100 # Exact match
|
||||
|
||||
def test_falls_back_to_fuzzy(self):
|
||||
"""Test fallback to fuzzy when no exact match."""
|
||||
candidates = [
|
||||
MatchCandidate("nba_lal", "Los Angeles Lakers", ["Lakers"]),
|
||||
]
|
||||
result = best_match("LA Laker", candidates, threshold=70)
|
||||
assert result is not None
|
||||
assert result.confidence < 100 # Fuzzy match
|
||||
|
||||
def test_no_match_below_threshold(self):
|
||||
"""Test returns None when no match above threshold."""
|
||||
candidates = [
|
||||
MatchCandidate("nba_lal", "Los Angeles Lakers", []),
|
||||
]
|
||||
result = best_match("xyz123", candidates, threshold=90)
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestCalculateSimilarity:
|
||||
"""Tests for calculate_similarity function."""
|
||||
|
||||
def test_identical_strings(self):
|
||||
"""Test identical strings have 100% similarity."""
|
||||
assert calculate_similarity("Boston Celtics", "Boston Celtics") == 100
|
||||
|
||||
def test_similar_strings(self):
|
||||
"""Test similar strings have high similarity."""
|
||||
score = calculate_similarity("Boston Celtics", "Celtics Boston")
|
||||
assert score >= 90
|
||||
|
||||
def test_different_strings(self):
|
||||
"""Test different strings have low similarity."""
|
||||
score = calculate_similarity("Boston Celtics", "Los Angeles Lakers")
|
||||
assert score < 50
|
||||
|
||||
def test_empty_string(self):
|
||||
"""Test empty string handling."""
|
||||
score = calculate_similarity("", "Boston Celtics")
|
||||
assert score == 0
|
||||
Reference in New Issue
Block a user