Files
Trey t 52d445bca4 feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading
sports schedule data to CloudKit. Includes:

- Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL
- Canonical ID system for teams, stadiums, and games
- Fuzzy matching with manual alias support
- CloudKit uploader with batch operations and deduplication
- Comprehensive test suite with fixtures
- WNBA abbreviation aliases for improved team resolution
- Alias validation script to detect orphan references

All 5 phases of data remediation plan completed:
- Phase 1: Alias fixes (team/stadium alias additions)
- Phase 2: NHL stadium coordinate fixes
- Phase 3: Re-scrape validation
- Phase 4: iOS bundle update
- Phase 5: Code quality improvements (WNBA aliases)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00

263 lines
8.6 KiB
Python

"""Alias and manual review data models for sportstime-parser."""
from dataclasses import dataclass, field
from datetime import date, datetime
from enum import Enum
from typing import Optional
import json
class AliasType(Enum):
"""Type of team alias."""
NAME = "name"
ABBREVIATION = "abbreviation"
CITY = "city"
class ReviewReason(Enum):
"""Reason an item requires manual review."""
UNRESOLVED_TEAM = "unresolved_team"
UNRESOLVED_STADIUM = "unresolved_stadium"
LOW_CONFIDENCE_MATCH = "low_confidence_match"
MISSING_DATA = "missing_data"
DUPLICATE_GAME = "duplicate_game"
TIMEZONE_UNKNOWN = "timezone_unknown"
GEOGRAPHIC_FILTER = "geographic_filter"
@dataclass
class TeamAlias:
"""Represents a team alias with optional date validity.
Attributes:
id: Unique alias ID
team_canonical_id: The canonical team ID this alias resolves to
alias_type: Type of alias (name, abbreviation, city)
alias_value: The alias value to match against
valid_from: Start date of alias validity (None = always valid)
valid_until: End date of alias validity (None = still valid)
"""
id: str
team_canonical_id: str
alias_type: AliasType
alias_value: str
valid_from: Optional[date] = None
valid_until: Optional[date] = None
def is_valid_on(self, check_date: date) -> bool:
"""Check if this alias is valid on the given date."""
if self.valid_from and check_date < self.valid_from:
return False
if self.valid_until and check_date > self.valid_until:
return False
return True
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"id": self.id,
"team_canonical_id": self.team_canonical_id,
"alias_type": self.alias_type.value,
"alias_value": self.alias_value,
"valid_from": self.valid_from.isoformat() if self.valid_from else None,
"valid_until": self.valid_until.isoformat() if self.valid_until else None,
}
@classmethod
def from_dict(cls, data: dict) -> "TeamAlias":
"""Create a TeamAlias from a dictionary."""
valid_from = None
if data.get("valid_from"):
valid_from = date.fromisoformat(data["valid_from"])
valid_until = None
if data.get("valid_until"):
valid_until = date.fromisoformat(data["valid_until"])
return cls(
id=data["id"],
team_canonical_id=data["team_canonical_id"],
alias_type=AliasType(data["alias_type"]),
alias_value=data["alias_value"],
valid_from=valid_from,
valid_until=valid_until,
)
@dataclass
class StadiumAlias:
"""Represents a stadium alias with optional date validity.
Attributes:
alias_name: The alias name to match against (lowercase)
stadium_canonical_id: The canonical stadium ID this alias resolves to
valid_from: Start date of alias validity (None = always valid)
valid_until: End date of alias validity (None = still valid)
"""
alias_name: str
stadium_canonical_id: str
valid_from: Optional[date] = None
valid_until: Optional[date] = None
def is_valid_on(self, check_date: date) -> bool:
"""Check if this alias is valid on the given date."""
if self.valid_from and check_date < self.valid_from:
return False
if self.valid_until and check_date > self.valid_until:
return False
return True
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"alias_name": self.alias_name,
"stadium_canonical_id": self.stadium_canonical_id,
"valid_from": self.valid_from.isoformat() if self.valid_from else None,
"valid_until": self.valid_until.isoformat() if self.valid_until else None,
}
@classmethod
def from_dict(cls, data: dict) -> "StadiumAlias":
"""Create a StadiumAlias from a dictionary."""
valid_from = None
if data.get("valid_from"):
valid_from = date.fromisoformat(data["valid_from"])
valid_until = None
if data.get("valid_until"):
valid_until = date.fromisoformat(data["valid_until"])
return cls(
alias_name=data["alias_name"],
stadium_canonical_id=data["stadium_canonical_id"],
valid_from=valid_from,
valid_until=valid_until,
)
@dataclass
class FuzzyMatch:
"""Represents a fuzzy match suggestion with confidence score."""
canonical_id: str
canonical_name: str
confidence: int # 0-100
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"canonical_id": self.canonical_id,
"canonical_name": self.canonical_name,
"confidence": self.confidence,
}
@dataclass
class ManualReviewItem:
"""Represents an item requiring manual review.
Attributes:
id: Unique review item ID
reason: Why this item needs review
sport: Sport code
raw_value: The original unresolved value
context: Additional context about the issue
source_url: URL of the source page
suggested_matches: List of potential matches with confidence scores
game_date: Date of the game (if applicable)
created_at: When this review item was created
"""
id: str
reason: ReviewReason
sport: str
raw_value: str
context: dict = field(default_factory=dict)
source_url: Optional[str] = None
suggested_matches: list[FuzzyMatch] = field(default_factory=list)
game_date: Optional[date] = None
created_at: datetime = field(default_factory=datetime.now)
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"id": self.id,
"reason": self.reason.value,
"sport": self.sport,
"raw_value": self.raw_value,
"context": self.context,
"source_url": self.source_url,
"suggested_matches": [m.to_dict() for m in self.suggested_matches],
"game_date": self.game_date.isoformat() if self.game_date else None,
"created_at": self.created_at.isoformat(),
}
@classmethod
def from_dict(cls, data: dict) -> "ManualReviewItem":
"""Create a ManualReviewItem from a dictionary."""
game_date = None
if data.get("game_date"):
game_date = date.fromisoformat(data["game_date"])
created_at = datetime.now()
if data.get("created_at"):
created_at = datetime.fromisoformat(data["created_at"])
suggested_matches = []
for match_data in data.get("suggested_matches", []):
suggested_matches.append(FuzzyMatch(
canonical_id=match_data["canonical_id"],
canonical_name=match_data["canonical_name"],
confidence=match_data["confidence"],
))
return cls(
id=data["id"],
reason=ReviewReason(data["reason"]),
sport=data["sport"],
raw_value=data["raw_value"],
context=data.get("context", {}),
source_url=data.get("source_url"),
suggested_matches=suggested_matches,
game_date=game_date,
created_at=created_at,
)
def to_markdown(self) -> str:
"""Generate markdown representation for validation report."""
lines = [
f"### {self.reason.value.replace('_', ' ').title()}: {self.raw_value}",
"",
f"**Sport**: {self.sport.upper()}",
]
if self.game_date:
lines.append(f"**Game Date**: {self.game_date.isoformat()}")
if self.context:
lines.append("")
lines.append("**Context**:")
for key, value in self.context.items():
lines.append(f"- {key}: {value}")
if self.suggested_matches:
lines.append("")
lines.append("**Suggested Matches**:")
for i, match in enumerate(self.suggested_matches, 1):
marker = " <- likely correct" if match.confidence >= 90 else ""
lines.append(
f"{i}. `{match.canonical_id}` ({match.confidence}%){marker}"
)
if self.source_url:
lines.append("")
lines.append(f"**Source**: [{self.source_url}]({self.source_url})")
lines.append("")
lines.append("---")
lines.append("")
return "\n".join(lines)