feat(scripts): add sportstime-parser data pipeline

Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:56:25 -06:00
parent ac78042a7e
commit 52d445bca4
76 changed files with 25065 additions and 0 deletions
--- a/sportstime_parser/models/aliases.py
+++ b/sportstime_parser/models/aliases.py
@@ -0,0 +1,262 @@
+"""Alias and manual review data models for sportstime-parser."""
+
+from dataclasses import dataclass, field
+from datetime import date, datetime
+from enum import Enum
+from typing import Optional
+import json
+
+
+class AliasType(Enum):
+    """Type of team alias."""
+    NAME = "name"
+    ABBREVIATION = "abbreviation"
+    CITY = "city"
+
+
+class ReviewReason(Enum):
+    """Reason an item requires manual review."""
+    UNRESOLVED_TEAM = "unresolved_team"
+    UNRESOLVED_STADIUM = "unresolved_stadium"
+    LOW_CONFIDENCE_MATCH = "low_confidence_match"
+    MISSING_DATA = "missing_data"
+    DUPLICATE_GAME = "duplicate_game"
+    TIMEZONE_UNKNOWN = "timezone_unknown"
+    GEOGRAPHIC_FILTER = "geographic_filter"
+
+
+@dataclass
+class TeamAlias:
+    """Represents a team alias with optional date validity.
+
+    Attributes:
+        id: Unique alias ID
+        team_canonical_id: The canonical team ID this alias resolves to
+        alias_type: Type of alias (name, abbreviation, city)
+        alias_value: The alias value to match against
+        valid_from: Start date of alias validity (None = always valid)
+        valid_until: End date of alias validity (None = still valid)
+    """
+
+    id: str
+    team_canonical_id: str
+    alias_type: AliasType
+    alias_value: str
+    valid_from: Optional[date] = None
+    valid_until: Optional[date] = None
+
+    def is_valid_on(self, check_date: date) -> bool:
+        """Check if this alias is valid on the given date."""
+        if self.valid_from and check_date < self.valid_from:
+            return False
+        if self.valid_until and check_date > self.valid_until:
+            return False
+        return True
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "id": self.id,
+            "team_canonical_id": self.team_canonical_id,
+            "alias_type": self.alias_type.value,
+            "alias_value": self.alias_value,
+            "valid_from": self.valid_from.isoformat() if self.valid_from else None,
+            "valid_until": self.valid_until.isoformat() if self.valid_until else None,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "TeamAlias":
+        """Create a TeamAlias from a dictionary."""
+        valid_from = None
+        if data.get("valid_from"):
+            valid_from = date.fromisoformat(data["valid_from"])
+
+        valid_until = None
+        if data.get("valid_until"):
+            valid_until = date.fromisoformat(data["valid_until"])
+
+        return cls(
+            id=data["id"],
+            team_canonical_id=data["team_canonical_id"],
+            alias_type=AliasType(data["alias_type"]),
+            alias_value=data["alias_value"],
+            valid_from=valid_from,
+            valid_until=valid_until,
+        )
+
+
+@dataclass
+class StadiumAlias:
+    """Represents a stadium alias with optional date validity.
+
+    Attributes:
+        alias_name: The alias name to match against (lowercase)
+        stadium_canonical_id: The canonical stadium ID this alias resolves to
+        valid_from: Start date of alias validity (None = always valid)
+        valid_until: End date of alias validity (None = still valid)
+    """
+
+    alias_name: str
+    stadium_canonical_id: str
+    valid_from: Optional[date] = None
+    valid_until: Optional[date] = None
+
+    def is_valid_on(self, check_date: date) -> bool:
+        """Check if this alias is valid on the given date."""
+        if self.valid_from and check_date < self.valid_from:
+            return False
+        if self.valid_until and check_date > self.valid_until:
+            return False
+        return True
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "alias_name": self.alias_name,
+            "stadium_canonical_id": self.stadium_canonical_id,
+            "valid_from": self.valid_from.isoformat() if self.valid_from else None,
+            "valid_until": self.valid_until.isoformat() if self.valid_until else None,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "StadiumAlias":
+        """Create a StadiumAlias from a dictionary."""
+        valid_from = None
+        if data.get("valid_from"):
+            valid_from = date.fromisoformat(data["valid_from"])
+
+        valid_until = None
+        if data.get("valid_until"):
+            valid_until = date.fromisoformat(data["valid_until"])
+
+        return cls(
+            alias_name=data["alias_name"],
+            stadium_canonical_id=data["stadium_canonical_id"],
+            valid_from=valid_from,
+            valid_until=valid_until,
+        )
+
+
+@dataclass
+class FuzzyMatch:
+    """Represents a fuzzy match suggestion with confidence score."""
+
+    canonical_id: str
+    canonical_name: str
+    confidence: int  # 0-100
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "canonical_id": self.canonical_id,
+            "canonical_name": self.canonical_name,
+            "confidence": self.confidence,
+        }
+
+
+@dataclass
+class ManualReviewItem:
+    """Represents an item requiring manual review.
+
+    Attributes:
+        id: Unique review item ID
+        reason: Why this item needs review
+        sport: Sport code
+        raw_value: The original unresolved value
+        context: Additional context about the issue
+        source_url: URL of the source page
+        suggested_matches: List of potential matches with confidence scores
+        game_date: Date of the game (if applicable)
+        created_at: When this review item was created
+    """
+
+    id: str
+    reason: ReviewReason
+    sport: str
+    raw_value: str
+    context: dict = field(default_factory=dict)
+    source_url: Optional[str] = None
+    suggested_matches: list[FuzzyMatch] = field(default_factory=list)
+    game_date: Optional[date] = None
+    created_at: datetime = field(default_factory=datetime.now)
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "id": self.id,
+            "reason": self.reason.value,
+            "sport": self.sport,
+            "raw_value": self.raw_value,
+            "context": self.context,
+            "source_url": self.source_url,
+            "suggested_matches": [m.to_dict() for m in self.suggested_matches],
+            "game_date": self.game_date.isoformat() if self.game_date else None,
+            "created_at": self.created_at.isoformat(),
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "ManualReviewItem":
+        """Create a ManualReviewItem from a dictionary."""
+        game_date = None
+        if data.get("game_date"):
+            game_date = date.fromisoformat(data["game_date"])
+
+        created_at = datetime.now()
+        if data.get("created_at"):
+            created_at = datetime.fromisoformat(data["created_at"])
+
+        suggested_matches = []
+        for match_data in data.get("suggested_matches", []):
+            suggested_matches.append(FuzzyMatch(
+                canonical_id=match_data["canonical_id"],
+                canonical_name=match_data["canonical_name"],
+                confidence=match_data["confidence"],
+            ))
+
+        return cls(
+            id=data["id"],
+            reason=ReviewReason(data["reason"]),
+            sport=data["sport"],
+            raw_value=data["raw_value"],
+            context=data.get("context", {}),
+            source_url=data.get("source_url"),
+            suggested_matches=suggested_matches,
+            game_date=game_date,
+            created_at=created_at,
+        )
+
+    def to_markdown(self) -> str:
+        """Generate markdown representation for validation report."""
+        lines = [
+            f"### {self.reason.value.replace('_', ' ').title()}: {self.raw_value}",
+            "",
+            f"**Sport**: {self.sport.upper()}",
+        ]
+
+        if self.game_date:
+            lines.append(f"**Game Date**: {self.game_date.isoformat()}")
+
+        if self.context:
+            lines.append("")
+            lines.append("**Context**:")
+            for key, value in self.context.items():
+                lines.append(f"- {key}: {value}")
+
+        if self.suggested_matches:
+            lines.append("")
+            lines.append("**Suggested Matches**:")
+            for i, match in enumerate(self.suggested_matches, 1):
+                marker = " <- likely correct" if match.confidence >= 90 else ""
+                lines.append(
+                    f"{i}. `{match.canonical_id}` ({match.confidence}%){marker}"
+                )
+
+        if self.source_url:
+            lines.append("")
+            lines.append(f"**Source**: [{self.source_url}]({self.source_url})")
+
+        lines.append("")
+        lines.append("---")
+        lines.append("")
+
+        return "\n".join(lines)