feat(scripts): add sportstime-parser data pipeline
Complete Python package for scraping, normalizing, and uploading sports schedule data to CloudKit. Includes: - Multi-source scrapers for NBA, MLB, NFL, NHL, MLS, WNBA, NWSL - Canonical ID system for teams, stadiums, and games - Fuzzy matching with manual alias support - CloudKit uploader with batch operations and deduplication - Comprehensive test suite with fixtures - WNBA abbreviation aliases for improved team resolution - Alias validation script to detect orphan references All 5 phases of data remediation plan completed: - Phase 1: Alias fixes (team/stadium alias additions) - Phase 2: NHL stadium coordinate fixes - Phase 3: Re-scrape validation - Phase 4: iOS bundle update - Phase 5: Code quality improvements (WNBA aliases) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
262
sportstime_parser/models/aliases.py
Normal file
262
sportstime_parser/models/aliases.py
Normal file
@@ -0,0 +1,262 @@
|
||||
"""Alias and manual review data models for sportstime-parser."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date, datetime
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
import json
|
||||
|
||||
|
||||
class AliasType(Enum):
|
||||
"""Type of team alias."""
|
||||
NAME = "name"
|
||||
ABBREVIATION = "abbreviation"
|
||||
CITY = "city"
|
||||
|
||||
|
||||
class ReviewReason(Enum):
|
||||
"""Reason an item requires manual review."""
|
||||
UNRESOLVED_TEAM = "unresolved_team"
|
||||
UNRESOLVED_STADIUM = "unresolved_stadium"
|
||||
LOW_CONFIDENCE_MATCH = "low_confidence_match"
|
||||
MISSING_DATA = "missing_data"
|
||||
DUPLICATE_GAME = "duplicate_game"
|
||||
TIMEZONE_UNKNOWN = "timezone_unknown"
|
||||
GEOGRAPHIC_FILTER = "geographic_filter"
|
||||
|
||||
|
||||
@dataclass
|
||||
class TeamAlias:
|
||||
"""Represents a team alias with optional date validity.
|
||||
|
||||
Attributes:
|
||||
id: Unique alias ID
|
||||
team_canonical_id: The canonical team ID this alias resolves to
|
||||
alias_type: Type of alias (name, abbreviation, city)
|
||||
alias_value: The alias value to match against
|
||||
valid_from: Start date of alias validity (None = always valid)
|
||||
valid_until: End date of alias validity (None = still valid)
|
||||
"""
|
||||
|
||||
id: str
|
||||
team_canonical_id: str
|
||||
alias_type: AliasType
|
||||
alias_value: str
|
||||
valid_from: Optional[date] = None
|
||||
valid_until: Optional[date] = None
|
||||
|
||||
def is_valid_on(self, check_date: date) -> bool:
|
||||
"""Check if this alias is valid on the given date."""
|
||||
if self.valid_from and check_date < self.valid_from:
|
||||
return False
|
||||
if self.valid_until and check_date > self.valid_until:
|
||||
return False
|
||||
return True
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"id": self.id,
|
||||
"team_canonical_id": self.team_canonical_id,
|
||||
"alias_type": self.alias_type.value,
|
||||
"alias_value": self.alias_value,
|
||||
"valid_from": self.valid_from.isoformat() if self.valid_from else None,
|
||||
"valid_until": self.valid_until.isoformat() if self.valid_until else None,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "TeamAlias":
|
||||
"""Create a TeamAlias from a dictionary."""
|
||||
valid_from = None
|
||||
if data.get("valid_from"):
|
||||
valid_from = date.fromisoformat(data["valid_from"])
|
||||
|
||||
valid_until = None
|
||||
if data.get("valid_until"):
|
||||
valid_until = date.fromisoformat(data["valid_until"])
|
||||
|
||||
return cls(
|
||||
id=data["id"],
|
||||
team_canonical_id=data["team_canonical_id"],
|
||||
alias_type=AliasType(data["alias_type"]),
|
||||
alias_value=data["alias_value"],
|
||||
valid_from=valid_from,
|
||||
valid_until=valid_until,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class StadiumAlias:
|
||||
"""Represents a stadium alias with optional date validity.
|
||||
|
||||
Attributes:
|
||||
alias_name: The alias name to match against (lowercase)
|
||||
stadium_canonical_id: The canonical stadium ID this alias resolves to
|
||||
valid_from: Start date of alias validity (None = always valid)
|
||||
valid_until: End date of alias validity (None = still valid)
|
||||
"""
|
||||
|
||||
alias_name: str
|
||||
stadium_canonical_id: str
|
||||
valid_from: Optional[date] = None
|
||||
valid_until: Optional[date] = None
|
||||
|
||||
def is_valid_on(self, check_date: date) -> bool:
|
||||
"""Check if this alias is valid on the given date."""
|
||||
if self.valid_from and check_date < self.valid_from:
|
||||
return False
|
||||
if self.valid_until and check_date > self.valid_until:
|
||||
return False
|
||||
return True
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"alias_name": self.alias_name,
|
||||
"stadium_canonical_id": self.stadium_canonical_id,
|
||||
"valid_from": self.valid_from.isoformat() if self.valid_from else None,
|
||||
"valid_until": self.valid_until.isoformat() if self.valid_until else None,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "StadiumAlias":
|
||||
"""Create a StadiumAlias from a dictionary."""
|
||||
valid_from = None
|
||||
if data.get("valid_from"):
|
||||
valid_from = date.fromisoformat(data["valid_from"])
|
||||
|
||||
valid_until = None
|
||||
if data.get("valid_until"):
|
||||
valid_until = date.fromisoformat(data["valid_until"])
|
||||
|
||||
return cls(
|
||||
alias_name=data["alias_name"],
|
||||
stadium_canonical_id=data["stadium_canonical_id"],
|
||||
valid_from=valid_from,
|
||||
valid_until=valid_until,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FuzzyMatch:
|
||||
"""Represents a fuzzy match suggestion with confidence score."""
|
||||
|
||||
canonical_id: str
|
||||
canonical_name: str
|
||||
confidence: int # 0-100
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"canonical_id": self.canonical_id,
|
||||
"canonical_name": self.canonical_name,
|
||||
"confidence": self.confidence,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ManualReviewItem:
|
||||
"""Represents an item requiring manual review.
|
||||
|
||||
Attributes:
|
||||
id: Unique review item ID
|
||||
reason: Why this item needs review
|
||||
sport: Sport code
|
||||
raw_value: The original unresolved value
|
||||
context: Additional context about the issue
|
||||
source_url: URL of the source page
|
||||
suggested_matches: List of potential matches with confidence scores
|
||||
game_date: Date of the game (if applicable)
|
||||
created_at: When this review item was created
|
||||
"""
|
||||
|
||||
id: str
|
||||
reason: ReviewReason
|
||||
sport: str
|
||||
raw_value: str
|
||||
context: dict = field(default_factory=dict)
|
||||
source_url: Optional[str] = None
|
||||
suggested_matches: list[FuzzyMatch] = field(default_factory=list)
|
||||
game_date: Optional[date] = None
|
||||
created_at: datetime = field(default_factory=datetime.now)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"id": self.id,
|
||||
"reason": self.reason.value,
|
||||
"sport": self.sport,
|
||||
"raw_value": self.raw_value,
|
||||
"context": self.context,
|
||||
"source_url": self.source_url,
|
||||
"suggested_matches": [m.to_dict() for m in self.suggested_matches],
|
||||
"game_date": self.game_date.isoformat() if self.game_date else None,
|
||||
"created_at": self.created_at.isoformat(),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "ManualReviewItem":
|
||||
"""Create a ManualReviewItem from a dictionary."""
|
||||
game_date = None
|
||||
if data.get("game_date"):
|
||||
game_date = date.fromisoformat(data["game_date"])
|
||||
|
||||
created_at = datetime.now()
|
||||
if data.get("created_at"):
|
||||
created_at = datetime.fromisoformat(data["created_at"])
|
||||
|
||||
suggested_matches = []
|
||||
for match_data in data.get("suggested_matches", []):
|
||||
suggested_matches.append(FuzzyMatch(
|
||||
canonical_id=match_data["canonical_id"],
|
||||
canonical_name=match_data["canonical_name"],
|
||||
confidence=match_data["confidence"],
|
||||
))
|
||||
|
||||
return cls(
|
||||
id=data["id"],
|
||||
reason=ReviewReason(data["reason"]),
|
||||
sport=data["sport"],
|
||||
raw_value=data["raw_value"],
|
||||
context=data.get("context", {}),
|
||||
source_url=data.get("source_url"),
|
||||
suggested_matches=suggested_matches,
|
||||
game_date=game_date,
|
||||
created_at=created_at,
|
||||
)
|
||||
|
||||
def to_markdown(self) -> str:
|
||||
"""Generate markdown representation for validation report."""
|
||||
lines = [
|
||||
f"### {self.reason.value.replace('_', ' ').title()}: {self.raw_value}",
|
||||
"",
|
||||
f"**Sport**: {self.sport.upper()}",
|
||||
]
|
||||
|
||||
if self.game_date:
|
||||
lines.append(f"**Game Date**: {self.game_date.isoformat()}")
|
||||
|
||||
if self.context:
|
||||
lines.append("")
|
||||
lines.append("**Context**:")
|
||||
for key, value in self.context.items():
|
||||
lines.append(f"- {key}: {value}")
|
||||
|
||||
if self.suggested_matches:
|
||||
lines.append("")
|
||||
lines.append("**Suggested Matches**:")
|
||||
for i, match in enumerate(self.suggested_matches, 1):
|
||||
marker = " <- likely correct" if match.confidence >= 90 else ""
|
||||
lines.append(
|
||||
f"{i}. `{match.canonical_id}` ({match.confidence}%){marker}"
|
||||
)
|
||||
|
||||
if self.source_url:
|
||||
lines.append("")
|
||||
lines.append(f"**Source**: [{self.source_url}]({self.source_url})")
|
||||
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user