Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
263 lines
8.6 KiB
Python
263 lines
8.6 KiB
Python
"""Alias and manual review data models for sportstime-parser."""
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import date, datetime
|
|
from enum import Enum
|
|
from typing import Optional
|
|
import json
|
|
|
|
|
|
class AliasType(Enum):
|
|
"""Type of team alias."""
|
|
NAME = "name"
|
|
ABBREVIATION = "abbreviation"
|
|
CITY = "city"
|
|
|
|
|
|
class ReviewReason(Enum):
|
|
"""Reason an item requires manual review."""
|
|
UNRESOLVED_TEAM = "unresolved_team"
|
|
UNRESOLVED_STADIUM = "unresolved_stadium"
|
|
LOW_CONFIDENCE_MATCH = "low_confidence_match"
|
|
MISSING_DATA = "missing_data"
|
|
DUPLICATE_GAME = "duplicate_game"
|
|
TIMEZONE_UNKNOWN = "timezone_unknown"
|
|
GEOGRAPHIC_FILTER = "geographic_filter"
|
|
|
|
|
|
@dataclass
|
|
class TeamAlias:
|
|
"""Represents a team alias with optional date validity.
|
|
|
|
Attributes:
|
|
id: Unique alias ID
|
|
team_canonical_id: The canonical team ID this alias resolves to
|
|
alias_type: Type of alias (name, abbreviation, city)
|
|
alias_value: The alias value to match against
|
|
valid_from: Start date of alias validity (None = always valid)
|
|
valid_until: End date of alias validity (None = still valid)
|
|
"""
|
|
|
|
id: str
|
|
team_canonical_id: str
|
|
alias_type: AliasType
|
|
alias_value: str
|
|
valid_from: Optional[date] = None
|
|
valid_until: Optional[date] = None
|
|
|
|
def is_valid_on(self, check_date: date) -> bool:
|
|
"""Check if this alias is valid on the given date."""
|
|
if self.valid_from and check_date < self.valid_from:
|
|
return False
|
|
if self.valid_until and check_date > self.valid_until:
|
|
return False
|
|
return True
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary for JSON serialization."""
|
|
return {
|
|
"id": self.id,
|
|
"team_canonical_id": self.team_canonical_id,
|
|
"alias_type": self.alias_type.value,
|
|
"alias_value": self.alias_value,
|
|
"valid_from": self.valid_from.isoformat() if self.valid_from else None,
|
|
"valid_until": self.valid_until.isoformat() if self.valid_until else None,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> "TeamAlias":
|
|
"""Create a TeamAlias from a dictionary."""
|
|
valid_from = None
|
|
if data.get("valid_from"):
|
|
valid_from = date.fromisoformat(data["valid_from"])
|
|
|
|
valid_until = None
|
|
if data.get("valid_until"):
|
|
valid_until = date.fromisoformat(data["valid_until"])
|
|
|
|
return cls(
|
|
id=data["id"],
|
|
team_canonical_id=data["team_canonical_id"],
|
|
alias_type=AliasType(data["alias_type"]),
|
|
alias_value=data["alias_value"],
|
|
valid_from=valid_from,
|
|
valid_until=valid_until,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class StadiumAlias:
|
|
"""Represents a stadium alias with optional date validity.
|
|
|
|
Attributes:
|
|
alias_name: The alias name to match against (lowercase)
|
|
stadium_canonical_id: The canonical stadium ID this alias resolves to
|
|
valid_from: Start date of alias validity (None = always valid)
|
|
valid_until: End date of alias validity (None = still valid)
|
|
"""
|
|
|
|
alias_name: str
|
|
stadium_canonical_id: str
|
|
valid_from: Optional[date] = None
|
|
valid_until: Optional[date] = None
|
|
|
|
def is_valid_on(self, check_date: date) -> bool:
|
|
"""Check if this alias is valid on the given date."""
|
|
if self.valid_from and check_date < self.valid_from:
|
|
return False
|
|
if self.valid_until and check_date > self.valid_until:
|
|
return False
|
|
return True
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary for JSON serialization."""
|
|
return {
|
|
"alias_name": self.alias_name,
|
|
"stadium_canonical_id": self.stadium_canonical_id,
|
|
"valid_from": self.valid_from.isoformat() if self.valid_from else None,
|
|
"valid_until": self.valid_until.isoformat() if self.valid_until else None,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> "StadiumAlias":
|
|
"""Create a StadiumAlias from a dictionary."""
|
|
valid_from = None
|
|
if data.get("valid_from"):
|
|
valid_from = date.fromisoformat(data["valid_from"])
|
|
|
|
valid_until = None
|
|
if data.get("valid_until"):
|
|
valid_until = date.fromisoformat(data["valid_until"])
|
|
|
|
return cls(
|
|
alias_name=data["alias_name"],
|
|
stadium_canonical_id=data["stadium_canonical_id"],
|
|
valid_from=valid_from,
|
|
valid_until=valid_until,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class FuzzyMatch:
|
|
"""Represents a fuzzy match suggestion with confidence score."""
|
|
|
|
canonical_id: str
|
|
canonical_name: str
|
|
confidence: int # 0-100
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary for JSON serialization."""
|
|
return {
|
|
"canonical_id": self.canonical_id,
|
|
"canonical_name": self.canonical_name,
|
|
"confidence": self.confidence,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class ManualReviewItem:
|
|
"""Represents an item requiring manual review.
|
|
|
|
Attributes:
|
|
id: Unique review item ID
|
|
reason: Why this item needs review
|
|
sport: Sport code
|
|
raw_value: The original unresolved value
|
|
context: Additional context about the issue
|
|
source_url: URL of the source page
|
|
suggested_matches: List of potential matches with confidence scores
|
|
game_date: Date of the game (if applicable)
|
|
created_at: When this review item was created
|
|
"""
|
|
|
|
id: str
|
|
reason: ReviewReason
|
|
sport: str
|
|
raw_value: str
|
|
context: dict = field(default_factory=dict)
|
|
source_url: Optional[str] = None
|
|
suggested_matches: list[FuzzyMatch] = field(default_factory=list)
|
|
game_date: Optional[date] = None
|
|
created_at: datetime = field(default_factory=datetime.now)
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary for JSON serialization."""
|
|
return {
|
|
"id": self.id,
|
|
"reason": self.reason.value,
|
|
"sport": self.sport,
|
|
"raw_value": self.raw_value,
|
|
"context": self.context,
|
|
"source_url": self.source_url,
|
|
"suggested_matches": [m.to_dict() for m in self.suggested_matches],
|
|
"game_date": self.game_date.isoformat() if self.game_date else None,
|
|
"created_at": self.created_at.isoformat(),
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> "ManualReviewItem":
|
|
"""Create a ManualReviewItem from a dictionary."""
|
|
game_date = None
|
|
if data.get("game_date"):
|
|
game_date = date.fromisoformat(data["game_date"])
|
|
|
|
created_at = datetime.now()
|
|
if data.get("created_at"):
|
|
created_at = datetime.fromisoformat(data["created_at"])
|
|
|
|
suggested_matches = []
|
|
for match_data in data.get("suggested_matches", []):
|
|
suggested_matches.append(FuzzyMatch(
|
|
canonical_id=match_data["canonical_id"],
|
|
canonical_name=match_data["canonical_name"],
|
|
confidence=match_data["confidence"],
|
|
))
|
|
|
|
return cls(
|
|
id=data["id"],
|
|
reason=ReviewReason(data["reason"]),
|
|
sport=data["sport"],
|
|
raw_value=data["raw_value"],
|
|
context=data.get("context", {}),
|
|
source_url=data.get("source_url"),
|
|
suggested_matches=suggested_matches,
|
|
game_date=game_date,
|
|
created_at=created_at,
|
|
)
|
|
|
|
def to_markdown(self) -> str:
|
|
"""Generate markdown representation for validation report."""
|
|
lines = [
|
|
f"### {self.reason.value.replace('_', ' ').title()}: {self.raw_value}",
|
|
"",
|
|
f"**Sport**: {self.sport.upper()}",
|
|
]
|
|
|
|
if self.game_date:
|
|
lines.append(f"**Game Date**: {self.game_date.isoformat()}")
|
|
|
|
if self.context:
|
|
lines.append("")
|
|
lines.append("**Context**:")
|
|
for key, value in self.context.items():
|
|
lines.append(f"- {key}: {value}")
|
|
|
|
if self.suggested_matches:
|
|
lines.append("")
|
|
lines.append("**Suggested Matches**:")
|
|
for i, match in enumerate(self.suggested_matches, 1):
|
|
marker = " <- likely correct" if match.confidence >= 90 else ""
|
|
lines.append(
|
|
f"{i}. `{match.canonical_id}` ({match.confidence}%){marker}"
|
|
)
|
|
|
|
if self.source_url:
|
|
lines.append("")
|
|
lines.append(f"**Source**: [{self.source_url}]({self.source_url})")
|
|
|
|
lines.append("")
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|