feat(scripts): rewrite parser as modular Python CLI
Replace monolithic scraping scripts with sportstime_parser package: - Multi-source scrapers with automatic fallback for 7 sports - Canonical ID generation for games, teams, and stadiums - Fuzzy matching with configurable thresholds for name resolution - CloudKit Web Services uploader with JWT auth, diff-based updates - Resumable uploads with checkpoint state persistence - Validation reports with manual review items and suggested matches - Comprehensive test suite (249 tests) CLI: sportstime-parser scrape|validate|upload|status|retry|clear Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
425
Scripts/sportstime_parser/uploaders/diff.py
Normal file
425
Scripts/sportstime_parser/uploaders/diff.py
Normal file
@@ -0,0 +1,425 @@
|
||||
"""Record differ for CloudKit uploads.
|
||||
|
||||
This module compares local records with CloudKit records to determine
|
||||
what needs to be created, updated, or deleted.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Optional
|
||||
|
||||
from ..models.game import Game
|
||||
from ..models.team import Team
|
||||
from ..models.stadium import Stadium
|
||||
from .cloudkit import CloudKitRecord, RecordType
|
||||
|
||||
|
||||
class DiffAction(str, Enum):
|
||||
"""Action to take for a record."""
|
||||
CREATE = "create"
|
||||
UPDATE = "update"
|
||||
DELETE = "delete"
|
||||
UNCHANGED = "unchanged"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecordDiff:
|
||||
"""Represents the difference between local and remote records.
|
||||
|
||||
Attributes:
|
||||
record_name: Canonical record ID
|
||||
record_type: CloudKit record type
|
||||
action: Action to take (create, update, delete, unchanged)
|
||||
local_record: Local CloudKitRecord (None if delete)
|
||||
remote_record: Remote record dict (None if create)
|
||||
changed_fields: List of field names that changed (for update)
|
||||
record_change_tag: Remote record's change tag (for update)
|
||||
"""
|
||||
record_name: str
|
||||
record_type: RecordType
|
||||
action: DiffAction
|
||||
local_record: Optional[CloudKitRecord] = None
|
||||
remote_record: Optional[dict] = None
|
||||
changed_fields: list[str] = field(default_factory=list)
|
||||
record_change_tag: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiffResult:
|
||||
"""Result of diffing local and remote records.
|
||||
|
||||
Attributes:
|
||||
creates: Records to create
|
||||
updates: Records to update
|
||||
deletes: Records to delete (record names)
|
||||
unchanged: Records with no changes
|
||||
"""
|
||||
creates: list[RecordDiff] = field(default_factory=list)
|
||||
updates: list[RecordDiff] = field(default_factory=list)
|
||||
deletes: list[RecordDiff] = field(default_factory=list)
|
||||
unchanged: list[RecordDiff] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def create_count(self) -> int:
|
||||
return len(self.creates)
|
||||
|
||||
@property
|
||||
def update_count(self) -> int:
|
||||
return len(self.updates)
|
||||
|
||||
@property
|
||||
def delete_count(self) -> int:
|
||||
return len(self.deletes)
|
||||
|
||||
@property
|
||||
def unchanged_count(self) -> int:
|
||||
return len(self.unchanged)
|
||||
|
||||
@property
|
||||
def total_changes(self) -> int:
|
||||
return self.create_count + self.update_count + self.delete_count
|
||||
|
||||
def get_records_to_upload(self) -> list[CloudKitRecord]:
|
||||
"""Get all records that need to be uploaded (creates + updates)."""
|
||||
records = []
|
||||
|
||||
for diff in self.creates:
|
||||
if diff.local_record:
|
||||
records.append(diff.local_record)
|
||||
|
||||
for diff in self.updates:
|
||||
if diff.local_record:
|
||||
# Add change tag for update
|
||||
diff.local_record.record_change_tag = diff.record_change_tag
|
||||
records.append(diff.local_record)
|
||||
|
||||
return records
|
||||
|
||||
|
||||
class RecordDiffer:
|
||||
"""Compares local records with CloudKit records."""
|
||||
|
||||
# Fields to compare for each record type
|
||||
GAME_FIELDS = [
|
||||
"sport", "season", "home_team_id", "away_team_id", "stadium_id",
|
||||
"game_date", "game_number", "home_score", "away_score", "status",
|
||||
]
|
||||
|
||||
TEAM_FIELDS = [
|
||||
"sport", "city", "name", "full_name", "abbreviation",
|
||||
"conference", "division", "primary_color", "secondary_color",
|
||||
"logo_url", "stadium_id",
|
||||
]
|
||||
|
||||
STADIUM_FIELDS = [
|
||||
"sport", "name", "city", "state", "country",
|
||||
"latitude", "longitude", "capacity", "surface",
|
||||
"roof_type", "opened_year", "image_url", "timezone",
|
||||
]
|
||||
|
||||
def diff_games(
|
||||
self,
|
||||
local_games: list[Game],
|
||||
remote_records: list[dict],
|
||||
) -> DiffResult:
|
||||
"""Diff local games against remote CloudKit records.
|
||||
|
||||
Args:
|
||||
local_games: List of local Game objects
|
||||
remote_records: List of remote record dictionaries
|
||||
|
||||
Returns:
|
||||
DiffResult with creates, updates, deletes
|
||||
"""
|
||||
local_records = [self._game_to_record(g) for g in local_games]
|
||||
return self._diff_records(
|
||||
local_records,
|
||||
remote_records,
|
||||
RecordType.GAME,
|
||||
self.GAME_FIELDS,
|
||||
)
|
||||
|
||||
def diff_teams(
|
||||
self,
|
||||
local_teams: list[Team],
|
||||
remote_records: list[dict],
|
||||
) -> DiffResult:
|
||||
"""Diff local teams against remote CloudKit records.
|
||||
|
||||
Args:
|
||||
local_teams: List of local Team objects
|
||||
remote_records: List of remote record dictionaries
|
||||
|
||||
Returns:
|
||||
DiffResult with creates, updates, deletes
|
||||
"""
|
||||
local_records = [self._team_to_record(t) for t in local_teams]
|
||||
return self._diff_records(
|
||||
local_records,
|
||||
remote_records,
|
||||
RecordType.TEAM,
|
||||
self.TEAM_FIELDS,
|
||||
)
|
||||
|
||||
def diff_stadiums(
|
||||
self,
|
||||
local_stadiums: list[Stadium],
|
||||
remote_records: list[dict],
|
||||
) -> DiffResult:
|
||||
"""Diff local stadiums against remote CloudKit records.
|
||||
|
||||
Args:
|
||||
local_stadiums: List of local Stadium objects
|
||||
remote_records: List of remote record dictionaries
|
||||
|
||||
Returns:
|
||||
DiffResult with creates, updates, deletes
|
||||
"""
|
||||
local_records = [self._stadium_to_record(s) for s in local_stadiums]
|
||||
return self._diff_records(
|
||||
local_records,
|
||||
remote_records,
|
||||
RecordType.STADIUM,
|
||||
self.STADIUM_FIELDS,
|
||||
)
|
||||
|
||||
def _diff_records(
|
||||
self,
|
||||
local_records: list[CloudKitRecord],
|
||||
remote_records: list[dict],
|
||||
record_type: RecordType,
|
||||
compare_fields: list[str],
|
||||
) -> DiffResult:
|
||||
"""Compare local and remote records.
|
||||
|
||||
Args:
|
||||
local_records: List of local CloudKitRecord objects
|
||||
remote_records: List of remote record dictionaries
|
||||
record_type: Type of records being compared
|
||||
compare_fields: List of field names to compare
|
||||
|
||||
Returns:
|
||||
DiffResult with categorized differences
|
||||
"""
|
||||
result = DiffResult()
|
||||
|
||||
# Index remote records by name
|
||||
remote_by_name: dict[str, dict] = {}
|
||||
for record in remote_records:
|
||||
name = record.get("recordName")
|
||||
if name:
|
||||
remote_by_name[name] = record
|
||||
|
||||
# Index local records by name
|
||||
local_by_name: dict[str, CloudKitRecord] = {}
|
||||
for record in local_records:
|
||||
local_by_name[record.record_name] = record
|
||||
|
||||
# Find creates and updates
|
||||
for local_record in local_records:
|
||||
remote = remote_by_name.get(local_record.record_name)
|
||||
|
||||
if remote is None:
|
||||
# New record
|
||||
result.creates.append(RecordDiff(
|
||||
record_name=local_record.record_name,
|
||||
record_type=record_type,
|
||||
action=DiffAction.CREATE,
|
||||
local_record=local_record,
|
||||
))
|
||||
else:
|
||||
# Check for changes
|
||||
changed_fields = self._compare_fields(
|
||||
local_record.fields,
|
||||
remote.get("fields", {}),
|
||||
compare_fields,
|
||||
)
|
||||
|
||||
if changed_fields:
|
||||
result.updates.append(RecordDiff(
|
||||
record_name=local_record.record_name,
|
||||
record_type=record_type,
|
||||
action=DiffAction.UPDATE,
|
||||
local_record=local_record,
|
||||
remote_record=remote,
|
||||
changed_fields=changed_fields,
|
||||
record_change_tag=remote.get("recordChangeTag"),
|
||||
))
|
||||
else:
|
||||
result.unchanged.append(RecordDiff(
|
||||
record_name=local_record.record_name,
|
||||
record_type=record_type,
|
||||
action=DiffAction.UNCHANGED,
|
||||
local_record=local_record,
|
||||
remote_record=remote,
|
||||
record_change_tag=remote.get("recordChangeTag"),
|
||||
))
|
||||
|
||||
# Find deletes (remote records not in local)
|
||||
local_names = set(local_by_name.keys())
|
||||
for remote_name, remote in remote_by_name.items():
|
||||
if remote_name not in local_names:
|
||||
result.deletes.append(RecordDiff(
|
||||
record_name=remote_name,
|
||||
record_type=record_type,
|
||||
action=DiffAction.DELETE,
|
||||
remote_record=remote,
|
||||
record_change_tag=remote.get("recordChangeTag"),
|
||||
))
|
||||
|
||||
return result
|
||||
|
||||
def _compare_fields(
|
||||
self,
|
||||
local_fields: dict[str, Any],
|
||||
remote_fields: dict[str, dict],
|
||||
compare_fields: list[str],
|
||||
) -> list[str]:
|
||||
"""Compare field values between local and remote.
|
||||
|
||||
Args:
|
||||
local_fields: Local field values
|
||||
remote_fields: Remote field values (CloudKit format)
|
||||
compare_fields: Fields to compare
|
||||
|
||||
Returns:
|
||||
List of field names that differ
|
||||
"""
|
||||
changed = []
|
||||
|
||||
for field_name in compare_fields:
|
||||
local_value = local_fields.get(field_name)
|
||||
remote_field = remote_fields.get(field_name, {})
|
||||
remote_value = remote_field.get("value") if remote_field else None
|
||||
|
||||
# Normalize values for comparison
|
||||
local_normalized = self._normalize_value(local_value)
|
||||
remote_normalized = self._normalize_remote_value(remote_value, remote_field)
|
||||
|
||||
if local_normalized != remote_normalized:
|
||||
changed.append(field_name)
|
||||
|
||||
return changed
|
||||
|
||||
def _normalize_value(self, value: Any) -> Any:
|
||||
"""Normalize a local value for comparison."""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, datetime):
|
||||
# Convert to milliseconds since epoch
|
||||
return int(value.timestamp() * 1000)
|
||||
if isinstance(value, float):
|
||||
# Round to 6 decimal places for coordinate comparison
|
||||
return round(value, 6)
|
||||
return value
|
||||
|
||||
def _normalize_remote_value(self, value: Any, field_data: dict) -> Any:
|
||||
"""Normalize a remote CloudKit value for comparison."""
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
field_type = field_data.get("type", "")
|
||||
|
||||
if field_type == "TIMESTAMP":
|
||||
# Already in milliseconds
|
||||
return value
|
||||
if field_type == "DOUBLE":
|
||||
return round(value, 6)
|
||||
if field_type == "LOCATION":
|
||||
# Return as tuple for comparison
|
||||
if isinstance(value, dict):
|
||||
return (
|
||||
round(value.get("latitude", 0), 6),
|
||||
round(value.get("longitude", 0), 6),
|
||||
)
|
||||
|
||||
return value
|
||||
|
||||
def _game_to_record(self, game: Game) -> CloudKitRecord:
|
||||
"""Convert a Game to a CloudKitRecord."""
|
||||
return CloudKitRecord(
|
||||
record_name=game.id,
|
||||
record_type=RecordType.GAME,
|
||||
fields={
|
||||
"sport": game.sport,
|
||||
"season": game.season,
|
||||
"home_team_id": game.home_team_id,
|
||||
"away_team_id": game.away_team_id,
|
||||
"stadium_id": game.stadium_id,
|
||||
"game_date": game.game_date,
|
||||
"game_number": game.game_number,
|
||||
"home_score": game.home_score,
|
||||
"away_score": game.away_score,
|
||||
"status": game.status,
|
||||
},
|
||||
)
|
||||
|
||||
def _team_to_record(self, team: Team) -> CloudKitRecord:
|
||||
"""Convert a Team to a CloudKitRecord."""
|
||||
return CloudKitRecord(
|
||||
record_name=team.id,
|
||||
record_type=RecordType.TEAM,
|
||||
fields={
|
||||
"sport": team.sport,
|
||||
"city": team.city,
|
||||
"name": team.name,
|
||||
"full_name": team.full_name,
|
||||
"abbreviation": team.abbreviation,
|
||||
"conference": team.conference,
|
||||
"division": team.division,
|
||||
"primary_color": team.primary_color,
|
||||
"secondary_color": team.secondary_color,
|
||||
"logo_url": team.logo_url,
|
||||
"stadium_id": team.stadium_id,
|
||||
},
|
||||
)
|
||||
|
||||
def _stadium_to_record(self, stadium: Stadium) -> CloudKitRecord:
|
||||
"""Convert a Stadium to a CloudKitRecord."""
|
||||
return CloudKitRecord(
|
||||
record_name=stadium.id,
|
||||
record_type=RecordType.STADIUM,
|
||||
fields={
|
||||
"sport": stadium.sport,
|
||||
"name": stadium.name,
|
||||
"city": stadium.city,
|
||||
"state": stadium.state,
|
||||
"country": stadium.country,
|
||||
"latitude": stadium.latitude,
|
||||
"longitude": stadium.longitude,
|
||||
"capacity": stadium.capacity,
|
||||
"surface": stadium.surface,
|
||||
"roof_type": stadium.roof_type,
|
||||
"opened_year": stadium.opened_year,
|
||||
"image_url": stadium.image_url,
|
||||
"timezone": stadium.timezone,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def game_to_cloudkit_record(game: Game) -> CloudKitRecord:
|
||||
"""Convert a Game to a CloudKitRecord.
|
||||
|
||||
Convenience function for external use.
|
||||
"""
|
||||
differ = RecordDiffer()
|
||||
return differ._game_to_record(game)
|
||||
|
||||
|
||||
def team_to_cloudkit_record(team: Team) -> CloudKitRecord:
|
||||
"""Convert a Team to a CloudKitRecord.
|
||||
|
||||
Convenience function for external use.
|
||||
"""
|
||||
differ = RecordDiffer()
|
||||
return differ._team_to_record(team)
|
||||
|
||||
|
||||
def stadium_to_cloudkit_record(stadium: Stadium) -> CloudKitRecord:
|
||||
"""Convert a Stadium to a CloudKitRecord.
|
||||
|
||||
Convenience function for external use.
|
||||
"""
|
||||
differ = RecordDiffer()
|
||||
return differ._stadium_to_record(stadium)
|
||||
Reference in New Issue
Block a user