feat: add Django web app, CloudKit sync, dashboard, and game_datetime_utc export
Adds the full Django application layer on top of sportstime_parser: - core: Sport, Team, Stadium, Game models with aliases and league structure - scraper: orchestration engine, adapter, job management, Celery tasks - cloudkit: CloudKit sync client, sync state tracking, sync jobs - dashboard: staff dashboard for monitoring scrapers, sync, review queue - notifications: email reports for scrape/sync results - Docker setup for deployment (Dockerfile, docker-compose, entrypoint) Game exports now use game_datetime_utc (ISO 8601 UTC) instead of venue-local date+time strings, matching the canonical format used by the iOS app. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
10
scraper/models/__init__.py
Normal file
10
scraper/models/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from .config import ScraperConfig
|
||||
from .job import ScrapeJob, ScrapeJobLog
|
||||
from .review import ManualReviewItem
|
||||
|
||||
__all__ = [
|
||||
'ScraperConfig',
|
||||
'ScrapeJob',
|
||||
'ScrapeJobLog',
|
||||
'ManualReviewItem',
|
||||
]
|
||||
102
scraper/models/config.py
Normal file
102
scraper/models/config.py
Normal file
@@ -0,0 +1,102 @@
|
||||
from django.db import models
|
||||
from django.conf import settings
|
||||
from simple_history.models import HistoricalRecords
|
||||
|
||||
|
||||
class ScraperConfig(models.Model):
|
||||
"""
|
||||
Configuration for a sport's scraper.
|
||||
"""
|
||||
sport = models.ForeignKey(
|
||||
'core.Sport',
|
||||
on_delete=models.CASCADE,
|
||||
related_name='scraper_configs'
|
||||
)
|
||||
season = models.PositiveSmallIntegerField(
|
||||
help_text='Season to scrape (e.g., 2025 for 2025-26 season)'
|
||||
)
|
||||
is_enabled = models.BooleanField(
|
||||
default=True,
|
||||
help_text='Whether this scraper is enabled for scheduling'
|
||||
)
|
||||
|
||||
# Source configuration
|
||||
sources = models.JSONField(
|
||||
default=list,
|
||||
help_text='Ordered list of sources to try (e.g., ["basketball_reference", "espn"])'
|
||||
)
|
||||
primary_source = models.CharField(
|
||||
max_length=100,
|
||||
blank=True,
|
||||
help_text='Primary source for this scraper'
|
||||
)
|
||||
|
||||
# Rate limiting
|
||||
request_delay = models.FloatField(
|
||||
default=settings.SCRAPER_REQUEST_DELAY,
|
||||
help_text='Seconds between requests'
|
||||
)
|
||||
max_retries = models.PositiveSmallIntegerField(
|
||||
default=settings.SCRAPER_MAX_RETRIES,
|
||||
help_text='Maximum retry attempts'
|
||||
)
|
||||
|
||||
# Fuzzy matching
|
||||
fuzzy_threshold = models.PositiveSmallIntegerField(
|
||||
default=settings.SCRAPER_FUZZY_THRESHOLD,
|
||||
help_text='Minimum fuzzy match confidence (0-100)'
|
||||
)
|
||||
|
||||
# Scheduling
|
||||
last_run = models.DateTimeField(
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='Last successful run timestamp'
|
||||
)
|
||||
last_run_status = models.CharField(
|
||||
max_length=20,
|
||||
blank=True,
|
||||
help_text='Status of last run'
|
||||
)
|
||||
last_run_games = models.PositiveIntegerField(
|
||||
default=0,
|
||||
help_text='Games found in last run'
|
||||
)
|
||||
|
||||
# Notes
|
||||
notes = models.TextField(
|
||||
blank=True,
|
||||
help_text='Configuration notes'
|
||||
)
|
||||
|
||||
# Metadata
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
# Audit trail
|
||||
history = HistoricalRecords()
|
||||
|
||||
class Meta:
|
||||
ordering = ['sport', 'season']
|
||||
unique_together = ['sport', 'season']
|
||||
verbose_name = 'Scraper Configuration'
|
||||
verbose_name_plural = 'Scraper Configurations'
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.sport.short_name} {self.sport.get_season_display(self.season)}"
|
||||
|
||||
def get_sources_list(self):
|
||||
"""Return sources as list, using defaults if empty."""
|
||||
if self.sources:
|
||||
return self.sources
|
||||
# Default sources per sport
|
||||
defaults = {
|
||||
'nba': ['basketball_reference', 'espn'],
|
||||
'mlb': ['baseball_reference', 'mlb_api', 'espn'],
|
||||
'nfl': ['espn', 'pro_football_reference'],
|
||||
'nhl': ['hockey_reference', 'nhl_api', 'espn'],
|
||||
'mls': ['espn'],
|
||||
'wnba': ['espn'],
|
||||
'nwsl': ['espn'],
|
||||
}
|
||||
return defaults.get(self.sport.code, ['espn'])
|
||||
159
scraper/models/job.py
Normal file
159
scraper/models/job.py
Normal file
@@ -0,0 +1,159 @@
|
||||
from django.db import models
|
||||
from simple_history.models import HistoricalRecords
|
||||
|
||||
|
||||
class ScrapeJob(models.Model):
|
||||
"""
|
||||
Record of a scraping job execution.
|
||||
"""
|
||||
STATUS_CHOICES = [
|
||||
('pending', 'Pending'),
|
||||
('running', 'Running'),
|
||||
('completed', 'Completed'),
|
||||
('failed', 'Failed'),
|
||||
('cancelled', 'Cancelled'),
|
||||
]
|
||||
|
||||
config = models.ForeignKey(
|
||||
'scraper.ScraperConfig',
|
||||
on_delete=models.CASCADE,
|
||||
related_name='jobs'
|
||||
)
|
||||
status = models.CharField(
|
||||
max_length=20,
|
||||
choices=STATUS_CHOICES,
|
||||
default='pending'
|
||||
)
|
||||
triggered_by = models.CharField(
|
||||
max_length=50,
|
||||
default='manual',
|
||||
help_text='How the job was triggered (manual, scheduled, api)'
|
||||
)
|
||||
|
||||
# Timing
|
||||
started_at = models.DateTimeField(null=True, blank=True)
|
||||
finished_at = models.DateTimeField(null=True, blank=True)
|
||||
|
||||
# Results
|
||||
games_found = models.PositiveIntegerField(default=0)
|
||||
games_new = models.PositiveIntegerField(default=0)
|
||||
games_updated = models.PositiveIntegerField(default=0)
|
||||
games_unchanged = models.PositiveIntegerField(default=0)
|
||||
games_errors = models.PositiveIntegerField(default=0)
|
||||
|
||||
teams_found = models.PositiveIntegerField(default=0)
|
||||
stadiums_found = models.PositiveIntegerField(default=0)
|
||||
review_items_created = models.PositiveIntegerField(default=0)
|
||||
|
||||
# Error tracking
|
||||
error_message = models.TextField(blank=True)
|
||||
error_traceback = models.TextField(blank=True)
|
||||
|
||||
# Celery task ID for tracking
|
||||
celery_task_id = models.CharField(
|
||||
max_length=255,
|
||||
blank=True,
|
||||
help_text='Celery task ID for this job'
|
||||
)
|
||||
|
||||
# Metadata
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
class Meta:
|
||||
ordering = ['-created_at']
|
||||
verbose_name = 'Scrape Job'
|
||||
verbose_name_plural = 'Scrape Jobs'
|
||||
indexes = [
|
||||
models.Index(fields=['config', 'status']),
|
||||
models.Index(fields=['status', 'created_at']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.config} - {self.created_at.strftime('%Y-%m-%d %H:%M')}"
|
||||
|
||||
@property
|
||||
def duration(self):
|
||||
"""Return job duration as timedelta or None."""
|
||||
if self.started_at and self.finished_at:
|
||||
return self.finished_at - self.started_at
|
||||
return None
|
||||
|
||||
@property
|
||||
def duration_display(self):
|
||||
"""Return formatted duration string."""
|
||||
duration = self.duration
|
||||
if duration:
|
||||
total_seconds = int(duration.total_seconds())
|
||||
minutes, seconds = divmod(total_seconds, 60)
|
||||
if minutes > 0:
|
||||
return f"{minutes}m {seconds}s"
|
||||
return f"{seconds}s"
|
||||
return '-'
|
||||
|
||||
@property
|
||||
def sport(self):
|
||||
return self.config.sport
|
||||
|
||||
@property
|
||||
def season(self):
|
||||
return self.config.season
|
||||
|
||||
def get_summary(self):
|
||||
"""Return summary dict for notifications."""
|
||||
return {
|
||||
'sport': self.config.sport.short_name,
|
||||
'season': self.config.sport.get_season_display(self.config.season),
|
||||
'status': self.status,
|
||||
'duration': self.duration_display,
|
||||
'games_found': self.games_found,
|
||||
'games_new': self.games_new,
|
||||
'games_updated': self.games_updated,
|
||||
'games_unchanged': self.games_unchanged,
|
||||
'games_errors': self.games_errors,
|
||||
'review_items': self.review_items_created,
|
||||
'error_message': self.error_message,
|
||||
}
|
||||
|
||||
|
||||
class ScrapeJobLog(models.Model):
|
||||
"""
|
||||
Log entries for a scrape job.
|
||||
"""
|
||||
LEVEL_CHOICES = [
|
||||
('debug', 'Debug'),
|
||||
('info', 'Info'),
|
||||
('warning', 'Warning'),
|
||||
('error', 'Error'),
|
||||
]
|
||||
|
||||
job = models.ForeignKey(
|
||||
ScrapeJob,
|
||||
on_delete=models.CASCADE,
|
||||
related_name='logs'
|
||||
)
|
||||
level = models.CharField(
|
||||
max_length=10,
|
||||
choices=LEVEL_CHOICES,
|
||||
default='info'
|
||||
)
|
||||
message = models.TextField()
|
||||
source = models.CharField(
|
||||
max_length=100,
|
||||
blank=True,
|
||||
help_text='Source/component that generated this log'
|
||||
)
|
||||
extra_data = models.JSONField(
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='Additional structured data'
|
||||
)
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
|
||||
class Meta:
|
||||
ordering = ['created_at']
|
||||
verbose_name = 'Scrape Job Log'
|
||||
verbose_name_plural = 'Scrape Job Logs'
|
||||
|
||||
def __str__(self):
|
||||
return f"[{self.level.upper()}] {self.message[:50]}"
|
||||
192
scraper/models/review.py
Normal file
192
scraper/models/review.py
Normal file
@@ -0,0 +1,192 @@
|
||||
from django.db import models
|
||||
from simple_history.models import HistoricalRecords
|
||||
|
||||
|
||||
class ManualReviewItem(models.Model):
|
||||
"""
|
||||
Items that require manual review before resolution.
|
||||
"""
|
||||
ITEM_TYPE_CHOICES = [
|
||||
('team', 'Team'),
|
||||
('stadium', 'Stadium'),
|
||||
]
|
||||
|
||||
STATUS_CHOICES = [
|
||||
('pending', 'Pending Review'),
|
||||
('resolved', 'Resolved'),
|
||||
('ignored', 'Ignored'),
|
||||
('new_entity', 'Created New Entity'),
|
||||
]
|
||||
|
||||
REASON_CHOICES = [
|
||||
('no_match', 'No Match Found'),
|
||||
('low_confidence', 'Low Confidence Match'),
|
||||
('ambiguous', 'Ambiguous Match'),
|
||||
('new_entity', 'Potentially New Entity'),
|
||||
]
|
||||
|
||||
job = models.ForeignKey(
|
||||
'scraper.ScrapeJob',
|
||||
on_delete=models.CASCADE,
|
||||
related_name='review_items',
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='Job that created this review item'
|
||||
)
|
||||
item_type = models.CharField(
|
||||
max_length=20,
|
||||
choices=ITEM_TYPE_CHOICES
|
||||
)
|
||||
sport = models.ForeignKey(
|
||||
'core.Sport',
|
||||
on_delete=models.CASCADE,
|
||||
related_name='review_items'
|
||||
)
|
||||
|
||||
# Raw value from scraping
|
||||
raw_value = models.CharField(
|
||||
max_length=300,
|
||||
help_text='Original scraped value'
|
||||
)
|
||||
|
||||
# Suggested resolution
|
||||
suggested_id = models.CharField(
|
||||
max_length=100,
|
||||
blank=True,
|
||||
help_text='Suggested canonical ID (if any match found)'
|
||||
)
|
||||
confidence = models.FloatField(
|
||||
default=0.0,
|
||||
help_text='Match confidence (0.0 - 1.0)'
|
||||
)
|
||||
reason = models.CharField(
|
||||
max_length=20,
|
||||
choices=REASON_CHOICES,
|
||||
help_text='Why manual review is needed'
|
||||
)
|
||||
|
||||
# Context
|
||||
source_url = models.URLField(
|
||||
blank=True,
|
||||
help_text='URL where this value was found'
|
||||
)
|
||||
check_date = models.DateField(
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='Date context for alias resolution'
|
||||
)
|
||||
context = models.JSONField(
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='Additional context (e.g., game info)'
|
||||
)
|
||||
|
||||
# Resolution
|
||||
status = models.CharField(
|
||||
max_length=20,
|
||||
choices=STATUS_CHOICES,
|
||||
default='pending'
|
||||
)
|
||||
resolved_to = models.CharField(
|
||||
max_length=100,
|
||||
blank=True,
|
||||
help_text='Final resolved canonical ID'
|
||||
)
|
||||
resolved_by = models.ForeignKey(
|
||||
'auth.User',
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name='resolved_review_items'
|
||||
)
|
||||
resolved_at = models.DateTimeField(null=True, blank=True)
|
||||
resolution_notes = models.TextField(
|
||||
blank=True,
|
||||
help_text='Notes about the resolution'
|
||||
)
|
||||
create_alias = models.BooleanField(
|
||||
default=False,
|
||||
help_text='Whether to create an alias from this resolution'
|
||||
)
|
||||
|
||||
# Metadata
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
# Audit trail
|
||||
history = HistoricalRecords()
|
||||
|
||||
class Meta:
|
||||
ordering = ['-created_at']
|
||||
verbose_name = 'Manual Review Item'
|
||||
verbose_name_plural = 'Manual Review Items'
|
||||
indexes = [
|
||||
models.Index(fields=['status', 'item_type']),
|
||||
models.Index(fields=['sport', 'status']),
|
||||
models.Index(fields=['raw_value']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.item_type}: {self.raw_value} ({self.get_status_display()})"
|
||||
|
||||
@property
|
||||
def confidence_display(self):
|
||||
"""Return confidence as percentage string."""
|
||||
return f"{self.confidence * 100:.0f}%"
|
||||
|
||||
def resolve(self, canonical_id, user=None, notes='', create_alias=False):
|
||||
"""Resolve this review item."""
|
||||
from django.utils import timezone
|
||||
self.status = 'resolved'
|
||||
self.resolved_to = canonical_id
|
||||
self.resolved_by = user
|
||||
self.resolved_at = timezone.now()
|
||||
self.resolution_notes = notes
|
||||
self.create_alias = create_alias
|
||||
self.save()
|
||||
|
||||
# Optionally create alias
|
||||
if create_alias and canonical_id:
|
||||
self._create_alias(canonical_id)
|
||||
|
||||
def _create_alias(self, canonical_id):
|
||||
"""Create an alias from this resolution."""
|
||||
from core.models import TeamAlias, StadiumAlias, Team, Stadium
|
||||
|
||||
if self.item_type == 'team':
|
||||
try:
|
||||
team = Team.objects.get(id=canonical_id)
|
||||
TeamAlias.objects.get_or_create(
|
||||
team=team,
|
||||
alias=self.raw_value,
|
||||
defaults={
|
||||
'alias_type': 'historical',
|
||||
'source': 'manual_review',
|
||||
'notes': f'Created from review item #{self.id}',
|
||||
}
|
||||
)
|
||||
except Team.DoesNotExist:
|
||||
pass
|
||||
elif self.item_type == 'stadium':
|
||||
try:
|
||||
stadium = Stadium.objects.get(id=canonical_id)
|
||||
StadiumAlias.objects.get_or_create(
|
||||
stadium=stadium,
|
||||
alias=self.raw_value,
|
||||
defaults={
|
||||
'alias_type': 'former',
|
||||
'source': 'manual_review',
|
||||
'notes': f'Created from review item #{self.id}',
|
||||
}
|
||||
)
|
||||
except Stadium.DoesNotExist:
|
||||
pass
|
||||
|
||||
def ignore(self, user=None, notes=''):
|
||||
"""Mark this review item as ignored."""
|
||||
from django.utils import timezone
|
||||
self.status = 'ignored'
|
||||
self.resolved_by = user
|
||||
self.resolved_at = timezone.now()
|
||||
self.resolution_notes = notes
|
||||
self.save()
|
||||
Reference in New Issue
Block a user