feat: add Django web app, CloudKit sync, dashboard, and game_datetime_utc export

Adds the full Django application layer on top of sportstime_parser: - core: Sport, Team, Stadium, Game models with aliases and league structure - scraper: orchestration engine, adapter, job management, Celery tasks - cloudkit: CloudKit sync client, sync state tracking, sync jobs - dashboard: staff dashboard for monitoring scrapers, sync, review queue - notifications: email reports for scrape/sync results - Docker setup for deployment (Dockerfile, docker-compose, entrypoint) Game exports now use game_datetime_utc (ISO 8601 UTC) instead of venue-local date+time strings, matching the canonical format used by the iOS app. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 14:04:27 -06:00
parent 4353d5943c
commit 63acf7accb
114 changed files with 13070 additions and 887 deletions
--- a/scraper/models.py
+++ b/scraper/models.py
@@ -0,0 +1,199 @@
+"""
+Scraper models for tracking scraping jobs and manual reviews.
+"""
+from django.db import models
+from simple_history.models import HistoricalRecords
+
+
+class ScraperConfig(models.Model):
+    """
+    Configuration for a sport scraper per season.
+    """
+    sport = models.ForeignKey(
+        'core.Sport',
+        on_delete=models.CASCADE,
+        related_name='scraper_configs'
+    )
+    season = models.PositiveSmallIntegerField(
+        help_text='Season year (start year for split seasons)'
+    )
+    is_active = models.BooleanField(
+        default=True,
+        help_text='Whether this config is actively scraping'
+    )
+    schedule_url = models.URLField(
+        blank=True,
+        help_text='Base URL for schedule scraping'
+    )
+    scrape_interval_hours = models.PositiveSmallIntegerField(
+        default=24,
+        help_text='How often to run the scraper (hours)'
+    )
+    last_scrape_at = models.DateTimeField(
+        null=True,
+        blank=True,
+        help_text='When the last scrape completed'
+    )
+    next_scrape_at = models.DateTimeField(
+        null=True,
+        blank=True,
+        help_text='When the next scrape is scheduled'
+    )
+
+    # Metadata
+    created_at = models.DateTimeField(auto_now_add=True)
+    updated_at = models.DateTimeField(auto_now=True)
+
+    # Audit trail
+    history = HistoricalRecords()
+
+    class Meta:
+        ordering = ['-season', 'sport']
+        unique_together = ['sport', 'season']
+        verbose_name = 'Scraper Config'
+        verbose_name_plural = 'Scraper Configs'
+
+    def __str__(self):
+        return f"{self.sport.short_name} {self.sport.get_season_display(self.season)}"
+
+
+class ScrapeJob(models.Model):
+    """
+    Record of a scraping job execution.
+    """
+    STATUS_CHOICES = [
+        ('pending', 'Pending'),
+        ('running', 'Running'),
+        ('completed', 'Completed'),
+        ('failed', 'Failed'),
+        ('cancelled', 'Cancelled'),
+    ]
+
+    config = models.ForeignKey(
+        ScraperConfig,
+        on_delete=models.CASCADE,
+        related_name='jobs'
+    )
+    status = models.CharField(
+        max_length=20,
+        choices=STATUS_CHOICES,
+        default='pending'
+    )
+    started_at = models.DateTimeField(
+        null=True,
+        blank=True
+    )
+    completed_at = models.DateTimeField(
+        null=True,
+        blank=True
+    )
+    games_found = models.PositiveIntegerField(default=0)
+    games_created = models.PositiveIntegerField(default=0)
+    games_updated = models.PositiveIntegerField(default=0)
+    errors = models.TextField(blank=True)
+    log_output = models.TextField(blank=True)
+
+    # Metadata
+    created_at = models.DateTimeField(auto_now_add=True)
+    updated_at = models.DateTimeField(auto_now=True)
+
+    class Meta:
+        ordering = ['-created_at']
+        verbose_name = 'Scrape Job'
+        verbose_name_plural = 'Scrape Jobs'
+
+    def __str__(self):
+        return f"{self.config} - {self.status} ({self.created_at.strftime('%Y-%m-%d %H:%M')})"
+
+    @property
+    def duration(self):
+        """Return job duration in seconds."""
+        if self.started_at and self.completed_at:
+            return (self.completed_at - self.started_at).total_seconds()
+        return None
+
+
+class ManualReviewItem(models.Model):
+    """
+    Items flagged for manual review (fuzzy matches, conflicts, etc).
+    """
+    STATUS_CHOICES = [
+        ('pending', 'Pending Review'),
+        ('approved', 'Approved'),
+        ('rejected', 'Rejected'),
+        ('resolved', 'Resolved'),
+    ]
+
+    ITEM_TYPE_CHOICES = [
+        ('team', 'Team Match'),
+        ('stadium', 'Stadium Match'),
+        ('game', 'Game Conflict'),
+        ('alias', 'New Alias'),
+    ]
+
+    sport = models.ForeignKey(
+        'core.Sport',
+        on_delete=models.CASCADE,
+        related_name='review_items'
+    )
+    job = models.ForeignKey(
+        ScrapeJob,
+        on_delete=models.SET_NULL,
+        null=True,
+        blank=True,
+        related_name='review_items'
+    )
+    item_type = models.CharField(
+        max_length=20,
+        choices=ITEM_TYPE_CHOICES
+    )
+    status = models.CharField(
+        max_length=20,
+        choices=STATUS_CHOICES,
+        default='pending'
+    )
+    raw_value = models.CharField(
+        max_length=500,
+        help_text='The raw scraped value'
+    )
+    matched_value = models.CharField(
+        max_length=500,
+        blank=True,
+        help_text='The matched canonical value (if any)'
+    )
+    confidence = models.PositiveSmallIntegerField(
+        default=0,
+        help_text='Match confidence score (0-100)'
+    )
+    context = models.JSONField(
+        default=dict,
+        blank=True,
+        help_text='Additional context (game date, opposing team, etc)'
+    )
+    resolution_notes = models.TextField(
+        blank=True,
+        help_text='Notes about the resolution'
+    )
+    resolved_at = models.DateTimeField(
+        null=True,
+        blank=True
+    )
+    resolved_by = models.ForeignKey(
+        'auth.User',
+        on_delete=models.SET_NULL,
+        null=True,
+        blank=True,
+        related_name='resolved_reviews'
+    )
+
+    # Metadata
+    created_at = models.DateTimeField(auto_now_add=True)
+    updated_at = models.DateTimeField(auto_now=True)
+
+    class Meta:
+        ordering = ['-confidence', '-created_at']
+        verbose_name = 'Manual Review Item'
+        verbose_name_plural = 'Manual Review Items'
+
+    def __str__(self):
+        return f"{self.item_type}: {self.raw_value} ({self.confidence}%)"