""" Scraper models for tracking scraping jobs and manual reviews. """ from django.db import models from simple_history.models import HistoricalRecords class ScraperConfig(models.Model): """ Configuration for a sport scraper per season. """ sport = models.ForeignKey( 'core.Sport', on_delete=models.CASCADE, related_name='scraper_configs' ) season = models.PositiveSmallIntegerField( help_text='Season year (start year for split seasons)' ) is_active = models.BooleanField( default=True, help_text='Whether this config is actively scraping' ) schedule_url = models.URLField( blank=True, help_text='Base URL for schedule scraping' ) scrape_interval_hours = models.PositiveSmallIntegerField( default=24, help_text='How often to run the scraper (hours)' ) last_scrape_at = models.DateTimeField( null=True, blank=True, help_text='When the last scrape completed' ) next_scrape_at = models.DateTimeField( null=True, blank=True, help_text='When the next scrape is scheduled' ) # Metadata created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) # Audit trail history = HistoricalRecords() class Meta: ordering = ['-season', 'sport'] unique_together = ['sport', 'season'] verbose_name = 'Scraper Config' verbose_name_plural = 'Scraper Configs' def __str__(self): return f"{self.sport.short_name} {self.sport.get_season_display(self.season)}" class ScrapeJob(models.Model): """ Record of a scraping job execution. """ STATUS_CHOICES = [ ('pending', 'Pending'), ('running', 'Running'), ('completed', 'Completed'), ('failed', 'Failed'), ('cancelled', 'Cancelled'), ] config = models.ForeignKey( ScraperConfig, on_delete=models.CASCADE, related_name='jobs' ) status = models.CharField( max_length=20, choices=STATUS_CHOICES, default='pending' ) started_at = models.DateTimeField( null=True, blank=True ) completed_at = models.DateTimeField( null=True, blank=True ) games_found = models.PositiveIntegerField(default=0) games_created = models.PositiveIntegerField(default=0) games_updated = models.PositiveIntegerField(default=0) errors = models.TextField(blank=True) log_output = models.TextField(blank=True) # Metadata created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) class Meta: ordering = ['-created_at'] verbose_name = 'Scrape Job' verbose_name_plural = 'Scrape Jobs' def __str__(self): return f"{self.config} - {self.status} ({self.created_at.strftime('%Y-%m-%d %H:%M')})" @property def duration(self): """Return job duration in seconds.""" if self.started_at and self.completed_at: return (self.completed_at - self.started_at).total_seconds() return None class ManualReviewItem(models.Model): """ Items flagged for manual review (fuzzy matches, conflicts, etc). """ STATUS_CHOICES = [ ('pending', 'Pending Review'), ('approved', 'Approved'), ('rejected', 'Rejected'), ('resolved', 'Resolved'), ] ITEM_TYPE_CHOICES = [ ('team', 'Team Match'), ('stadium', 'Stadium Match'), ('game', 'Game Conflict'), ('alias', 'New Alias'), ] sport = models.ForeignKey( 'core.Sport', on_delete=models.CASCADE, related_name='review_items' ) job = models.ForeignKey( ScrapeJob, on_delete=models.SET_NULL, null=True, blank=True, related_name='review_items' ) item_type = models.CharField( max_length=20, choices=ITEM_TYPE_CHOICES ) status = models.CharField( max_length=20, choices=STATUS_CHOICES, default='pending' ) raw_value = models.CharField( max_length=500, help_text='The raw scraped value' ) matched_value = models.CharField( max_length=500, blank=True, help_text='The matched canonical value (if any)' ) confidence = models.PositiveSmallIntegerField( default=0, help_text='Match confidence score (0-100)' ) context = models.JSONField( default=dict, blank=True, help_text='Additional context (game date, opposing team, etc)' ) resolution_notes = models.TextField( blank=True, help_text='Notes about the resolution' ) resolved_at = models.DateTimeField( null=True, blank=True ) resolved_by = models.ForeignKey( 'auth.User', on_delete=models.SET_NULL, null=True, blank=True, related_name='resolved_reviews' ) # Metadata created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) class Meta: ordering = ['-confidence', '-created_at'] verbose_name = 'Manual Review Item' verbose_name_plural = 'Manual Review Items' def __str__(self): return f"{self.item_type}: {self.raw_value} ({self.confidence}%)"