Adds the full Django application layer on top of sportstime_parser: - core: Sport, Team, Stadium, Game models with aliases and league structure - scraper: orchestration engine, adapter, job management, Celery tasks - cloudkit: CloudKit sync client, sync state tracking, sync jobs - dashboard: staff dashboard for monitoring scrapers, sync, review queue - notifications: email reports for scrape/sync results - Docker setup for deployment (Dockerfile, docker-compose, entrypoint) Game exports now use game_datetime_utc (ISO 8601 UTC) instead of venue-local date+time strings, matching the canonical format used by the iOS app. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
200 lines
5.3 KiB
Python
200 lines
5.3 KiB
Python
"""
|
|
Scraper models for tracking scraping jobs and manual reviews.
|
|
"""
|
|
from django.db import models
|
|
from simple_history.models import HistoricalRecords
|
|
|
|
|
|
class ScraperConfig(models.Model):
|
|
"""
|
|
Configuration for a sport scraper per season.
|
|
"""
|
|
sport = models.ForeignKey(
|
|
'core.Sport',
|
|
on_delete=models.CASCADE,
|
|
related_name='scraper_configs'
|
|
)
|
|
season = models.PositiveSmallIntegerField(
|
|
help_text='Season year (start year for split seasons)'
|
|
)
|
|
is_active = models.BooleanField(
|
|
default=True,
|
|
help_text='Whether this config is actively scraping'
|
|
)
|
|
schedule_url = models.URLField(
|
|
blank=True,
|
|
help_text='Base URL for schedule scraping'
|
|
)
|
|
scrape_interval_hours = models.PositiveSmallIntegerField(
|
|
default=24,
|
|
help_text='How often to run the scraper (hours)'
|
|
)
|
|
last_scrape_at = models.DateTimeField(
|
|
null=True,
|
|
blank=True,
|
|
help_text='When the last scrape completed'
|
|
)
|
|
next_scrape_at = models.DateTimeField(
|
|
null=True,
|
|
blank=True,
|
|
help_text='When the next scrape is scheduled'
|
|
)
|
|
|
|
# Metadata
|
|
created_at = models.DateTimeField(auto_now_add=True)
|
|
updated_at = models.DateTimeField(auto_now=True)
|
|
|
|
# Audit trail
|
|
history = HistoricalRecords()
|
|
|
|
class Meta:
|
|
ordering = ['-season', 'sport']
|
|
unique_together = ['sport', 'season']
|
|
verbose_name = 'Scraper Config'
|
|
verbose_name_plural = 'Scraper Configs'
|
|
|
|
def __str__(self):
|
|
return f"{self.sport.short_name} {self.sport.get_season_display(self.season)}"
|
|
|
|
|
|
class ScrapeJob(models.Model):
|
|
"""
|
|
Record of a scraping job execution.
|
|
"""
|
|
STATUS_CHOICES = [
|
|
('pending', 'Pending'),
|
|
('running', 'Running'),
|
|
('completed', 'Completed'),
|
|
('failed', 'Failed'),
|
|
('cancelled', 'Cancelled'),
|
|
]
|
|
|
|
config = models.ForeignKey(
|
|
ScraperConfig,
|
|
on_delete=models.CASCADE,
|
|
related_name='jobs'
|
|
)
|
|
status = models.CharField(
|
|
max_length=20,
|
|
choices=STATUS_CHOICES,
|
|
default='pending'
|
|
)
|
|
started_at = models.DateTimeField(
|
|
null=True,
|
|
blank=True
|
|
)
|
|
completed_at = models.DateTimeField(
|
|
null=True,
|
|
blank=True
|
|
)
|
|
games_found = models.PositiveIntegerField(default=0)
|
|
games_created = models.PositiveIntegerField(default=0)
|
|
games_updated = models.PositiveIntegerField(default=0)
|
|
errors = models.TextField(blank=True)
|
|
log_output = models.TextField(blank=True)
|
|
|
|
# Metadata
|
|
created_at = models.DateTimeField(auto_now_add=True)
|
|
updated_at = models.DateTimeField(auto_now=True)
|
|
|
|
class Meta:
|
|
ordering = ['-created_at']
|
|
verbose_name = 'Scrape Job'
|
|
verbose_name_plural = 'Scrape Jobs'
|
|
|
|
def __str__(self):
|
|
return f"{self.config} - {self.status} ({self.created_at.strftime('%Y-%m-%d %H:%M')})"
|
|
|
|
@property
|
|
def duration(self):
|
|
"""Return job duration in seconds."""
|
|
if self.started_at and self.completed_at:
|
|
return (self.completed_at - self.started_at).total_seconds()
|
|
return None
|
|
|
|
|
|
class ManualReviewItem(models.Model):
|
|
"""
|
|
Items flagged for manual review (fuzzy matches, conflicts, etc).
|
|
"""
|
|
STATUS_CHOICES = [
|
|
('pending', 'Pending Review'),
|
|
('approved', 'Approved'),
|
|
('rejected', 'Rejected'),
|
|
('resolved', 'Resolved'),
|
|
]
|
|
|
|
ITEM_TYPE_CHOICES = [
|
|
('team', 'Team Match'),
|
|
('stadium', 'Stadium Match'),
|
|
('game', 'Game Conflict'),
|
|
('alias', 'New Alias'),
|
|
]
|
|
|
|
sport = models.ForeignKey(
|
|
'core.Sport',
|
|
on_delete=models.CASCADE,
|
|
related_name='review_items'
|
|
)
|
|
job = models.ForeignKey(
|
|
ScrapeJob,
|
|
on_delete=models.SET_NULL,
|
|
null=True,
|
|
blank=True,
|
|
related_name='review_items'
|
|
)
|
|
item_type = models.CharField(
|
|
max_length=20,
|
|
choices=ITEM_TYPE_CHOICES
|
|
)
|
|
status = models.CharField(
|
|
max_length=20,
|
|
choices=STATUS_CHOICES,
|
|
default='pending'
|
|
)
|
|
raw_value = models.CharField(
|
|
max_length=500,
|
|
help_text='The raw scraped value'
|
|
)
|
|
matched_value = models.CharField(
|
|
max_length=500,
|
|
blank=True,
|
|
help_text='The matched canonical value (if any)'
|
|
)
|
|
confidence = models.PositiveSmallIntegerField(
|
|
default=0,
|
|
help_text='Match confidence score (0-100)'
|
|
)
|
|
context = models.JSONField(
|
|
default=dict,
|
|
blank=True,
|
|
help_text='Additional context (game date, opposing team, etc)'
|
|
)
|
|
resolution_notes = models.TextField(
|
|
blank=True,
|
|
help_text='Notes about the resolution'
|
|
)
|
|
resolved_at = models.DateTimeField(
|
|
null=True,
|
|
blank=True
|
|
)
|
|
resolved_by = models.ForeignKey(
|
|
'auth.User',
|
|
on_delete=models.SET_NULL,
|
|
null=True,
|
|
blank=True,
|
|
related_name='resolved_reviews'
|
|
)
|
|
|
|
# Metadata
|
|
created_at = models.DateTimeField(auto_now_add=True)
|
|
updated_at = models.DateTimeField(auto_now=True)
|
|
|
|
class Meta:
|
|
ordering = ['-confidence', '-created_at']
|
|
verbose_name = 'Manual Review Item'
|
|
verbose_name_plural = 'Manual Review Items'
|
|
|
|
def __str__(self):
|
|
return f"{self.item_type}: {self.raw_value} ({self.confidence}%)"
|