feat: add Django web app, CloudKit sync, dashboard, and game_datetime_utc export
Adds the full Django application layer on top of sportstime_parser: - core: Sport, Team, Stadium, Game models with aliases and league structure - scraper: orchestration engine, adapter, job management, Celery tasks - cloudkit: CloudKit sync client, sync state tracking, sync jobs - dashboard: staff dashboard for monitoring scrapers, sync, review queue - notifications: email reports for scrape/sync results - Docker setup for deployment (Dockerfile, docker-compose, entrypoint) Game exports now use game_datetime_utc (ISO 8601 UTC) instead of venue-local date+time strings, matching the canonical format used by the iOS app. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
182
scraper/tasks.py
Normal file
182
scraper/tasks.py
Normal file
@@ -0,0 +1,182 @@
|
||||
import logging
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
|
||||
from celery import shared_task
|
||||
from django.utils import timezone
|
||||
|
||||
logger = logging.getLogger('scraper')
|
||||
|
||||
|
||||
@shared_task(bind=True, max_retries=3)
|
||||
def run_scraper_task(self, config_id: int, triggered_by: str = 'manual'):
|
||||
"""
|
||||
Run a scraper job for the given configuration.
|
||||
"""
|
||||
from scraper.models import ScraperConfig, ScrapeJob, ScrapeJobLog
|
||||
from notifications.tasks import send_scrape_notification
|
||||
|
||||
# Get configuration
|
||||
try:
|
||||
config = ScraperConfig.objects.select_related('sport').get(id=config_id)
|
||||
except ScraperConfig.DoesNotExist:
|
||||
logger.error(f"ScraperConfig {config_id} not found")
|
||||
return {'error': 'Configuration not found'}
|
||||
|
||||
# Create job record
|
||||
job = ScrapeJob.objects.create(
|
||||
config=config,
|
||||
status='running',
|
||||
triggered_by=triggered_by,
|
||||
started_at=timezone.now(),
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
|
||||
def log(level, message, source='', extra_data=None):
|
||||
ScrapeJobLog.objects.create(
|
||||
job=job,
|
||||
level=level,
|
||||
message=message,
|
||||
source=source,
|
||||
extra_data=extra_data,
|
||||
)
|
||||
getattr(logger, level)(f"[{config.sport.code}] {message}")
|
||||
|
||||
try:
|
||||
log('info', f'Starting scraper for {config.sport.short_name} {config.season}')
|
||||
|
||||
# Import and run the appropriate scraper
|
||||
result = run_sport_scraper(config, log)
|
||||
|
||||
# Update job with results
|
||||
job.status = 'completed'
|
||||
job.finished_at = timezone.now()
|
||||
job.games_found = result.get('games_found', 0)
|
||||
job.games_new = result.get('games_new', 0)
|
||||
job.games_updated = result.get('games_updated', 0)
|
||||
job.games_unchanged = result.get('games_unchanged', 0)
|
||||
job.games_errors = result.get('games_errors', 0)
|
||||
job.teams_found = result.get('teams_found', 0)
|
||||
job.stadiums_found = result.get('stadiums_found', 0)
|
||||
job.review_items_created = result.get('review_items', 0)
|
||||
job.save()
|
||||
|
||||
# Update config
|
||||
config.last_run = timezone.now()
|
||||
config.last_run_status = 'completed'
|
||||
config.last_run_games = result.get('games_found', 0)
|
||||
config.save()
|
||||
|
||||
log('info', f'Scraper completed: {job.games_found} games, {job.games_new} new, {job.review_items_created} reviews')
|
||||
|
||||
# Send notification
|
||||
send_scrape_notification.delay(job.id)
|
||||
|
||||
return {
|
||||
'job_id': job.id,
|
||||
'status': 'completed',
|
||||
'games_found': job.games_found,
|
||||
'games_new': job.games_new,
|
||||
'review_items': job.review_items_created,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
error_tb = traceback.format_exc()
|
||||
|
||||
job.status = 'failed'
|
||||
job.finished_at = timezone.now()
|
||||
job.error_message = error_msg
|
||||
job.error_traceback = error_tb
|
||||
job.save()
|
||||
|
||||
config.last_run = timezone.now()
|
||||
config.last_run_status = 'failed'
|
||||
config.save()
|
||||
|
||||
log('error', f'Scraper failed: {error_msg}', extra_data={'traceback': error_tb})
|
||||
|
||||
# Send failure notification
|
||||
send_scrape_notification.delay(job.id)
|
||||
|
||||
# Retry if applicable
|
||||
if self.request.retries < self.max_retries:
|
||||
raise self.retry(exc=e, countdown=60 * (self.request.retries + 1))
|
||||
|
||||
return {
|
||||
'job_id': job.id,
|
||||
'status': 'failed',
|
||||
'error': error_msg,
|
||||
}
|
||||
|
||||
|
||||
def run_sport_scraper(config, log_func):
|
||||
"""
|
||||
Run the appropriate scraper for the sport.
|
||||
Returns dict with results.
|
||||
"""
|
||||
from core.models import Game, Team, Stadium
|
||||
from scraper.models import ManualReviewItem
|
||||
|
||||
sport_code = config.sport.code
|
||||
season = config.season
|
||||
|
||||
log_func('info', f'Loading scraper for {sport_code}', source='engine')
|
||||
|
||||
# Import the scraper engine from sportstime_parser
|
||||
# This adapts the existing scrapers to work with Django models
|
||||
from scraper.engine.adapter import ScraperAdapter
|
||||
|
||||
adapter = ScraperAdapter(
|
||||
sport_code=sport_code,
|
||||
season=season,
|
||||
config=config,
|
||||
log_func=log_func,
|
||||
)
|
||||
|
||||
# Run the scraper
|
||||
result = adapter.run()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@shared_task
|
||||
def run_all_enabled_scrapers():
|
||||
"""
|
||||
Run all enabled scraper configurations.
|
||||
Called by celery-beat on schedule.
|
||||
"""
|
||||
from scraper.models import ScraperConfig
|
||||
|
||||
configs = ScraperConfig.objects.filter(is_enabled=True)
|
||||
for config in configs:
|
||||
run_scraper_task.delay(config.id, triggered_by='scheduled')
|
||||
|
||||
return {'configs_queued': configs.count()}
|
||||
|
||||
|
||||
@shared_task
|
||||
def cleanup_old_jobs(days: int = 30):
|
||||
"""
|
||||
Clean up old scrape job records.
|
||||
"""
|
||||
from scraper.models import ScrapeJob, ScrapeJobLog
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
|
||||
cutoff = timezone.now() - timedelta(days=days)
|
||||
|
||||
# Delete old logs first (foreign key)
|
||||
logs_deleted, _ = ScrapeJobLog.objects.filter(
|
||||
job__created_at__lt=cutoff
|
||||
).delete()
|
||||
|
||||
# Then delete old jobs
|
||||
jobs_deleted, _ = ScrapeJob.objects.filter(
|
||||
created_at__lt=cutoff
|
||||
).delete()
|
||||
|
||||
return {
|
||||
'jobs_deleted': jobs_deleted,
|
||||
'logs_deleted': logs_deleted,
|
||||
}
|
||||
Reference in New Issue
Block a user