Files
SportstimeAPI/scraper/tasks.py
Trey t 63acf7accb feat: add Django web app, CloudKit sync, dashboard, and game_datetime_utc export
Adds the full Django application layer on top of sportstime_parser:
- core: Sport, Team, Stadium, Game models with aliases and league structure
- scraper: orchestration engine, adapter, job management, Celery tasks
- cloudkit: CloudKit sync client, sync state tracking, sync jobs
- dashboard: staff dashboard for monitoring scrapers, sync, review queue
- notifications: email reports for scrape/sync results
- Docker setup for deployment (Dockerfile, docker-compose, entrypoint)

Game exports now use game_datetime_utc (ISO 8601 UTC) instead of
venue-local date+time strings, matching the canonical format used
by the iOS app.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 14:04:27 -06:00

183 lines
5.2 KiB
Python

import logging
import traceback
from datetime import datetime
from celery import shared_task
from django.utils import timezone
logger = logging.getLogger('scraper')
@shared_task(bind=True, max_retries=3)
def run_scraper_task(self, config_id: int, triggered_by: str = 'manual'):
"""
Run a scraper job for the given configuration.
"""
from scraper.models import ScraperConfig, ScrapeJob, ScrapeJobLog
from notifications.tasks import send_scrape_notification
# Get configuration
try:
config = ScraperConfig.objects.select_related('sport').get(id=config_id)
except ScraperConfig.DoesNotExist:
logger.error(f"ScraperConfig {config_id} not found")
return {'error': 'Configuration not found'}
# Create job record
job = ScrapeJob.objects.create(
config=config,
status='running',
triggered_by=triggered_by,
started_at=timezone.now(),
celery_task_id=self.request.id,
)
def log(level, message, source='', extra_data=None):
ScrapeJobLog.objects.create(
job=job,
level=level,
message=message,
source=source,
extra_data=extra_data,
)
getattr(logger, level)(f"[{config.sport.code}] {message}")
try:
log('info', f'Starting scraper for {config.sport.short_name} {config.season}')
# Import and run the appropriate scraper
result = run_sport_scraper(config, log)
# Update job with results
job.status = 'completed'
job.finished_at = timezone.now()
job.games_found = result.get('games_found', 0)
job.games_new = result.get('games_new', 0)
job.games_updated = result.get('games_updated', 0)
job.games_unchanged = result.get('games_unchanged', 0)
job.games_errors = result.get('games_errors', 0)
job.teams_found = result.get('teams_found', 0)
job.stadiums_found = result.get('stadiums_found', 0)
job.review_items_created = result.get('review_items', 0)
job.save()
# Update config
config.last_run = timezone.now()
config.last_run_status = 'completed'
config.last_run_games = result.get('games_found', 0)
config.save()
log('info', f'Scraper completed: {job.games_found} games, {job.games_new} new, {job.review_items_created} reviews')
# Send notification
send_scrape_notification.delay(job.id)
return {
'job_id': job.id,
'status': 'completed',
'games_found': job.games_found,
'games_new': job.games_new,
'review_items': job.review_items_created,
}
except Exception as e:
error_msg = str(e)
error_tb = traceback.format_exc()
job.status = 'failed'
job.finished_at = timezone.now()
job.error_message = error_msg
job.error_traceback = error_tb
job.save()
config.last_run = timezone.now()
config.last_run_status = 'failed'
config.save()
log('error', f'Scraper failed: {error_msg}', extra_data={'traceback': error_tb})
# Send failure notification
send_scrape_notification.delay(job.id)
# Retry if applicable
if self.request.retries < self.max_retries:
raise self.retry(exc=e, countdown=60 * (self.request.retries + 1))
return {
'job_id': job.id,
'status': 'failed',
'error': error_msg,
}
def run_sport_scraper(config, log_func):
"""
Run the appropriate scraper for the sport.
Returns dict with results.
"""
from core.models import Game, Team, Stadium
from scraper.models import ManualReviewItem
sport_code = config.sport.code
season = config.season
log_func('info', f'Loading scraper for {sport_code}', source='engine')
# Import the scraper engine from sportstime_parser
# This adapts the existing scrapers to work with Django models
from scraper.engine.adapter import ScraperAdapter
adapter = ScraperAdapter(
sport_code=sport_code,
season=season,
config=config,
log_func=log_func,
)
# Run the scraper
result = adapter.run()
return result
@shared_task
def run_all_enabled_scrapers():
"""
Run all enabled scraper configurations.
Called by celery-beat on schedule.
"""
from scraper.models import ScraperConfig
configs = ScraperConfig.objects.filter(is_enabled=True)
for config in configs:
run_scraper_task.delay(config.id, triggered_by='scheduled')
return {'configs_queued': configs.count()}
@shared_task
def cleanup_old_jobs(days: int = 30):
"""
Clean up old scrape job records.
"""
from scraper.models import ScrapeJob, ScrapeJobLog
from django.utils import timezone
from datetime import timedelta
cutoff = timezone.now() - timedelta(days=days)
# Delete old logs first (foreign key)
logs_deleted, _ = ScrapeJobLog.objects.filter(
job__created_at__lt=cutoff
).delete()
# Then delete old jobs
jobs_deleted, _ = ScrapeJob.objects.filter(
created_at__lt=cutoff
).delete()
return {
'jobs_deleted': jobs_deleted,
'logs_deleted': logs_deleted,
}