Adds the full Django application layer on top of sportstime_parser: - core: Sport, Team, Stadium, Game models with aliases and league structure - scraper: orchestration engine, adapter, job management, Celery tasks - cloudkit: CloudKit sync client, sync state tracking, sync jobs - dashboard: staff dashboard for monitoring scrapers, sync, review queue - notifications: email reports for scrape/sync results - Docker setup for deployment (Dockerfile, docker-compose, entrypoint) Game exports now use game_datetime_utc (ISO 8601 UTC) instead of venue-local date+time strings, matching the canonical format used by the iOS app. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
183 lines
5.2 KiB
Python
183 lines
5.2 KiB
Python
import logging
|
|
import traceback
|
|
from datetime import datetime
|
|
|
|
from celery import shared_task
|
|
from django.utils import timezone
|
|
|
|
logger = logging.getLogger('scraper')
|
|
|
|
|
|
@shared_task(bind=True, max_retries=3)
|
|
def run_scraper_task(self, config_id: int, triggered_by: str = 'manual'):
|
|
"""
|
|
Run a scraper job for the given configuration.
|
|
"""
|
|
from scraper.models import ScraperConfig, ScrapeJob, ScrapeJobLog
|
|
from notifications.tasks import send_scrape_notification
|
|
|
|
# Get configuration
|
|
try:
|
|
config = ScraperConfig.objects.select_related('sport').get(id=config_id)
|
|
except ScraperConfig.DoesNotExist:
|
|
logger.error(f"ScraperConfig {config_id} not found")
|
|
return {'error': 'Configuration not found'}
|
|
|
|
# Create job record
|
|
job = ScrapeJob.objects.create(
|
|
config=config,
|
|
status='running',
|
|
triggered_by=triggered_by,
|
|
started_at=timezone.now(),
|
|
celery_task_id=self.request.id,
|
|
)
|
|
|
|
def log(level, message, source='', extra_data=None):
|
|
ScrapeJobLog.objects.create(
|
|
job=job,
|
|
level=level,
|
|
message=message,
|
|
source=source,
|
|
extra_data=extra_data,
|
|
)
|
|
getattr(logger, level)(f"[{config.sport.code}] {message}")
|
|
|
|
try:
|
|
log('info', f'Starting scraper for {config.sport.short_name} {config.season}')
|
|
|
|
# Import and run the appropriate scraper
|
|
result = run_sport_scraper(config, log)
|
|
|
|
# Update job with results
|
|
job.status = 'completed'
|
|
job.finished_at = timezone.now()
|
|
job.games_found = result.get('games_found', 0)
|
|
job.games_new = result.get('games_new', 0)
|
|
job.games_updated = result.get('games_updated', 0)
|
|
job.games_unchanged = result.get('games_unchanged', 0)
|
|
job.games_errors = result.get('games_errors', 0)
|
|
job.teams_found = result.get('teams_found', 0)
|
|
job.stadiums_found = result.get('stadiums_found', 0)
|
|
job.review_items_created = result.get('review_items', 0)
|
|
job.save()
|
|
|
|
# Update config
|
|
config.last_run = timezone.now()
|
|
config.last_run_status = 'completed'
|
|
config.last_run_games = result.get('games_found', 0)
|
|
config.save()
|
|
|
|
log('info', f'Scraper completed: {job.games_found} games, {job.games_new} new, {job.review_items_created} reviews')
|
|
|
|
# Send notification
|
|
send_scrape_notification.delay(job.id)
|
|
|
|
return {
|
|
'job_id': job.id,
|
|
'status': 'completed',
|
|
'games_found': job.games_found,
|
|
'games_new': job.games_new,
|
|
'review_items': job.review_items_created,
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = str(e)
|
|
error_tb = traceback.format_exc()
|
|
|
|
job.status = 'failed'
|
|
job.finished_at = timezone.now()
|
|
job.error_message = error_msg
|
|
job.error_traceback = error_tb
|
|
job.save()
|
|
|
|
config.last_run = timezone.now()
|
|
config.last_run_status = 'failed'
|
|
config.save()
|
|
|
|
log('error', f'Scraper failed: {error_msg}', extra_data={'traceback': error_tb})
|
|
|
|
# Send failure notification
|
|
send_scrape_notification.delay(job.id)
|
|
|
|
# Retry if applicable
|
|
if self.request.retries < self.max_retries:
|
|
raise self.retry(exc=e, countdown=60 * (self.request.retries + 1))
|
|
|
|
return {
|
|
'job_id': job.id,
|
|
'status': 'failed',
|
|
'error': error_msg,
|
|
}
|
|
|
|
|
|
def run_sport_scraper(config, log_func):
|
|
"""
|
|
Run the appropriate scraper for the sport.
|
|
Returns dict with results.
|
|
"""
|
|
from core.models import Game, Team, Stadium
|
|
from scraper.models import ManualReviewItem
|
|
|
|
sport_code = config.sport.code
|
|
season = config.season
|
|
|
|
log_func('info', f'Loading scraper for {sport_code}', source='engine')
|
|
|
|
# Import the scraper engine from sportstime_parser
|
|
# This adapts the existing scrapers to work with Django models
|
|
from scraper.engine.adapter import ScraperAdapter
|
|
|
|
adapter = ScraperAdapter(
|
|
sport_code=sport_code,
|
|
season=season,
|
|
config=config,
|
|
log_func=log_func,
|
|
)
|
|
|
|
# Run the scraper
|
|
result = adapter.run()
|
|
|
|
return result
|
|
|
|
|
|
@shared_task
|
|
def run_all_enabled_scrapers():
|
|
"""
|
|
Run all enabled scraper configurations.
|
|
Called by celery-beat on schedule.
|
|
"""
|
|
from scraper.models import ScraperConfig
|
|
|
|
configs = ScraperConfig.objects.filter(is_enabled=True)
|
|
for config in configs:
|
|
run_scraper_task.delay(config.id, triggered_by='scheduled')
|
|
|
|
return {'configs_queued': configs.count()}
|
|
|
|
|
|
@shared_task
|
|
def cleanup_old_jobs(days: int = 30):
|
|
"""
|
|
Clean up old scrape job records.
|
|
"""
|
|
from scraper.models import ScrapeJob, ScrapeJobLog
|
|
from django.utils import timezone
|
|
from datetime import timedelta
|
|
|
|
cutoff = timezone.now() - timedelta(days=days)
|
|
|
|
# Delete old logs first (foreign key)
|
|
logs_deleted, _ = ScrapeJobLog.objects.filter(
|
|
job__created_at__lt=cutoff
|
|
).delete()
|
|
|
|
# Then delete old jobs
|
|
jobs_deleted, _ = ScrapeJob.objects.filter(
|
|
created_at__lt=cutoff
|
|
).delete()
|
|
|
|
return {
|
|
'jobs_deleted': jobs_deleted,
|
|
'logs_deleted': logs_deleted,
|
|
}
|