import logging import traceback from datetime import datetime from celery import shared_task from django.utils import timezone logger = logging.getLogger('scraper') @shared_task(bind=True, max_retries=3) def run_scraper_task(self, config_id: int, triggered_by: str = 'manual'): """ Run a scraper job for the given configuration. """ from scraper.models import ScraperConfig, ScrapeJob, ScrapeJobLog from notifications.tasks import send_scrape_notification # Get configuration try: config = ScraperConfig.objects.select_related('sport').get(id=config_id) except ScraperConfig.DoesNotExist: logger.error(f"ScraperConfig {config_id} not found") return {'error': 'Configuration not found'} # Create job record job = ScrapeJob.objects.create( config=config, status='running', triggered_by=triggered_by, started_at=timezone.now(), celery_task_id=self.request.id, ) def log(level, message, source='', extra_data=None): ScrapeJobLog.objects.create( job=job, level=level, message=message, source=source, extra_data=extra_data, ) getattr(logger, level)(f"[{config.sport.code}] {message}") try: log('info', f'Starting scraper for {config.sport.short_name} {config.season}') # Import and run the appropriate scraper result = run_sport_scraper(config, log) # Update job with results job.status = 'completed' job.finished_at = timezone.now() job.games_found = result.get('games_found', 0) job.games_new = result.get('games_new', 0) job.games_updated = result.get('games_updated', 0) job.games_unchanged = result.get('games_unchanged', 0) job.games_errors = result.get('games_errors', 0) job.teams_found = result.get('teams_found', 0) job.stadiums_found = result.get('stadiums_found', 0) job.review_items_created = result.get('review_items', 0) job.save() # Update config config.last_run = timezone.now() config.last_run_status = 'completed' config.last_run_games = result.get('games_found', 0) config.save() log('info', f'Scraper completed: {job.games_found} games, {job.games_new} new, {job.review_items_created} reviews') # Send notification send_scrape_notification.delay(job.id) return { 'job_id': job.id, 'status': 'completed', 'games_found': job.games_found, 'games_new': job.games_new, 'review_items': job.review_items_created, } except Exception as e: error_msg = str(e) error_tb = traceback.format_exc() job.status = 'failed' job.finished_at = timezone.now() job.error_message = error_msg job.error_traceback = error_tb job.save() config.last_run = timezone.now() config.last_run_status = 'failed' config.save() log('error', f'Scraper failed: {error_msg}', extra_data={'traceback': error_tb}) # Send failure notification send_scrape_notification.delay(job.id) # Retry if applicable if self.request.retries < self.max_retries: raise self.retry(exc=e, countdown=60 * (self.request.retries + 1)) return { 'job_id': job.id, 'status': 'failed', 'error': error_msg, } def run_sport_scraper(config, log_func): """ Run the appropriate scraper for the sport. Returns dict with results. """ from core.models import Game, Team, Stadium from scraper.models import ManualReviewItem sport_code = config.sport.code season = config.season log_func('info', f'Loading scraper for {sport_code}', source='engine') # Import the scraper engine from sportstime_parser # This adapts the existing scrapers to work with Django models from scraper.engine.adapter import ScraperAdapter adapter = ScraperAdapter( sport_code=sport_code, season=season, config=config, log_func=log_func, ) # Run the scraper result = adapter.run() return result @shared_task def run_all_enabled_scrapers(): """ Run all enabled scraper configurations. Called by celery-beat on schedule. """ from scraper.models import ScraperConfig configs = ScraperConfig.objects.filter(is_enabled=True) for config in configs: run_scraper_task.delay(config.id, triggered_by='scheduled') return {'configs_queued': configs.count()} @shared_task def cleanup_old_jobs(days: int = 30): """ Clean up old scrape job records. """ from scraper.models import ScrapeJob, ScrapeJobLog from django.utils import timezone from datetime import timedelta cutoff = timezone.now() - timedelta(days=days) # Delete old logs first (foreign key) logs_deleted, _ = ScrapeJobLog.objects.filter( job__created_at__lt=cutoff ).delete() # Then delete old jobs jobs_deleted, _ = ScrapeJob.objects.filter( created_at__lt=cutoff ).delete() return { 'jobs_deleted': jobs_deleted, 'logs_deleted': logs_deleted, }