feat: add Django web app, CloudKit sync, dashboard, and game_datetime_utc export

Adds the full Django application layer on top of sportstime_parser:
- core: Sport, Team, Stadium, Game models with aliases and league structure
- scraper: orchestration engine, adapter, job management, Celery tasks
- cloudkit: CloudKit sync client, sync state tracking, sync jobs
- dashboard: staff dashboard for monitoring scrapers, sync, review queue
- notifications: email reports for scrape/sync results
- Docker setup for deployment (Dockerfile, docker-compose, entrypoint)

Game exports now use game_datetime_utc (ISO 8601 UTC) instead of
venue-local date+time strings, matching the canonical format used
by the iOS app.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-02-19 14:04:27 -06:00
parent 4353d5943c
commit 63acf7accb
114 changed files with 13070 additions and 887 deletions

1
scraper/__init__.py Normal file
View File

@@ -0,0 +1 @@
default_app_config = 'scraper.apps.ScraperConfig'

139
scraper/admin.py Normal file
View File

@@ -0,0 +1,139 @@
"""
Admin configuration for scraper models.
"""
from django.contrib import admin
from django.utils.html import format_html
from import_export.admin import ImportExportMixin, ImportExportModelAdmin
from simple_history.admin import SimpleHistoryAdmin
from .models import ScraperConfig, ScrapeJob, ManualReviewItem
from .resources import ScraperConfigResource, ScrapeJobResource, ManualReviewItemResource
@admin.register(ScraperConfig)
class ScraperConfigAdmin(ImportExportMixin, SimpleHistoryAdmin):
resource_class = ScraperConfigResource
list_display = [
'__str__',
'sport',
'season',
'is_active',
'last_scrape_at',
'next_scrape_at',
'scrape_interval_hours',
]
list_filter = ['sport', 'is_active', 'season']
search_fields = ['sport__name', 'sport__short_name']
ordering = ['-season', 'sport']
readonly_fields = ['created_at', 'updated_at']
@admin.register(ScrapeJob)
class ScrapeJobAdmin(ImportExportModelAdmin):
resource_class = ScrapeJobResource
list_display = [
'__str__',
'status_badge',
'games_found',
'games_created',
'games_updated',
'duration_display',
'created_at',
]
list_filter = ['status', 'config__sport', ('created_at', admin.DateFieldListFilter)]
search_fields = ['config__sport__name', 'errors']
ordering = ['-created_at']
readonly_fields = ['created_at', 'updated_at', 'duration_display']
@admin.display(description='Status')
def status_badge(self, obj):
colors = {
'pending': '#ffc107',
'running': '#17a2b8',
'completed': '#28a745',
'failed': '#dc3545',
'cancelled': '#6c757d',
}
color = colors.get(obj.status, '#6c757d')
return format_html(
'<span style="background-color: {}; color: white; padding: 3px 8px; '
'border-radius: 3px; font-size: 11px;">{}</span>',
color, obj.get_status_display()
)
@admin.display(description='Duration')
def duration_display(self, obj):
duration = obj.duration
if duration is not None:
if duration < 60:
return f"{duration:.1f}s"
elif duration < 3600:
return f"{duration/60:.1f}m"
else:
return f"{duration/3600:.1f}h"
return '-'
@admin.register(ManualReviewItem)
class ManualReviewItemAdmin(ImportExportModelAdmin):
resource_class = ManualReviewItemResource
list_display = [
'raw_value',
'item_type',
'sport',
'status_badge',
'confidence_bar',
'matched_value',
'created_at',
]
list_filter = ['status', 'item_type', 'sport']
search_fields = ['raw_value', 'matched_value']
ordering = ['-confidence', '-created_at']
readonly_fields = ['created_at', 'updated_at', 'resolved_at', 'resolved_by']
actions = ['approve_items', 'reject_items']
@admin.display(description='Status')
def status_badge(self, obj):
colors = {
'pending': '#ffc107',
'approved': '#28a745',
'rejected': '#dc3545',
'resolved': '#17a2b8',
}
color = colors.get(obj.status, '#6c757d')
return format_html(
'<span style="background-color: {}; color: white; padding: 3px 8px; '
'border-radius: 3px; font-size: 11px;">{}</span>',
color, obj.get_status_display()
)
@admin.display(description='Confidence')
def confidence_bar(self, obj):
color = '#28a745' if obj.confidence >= 85 else '#ffc107' if obj.confidence >= 70 else '#dc3545'
return format_html(
'<div style="width: 100px; background: #ddd; border-radius: 3px;">'
'<div style="width: {}%; background: {}; height: 16px; border-radius: 3px; '
'text-align: center; color: white; font-size: 11px; line-height: 16px;">'
'{}%</div></div>',
obj.confidence, color, obj.confidence
)
@admin.action(description='Approve selected items')
def approve_items(self, request, queryset):
from django.utils import timezone
updated = queryset.update(
status='approved',
resolved_at=timezone.now(),
resolved_by=request.user
)
self.message_user(request, f'{updated} items approved.')
@admin.action(description='Reject selected items')
def reject_items(self, request, queryset):
from django.utils import timezone
updated = queryset.update(
status='rejected',
resolved_at=timezone.now(),
resolved_by=request.user
)
self.message_user(request, f'{updated} items rejected.')

View File

@@ -0,0 +1,3 @@
from .config_admin import ScraperConfigAdmin
from .job_admin import ScrapeJobAdmin
from .review_admin import ManualReviewItemAdmin

View File

@@ -0,0 +1,110 @@
from django.contrib import admin
from django.utils.html import format_html
from django.urls import reverse
from simple_history.admin import SimpleHistoryAdmin
from scraper.models import ScraperConfig
@admin.register(ScraperConfig)
class ScraperConfigAdmin(SimpleHistoryAdmin):
list_display = [
'sport',
'season_display',
'is_enabled',
'primary_source',
'last_run_display',
'last_run_status_badge',
'last_run_games',
'job_count',
]
list_filter = ['sport', 'is_enabled', 'last_run_status']
search_fields = ['sport__name', 'sport__short_name']
ordering = ['-season', 'sport']
readonly_fields = [
'created_at',
'updated_at',
'last_run',
'last_run_status',
'last_run_games',
]
fieldsets = [
(None, {
'fields': ['sport', 'season', 'is_enabled']
}),
('Source Configuration', {
'fields': ['sources', 'primary_source']
}),
('Rate Limiting', {
'fields': ['request_delay', 'max_retries']
}),
('Matching', {
'fields': ['fuzzy_threshold']
}),
('Last Run', {
'fields': ['last_run', 'last_run_status', 'last_run_games'],
'classes': ['collapse']
}),
('Notes', {
'fields': ['notes'],
'classes': ['collapse']
}),
('Metadata', {
'fields': ['created_at', 'updated_at'],
'classes': ['collapse']
}),
]
actions = ['run_scraper', 'enable_scrapers', 'disable_scrapers']
def season_display(self, obj):
return obj.sport.get_season_display(obj.season)
season_display.short_description = 'Season'
def last_run_display(self, obj):
if obj.last_run:
return obj.last_run.strftime('%Y-%m-%d %H:%M')
return '-'
last_run_display.short_description = 'Last Run'
def last_run_status_badge(self, obj):
if not obj.last_run_status:
return '-'
colors = {
'completed': 'green',
'failed': 'red',
'running': 'orange',
}
color = colors.get(obj.last_run_status, 'gray')
return format_html(
'<span style="color: {}; font-weight: bold;">{}</span>',
color,
obj.last_run_status.upper()
)
last_run_status_badge.short_description = 'Status'
def job_count(self, obj):
count = obj.jobs.count()
if count > 0:
url = reverse('admin:scraper_scrapejob_changelist') + f'?config__id__exact={obj.id}'
return format_html('<a href="{}">{} jobs</a>', url, count)
return '0 jobs'
job_count.short_description = 'Jobs'
@admin.action(description='Run scraper for selected configurations')
def run_scraper(self, request, queryset):
from scraper.tasks import run_scraper_task
for config in queryset:
run_scraper_task.delay(config.id)
self.message_user(request, f'Started {queryset.count()} scraper jobs.')
@admin.action(description='Enable selected scrapers')
def enable_scrapers(self, request, queryset):
updated = queryset.update(is_enabled=True)
self.message_user(request, f'{updated} scrapers enabled.')
@admin.action(description='Disable selected scrapers')
def disable_scrapers(self, request, queryset):
updated = queryset.update(is_enabled=False)
self.message_user(request, f'{updated} scrapers disabled.')

154
scraper/admin/job_admin.py Normal file
View File

@@ -0,0 +1,154 @@
from django.contrib import admin
from django.utils.html import format_html
from django.urls import reverse
from scraper.models import ScrapeJob, ScrapeJobLog
class ScrapeJobLogInline(admin.TabularInline):
model = ScrapeJobLog
extra = 0
readonly_fields = ['created_at', 'level', 'source', 'message']
fields = ['created_at', 'level', 'source', 'message']
ordering = ['created_at']
can_delete = False
def has_add_permission(self, request, obj=None):
return False
@admin.register(ScrapeJob)
class ScrapeJobAdmin(admin.ModelAdmin):
list_display = [
'id',
'config',
'status_badge',
'triggered_by',
'started_at',
'duration_display',
'games_summary',
'review_items_link',
]
list_filter = ['status', 'config__sport', 'triggered_by', 'config__season']
search_fields = ['config__sport__name', 'celery_task_id']
date_hierarchy = 'created_at'
ordering = ['-created_at']
readonly_fields = [
'id',
'config',
'status',
'triggered_by',
'started_at',
'finished_at',
'duration_display',
'games_found',
'games_new',
'games_updated',
'games_unchanged',
'games_errors',
'teams_found',
'stadiums_found',
'review_items_created',
'error_message',
'error_traceback',
'celery_task_id',
'created_at',
'updated_at',
]
inlines = [ScrapeJobLogInline]
fieldsets = [
(None, {
'fields': ['id', 'config', 'status', 'triggered_by', 'celery_task_id']
}),
('Timing', {
'fields': ['started_at', 'finished_at', 'duration_display']
}),
('Results - Games', {
'fields': [
'games_found',
'games_new',
'games_updated',
'games_unchanged',
'games_errors',
]
}),
('Results - Other', {
'fields': ['teams_found', 'stadiums_found', 'review_items_created']
}),
('Errors', {
'fields': ['error_message', 'error_traceback'],
'classes': ['collapse']
}),
('Metadata', {
'fields': ['created_at', 'updated_at'],
'classes': ['collapse']
}),
]
actions = ['cancel_jobs', 'retry_jobs']
def has_add_permission(self, request):
return False
def has_change_permission(self, request, obj=None):
return False
def status_badge(self, obj):
colors = {
'pending': '#999',
'running': '#f0ad4e',
'completed': '#5cb85c',
'failed': '#d9534f',
'cancelled': '#777',
}
color = colors.get(obj.status, '#999')
return format_html(
'<span style="background-color: {}; color: white; padding: 3px 8px; '
'border-radius: 3px; font-size: 11px;">{}</span>',
color,
obj.status.upper()
)
status_badge.short_description = 'Status'
def games_summary(self, obj):
if obj.games_found == 0:
return '-'
return format_html(
'<span title="New: {}, Updated: {}, Unchanged: {}, Errors: {}">'
'{} found ({} new, {} upd)</span>',
obj.games_new, obj.games_updated, obj.games_unchanged, obj.games_errors,
obj.games_found, obj.games_new, obj.games_updated
)
games_summary.short_description = 'Games'
def review_items_link(self, obj):
if obj.review_items_created > 0:
url = reverse('admin:scraper_manualreviewitem_changelist') + f'?job__id__exact={obj.id}'
return format_html(
'<a href="{}">{} items</a>',
url, obj.review_items_created
)
return '-'
review_items_link.short_description = 'Review'
@admin.action(description='Cancel selected jobs')
def cancel_jobs(self, request, queryset):
from celery.result import AsyncResult
cancelled = 0
for job in queryset.filter(status__in=['pending', 'running']):
if job.celery_task_id:
AsyncResult(job.celery_task_id).revoke(terminate=True)
job.status = 'cancelled'
job.save()
cancelled += 1
self.message_user(request, f'{cancelled} jobs cancelled.')
@admin.action(description='Retry failed jobs')
def retry_jobs(self, request, queryset):
from scraper.tasks import run_scraper_task
retried = 0
for job in queryset.filter(status='failed'):
run_scraper_task.delay(job.config.id)
retried += 1
self.message_user(request, f'{retried} jobs requeued.')

View File

@@ -0,0 +1,157 @@
from django.contrib import admin
from django.utils.html import format_html
from django.utils import timezone
from simple_history.admin import SimpleHistoryAdmin
from scraper.models import ManualReviewItem
@admin.register(ManualReviewItem)
class ManualReviewItemAdmin(SimpleHistoryAdmin):
list_display = [
'raw_value',
'item_type',
'sport',
'status_badge',
'suggested_match',
'confidence_badge',
'reason',
'created_at',
]
list_filter = ['status', 'item_type', 'sport', 'reason']
search_fields = ['raw_value', 'suggested_id', 'resolved_to']
ordering = ['-created_at']
readonly_fields = [
'job',
'item_type',
'sport',
'raw_value',
'suggested_id',
'confidence',
'reason',
'source_url',
'check_date',
'context',
'resolved_by',
'resolved_at',
'created_at',
'updated_at',
]
autocomplete_fields = []
fieldsets = [
(None, {
'fields': ['job', 'item_type', 'sport', 'raw_value']
}),
('Suggested Match', {
'fields': ['suggested_id', 'confidence', 'reason']
}),
('Context', {
'fields': ['source_url', 'check_date', 'context'],
'classes': ['collapse']
}),
('Resolution', {
'fields': [
'status',
'resolved_to',
'create_alias',
'resolution_notes',
'resolved_by',
'resolved_at',
]
}),
('Metadata', {
'fields': ['created_at', 'updated_at'],
'classes': ['collapse']
}),
]
actions = [
'accept_suggested',
'mark_ignored',
'accept_and_create_alias',
]
def status_badge(self, obj):
colors = {
'pending': '#f0ad4e',
'resolved': '#5cb85c',
'ignored': '#999',
'new_entity': '#5bc0de',
}
color = colors.get(obj.status, '#999')
return format_html(
'<span style="background-color: {}; color: white; padding: 3px 8px; '
'border-radius: 3px; font-size: 11px;">{}</span>',
color,
obj.get_status_display().upper()
)
status_badge.short_description = 'Status'
def suggested_match(self, obj):
if obj.suggested_id:
return format_html(
'<code style="background: #f5f5f5; padding: 2px 5px;">{}</code>',
obj.suggested_id
)
return '-'
suggested_match.short_description = 'Suggested'
def confidence_badge(self, obj):
if obj.confidence == 0:
return '-'
pct = obj.confidence * 100
if pct >= 85:
color = '#5cb85c'
elif pct >= 70:
color = '#f0ad4e'
else:
color = '#d9534f'
return format_html(
'<span style="color: {}; font-weight: bold;">{:.0f}%</span>',
color, pct
)
confidence_badge.short_description = 'Conf.'
@admin.action(description='Accept suggested match')
def accept_suggested(self, request, queryset):
resolved = 0
for item in queryset.filter(status='pending', suggested_id__isnull=False):
item.resolve(
canonical_id=item.suggested_id,
user=request.user,
notes='Accepted suggested match via admin action'
)
resolved += 1
self.message_user(request, f'{resolved} items resolved.')
@admin.action(description='Accept suggested and create alias')
def accept_and_create_alias(self, request, queryset):
resolved = 0
for item in queryset.filter(status='pending', suggested_id__isnull=False):
item.resolve(
canonical_id=item.suggested_id,
user=request.user,
notes='Accepted and created alias via admin action',
create_alias=True
)
resolved += 1
self.message_user(request, f'{resolved} items resolved with aliases created.')
@admin.action(description='Mark as ignored')
def mark_ignored(self, request, queryset):
ignored = 0
for item in queryset.filter(status='pending'):
item.ignore(
user=request.user,
notes='Ignored via admin action'
)
ignored += 1
self.message_user(request, f'{ignored} items ignored.')
def save_model(self, request, obj, form, change):
# Auto-set resolved_by and resolved_at when status changes to resolved
if change and obj.status in ['resolved', 'ignored'] and not obj.resolved_by:
obj.resolved_by = request.user
obj.resolved_at = timezone.now()
super().save_model(request, obj, form, change)

7
scraper/apps.py Normal file
View File

@@ -0,0 +1,7 @@
from django.apps import AppConfig
class ScraperConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'scraper'
verbose_name = 'Scraper Management'

View File

@@ -0,0 +1 @@
# Scraper engine package

496
scraper/engine/adapter.py Normal file
View File

@@ -0,0 +1,496 @@
"""
Adapter to bridge existing sportstime_parser scrapers with Django models.
"""
import hashlib
from datetime import datetime
from typing import Callable, Optional
from django.db import transaction
from django.utils import timezone
class ScraperAdapter:
"""
Adapts the existing sportstime_parser scrapers to work with Django models.
"""
def __init__(
self,
sport_code: str,
season: int,
config,
log_func: Optional[Callable] = None,
):
self.sport_code = sport_code
self.season = season
self.config = config
self.log = log_func or (lambda level, msg, **kw: None)
def run(self) -> dict:
"""
Run the scraper and return results.
"""
from core.models import Sport, Team, Stadium, Game
from scraper.models import ManualReviewItem
from cloudkit.models import CloudKitSyncState
result = {
'games_found': 0,
'games_new': 0,
'games_updated': 0,
'games_unchanged': 0,
'games_errors': 0,
'teams_found': 0,
'stadiums_found': 0,
'review_items': 0,
}
# Get sport
try:
sport = Sport.objects.get(code=self.sport_code)
except Sport.DoesNotExist:
raise ValueError(f"Sport {self.sport_code} not found in database")
self.log('info', f'Starting scraper for {sport.short_name} {self.season}', source='adapter')
# Import and create the appropriate scraper
scraper = self._create_scraper()
# Run the scrape
self.log('info', 'Scraping games...', source='adapter')
raw_result = scraper.scrape_all()
# Process stadiums first (teams reference stadiums via home_stadium FK)
self.log('info', f'Processing {len(raw_result.stadiums)} stadiums...', source='adapter')
result['stadiums_found'] = len(raw_result.stadiums)
self._process_stadiums(sport, raw_result.stadiums)
# Process teams
self.log('info', f'Processing {len(raw_result.teams)} teams...', source='adapter')
result['teams_found'] = len(raw_result.teams)
self._process_teams(sport, raw_result.teams)
# Process games
self.log('info', f'Processing {len(raw_result.games)} games...', source='adapter')
game_result = self._process_games(sport, raw_result.games)
result.update(game_result)
# Process review items
if raw_result.review_items:
self.log('info', f'Creating {len(raw_result.review_items)} review items...', source='adapter')
result['review_items'] = self._process_review_items(sport, raw_result.review_items)
self.log('info', f'Scrape complete: {result}', source='adapter')
return result
def _create_scraper(self):
"""Create the appropriate scraper instance."""
# Import from existing sportstime_parser
from sportstime_parser.scrapers import (
create_nba_scraper,
create_mlb_scraper,
create_nfl_scraper,
create_nhl_scraper,
create_mls_scraper,
create_wnba_scraper,
create_nwsl_scraper,
)
scrapers = {
'nba': create_nba_scraper,
'mlb': create_mlb_scraper,
'nfl': create_nfl_scraper,
'nhl': create_nhl_scraper,
'mls': create_mls_scraper,
'wnba': create_wnba_scraper,
'nwsl': create_nwsl_scraper,
}
creator = scrapers.get(self.sport_code)
if not creator:
raise ValueError(f"No scraper for sport: {self.sport_code}")
# Create scraper (config overrides handled via session/resolver settings if needed)
return creator(season=self.season)
def _process_teams(self, sport, teams):
"""Process and upsert teams."""
from core.models import Team, Stadium, Division, Conference
from cloudkit.models import CloudKitSyncState
for team_data in teams:
team_id = team_data.id
# Find division if available
division = None
if team_data.division:
division = Division.objects.filter(
conference__sport=sport,
name__iexact=team_data.division
).first()
# Fallback to partial match
if not division:
division = Division.objects.filter(
conference__sport=sport,
name__icontains=team_data.division
).first()
# Resolve home stadium if available
home_stadium = None
stadium_id = getattr(team_data, 'stadium_id', None)
if stadium_id:
home_stadium = Stadium.objects.filter(id=stadium_id).first()
team, created = Team.objects.update_or_create(
id=team_id,
defaults={
'sport': sport,
'division': division,
'city': team_data.city,
'name': team_data.name,
'full_name': team_data.full_name,
'abbreviation': team_data.abbreviation,
'home_stadium': home_stadium,
'primary_color': getattr(team_data, 'primary_color', '') or '',
'secondary_color': getattr(team_data, 'secondary_color', '') or '',
'logo_url': getattr(team_data, 'logo_url', '') or '',
}
)
# Mark for sync
if created:
CloudKitSyncState.objects.get_or_create(
record_type='Team',
record_id=team_id,
defaults={'sync_status': 'pending'}
)
def _process_stadiums(self, sport, stadiums):
"""Process and upsert stadiums."""
from core.models import Stadium
from cloudkit.models import CloudKitSyncState
for stadium_data in stadiums:
stadium_id = stadium_data.id
stadium, created = Stadium.objects.update_or_create(
id=stadium_id,
defaults={
'sport': sport,
'name': stadium_data.name,
'city': stadium_data.city,
'state': getattr(stadium_data, 'state', '') or '',
'country': getattr(stadium_data, 'country', 'USA'),
'latitude': getattr(stadium_data, 'latitude', None),
'longitude': getattr(stadium_data, 'longitude', None),
'capacity': getattr(stadium_data, 'capacity', None),
'surface': getattr(stadium_data, 'surface', '') or '',
'roof_type': getattr(stadium_data, 'roof_type', '') or '',
'opened_year': getattr(stadium_data, 'opened_year', None),
'timezone': getattr(stadium_data, 'timezone', '') or '',
'image_url': getattr(stadium_data, 'image_url', '') or '',
}
)
if created:
CloudKitSyncState.objects.get_or_create(
record_type='Stadium',
record_id=stadium_id,
defaults={'sync_status': 'pending'}
)
def _resolve_team_via_db_alias(self, sport, raw_name, check_date=None):
"""Try to resolve a team name using database aliases.
Args:
sport: Sport model instance
raw_name: Raw team name from scraper
check_date: Date for alias validity check
Returns:
Team instance if found, None otherwise
"""
from core.models import Team, TeamAlias
from datetime import date
if not raw_name:
return None
check_date = check_date or date.today()
# Check TeamAlias model
aliases = TeamAlias.objects.filter(
alias__iexact=raw_name.strip(),
team__sport=sport,
).select_related('team')
for alias in aliases:
if alias.is_valid_for_date(check_date):
return alias.team
# Also try partial matching on team full_name and city
team = Team.objects.filter(
sport=sport,
full_name__iexact=raw_name.strip()
).first()
if team:
return team
team = Team.objects.filter(
sport=sport,
city__iexact=raw_name.strip()
).first()
if team:
return team
return None
def _resolve_stadium_via_db_alias(self, sport, raw_name, check_date=None):
"""Try to resolve a stadium name using database aliases.
Args:
sport: Sport model instance
raw_name: Raw stadium name from scraper
check_date: Date for alias validity check
Returns:
Stadium instance if found, None otherwise
"""
from core.models import Stadium, StadiumAlias
from datetime import date
if not raw_name:
return None
check_date = check_date or date.today()
# Check StadiumAlias model
aliases = StadiumAlias.objects.filter(
alias__iexact=raw_name.strip(),
stadium__sport=sport,
).select_related('stadium')
for alias in aliases:
if alias.is_valid_for_date(check_date):
return alias.stadium
# Also try direct matching on stadium name
stadium = Stadium.objects.filter(
sport=sport,
name__iexact=raw_name.strip()
).first()
if stadium:
return stadium
return None
def _process_games(self, sport, games):
"""Process and upsert games."""
from core.models import Game, Team, Stadium
from cloudkit.models import CloudKitSyncState
result = {
'games_found': len(games),
'games_new': 0,
'games_updated': 0,
'games_unchanged': 0,
'games_errors': 0,
}
for game_data in games:
try:
game_id = game_data.id
check_date = game_data.game_date.date() if hasattr(game_data.game_date, 'date') else game_data.game_date
# Get related objects - try by ID first, then by DB alias
home_team = None
away_team = None
try:
home_team = Team.objects.get(id=game_data.home_team_id)
except Team.DoesNotExist:
# Try resolving via database alias using raw name
raw_home = getattr(game_data, 'raw_home_team', None)
if raw_home:
home_team = self._resolve_team_via_db_alias(sport, raw_home, check_date)
if home_team:
self.log('info', f'Resolved home team via DB alias: {raw_home} -> {home_team.abbreviation}', source='adapter')
try:
away_team = Team.objects.get(id=game_data.away_team_id)
except Team.DoesNotExist:
# Try resolving via database alias using raw name
raw_away = getattr(game_data, 'raw_away_team', None)
if raw_away:
away_team = self._resolve_team_via_db_alias(sport, raw_away, check_date)
if away_team:
self.log('info', f'Resolved away team via DB alias: {raw_away} -> {away_team.abbreviation}', source='adapter')
if not home_team or not away_team:
missing = []
if not home_team:
missing.append(f'home={game_data.home_team_id}')
if not away_team:
missing.append(f'away={game_data.away_team_id}')
self.log('warning', f'Team not found for game {game_id}: {", ".join(missing)}', source='adapter')
result['games_errors'] += 1
continue
stadium = None
if game_data.stadium_id:
try:
stadium = Stadium.objects.get(id=game_data.stadium_id)
except Stadium.DoesNotExist:
# Try resolving via database alias using raw name
raw_stadium = getattr(game_data, 'raw_stadium', None)
if raw_stadium:
stadium = self._resolve_stadium_via_db_alias(sport, raw_stadium, check_date)
if stadium:
self.log('info', f'Resolved stadium via DB alias: {raw_stadium} -> {stadium.name}', source='adapter')
# Build game dict
game_defaults = {
'sport': sport,
'season': game_data.season,
'home_team': home_team,
'away_team': away_team,
'stadium': stadium,
'game_date': game_data.game_date,
'game_number': getattr(game_data, 'game_number', None),
'home_score': game_data.home_score,
'away_score': game_data.away_score,
'status': game_data.status,
'raw_home_team': getattr(game_data, 'raw_home_team', '') or '',
'raw_away_team': getattr(game_data, 'raw_away_team', '') or '',
'raw_stadium': getattr(game_data, 'raw_stadium', '') or '',
'source_url': getattr(game_data, 'source_url', '') or '',
}
# Check if game exists
try:
existing = Game.objects.get(id=game_id)
# Check if changed
changed = False
for key, value in game_defaults.items():
if getattr(existing, key if not hasattr(existing, f'{key}_id') else f'{key}_id') != (value.id if hasattr(value, 'id') else value):
changed = True
break
if changed:
for key, value in game_defaults.items():
setattr(existing, key, value)
existing.save()
result['games_updated'] += 1
# Mark for sync
CloudKitSyncState.objects.update_or_create(
record_type='Game',
record_id=game_id,
defaults={'sync_status': 'pending'}
)
else:
result['games_unchanged'] += 1
except Game.DoesNotExist:
# Create new game
Game.objects.create(id=game_id, **game_defaults)
result['games_new'] += 1
# Mark for sync
CloudKitSyncState.objects.get_or_create(
record_type='Game',
record_id=game_id,
defaults={'sync_status': 'pending'}
)
except Exception as e:
self.log('error', f'Error processing game: {e}', source='adapter')
result['games_errors'] += 1
return result
def _process_review_items(self, sport, review_items):
"""Create manual review items."""
from scraper.models import ManualReviewItem, ScrapeJob
from sportstime_parser.models.aliases import ReviewReason
# Get current job
job = ScrapeJob.objects.filter(
config=self.config,
status='running'
).order_by('-created_at').first()
count = 0
for item in review_items:
# Derive item_type from reason
item_type = self._get_item_type_from_reason(item.reason)
# Get suggested match info (parser uses suggested_matches list)
suggested_id = ''
confidence = 0.0
if item.suggested_matches:
best_match = item.suggested_matches[0]
suggested_id = best_match.canonical_id
confidence = best_match.confidence / 100.0 # Convert to 0-1 range
ManualReviewItem.objects.create(
job=job,
item_type=item_type,
sport=sport,
raw_value=item.raw_value,
suggested_id=suggested_id,
confidence=confidence,
reason=self._map_reason(item.reason),
source_url=item.source_url or '',
check_date=item.game_date,
context=item.context if item.context else None,
)
count += 1
return count
def _get_item_type_from_reason(self, reason) -> str:
"""Derive item type (team/stadium) from ReviewReason enum."""
from sportstime_parser.models.aliases import ReviewReason
# Map reason to item type
if isinstance(reason, ReviewReason):
reason_value = reason.value
else:
reason_value = str(reason).lower()
if 'team' in reason_value:
return 'team'
elif 'stadium' in reason_value:
return 'stadium'
else:
# Default to team for other reasons
return 'team'
def _map_reason(self, reason) -> str:
"""Map scraper ReviewReason to model choice."""
from sportstime_parser.models.aliases import ReviewReason
# Handle ReviewReason enum
if isinstance(reason, ReviewReason):
reason_value = reason.value
else:
reason_value = str(reason).lower()
reason_map = {
'unresolved_team': 'no_match',
'unresolved_stadium': 'no_match',
'low_confidence_match': 'low_confidence',
'missing_data': 'no_match',
'duplicate_game': 'ambiguous',
'timezone_unknown': 'no_match',
'geographic_filter': 'no_match',
# Legacy mappings
'no_match': 'no_match',
'no match found': 'no_match',
'low_confidence': 'low_confidence',
'fuzzy match below threshold': 'low_confidence',
'ambiguous': 'ambiguous',
'new_entity': 'new_entity',
}
return reason_map.get(reason_value.lower(), 'no_match')

View File

@@ -0,0 +1,144 @@
"""Database-aware alias loaders for team and stadium resolution.
These loaders check the Django TeamAlias and StadiumAlias models
in addition to the hardcoded mappings, allowing aliases to be
managed via the admin interface.
"""
from datetime import date
from typing import Optional
class DatabaseTeamAliasLoader:
"""Load team aliases from the Django database.
Checks the core.TeamAlias model for alias mappings,
supporting date-aware lookups for historical names.
"""
def resolve(
self,
value: str,
sport_code: str,
check_date: Optional[date] = None,
) -> Optional[str]:
"""Resolve an alias value to a canonical team ID.
Args:
value: Alias value to look up (case-insensitive)
sport_code: Sport code to filter by
check_date: Date to check validity (None = current date)
Returns:
Canonical team ID if found, None otherwise
"""
from core.models import TeamAlias
from django.db.models import Q
if check_date is None:
check_date = date.today()
value_lower = value.lower().strip()
# Query aliases matching the value and sport
aliases = TeamAlias.objects.filter(
alias__iexact=value_lower,
team__sport__code=sport_code,
).select_related('team')
for alias in aliases:
if alias.is_valid_for_date(check_date):
return alias.team.id
return None
def get_aliases_for_team(
self,
team_id: str,
check_date: Optional[date] = None,
) -> list:
"""Get all aliases for a team.
Args:
team_id: Team ID
check_date: Date to filter by (None = all aliases)
Returns:
List of TeamAlias objects
"""
from core.models import TeamAlias
aliases = TeamAlias.objects.filter(team_id=team_id)
if check_date:
result = []
for alias in aliases:
if alias.is_valid_for_date(check_date):
result.append(alias)
return result
return list(aliases)
class DatabaseStadiumAliasLoader:
"""Load stadium aliases from the Django database.
Checks the core.StadiumAlias model for alias mappings,
supporting date-aware lookups for naming rights changes.
"""
def resolve(
self,
name: str,
sport_code: str,
check_date: Optional[date] = None,
) -> Optional[str]:
"""Resolve a stadium name to a canonical stadium ID.
Args:
name: Stadium name to look up (case-insensitive)
sport_code: Sport code to filter by
check_date: Date to check validity (None = current date)
Returns:
Canonical stadium ID if found, None otherwise
"""
from core.models import StadiumAlias
if check_date is None:
check_date = date.today()
name_lower = name.lower().strip()
# Query aliases matching the name and sport
aliases = StadiumAlias.objects.filter(
alias__iexact=name_lower,
stadium__sport__code=sport_code,
).select_related('stadium')
for alias in aliases:
if alias.is_valid_for_date(check_date):
return alias.stadium.id
return None
# Global instances
_db_team_loader: Optional[DatabaseTeamAliasLoader] = None
_db_stadium_loader: Optional[DatabaseStadiumAliasLoader] = None
def get_db_team_alias_loader() -> DatabaseTeamAliasLoader:
"""Get the database team alias loader."""
global _db_team_loader
if _db_team_loader is None:
_db_team_loader = DatabaseTeamAliasLoader()
return _db_team_loader
def get_db_stadium_alias_loader() -> DatabaseStadiumAliasLoader:
"""Get the database stadium alias loader."""
global _db_stadium_loader
if _db_stadium_loader is None:
_db_stadium_loader = DatabaseStadiumAliasLoader()
return _db_stadium_loader

View File

@@ -0,0 +1,201 @@
# Generated by Django 5.1.15 on 2026-01-26 08:59
import django.db.models.deletion
import simple_history.models
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
('core', '0001_initial'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='ScrapeJob',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('status', models.CharField(choices=[('pending', 'Pending'), ('running', 'Running'), ('completed', 'Completed'), ('failed', 'Failed'), ('cancelled', 'Cancelled')], default='pending', max_length=20)),
('triggered_by', models.CharField(default='manual', help_text='How the job was triggered (manual, scheduled, api)', max_length=50)),
('started_at', models.DateTimeField(blank=True, null=True)),
('finished_at', models.DateTimeField(blank=True, null=True)),
('games_found', models.PositiveIntegerField(default=0)),
('games_new', models.PositiveIntegerField(default=0)),
('games_updated', models.PositiveIntegerField(default=0)),
('games_unchanged', models.PositiveIntegerField(default=0)),
('games_errors', models.PositiveIntegerField(default=0)),
('teams_found', models.PositiveIntegerField(default=0)),
('stadiums_found', models.PositiveIntegerField(default=0)),
('review_items_created', models.PositiveIntegerField(default=0)),
('error_message', models.TextField(blank=True)),
('error_traceback', models.TextField(blank=True)),
('celery_task_id', models.CharField(blank=True, help_text='Celery task ID for this job', max_length=255)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
],
options={
'verbose_name': 'Scrape Job',
'verbose_name_plural': 'Scrape Jobs',
'ordering': ['-created_at'],
},
),
migrations.CreateModel(
name='HistoricalScraperConfig',
fields=[
('id', models.BigIntegerField(auto_created=True, blank=True, db_index=True, verbose_name='ID')),
('season', models.PositiveSmallIntegerField(help_text='Season to scrape (e.g., 2025 for 2025-26 season)')),
('is_enabled', models.BooleanField(default=True, help_text='Whether this scraper is enabled for scheduling')),
('sources', models.JSONField(default=list, help_text='Ordered list of sources to try (e.g., ["basketball_reference", "espn"])')),
('primary_source', models.CharField(blank=True, help_text='Primary source for this scraper', max_length=100)),
('request_delay', models.FloatField(default=3.0, help_text='Seconds between requests')),
('max_retries', models.PositiveSmallIntegerField(default=3, help_text='Maximum retry attempts')),
('fuzzy_threshold', models.PositiveSmallIntegerField(default=85, help_text='Minimum fuzzy match confidence (0-100)')),
('last_run', models.DateTimeField(blank=True, help_text='Last successful run timestamp', null=True)),
('last_run_status', models.CharField(blank=True, help_text='Status of last run', max_length=20)),
('last_run_games', models.PositiveIntegerField(default=0, help_text='Games found in last run')),
('notes', models.TextField(blank=True, help_text='Configuration notes')),
('created_at', models.DateTimeField(blank=True, editable=False)),
('updated_at', models.DateTimeField(blank=True, editable=False)),
('history_id', models.AutoField(primary_key=True, serialize=False)),
('history_date', models.DateTimeField(db_index=True)),
('history_change_reason', models.CharField(max_length=100, null=True)),
('history_type', models.CharField(choices=[('+', 'Created'), ('~', 'Changed'), ('-', 'Deleted')], max_length=1)),
('history_user', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='+', to=settings.AUTH_USER_MODEL)),
('sport', models.ForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='core.sport')),
],
options={
'verbose_name': 'historical Scraper Configuration',
'verbose_name_plural': 'historical Scraper Configurations',
'ordering': ('-history_date', '-history_id'),
'get_latest_by': ('history_date', 'history_id'),
},
bases=(simple_history.models.HistoricalChanges, models.Model),
),
migrations.CreateModel(
name='HistoricalManualReviewItem',
fields=[
('id', models.BigIntegerField(auto_created=True, blank=True, db_index=True, verbose_name='ID')),
('item_type', models.CharField(choices=[('team', 'Team'), ('stadium', 'Stadium')], max_length=20)),
('raw_value', models.CharField(help_text='Original scraped value', max_length=300)),
('suggested_id', models.CharField(blank=True, help_text='Suggested canonical ID (if any match found)', max_length=100)),
('confidence', models.FloatField(default=0.0, help_text='Match confidence (0.0 - 1.0)')),
('reason', models.CharField(choices=[('no_match', 'No Match Found'), ('low_confidence', 'Low Confidence Match'), ('ambiguous', 'Ambiguous Match'), ('new_entity', 'Potentially New Entity')], help_text='Why manual review is needed', max_length=20)),
('source_url', models.URLField(blank=True, help_text='URL where this value was found')),
('check_date', models.DateField(blank=True, help_text='Date context for alias resolution', null=True)),
('context', models.JSONField(blank=True, help_text='Additional context (e.g., game info)', null=True)),
('status', models.CharField(choices=[('pending', 'Pending Review'), ('resolved', 'Resolved'), ('ignored', 'Ignored'), ('new_entity', 'Created New Entity')], default='pending', max_length=20)),
('resolved_to', models.CharField(blank=True, help_text='Final resolved canonical ID', max_length=100)),
('resolved_at', models.DateTimeField(blank=True, null=True)),
('resolution_notes', models.TextField(blank=True, help_text='Notes about the resolution')),
('create_alias', models.BooleanField(default=False, help_text='Whether to create an alias from this resolution')),
('created_at', models.DateTimeField(blank=True, editable=False)),
('updated_at', models.DateTimeField(blank=True, editable=False)),
('history_id', models.AutoField(primary_key=True, serialize=False)),
('history_date', models.DateTimeField(db_index=True)),
('history_change_reason', models.CharField(max_length=100, null=True)),
('history_type', models.CharField(choices=[('+', 'Created'), ('~', 'Changed'), ('-', 'Deleted')], max_length=1)),
('history_user', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='+', to=settings.AUTH_USER_MODEL)),
('resolved_by', models.ForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to=settings.AUTH_USER_MODEL)),
('sport', models.ForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='core.sport')),
('job', models.ForeignKey(blank=True, db_constraint=False, help_text='Job that created this review item', null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='scraper.scrapejob')),
],
options={
'verbose_name': 'historical Manual Review Item',
'verbose_name_plural': 'historical Manual Review Items',
'ordering': ('-history_date', '-history_id'),
'get_latest_by': ('history_date', 'history_id'),
},
bases=(simple_history.models.HistoricalChanges, models.Model),
),
migrations.CreateModel(
name='ScrapeJobLog',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('level', models.CharField(choices=[('debug', 'Debug'), ('info', 'Info'), ('warning', 'Warning'), ('error', 'Error')], default='info', max_length=10)),
('message', models.TextField()),
('source', models.CharField(blank=True, help_text='Source/component that generated this log', max_length=100)),
('extra_data', models.JSONField(blank=True, help_text='Additional structured data', null=True)),
('created_at', models.DateTimeField(auto_now_add=True)),
('job', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='logs', to='scraper.scrapejob')),
],
options={
'verbose_name': 'Scrape Job Log',
'verbose_name_plural': 'Scrape Job Logs',
'ordering': ['created_at'],
},
),
migrations.CreateModel(
name='ScraperConfig',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('season', models.PositiveSmallIntegerField(help_text='Season to scrape (e.g., 2025 for 2025-26 season)')),
('is_enabled', models.BooleanField(default=True, help_text='Whether this scraper is enabled for scheduling')),
('sources', models.JSONField(default=list, help_text='Ordered list of sources to try (e.g., ["basketball_reference", "espn"])')),
('primary_source', models.CharField(blank=True, help_text='Primary source for this scraper', max_length=100)),
('request_delay', models.FloatField(default=3.0, help_text='Seconds between requests')),
('max_retries', models.PositiveSmallIntegerField(default=3, help_text='Maximum retry attempts')),
('fuzzy_threshold', models.PositiveSmallIntegerField(default=85, help_text='Minimum fuzzy match confidence (0-100)')),
('last_run', models.DateTimeField(blank=True, help_text='Last successful run timestamp', null=True)),
('last_run_status', models.CharField(blank=True, help_text='Status of last run', max_length=20)),
('last_run_games', models.PositiveIntegerField(default=0, help_text='Games found in last run')),
('notes', models.TextField(blank=True, help_text='Configuration notes')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('sport', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='scraper_configs', to='core.sport')),
],
options={
'verbose_name': 'Scraper Configuration',
'verbose_name_plural': 'Scraper Configurations',
'ordering': ['sport', 'season'],
'unique_together': {('sport', 'season')},
},
),
migrations.AddField(
model_name='scrapejob',
name='config',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='jobs', to='scraper.scraperconfig'),
),
migrations.CreateModel(
name='ManualReviewItem',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('item_type', models.CharField(choices=[('team', 'Team'), ('stadium', 'Stadium')], max_length=20)),
('raw_value', models.CharField(help_text='Original scraped value', max_length=300)),
('suggested_id', models.CharField(blank=True, help_text='Suggested canonical ID (if any match found)', max_length=100)),
('confidence', models.FloatField(default=0.0, help_text='Match confidence (0.0 - 1.0)')),
('reason', models.CharField(choices=[('no_match', 'No Match Found'), ('low_confidence', 'Low Confidence Match'), ('ambiguous', 'Ambiguous Match'), ('new_entity', 'Potentially New Entity')], help_text='Why manual review is needed', max_length=20)),
('source_url', models.URLField(blank=True, help_text='URL where this value was found')),
('check_date', models.DateField(blank=True, help_text='Date context for alias resolution', null=True)),
('context', models.JSONField(blank=True, help_text='Additional context (e.g., game info)', null=True)),
('status', models.CharField(choices=[('pending', 'Pending Review'), ('resolved', 'Resolved'), ('ignored', 'Ignored'), ('new_entity', 'Created New Entity')], default='pending', max_length=20)),
('resolved_to', models.CharField(blank=True, help_text='Final resolved canonical ID', max_length=100)),
('resolved_at', models.DateTimeField(blank=True, null=True)),
('resolution_notes', models.TextField(blank=True, help_text='Notes about the resolution')),
('create_alias', models.BooleanField(default=False, help_text='Whether to create an alias from this resolution')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('resolved_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='resolved_review_items', to=settings.AUTH_USER_MODEL)),
('sport', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='review_items', to='core.sport')),
('job', models.ForeignKey(blank=True, help_text='Job that created this review item', null=True, on_delete=django.db.models.deletion.CASCADE, related_name='review_items', to='scraper.scrapejob')),
],
options={
'verbose_name': 'Manual Review Item',
'verbose_name_plural': 'Manual Review Items',
'ordering': ['-created_at'],
'indexes': [models.Index(fields=['status', 'item_type'], name='scraper_man_status_5d06e2_idx'), models.Index(fields=['sport', 'status'], name='scraper_man_sport_i_7af37b_idx'), models.Index(fields=['raw_value'], name='scraper_man_raw_val_abdd0a_idx')],
},
),
migrations.AddIndex(
model_name='scrapejob',
index=models.Index(fields=['config', 'status'], name='scraper_scr_config__4c4058_idx'),
),
migrations.AddIndex(
model_name='scrapejob',
index=models.Index(fields=['status', 'created_at'], name='scraper_scr_status_f3978d_idx'),
),
]

View File

199
scraper/models.py Normal file
View File

@@ -0,0 +1,199 @@
"""
Scraper models for tracking scraping jobs and manual reviews.
"""
from django.db import models
from simple_history.models import HistoricalRecords
class ScraperConfig(models.Model):
"""
Configuration for a sport scraper per season.
"""
sport = models.ForeignKey(
'core.Sport',
on_delete=models.CASCADE,
related_name='scraper_configs'
)
season = models.PositiveSmallIntegerField(
help_text='Season year (start year for split seasons)'
)
is_active = models.BooleanField(
default=True,
help_text='Whether this config is actively scraping'
)
schedule_url = models.URLField(
blank=True,
help_text='Base URL for schedule scraping'
)
scrape_interval_hours = models.PositiveSmallIntegerField(
default=24,
help_text='How often to run the scraper (hours)'
)
last_scrape_at = models.DateTimeField(
null=True,
blank=True,
help_text='When the last scrape completed'
)
next_scrape_at = models.DateTimeField(
null=True,
blank=True,
help_text='When the next scrape is scheduled'
)
# Metadata
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
# Audit trail
history = HistoricalRecords()
class Meta:
ordering = ['-season', 'sport']
unique_together = ['sport', 'season']
verbose_name = 'Scraper Config'
verbose_name_plural = 'Scraper Configs'
def __str__(self):
return f"{self.sport.short_name} {self.sport.get_season_display(self.season)}"
class ScrapeJob(models.Model):
"""
Record of a scraping job execution.
"""
STATUS_CHOICES = [
('pending', 'Pending'),
('running', 'Running'),
('completed', 'Completed'),
('failed', 'Failed'),
('cancelled', 'Cancelled'),
]
config = models.ForeignKey(
ScraperConfig,
on_delete=models.CASCADE,
related_name='jobs'
)
status = models.CharField(
max_length=20,
choices=STATUS_CHOICES,
default='pending'
)
started_at = models.DateTimeField(
null=True,
blank=True
)
completed_at = models.DateTimeField(
null=True,
blank=True
)
games_found = models.PositiveIntegerField(default=0)
games_created = models.PositiveIntegerField(default=0)
games_updated = models.PositiveIntegerField(default=0)
errors = models.TextField(blank=True)
log_output = models.TextField(blank=True)
# Metadata
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['-created_at']
verbose_name = 'Scrape Job'
verbose_name_plural = 'Scrape Jobs'
def __str__(self):
return f"{self.config} - {self.status} ({self.created_at.strftime('%Y-%m-%d %H:%M')})"
@property
def duration(self):
"""Return job duration in seconds."""
if self.started_at and self.completed_at:
return (self.completed_at - self.started_at).total_seconds()
return None
class ManualReviewItem(models.Model):
"""
Items flagged for manual review (fuzzy matches, conflicts, etc).
"""
STATUS_CHOICES = [
('pending', 'Pending Review'),
('approved', 'Approved'),
('rejected', 'Rejected'),
('resolved', 'Resolved'),
]
ITEM_TYPE_CHOICES = [
('team', 'Team Match'),
('stadium', 'Stadium Match'),
('game', 'Game Conflict'),
('alias', 'New Alias'),
]
sport = models.ForeignKey(
'core.Sport',
on_delete=models.CASCADE,
related_name='review_items'
)
job = models.ForeignKey(
ScrapeJob,
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='review_items'
)
item_type = models.CharField(
max_length=20,
choices=ITEM_TYPE_CHOICES
)
status = models.CharField(
max_length=20,
choices=STATUS_CHOICES,
default='pending'
)
raw_value = models.CharField(
max_length=500,
help_text='The raw scraped value'
)
matched_value = models.CharField(
max_length=500,
blank=True,
help_text='The matched canonical value (if any)'
)
confidence = models.PositiveSmallIntegerField(
default=0,
help_text='Match confidence score (0-100)'
)
context = models.JSONField(
default=dict,
blank=True,
help_text='Additional context (game date, opposing team, etc)'
)
resolution_notes = models.TextField(
blank=True,
help_text='Notes about the resolution'
)
resolved_at = models.DateTimeField(
null=True,
blank=True
)
resolved_by = models.ForeignKey(
'auth.User',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='resolved_reviews'
)
# Metadata
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['-confidence', '-created_at']
verbose_name = 'Manual Review Item'
verbose_name_plural = 'Manual Review Items'
def __str__(self):
return f"{self.item_type}: {self.raw_value} ({self.confidence}%)"

View File

@@ -0,0 +1,10 @@
from .config import ScraperConfig
from .job import ScrapeJob, ScrapeJobLog
from .review import ManualReviewItem
__all__ = [
'ScraperConfig',
'ScrapeJob',
'ScrapeJobLog',
'ManualReviewItem',
]

102
scraper/models/config.py Normal file
View File

@@ -0,0 +1,102 @@
from django.db import models
from django.conf import settings
from simple_history.models import HistoricalRecords
class ScraperConfig(models.Model):
"""
Configuration for a sport's scraper.
"""
sport = models.ForeignKey(
'core.Sport',
on_delete=models.CASCADE,
related_name='scraper_configs'
)
season = models.PositiveSmallIntegerField(
help_text='Season to scrape (e.g., 2025 for 2025-26 season)'
)
is_enabled = models.BooleanField(
default=True,
help_text='Whether this scraper is enabled for scheduling'
)
# Source configuration
sources = models.JSONField(
default=list,
help_text='Ordered list of sources to try (e.g., ["basketball_reference", "espn"])'
)
primary_source = models.CharField(
max_length=100,
blank=True,
help_text='Primary source for this scraper'
)
# Rate limiting
request_delay = models.FloatField(
default=settings.SCRAPER_REQUEST_DELAY,
help_text='Seconds between requests'
)
max_retries = models.PositiveSmallIntegerField(
default=settings.SCRAPER_MAX_RETRIES,
help_text='Maximum retry attempts'
)
# Fuzzy matching
fuzzy_threshold = models.PositiveSmallIntegerField(
default=settings.SCRAPER_FUZZY_THRESHOLD,
help_text='Minimum fuzzy match confidence (0-100)'
)
# Scheduling
last_run = models.DateTimeField(
null=True,
blank=True,
help_text='Last successful run timestamp'
)
last_run_status = models.CharField(
max_length=20,
blank=True,
help_text='Status of last run'
)
last_run_games = models.PositiveIntegerField(
default=0,
help_text='Games found in last run'
)
# Notes
notes = models.TextField(
blank=True,
help_text='Configuration notes'
)
# Metadata
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
# Audit trail
history = HistoricalRecords()
class Meta:
ordering = ['sport', 'season']
unique_together = ['sport', 'season']
verbose_name = 'Scraper Configuration'
verbose_name_plural = 'Scraper Configurations'
def __str__(self):
return f"{self.sport.short_name} {self.sport.get_season_display(self.season)}"
def get_sources_list(self):
"""Return sources as list, using defaults if empty."""
if self.sources:
return self.sources
# Default sources per sport
defaults = {
'nba': ['basketball_reference', 'espn'],
'mlb': ['baseball_reference', 'mlb_api', 'espn'],
'nfl': ['espn', 'pro_football_reference'],
'nhl': ['hockey_reference', 'nhl_api', 'espn'],
'mls': ['espn'],
'wnba': ['espn'],
'nwsl': ['espn'],
}
return defaults.get(self.sport.code, ['espn'])

159
scraper/models/job.py Normal file
View File

@@ -0,0 +1,159 @@
from django.db import models
from simple_history.models import HistoricalRecords
class ScrapeJob(models.Model):
"""
Record of a scraping job execution.
"""
STATUS_CHOICES = [
('pending', 'Pending'),
('running', 'Running'),
('completed', 'Completed'),
('failed', 'Failed'),
('cancelled', 'Cancelled'),
]
config = models.ForeignKey(
'scraper.ScraperConfig',
on_delete=models.CASCADE,
related_name='jobs'
)
status = models.CharField(
max_length=20,
choices=STATUS_CHOICES,
default='pending'
)
triggered_by = models.CharField(
max_length=50,
default='manual',
help_text='How the job was triggered (manual, scheduled, api)'
)
# Timing
started_at = models.DateTimeField(null=True, blank=True)
finished_at = models.DateTimeField(null=True, blank=True)
# Results
games_found = models.PositiveIntegerField(default=0)
games_new = models.PositiveIntegerField(default=0)
games_updated = models.PositiveIntegerField(default=0)
games_unchanged = models.PositiveIntegerField(default=0)
games_errors = models.PositiveIntegerField(default=0)
teams_found = models.PositiveIntegerField(default=0)
stadiums_found = models.PositiveIntegerField(default=0)
review_items_created = models.PositiveIntegerField(default=0)
# Error tracking
error_message = models.TextField(blank=True)
error_traceback = models.TextField(blank=True)
# Celery task ID for tracking
celery_task_id = models.CharField(
max_length=255,
blank=True,
help_text='Celery task ID for this job'
)
# Metadata
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['-created_at']
verbose_name = 'Scrape Job'
verbose_name_plural = 'Scrape Jobs'
indexes = [
models.Index(fields=['config', 'status']),
models.Index(fields=['status', 'created_at']),
]
def __str__(self):
return f"{self.config} - {self.created_at.strftime('%Y-%m-%d %H:%M')}"
@property
def duration(self):
"""Return job duration as timedelta or None."""
if self.started_at and self.finished_at:
return self.finished_at - self.started_at
return None
@property
def duration_display(self):
"""Return formatted duration string."""
duration = self.duration
if duration:
total_seconds = int(duration.total_seconds())
minutes, seconds = divmod(total_seconds, 60)
if minutes > 0:
return f"{minutes}m {seconds}s"
return f"{seconds}s"
return '-'
@property
def sport(self):
return self.config.sport
@property
def season(self):
return self.config.season
def get_summary(self):
"""Return summary dict for notifications."""
return {
'sport': self.config.sport.short_name,
'season': self.config.sport.get_season_display(self.config.season),
'status': self.status,
'duration': self.duration_display,
'games_found': self.games_found,
'games_new': self.games_new,
'games_updated': self.games_updated,
'games_unchanged': self.games_unchanged,
'games_errors': self.games_errors,
'review_items': self.review_items_created,
'error_message': self.error_message,
}
class ScrapeJobLog(models.Model):
"""
Log entries for a scrape job.
"""
LEVEL_CHOICES = [
('debug', 'Debug'),
('info', 'Info'),
('warning', 'Warning'),
('error', 'Error'),
]
job = models.ForeignKey(
ScrapeJob,
on_delete=models.CASCADE,
related_name='logs'
)
level = models.CharField(
max_length=10,
choices=LEVEL_CHOICES,
default='info'
)
message = models.TextField()
source = models.CharField(
max_length=100,
blank=True,
help_text='Source/component that generated this log'
)
extra_data = models.JSONField(
null=True,
blank=True,
help_text='Additional structured data'
)
created_at = models.DateTimeField(auto_now_add=True)
class Meta:
ordering = ['created_at']
verbose_name = 'Scrape Job Log'
verbose_name_plural = 'Scrape Job Logs'
def __str__(self):
return f"[{self.level.upper()}] {self.message[:50]}"

192
scraper/models/review.py Normal file
View File

@@ -0,0 +1,192 @@
from django.db import models
from simple_history.models import HistoricalRecords
class ManualReviewItem(models.Model):
"""
Items that require manual review before resolution.
"""
ITEM_TYPE_CHOICES = [
('team', 'Team'),
('stadium', 'Stadium'),
]
STATUS_CHOICES = [
('pending', 'Pending Review'),
('resolved', 'Resolved'),
('ignored', 'Ignored'),
('new_entity', 'Created New Entity'),
]
REASON_CHOICES = [
('no_match', 'No Match Found'),
('low_confidence', 'Low Confidence Match'),
('ambiguous', 'Ambiguous Match'),
('new_entity', 'Potentially New Entity'),
]
job = models.ForeignKey(
'scraper.ScrapeJob',
on_delete=models.CASCADE,
related_name='review_items',
null=True,
blank=True,
help_text='Job that created this review item'
)
item_type = models.CharField(
max_length=20,
choices=ITEM_TYPE_CHOICES
)
sport = models.ForeignKey(
'core.Sport',
on_delete=models.CASCADE,
related_name='review_items'
)
# Raw value from scraping
raw_value = models.CharField(
max_length=300,
help_text='Original scraped value'
)
# Suggested resolution
suggested_id = models.CharField(
max_length=100,
blank=True,
help_text='Suggested canonical ID (if any match found)'
)
confidence = models.FloatField(
default=0.0,
help_text='Match confidence (0.0 - 1.0)'
)
reason = models.CharField(
max_length=20,
choices=REASON_CHOICES,
help_text='Why manual review is needed'
)
# Context
source_url = models.URLField(
blank=True,
help_text='URL where this value was found'
)
check_date = models.DateField(
null=True,
blank=True,
help_text='Date context for alias resolution'
)
context = models.JSONField(
null=True,
blank=True,
help_text='Additional context (e.g., game info)'
)
# Resolution
status = models.CharField(
max_length=20,
choices=STATUS_CHOICES,
default='pending'
)
resolved_to = models.CharField(
max_length=100,
blank=True,
help_text='Final resolved canonical ID'
)
resolved_by = models.ForeignKey(
'auth.User',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='resolved_review_items'
)
resolved_at = models.DateTimeField(null=True, blank=True)
resolution_notes = models.TextField(
blank=True,
help_text='Notes about the resolution'
)
create_alias = models.BooleanField(
default=False,
help_text='Whether to create an alias from this resolution'
)
# Metadata
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
# Audit trail
history = HistoricalRecords()
class Meta:
ordering = ['-created_at']
verbose_name = 'Manual Review Item'
verbose_name_plural = 'Manual Review Items'
indexes = [
models.Index(fields=['status', 'item_type']),
models.Index(fields=['sport', 'status']),
models.Index(fields=['raw_value']),
]
def __str__(self):
return f"{self.item_type}: {self.raw_value} ({self.get_status_display()})"
@property
def confidence_display(self):
"""Return confidence as percentage string."""
return f"{self.confidence * 100:.0f}%"
def resolve(self, canonical_id, user=None, notes='', create_alias=False):
"""Resolve this review item."""
from django.utils import timezone
self.status = 'resolved'
self.resolved_to = canonical_id
self.resolved_by = user
self.resolved_at = timezone.now()
self.resolution_notes = notes
self.create_alias = create_alias
self.save()
# Optionally create alias
if create_alias and canonical_id:
self._create_alias(canonical_id)
def _create_alias(self, canonical_id):
"""Create an alias from this resolution."""
from core.models import TeamAlias, StadiumAlias, Team, Stadium
if self.item_type == 'team':
try:
team = Team.objects.get(id=canonical_id)
TeamAlias.objects.get_or_create(
team=team,
alias=self.raw_value,
defaults={
'alias_type': 'historical',
'source': 'manual_review',
'notes': f'Created from review item #{self.id}',
}
)
except Team.DoesNotExist:
pass
elif self.item_type == 'stadium':
try:
stadium = Stadium.objects.get(id=canonical_id)
StadiumAlias.objects.get_or_create(
stadium=stadium,
alias=self.raw_value,
defaults={
'alias_type': 'former',
'source': 'manual_review',
'notes': f'Created from review item #{self.id}',
}
)
except Stadium.DoesNotExist:
pass
def ignore(self, user=None, notes=''):
"""Mark this review item as ignored."""
from django.utils import timezone
self.status = 'ignored'
self.resolved_by = user
self.resolved_at = timezone.now()
self.resolution_notes = notes
self.save()

55
scraper/resources.py Normal file
View File

@@ -0,0 +1,55 @@
"""Import/Export resources for scraper models."""
from import_export import resources, fields
from import_export.widgets import ForeignKeyWidget
from core.models import Sport
from .models import ScraperConfig, ScrapeJob, ManualReviewItem
class ScraperConfigResource(resources.ModelResource):
sport = fields.Field(
column_name='sport',
attribute='sport',
widget=ForeignKeyWidget(Sport, 'code')
)
class Meta:
model = ScraperConfig
import_id_fields = ['sport', 'season']
fields = [
'sport', 'season', 'is_active', 'is_enabled',
'scrape_interval_hours', 'primary_source',
]
export_order = fields
class ScrapeJobResource(resources.ModelResource):
sport = fields.Field(attribute='config__sport__code', readonly=True)
season = fields.Field(attribute='config__season', readonly=True)
class Meta:
model = ScrapeJob
fields = [
'id', 'sport', 'season', 'status',
'games_found', 'games_new', 'games_updated', 'games_unchanged',
'started_at', 'finished_at', 'errors', 'created_at',
]
export_order = fields
class ManualReviewItemResource(resources.ModelResource):
sport = fields.Field(
column_name='sport',
attribute='sport',
widget=ForeignKeyWidget(Sport, 'code')
)
class Meta:
model = ManualReviewItem
import_id_fields = ['id']
fields = [
'id', 'sport', 'item_type', 'raw_value', 'matched_value',
'status', 'confidence', 'reason', 'source_url',
'check_date', 'created_at',
]
export_order = fields

182
scraper/tasks.py Normal file
View File

@@ -0,0 +1,182 @@
import logging
import traceback
from datetime import datetime
from celery import shared_task
from django.utils import timezone
logger = logging.getLogger('scraper')
@shared_task(bind=True, max_retries=3)
def run_scraper_task(self, config_id: int, triggered_by: str = 'manual'):
"""
Run a scraper job for the given configuration.
"""
from scraper.models import ScraperConfig, ScrapeJob, ScrapeJobLog
from notifications.tasks import send_scrape_notification
# Get configuration
try:
config = ScraperConfig.objects.select_related('sport').get(id=config_id)
except ScraperConfig.DoesNotExist:
logger.error(f"ScraperConfig {config_id} not found")
return {'error': 'Configuration not found'}
# Create job record
job = ScrapeJob.objects.create(
config=config,
status='running',
triggered_by=triggered_by,
started_at=timezone.now(),
celery_task_id=self.request.id,
)
def log(level, message, source='', extra_data=None):
ScrapeJobLog.objects.create(
job=job,
level=level,
message=message,
source=source,
extra_data=extra_data,
)
getattr(logger, level)(f"[{config.sport.code}] {message}")
try:
log('info', f'Starting scraper for {config.sport.short_name} {config.season}')
# Import and run the appropriate scraper
result = run_sport_scraper(config, log)
# Update job with results
job.status = 'completed'
job.finished_at = timezone.now()
job.games_found = result.get('games_found', 0)
job.games_new = result.get('games_new', 0)
job.games_updated = result.get('games_updated', 0)
job.games_unchanged = result.get('games_unchanged', 0)
job.games_errors = result.get('games_errors', 0)
job.teams_found = result.get('teams_found', 0)
job.stadiums_found = result.get('stadiums_found', 0)
job.review_items_created = result.get('review_items', 0)
job.save()
# Update config
config.last_run = timezone.now()
config.last_run_status = 'completed'
config.last_run_games = result.get('games_found', 0)
config.save()
log('info', f'Scraper completed: {job.games_found} games, {job.games_new} new, {job.review_items_created} reviews')
# Send notification
send_scrape_notification.delay(job.id)
return {
'job_id': job.id,
'status': 'completed',
'games_found': job.games_found,
'games_new': job.games_new,
'review_items': job.review_items_created,
}
except Exception as e:
error_msg = str(e)
error_tb = traceback.format_exc()
job.status = 'failed'
job.finished_at = timezone.now()
job.error_message = error_msg
job.error_traceback = error_tb
job.save()
config.last_run = timezone.now()
config.last_run_status = 'failed'
config.save()
log('error', f'Scraper failed: {error_msg}', extra_data={'traceback': error_tb})
# Send failure notification
send_scrape_notification.delay(job.id)
# Retry if applicable
if self.request.retries < self.max_retries:
raise self.retry(exc=e, countdown=60 * (self.request.retries + 1))
return {
'job_id': job.id,
'status': 'failed',
'error': error_msg,
}
def run_sport_scraper(config, log_func):
"""
Run the appropriate scraper for the sport.
Returns dict with results.
"""
from core.models import Game, Team, Stadium
from scraper.models import ManualReviewItem
sport_code = config.sport.code
season = config.season
log_func('info', f'Loading scraper for {sport_code}', source='engine')
# Import the scraper engine from sportstime_parser
# This adapts the existing scrapers to work with Django models
from scraper.engine.adapter import ScraperAdapter
adapter = ScraperAdapter(
sport_code=sport_code,
season=season,
config=config,
log_func=log_func,
)
# Run the scraper
result = adapter.run()
return result
@shared_task
def run_all_enabled_scrapers():
"""
Run all enabled scraper configurations.
Called by celery-beat on schedule.
"""
from scraper.models import ScraperConfig
configs = ScraperConfig.objects.filter(is_enabled=True)
for config in configs:
run_scraper_task.delay(config.id, triggered_by='scheduled')
return {'configs_queued': configs.count()}
@shared_task
def cleanup_old_jobs(days: int = 30):
"""
Clean up old scrape job records.
"""
from scraper.models import ScrapeJob, ScrapeJobLog
from django.utils import timezone
from datetime import timedelta
cutoff = timezone.now() - timedelta(days=days)
# Delete old logs first (foreign key)
logs_deleted, _ = ScrapeJobLog.objects.filter(
job__created_at__lt=cutoff
).delete()
# Then delete old jobs
jobs_deleted, _ = ScrapeJob.objects.filter(
created_at__lt=cutoff
).delete()
return {
'jobs_deleted': jobs_deleted,
'logs_deleted': logs_deleted,
}