feat: add Django web app, CloudKit sync, dashboard, and game_datetime_utc export
Adds the full Django application layer on top of sportstime_parser: - core: Sport, Team, Stadium, Game models with aliases and league structure - scraper: orchestration engine, adapter, job management, Celery tasks - cloudkit: CloudKit sync client, sync state tracking, sync jobs - dashboard: staff dashboard for monitoring scrapers, sync, review queue - notifications: email reports for scrape/sync results - Docker setup for deployment (Dockerfile, docker-compose, entrypoint) Game exports now use game_datetime_utc (ISO 8601 UTC) instead of venue-local date+time strings, matching the canonical format used by the iOS app. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1
scraper/__init__.py
Normal file
1
scraper/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
default_app_config = 'scraper.apps.ScraperConfig'
|
||||
139
scraper/admin.py
Normal file
139
scraper/admin.py
Normal file
@@ -0,0 +1,139 @@
|
||||
"""
|
||||
Admin configuration for scraper models.
|
||||
"""
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html
|
||||
from import_export.admin import ImportExportMixin, ImportExportModelAdmin
|
||||
from simple_history.admin import SimpleHistoryAdmin
|
||||
|
||||
from .models import ScraperConfig, ScrapeJob, ManualReviewItem
|
||||
from .resources import ScraperConfigResource, ScrapeJobResource, ManualReviewItemResource
|
||||
|
||||
|
||||
@admin.register(ScraperConfig)
|
||||
class ScraperConfigAdmin(ImportExportMixin, SimpleHistoryAdmin):
|
||||
resource_class = ScraperConfigResource
|
||||
list_display = [
|
||||
'__str__',
|
||||
'sport',
|
||||
'season',
|
||||
'is_active',
|
||||
'last_scrape_at',
|
||||
'next_scrape_at',
|
||||
'scrape_interval_hours',
|
||||
]
|
||||
list_filter = ['sport', 'is_active', 'season']
|
||||
search_fields = ['sport__name', 'sport__short_name']
|
||||
ordering = ['-season', 'sport']
|
||||
readonly_fields = ['created_at', 'updated_at']
|
||||
|
||||
|
||||
@admin.register(ScrapeJob)
|
||||
class ScrapeJobAdmin(ImportExportModelAdmin):
|
||||
resource_class = ScrapeJobResource
|
||||
list_display = [
|
||||
'__str__',
|
||||
'status_badge',
|
||||
'games_found',
|
||||
'games_created',
|
||||
'games_updated',
|
||||
'duration_display',
|
||||
'created_at',
|
||||
]
|
||||
list_filter = ['status', 'config__sport', ('created_at', admin.DateFieldListFilter)]
|
||||
search_fields = ['config__sport__name', 'errors']
|
||||
ordering = ['-created_at']
|
||||
readonly_fields = ['created_at', 'updated_at', 'duration_display']
|
||||
|
||||
@admin.display(description='Status')
|
||||
def status_badge(self, obj):
|
||||
colors = {
|
||||
'pending': '#ffc107',
|
||||
'running': '#17a2b8',
|
||||
'completed': '#28a745',
|
||||
'failed': '#dc3545',
|
||||
'cancelled': '#6c757d',
|
||||
}
|
||||
color = colors.get(obj.status, '#6c757d')
|
||||
return format_html(
|
||||
'<span style="background-color: {}; color: white; padding: 3px 8px; '
|
||||
'border-radius: 3px; font-size: 11px;">{}</span>',
|
||||
color, obj.get_status_display()
|
||||
)
|
||||
|
||||
@admin.display(description='Duration')
|
||||
def duration_display(self, obj):
|
||||
duration = obj.duration
|
||||
if duration is not None:
|
||||
if duration < 60:
|
||||
return f"{duration:.1f}s"
|
||||
elif duration < 3600:
|
||||
return f"{duration/60:.1f}m"
|
||||
else:
|
||||
return f"{duration/3600:.1f}h"
|
||||
return '-'
|
||||
|
||||
|
||||
@admin.register(ManualReviewItem)
|
||||
class ManualReviewItemAdmin(ImportExportModelAdmin):
|
||||
resource_class = ManualReviewItemResource
|
||||
list_display = [
|
||||
'raw_value',
|
||||
'item_type',
|
||||
'sport',
|
||||
'status_badge',
|
||||
'confidence_bar',
|
||||
'matched_value',
|
||||
'created_at',
|
||||
]
|
||||
list_filter = ['status', 'item_type', 'sport']
|
||||
search_fields = ['raw_value', 'matched_value']
|
||||
ordering = ['-confidence', '-created_at']
|
||||
readonly_fields = ['created_at', 'updated_at', 'resolved_at', 'resolved_by']
|
||||
actions = ['approve_items', 'reject_items']
|
||||
|
||||
@admin.display(description='Status')
|
||||
def status_badge(self, obj):
|
||||
colors = {
|
||||
'pending': '#ffc107',
|
||||
'approved': '#28a745',
|
||||
'rejected': '#dc3545',
|
||||
'resolved': '#17a2b8',
|
||||
}
|
||||
color = colors.get(obj.status, '#6c757d')
|
||||
return format_html(
|
||||
'<span style="background-color: {}; color: white; padding: 3px 8px; '
|
||||
'border-radius: 3px; font-size: 11px;">{}</span>',
|
||||
color, obj.get_status_display()
|
||||
)
|
||||
|
||||
@admin.display(description='Confidence')
|
||||
def confidence_bar(self, obj):
|
||||
color = '#28a745' if obj.confidence >= 85 else '#ffc107' if obj.confidence >= 70 else '#dc3545'
|
||||
return format_html(
|
||||
'<div style="width: 100px; background: #ddd; border-radius: 3px;">'
|
||||
'<div style="width: {}%; background: {}; height: 16px; border-radius: 3px; '
|
||||
'text-align: center; color: white; font-size: 11px; line-height: 16px;">'
|
||||
'{}%</div></div>',
|
||||
obj.confidence, color, obj.confidence
|
||||
)
|
||||
|
||||
@admin.action(description='Approve selected items')
|
||||
def approve_items(self, request, queryset):
|
||||
from django.utils import timezone
|
||||
updated = queryset.update(
|
||||
status='approved',
|
||||
resolved_at=timezone.now(),
|
||||
resolved_by=request.user
|
||||
)
|
||||
self.message_user(request, f'{updated} items approved.')
|
||||
|
||||
@admin.action(description='Reject selected items')
|
||||
def reject_items(self, request, queryset):
|
||||
from django.utils import timezone
|
||||
updated = queryset.update(
|
||||
status='rejected',
|
||||
resolved_at=timezone.now(),
|
||||
resolved_by=request.user
|
||||
)
|
||||
self.message_user(request, f'{updated} items rejected.')
|
||||
3
scraper/admin/__init__.py
Normal file
3
scraper/admin/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .config_admin import ScraperConfigAdmin
|
||||
from .job_admin import ScrapeJobAdmin
|
||||
from .review_admin import ManualReviewItemAdmin
|
||||
110
scraper/admin/config_admin.py
Normal file
110
scraper/admin/config_admin.py
Normal file
@@ -0,0 +1,110 @@
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html
|
||||
from django.urls import reverse
|
||||
from simple_history.admin import SimpleHistoryAdmin
|
||||
|
||||
from scraper.models import ScraperConfig
|
||||
|
||||
|
||||
@admin.register(ScraperConfig)
|
||||
class ScraperConfigAdmin(SimpleHistoryAdmin):
|
||||
list_display = [
|
||||
'sport',
|
||||
'season_display',
|
||||
'is_enabled',
|
||||
'primary_source',
|
||||
'last_run_display',
|
||||
'last_run_status_badge',
|
||||
'last_run_games',
|
||||
'job_count',
|
||||
]
|
||||
list_filter = ['sport', 'is_enabled', 'last_run_status']
|
||||
search_fields = ['sport__name', 'sport__short_name']
|
||||
ordering = ['-season', 'sport']
|
||||
readonly_fields = [
|
||||
'created_at',
|
||||
'updated_at',
|
||||
'last_run',
|
||||
'last_run_status',
|
||||
'last_run_games',
|
||||
]
|
||||
|
||||
fieldsets = [
|
||||
(None, {
|
||||
'fields': ['sport', 'season', 'is_enabled']
|
||||
}),
|
||||
('Source Configuration', {
|
||||
'fields': ['sources', 'primary_source']
|
||||
}),
|
||||
('Rate Limiting', {
|
||||
'fields': ['request_delay', 'max_retries']
|
||||
}),
|
||||
('Matching', {
|
||||
'fields': ['fuzzy_threshold']
|
||||
}),
|
||||
('Last Run', {
|
||||
'fields': ['last_run', 'last_run_status', 'last_run_games'],
|
||||
'classes': ['collapse']
|
||||
}),
|
||||
('Notes', {
|
||||
'fields': ['notes'],
|
||||
'classes': ['collapse']
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ['created_at', 'updated_at'],
|
||||
'classes': ['collapse']
|
||||
}),
|
||||
]
|
||||
|
||||
actions = ['run_scraper', 'enable_scrapers', 'disable_scrapers']
|
||||
|
||||
def season_display(self, obj):
|
||||
return obj.sport.get_season_display(obj.season)
|
||||
season_display.short_description = 'Season'
|
||||
|
||||
def last_run_display(self, obj):
|
||||
if obj.last_run:
|
||||
return obj.last_run.strftime('%Y-%m-%d %H:%M')
|
||||
return '-'
|
||||
last_run_display.short_description = 'Last Run'
|
||||
|
||||
def last_run_status_badge(self, obj):
|
||||
if not obj.last_run_status:
|
||||
return '-'
|
||||
colors = {
|
||||
'completed': 'green',
|
||||
'failed': 'red',
|
||||
'running': 'orange',
|
||||
}
|
||||
color = colors.get(obj.last_run_status, 'gray')
|
||||
return format_html(
|
||||
'<span style="color: {}; font-weight: bold;">{}</span>',
|
||||
color,
|
||||
obj.last_run_status.upper()
|
||||
)
|
||||
last_run_status_badge.short_description = 'Status'
|
||||
|
||||
def job_count(self, obj):
|
||||
count = obj.jobs.count()
|
||||
if count > 0:
|
||||
url = reverse('admin:scraper_scrapejob_changelist') + f'?config__id__exact={obj.id}'
|
||||
return format_html('<a href="{}">{} jobs</a>', url, count)
|
||||
return '0 jobs'
|
||||
job_count.short_description = 'Jobs'
|
||||
|
||||
@admin.action(description='Run scraper for selected configurations')
|
||||
def run_scraper(self, request, queryset):
|
||||
from scraper.tasks import run_scraper_task
|
||||
for config in queryset:
|
||||
run_scraper_task.delay(config.id)
|
||||
self.message_user(request, f'Started {queryset.count()} scraper jobs.')
|
||||
|
||||
@admin.action(description='Enable selected scrapers')
|
||||
def enable_scrapers(self, request, queryset):
|
||||
updated = queryset.update(is_enabled=True)
|
||||
self.message_user(request, f'{updated} scrapers enabled.')
|
||||
|
||||
@admin.action(description='Disable selected scrapers')
|
||||
def disable_scrapers(self, request, queryset):
|
||||
updated = queryset.update(is_enabled=False)
|
||||
self.message_user(request, f'{updated} scrapers disabled.')
|
||||
154
scraper/admin/job_admin.py
Normal file
154
scraper/admin/job_admin.py
Normal file
@@ -0,0 +1,154 @@
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html
|
||||
from django.urls import reverse
|
||||
|
||||
from scraper.models import ScrapeJob, ScrapeJobLog
|
||||
|
||||
|
||||
class ScrapeJobLogInline(admin.TabularInline):
|
||||
model = ScrapeJobLog
|
||||
extra = 0
|
||||
readonly_fields = ['created_at', 'level', 'source', 'message']
|
||||
fields = ['created_at', 'level', 'source', 'message']
|
||||
ordering = ['created_at']
|
||||
can_delete = False
|
||||
|
||||
def has_add_permission(self, request, obj=None):
|
||||
return False
|
||||
|
||||
|
||||
@admin.register(ScrapeJob)
|
||||
class ScrapeJobAdmin(admin.ModelAdmin):
|
||||
list_display = [
|
||||
'id',
|
||||
'config',
|
||||
'status_badge',
|
||||
'triggered_by',
|
||||
'started_at',
|
||||
'duration_display',
|
||||
'games_summary',
|
||||
'review_items_link',
|
||||
]
|
||||
list_filter = ['status', 'config__sport', 'triggered_by', 'config__season']
|
||||
search_fields = ['config__sport__name', 'celery_task_id']
|
||||
date_hierarchy = 'created_at'
|
||||
ordering = ['-created_at']
|
||||
readonly_fields = [
|
||||
'id',
|
||||
'config',
|
||||
'status',
|
||||
'triggered_by',
|
||||
'started_at',
|
||||
'finished_at',
|
||||
'duration_display',
|
||||
'games_found',
|
||||
'games_new',
|
||||
'games_updated',
|
||||
'games_unchanged',
|
||||
'games_errors',
|
||||
'teams_found',
|
||||
'stadiums_found',
|
||||
'review_items_created',
|
||||
'error_message',
|
||||
'error_traceback',
|
||||
'celery_task_id',
|
||||
'created_at',
|
||||
'updated_at',
|
||||
]
|
||||
inlines = [ScrapeJobLogInline]
|
||||
|
||||
fieldsets = [
|
||||
(None, {
|
||||
'fields': ['id', 'config', 'status', 'triggered_by', 'celery_task_id']
|
||||
}),
|
||||
('Timing', {
|
||||
'fields': ['started_at', 'finished_at', 'duration_display']
|
||||
}),
|
||||
('Results - Games', {
|
||||
'fields': [
|
||||
'games_found',
|
||||
'games_new',
|
||||
'games_updated',
|
||||
'games_unchanged',
|
||||
'games_errors',
|
||||
]
|
||||
}),
|
||||
('Results - Other', {
|
||||
'fields': ['teams_found', 'stadiums_found', 'review_items_created']
|
||||
}),
|
||||
('Errors', {
|
||||
'fields': ['error_message', 'error_traceback'],
|
||||
'classes': ['collapse']
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ['created_at', 'updated_at'],
|
||||
'classes': ['collapse']
|
||||
}),
|
||||
]
|
||||
|
||||
actions = ['cancel_jobs', 'retry_jobs']
|
||||
|
||||
def has_add_permission(self, request):
|
||||
return False
|
||||
|
||||
def has_change_permission(self, request, obj=None):
|
||||
return False
|
||||
|
||||
def status_badge(self, obj):
|
||||
colors = {
|
||||
'pending': '#999',
|
||||
'running': '#f0ad4e',
|
||||
'completed': '#5cb85c',
|
||||
'failed': '#d9534f',
|
||||
'cancelled': '#777',
|
||||
}
|
||||
color = colors.get(obj.status, '#999')
|
||||
return format_html(
|
||||
'<span style="background-color: {}; color: white; padding: 3px 8px; '
|
||||
'border-radius: 3px; font-size: 11px;">{}</span>',
|
||||
color,
|
||||
obj.status.upper()
|
||||
)
|
||||
status_badge.short_description = 'Status'
|
||||
|
||||
def games_summary(self, obj):
|
||||
if obj.games_found == 0:
|
||||
return '-'
|
||||
return format_html(
|
||||
'<span title="New: {}, Updated: {}, Unchanged: {}, Errors: {}">'
|
||||
'{} found ({} new, {} upd)</span>',
|
||||
obj.games_new, obj.games_updated, obj.games_unchanged, obj.games_errors,
|
||||
obj.games_found, obj.games_new, obj.games_updated
|
||||
)
|
||||
games_summary.short_description = 'Games'
|
||||
|
||||
def review_items_link(self, obj):
|
||||
if obj.review_items_created > 0:
|
||||
url = reverse('admin:scraper_manualreviewitem_changelist') + f'?job__id__exact={obj.id}'
|
||||
return format_html(
|
||||
'<a href="{}">{} items</a>',
|
||||
url, obj.review_items_created
|
||||
)
|
||||
return '-'
|
||||
review_items_link.short_description = 'Review'
|
||||
|
||||
@admin.action(description='Cancel selected jobs')
|
||||
def cancel_jobs(self, request, queryset):
|
||||
from celery.result import AsyncResult
|
||||
cancelled = 0
|
||||
for job in queryset.filter(status__in=['pending', 'running']):
|
||||
if job.celery_task_id:
|
||||
AsyncResult(job.celery_task_id).revoke(terminate=True)
|
||||
job.status = 'cancelled'
|
||||
job.save()
|
||||
cancelled += 1
|
||||
self.message_user(request, f'{cancelled} jobs cancelled.')
|
||||
|
||||
@admin.action(description='Retry failed jobs')
|
||||
def retry_jobs(self, request, queryset):
|
||||
from scraper.tasks import run_scraper_task
|
||||
retried = 0
|
||||
for job in queryset.filter(status='failed'):
|
||||
run_scraper_task.delay(job.config.id)
|
||||
retried += 1
|
||||
self.message_user(request, f'{retried} jobs requeued.')
|
||||
157
scraper/admin/review_admin.py
Normal file
157
scraper/admin/review_admin.py
Normal file
@@ -0,0 +1,157 @@
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html
|
||||
from django.utils import timezone
|
||||
from simple_history.admin import SimpleHistoryAdmin
|
||||
|
||||
from scraper.models import ManualReviewItem
|
||||
|
||||
|
||||
@admin.register(ManualReviewItem)
|
||||
class ManualReviewItemAdmin(SimpleHistoryAdmin):
|
||||
list_display = [
|
||||
'raw_value',
|
||||
'item_type',
|
||||
'sport',
|
||||
'status_badge',
|
||||
'suggested_match',
|
||||
'confidence_badge',
|
||||
'reason',
|
||||
'created_at',
|
||||
]
|
||||
list_filter = ['status', 'item_type', 'sport', 'reason']
|
||||
search_fields = ['raw_value', 'suggested_id', 'resolved_to']
|
||||
ordering = ['-created_at']
|
||||
readonly_fields = [
|
||||
'job',
|
||||
'item_type',
|
||||
'sport',
|
||||
'raw_value',
|
||||
'suggested_id',
|
||||
'confidence',
|
||||
'reason',
|
||||
'source_url',
|
||||
'check_date',
|
||||
'context',
|
||||
'resolved_by',
|
||||
'resolved_at',
|
||||
'created_at',
|
||||
'updated_at',
|
||||
]
|
||||
autocomplete_fields = []
|
||||
|
||||
fieldsets = [
|
||||
(None, {
|
||||
'fields': ['job', 'item_type', 'sport', 'raw_value']
|
||||
}),
|
||||
('Suggested Match', {
|
||||
'fields': ['suggested_id', 'confidence', 'reason']
|
||||
}),
|
||||
('Context', {
|
||||
'fields': ['source_url', 'check_date', 'context'],
|
||||
'classes': ['collapse']
|
||||
}),
|
||||
('Resolution', {
|
||||
'fields': [
|
||||
'status',
|
||||
'resolved_to',
|
||||
'create_alias',
|
||||
'resolution_notes',
|
||||
'resolved_by',
|
||||
'resolved_at',
|
||||
]
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ['created_at', 'updated_at'],
|
||||
'classes': ['collapse']
|
||||
}),
|
||||
]
|
||||
|
||||
actions = [
|
||||
'accept_suggested',
|
||||
'mark_ignored',
|
||||
'accept_and_create_alias',
|
||||
]
|
||||
|
||||
def status_badge(self, obj):
|
||||
colors = {
|
||||
'pending': '#f0ad4e',
|
||||
'resolved': '#5cb85c',
|
||||
'ignored': '#999',
|
||||
'new_entity': '#5bc0de',
|
||||
}
|
||||
color = colors.get(obj.status, '#999')
|
||||
return format_html(
|
||||
'<span style="background-color: {}; color: white; padding: 3px 8px; '
|
||||
'border-radius: 3px; font-size: 11px;">{}</span>',
|
||||
color,
|
||||
obj.get_status_display().upper()
|
||||
)
|
||||
status_badge.short_description = 'Status'
|
||||
|
||||
def suggested_match(self, obj):
|
||||
if obj.suggested_id:
|
||||
return format_html(
|
||||
'<code style="background: #f5f5f5; padding: 2px 5px;">{}</code>',
|
||||
obj.suggested_id
|
||||
)
|
||||
return '-'
|
||||
suggested_match.short_description = 'Suggested'
|
||||
|
||||
def confidence_badge(self, obj):
|
||||
if obj.confidence == 0:
|
||||
return '-'
|
||||
pct = obj.confidence * 100
|
||||
if pct >= 85:
|
||||
color = '#5cb85c'
|
||||
elif pct >= 70:
|
||||
color = '#f0ad4e'
|
||||
else:
|
||||
color = '#d9534f'
|
||||
return format_html(
|
||||
'<span style="color: {}; font-weight: bold;">{:.0f}%</span>',
|
||||
color, pct
|
||||
)
|
||||
confidence_badge.short_description = 'Conf.'
|
||||
|
||||
@admin.action(description='Accept suggested match')
|
||||
def accept_suggested(self, request, queryset):
|
||||
resolved = 0
|
||||
for item in queryset.filter(status='pending', suggested_id__isnull=False):
|
||||
item.resolve(
|
||||
canonical_id=item.suggested_id,
|
||||
user=request.user,
|
||||
notes='Accepted suggested match via admin action'
|
||||
)
|
||||
resolved += 1
|
||||
self.message_user(request, f'{resolved} items resolved.')
|
||||
|
||||
@admin.action(description='Accept suggested and create alias')
|
||||
def accept_and_create_alias(self, request, queryset):
|
||||
resolved = 0
|
||||
for item in queryset.filter(status='pending', suggested_id__isnull=False):
|
||||
item.resolve(
|
||||
canonical_id=item.suggested_id,
|
||||
user=request.user,
|
||||
notes='Accepted and created alias via admin action',
|
||||
create_alias=True
|
||||
)
|
||||
resolved += 1
|
||||
self.message_user(request, f'{resolved} items resolved with aliases created.')
|
||||
|
||||
@admin.action(description='Mark as ignored')
|
||||
def mark_ignored(self, request, queryset):
|
||||
ignored = 0
|
||||
for item in queryset.filter(status='pending'):
|
||||
item.ignore(
|
||||
user=request.user,
|
||||
notes='Ignored via admin action'
|
||||
)
|
||||
ignored += 1
|
||||
self.message_user(request, f'{ignored} items ignored.')
|
||||
|
||||
def save_model(self, request, obj, form, change):
|
||||
# Auto-set resolved_by and resolved_at when status changes to resolved
|
||||
if change and obj.status in ['resolved', 'ignored'] and not obj.resolved_by:
|
||||
obj.resolved_by = request.user
|
||||
obj.resolved_at = timezone.now()
|
||||
super().save_model(request, obj, form, change)
|
||||
7
scraper/apps.py
Normal file
7
scraper/apps.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class ScraperConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'scraper'
|
||||
verbose_name = 'Scraper Management'
|
||||
1
scraper/engine/__init__.py
Normal file
1
scraper/engine/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Scraper engine package
|
||||
496
scraper/engine/adapter.py
Normal file
496
scraper/engine/adapter.py
Normal file
@@ -0,0 +1,496 @@
|
||||
"""
|
||||
Adapter to bridge existing sportstime_parser scrapers with Django models.
|
||||
"""
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from typing import Callable, Optional
|
||||
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
|
||||
|
||||
class ScraperAdapter:
|
||||
"""
|
||||
Adapts the existing sportstime_parser scrapers to work with Django models.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sport_code: str,
|
||||
season: int,
|
||||
config,
|
||||
log_func: Optional[Callable] = None,
|
||||
):
|
||||
self.sport_code = sport_code
|
||||
self.season = season
|
||||
self.config = config
|
||||
self.log = log_func or (lambda level, msg, **kw: None)
|
||||
|
||||
def run(self) -> dict:
|
||||
"""
|
||||
Run the scraper and return results.
|
||||
"""
|
||||
from core.models import Sport, Team, Stadium, Game
|
||||
from scraper.models import ManualReviewItem
|
||||
from cloudkit.models import CloudKitSyncState
|
||||
|
||||
result = {
|
||||
'games_found': 0,
|
||||
'games_new': 0,
|
||||
'games_updated': 0,
|
||||
'games_unchanged': 0,
|
||||
'games_errors': 0,
|
||||
'teams_found': 0,
|
||||
'stadiums_found': 0,
|
||||
'review_items': 0,
|
||||
}
|
||||
|
||||
# Get sport
|
||||
try:
|
||||
sport = Sport.objects.get(code=self.sport_code)
|
||||
except Sport.DoesNotExist:
|
||||
raise ValueError(f"Sport {self.sport_code} not found in database")
|
||||
|
||||
self.log('info', f'Starting scraper for {sport.short_name} {self.season}', source='adapter')
|
||||
|
||||
# Import and create the appropriate scraper
|
||||
scraper = self._create_scraper()
|
||||
|
||||
# Run the scrape
|
||||
self.log('info', 'Scraping games...', source='adapter')
|
||||
raw_result = scraper.scrape_all()
|
||||
|
||||
# Process stadiums first (teams reference stadiums via home_stadium FK)
|
||||
self.log('info', f'Processing {len(raw_result.stadiums)} stadiums...', source='adapter')
|
||||
result['stadiums_found'] = len(raw_result.stadiums)
|
||||
self._process_stadiums(sport, raw_result.stadiums)
|
||||
|
||||
# Process teams
|
||||
self.log('info', f'Processing {len(raw_result.teams)} teams...', source='adapter')
|
||||
result['teams_found'] = len(raw_result.teams)
|
||||
self._process_teams(sport, raw_result.teams)
|
||||
|
||||
# Process games
|
||||
self.log('info', f'Processing {len(raw_result.games)} games...', source='adapter')
|
||||
game_result = self._process_games(sport, raw_result.games)
|
||||
result.update(game_result)
|
||||
|
||||
# Process review items
|
||||
if raw_result.review_items:
|
||||
self.log('info', f'Creating {len(raw_result.review_items)} review items...', source='adapter')
|
||||
result['review_items'] = self._process_review_items(sport, raw_result.review_items)
|
||||
|
||||
self.log('info', f'Scrape complete: {result}', source='adapter')
|
||||
return result
|
||||
|
||||
def _create_scraper(self):
|
||||
"""Create the appropriate scraper instance."""
|
||||
# Import from existing sportstime_parser
|
||||
from sportstime_parser.scrapers import (
|
||||
create_nba_scraper,
|
||||
create_mlb_scraper,
|
||||
create_nfl_scraper,
|
||||
create_nhl_scraper,
|
||||
create_mls_scraper,
|
||||
create_wnba_scraper,
|
||||
create_nwsl_scraper,
|
||||
)
|
||||
|
||||
scrapers = {
|
||||
'nba': create_nba_scraper,
|
||||
'mlb': create_mlb_scraper,
|
||||
'nfl': create_nfl_scraper,
|
||||
'nhl': create_nhl_scraper,
|
||||
'mls': create_mls_scraper,
|
||||
'wnba': create_wnba_scraper,
|
||||
'nwsl': create_nwsl_scraper,
|
||||
}
|
||||
|
||||
creator = scrapers.get(self.sport_code)
|
||||
if not creator:
|
||||
raise ValueError(f"No scraper for sport: {self.sport_code}")
|
||||
|
||||
# Create scraper (config overrides handled via session/resolver settings if needed)
|
||||
return creator(season=self.season)
|
||||
|
||||
def _process_teams(self, sport, teams):
|
||||
"""Process and upsert teams."""
|
||||
from core.models import Team, Stadium, Division, Conference
|
||||
from cloudkit.models import CloudKitSyncState
|
||||
|
||||
for team_data in teams:
|
||||
team_id = team_data.id
|
||||
|
||||
# Find division if available
|
||||
division = None
|
||||
if team_data.division:
|
||||
division = Division.objects.filter(
|
||||
conference__sport=sport,
|
||||
name__iexact=team_data.division
|
||||
).first()
|
||||
# Fallback to partial match
|
||||
if not division:
|
||||
division = Division.objects.filter(
|
||||
conference__sport=sport,
|
||||
name__icontains=team_data.division
|
||||
).first()
|
||||
|
||||
# Resolve home stadium if available
|
||||
home_stadium = None
|
||||
stadium_id = getattr(team_data, 'stadium_id', None)
|
||||
if stadium_id:
|
||||
home_stadium = Stadium.objects.filter(id=stadium_id).first()
|
||||
|
||||
team, created = Team.objects.update_or_create(
|
||||
id=team_id,
|
||||
defaults={
|
||||
'sport': sport,
|
||||
'division': division,
|
||||
'city': team_data.city,
|
||||
'name': team_data.name,
|
||||
'full_name': team_data.full_name,
|
||||
'abbreviation': team_data.abbreviation,
|
||||
'home_stadium': home_stadium,
|
||||
'primary_color': getattr(team_data, 'primary_color', '') or '',
|
||||
'secondary_color': getattr(team_data, 'secondary_color', '') or '',
|
||||
'logo_url': getattr(team_data, 'logo_url', '') or '',
|
||||
}
|
||||
)
|
||||
|
||||
# Mark for sync
|
||||
if created:
|
||||
CloudKitSyncState.objects.get_or_create(
|
||||
record_type='Team',
|
||||
record_id=team_id,
|
||||
defaults={'sync_status': 'pending'}
|
||||
)
|
||||
|
||||
def _process_stadiums(self, sport, stadiums):
|
||||
"""Process and upsert stadiums."""
|
||||
from core.models import Stadium
|
||||
from cloudkit.models import CloudKitSyncState
|
||||
|
||||
for stadium_data in stadiums:
|
||||
stadium_id = stadium_data.id
|
||||
|
||||
stadium, created = Stadium.objects.update_or_create(
|
||||
id=stadium_id,
|
||||
defaults={
|
||||
'sport': sport,
|
||||
'name': stadium_data.name,
|
||||
'city': stadium_data.city,
|
||||
'state': getattr(stadium_data, 'state', '') or '',
|
||||
'country': getattr(stadium_data, 'country', 'USA'),
|
||||
'latitude': getattr(stadium_data, 'latitude', None),
|
||||
'longitude': getattr(stadium_data, 'longitude', None),
|
||||
'capacity': getattr(stadium_data, 'capacity', None),
|
||||
'surface': getattr(stadium_data, 'surface', '') or '',
|
||||
'roof_type': getattr(stadium_data, 'roof_type', '') or '',
|
||||
'opened_year': getattr(stadium_data, 'opened_year', None),
|
||||
'timezone': getattr(stadium_data, 'timezone', '') or '',
|
||||
'image_url': getattr(stadium_data, 'image_url', '') or '',
|
||||
}
|
||||
)
|
||||
|
||||
if created:
|
||||
CloudKitSyncState.objects.get_or_create(
|
||||
record_type='Stadium',
|
||||
record_id=stadium_id,
|
||||
defaults={'sync_status': 'pending'}
|
||||
)
|
||||
|
||||
def _resolve_team_via_db_alias(self, sport, raw_name, check_date=None):
|
||||
"""Try to resolve a team name using database aliases.
|
||||
|
||||
Args:
|
||||
sport: Sport model instance
|
||||
raw_name: Raw team name from scraper
|
||||
check_date: Date for alias validity check
|
||||
|
||||
Returns:
|
||||
Team instance if found, None otherwise
|
||||
"""
|
||||
from core.models import Team, TeamAlias
|
||||
from datetime import date
|
||||
|
||||
if not raw_name:
|
||||
return None
|
||||
|
||||
check_date = check_date or date.today()
|
||||
|
||||
# Check TeamAlias model
|
||||
aliases = TeamAlias.objects.filter(
|
||||
alias__iexact=raw_name.strip(),
|
||||
team__sport=sport,
|
||||
).select_related('team')
|
||||
|
||||
for alias in aliases:
|
||||
if alias.is_valid_for_date(check_date):
|
||||
return alias.team
|
||||
|
||||
# Also try partial matching on team full_name and city
|
||||
team = Team.objects.filter(
|
||||
sport=sport,
|
||||
full_name__iexact=raw_name.strip()
|
||||
).first()
|
||||
if team:
|
||||
return team
|
||||
|
||||
team = Team.objects.filter(
|
||||
sport=sport,
|
||||
city__iexact=raw_name.strip()
|
||||
).first()
|
||||
if team:
|
||||
return team
|
||||
|
||||
return None
|
||||
|
||||
def _resolve_stadium_via_db_alias(self, sport, raw_name, check_date=None):
|
||||
"""Try to resolve a stadium name using database aliases.
|
||||
|
||||
Args:
|
||||
sport: Sport model instance
|
||||
raw_name: Raw stadium name from scraper
|
||||
check_date: Date for alias validity check
|
||||
|
||||
Returns:
|
||||
Stadium instance if found, None otherwise
|
||||
"""
|
||||
from core.models import Stadium, StadiumAlias
|
||||
from datetime import date
|
||||
|
||||
if not raw_name:
|
||||
return None
|
||||
|
||||
check_date = check_date or date.today()
|
||||
|
||||
# Check StadiumAlias model
|
||||
aliases = StadiumAlias.objects.filter(
|
||||
alias__iexact=raw_name.strip(),
|
||||
stadium__sport=sport,
|
||||
).select_related('stadium')
|
||||
|
||||
for alias in aliases:
|
||||
if alias.is_valid_for_date(check_date):
|
||||
return alias.stadium
|
||||
|
||||
# Also try direct matching on stadium name
|
||||
stadium = Stadium.objects.filter(
|
||||
sport=sport,
|
||||
name__iexact=raw_name.strip()
|
||||
).first()
|
||||
if stadium:
|
||||
return stadium
|
||||
|
||||
return None
|
||||
|
||||
def _process_games(self, sport, games):
|
||||
"""Process and upsert games."""
|
||||
from core.models import Game, Team, Stadium
|
||||
from cloudkit.models import CloudKitSyncState
|
||||
|
||||
result = {
|
||||
'games_found': len(games),
|
||||
'games_new': 0,
|
||||
'games_updated': 0,
|
||||
'games_unchanged': 0,
|
||||
'games_errors': 0,
|
||||
}
|
||||
|
||||
for game_data in games:
|
||||
try:
|
||||
game_id = game_data.id
|
||||
check_date = game_data.game_date.date() if hasattr(game_data.game_date, 'date') else game_data.game_date
|
||||
|
||||
# Get related objects - try by ID first, then by DB alias
|
||||
home_team = None
|
||||
away_team = None
|
||||
|
||||
try:
|
||||
home_team = Team.objects.get(id=game_data.home_team_id)
|
||||
except Team.DoesNotExist:
|
||||
# Try resolving via database alias using raw name
|
||||
raw_home = getattr(game_data, 'raw_home_team', None)
|
||||
if raw_home:
|
||||
home_team = self._resolve_team_via_db_alias(sport, raw_home, check_date)
|
||||
if home_team:
|
||||
self.log('info', f'Resolved home team via DB alias: {raw_home} -> {home_team.abbreviation}', source='adapter')
|
||||
|
||||
try:
|
||||
away_team = Team.objects.get(id=game_data.away_team_id)
|
||||
except Team.DoesNotExist:
|
||||
# Try resolving via database alias using raw name
|
||||
raw_away = getattr(game_data, 'raw_away_team', None)
|
||||
if raw_away:
|
||||
away_team = self._resolve_team_via_db_alias(sport, raw_away, check_date)
|
||||
if away_team:
|
||||
self.log('info', f'Resolved away team via DB alias: {raw_away} -> {away_team.abbreviation}', source='adapter')
|
||||
|
||||
if not home_team or not away_team:
|
||||
missing = []
|
||||
if not home_team:
|
||||
missing.append(f'home={game_data.home_team_id}')
|
||||
if not away_team:
|
||||
missing.append(f'away={game_data.away_team_id}')
|
||||
self.log('warning', f'Team not found for game {game_id}: {", ".join(missing)}', source='adapter')
|
||||
result['games_errors'] += 1
|
||||
continue
|
||||
|
||||
stadium = None
|
||||
if game_data.stadium_id:
|
||||
try:
|
||||
stadium = Stadium.objects.get(id=game_data.stadium_id)
|
||||
except Stadium.DoesNotExist:
|
||||
# Try resolving via database alias using raw name
|
||||
raw_stadium = getattr(game_data, 'raw_stadium', None)
|
||||
if raw_stadium:
|
||||
stadium = self._resolve_stadium_via_db_alias(sport, raw_stadium, check_date)
|
||||
if stadium:
|
||||
self.log('info', f'Resolved stadium via DB alias: {raw_stadium} -> {stadium.name}', source='adapter')
|
||||
|
||||
# Build game dict
|
||||
game_defaults = {
|
||||
'sport': sport,
|
||||
'season': game_data.season,
|
||||
'home_team': home_team,
|
||||
'away_team': away_team,
|
||||
'stadium': stadium,
|
||||
'game_date': game_data.game_date,
|
||||
'game_number': getattr(game_data, 'game_number', None),
|
||||
'home_score': game_data.home_score,
|
||||
'away_score': game_data.away_score,
|
||||
'status': game_data.status,
|
||||
'raw_home_team': getattr(game_data, 'raw_home_team', '') or '',
|
||||
'raw_away_team': getattr(game_data, 'raw_away_team', '') or '',
|
||||
'raw_stadium': getattr(game_data, 'raw_stadium', '') or '',
|
||||
'source_url': getattr(game_data, 'source_url', '') or '',
|
||||
}
|
||||
|
||||
# Check if game exists
|
||||
try:
|
||||
existing = Game.objects.get(id=game_id)
|
||||
# Check if changed
|
||||
changed = False
|
||||
for key, value in game_defaults.items():
|
||||
if getattr(existing, key if not hasattr(existing, f'{key}_id') else f'{key}_id') != (value.id if hasattr(value, 'id') else value):
|
||||
changed = True
|
||||
break
|
||||
|
||||
if changed:
|
||||
for key, value in game_defaults.items():
|
||||
setattr(existing, key, value)
|
||||
existing.save()
|
||||
result['games_updated'] += 1
|
||||
|
||||
# Mark for sync
|
||||
CloudKitSyncState.objects.update_or_create(
|
||||
record_type='Game',
|
||||
record_id=game_id,
|
||||
defaults={'sync_status': 'pending'}
|
||||
)
|
||||
else:
|
||||
result['games_unchanged'] += 1
|
||||
|
||||
except Game.DoesNotExist:
|
||||
# Create new game
|
||||
Game.objects.create(id=game_id, **game_defaults)
|
||||
result['games_new'] += 1
|
||||
|
||||
# Mark for sync
|
||||
CloudKitSyncState.objects.get_or_create(
|
||||
record_type='Game',
|
||||
record_id=game_id,
|
||||
defaults={'sync_status': 'pending'}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log('error', f'Error processing game: {e}', source='adapter')
|
||||
result['games_errors'] += 1
|
||||
|
||||
return result
|
||||
|
||||
def _process_review_items(self, sport, review_items):
|
||||
"""Create manual review items."""
|
||||
from scraper.models import ManualReviewItem, ScrapeJob
|
||||
from sportstime_parser.models.aliases import ReviewReason
|
||||
|
||||
# Get current job
|
||||
job = ScrapeJob.objects.filter(
|
||||
config=self.config,
|
||||
status='running'
|
||||
).order_by('-created_at').first()
|
||||
|
||||
count = 0
|
||||
for item in review_items:
|
||||
# Derive item_type from reason
|
||||
item_type = self._get_item_type_from_reason(item.reason)
|
||||
|
||||
# Get suggested match info (parser uses suggested_matches list)
|
||||
suggested_id = ''
|
||||
confidence = 0.0
|
||||
if item.suggested_matches:
|
||||
best_match = item.suggested_matches[0]
|
||||
suggested_id = best_match.canonical_id
|
||||
confidence = best_match.confidence / 100.0 # Convert to 0-1 range
|
||||
|
||||
ManualReviewItem.objects.create(
|
||||
job=job,
|
||||
item_type=item_type,
|
||||
sport=sport,
|
||||
raw_value=item.raw_value,
|
||||
suggested_id=suggested_id,
|
||||
confidence=confidence,
|
||||
reason=self._map_reason(item.reason),
|
||||
source_url=item.source_url or '',
|
||||
check_date=item.game_date,
|
||||
context=item.context if item.context else None,
|
||||
)
|
||||
count += 1
|
||||
|
||||
return count
|
||||
|
||||
def _get_item_type_from_reason(self, reason) -> str:
|
||||
"""Derive item type (team/stadium) from ReviewReason enum."""
|
||||
from sportstime_parser.models.aliases import ReviewReason
|
||||
|
||||
# Map reason to item type
|
||||
if isinstance(reason, ReviewReason):
|
||||
reason_value = reason.value
|
||||
else:
|
||||
reason_value = str(reason).lower()
|
||||
|
||||
if 'team' in reason_value:
|
||||
return 'team'
|
||||
elif 'stadium' in reason_value:
|
||||
return 'stadium'
|
||||
else:
|
||||
# Default to team for other reasons
|
||||
return 'team'
|
||||
|
||||
def _map_reason(self, reason) -> str:
|
||||
"""Map scraper ReviewReason to model choice."""
|
||||
from sportstime_parser.models.aliases import ReviewReason
|
||||
|
||||
# Handle ReviewReason enum
|
||||
if isinstance(reason, ReviewReason):
|
||||
reason_value = reason.value
|
||||
else:
|
||||
reason_value = str(reason).lower()
|
||||
|
||||
reason_map = {
|
||||
'unresolved_team': 'no_match',
|
||||
'unresolved_stadium': 'no_match',
|
||||
'low_confidence_match': 'low_confidence',
|
||||
'missing_data': 'no_match',
|
||||
'duplicate_game': 'ambiguous',
|
||||
'timezone_unknown': 'no_match',
|
||||
'geographic_filter': 'no_match',
|
||||
# Legacy mappings
|
||||
'no_match': 'no_match',
|
||||
'no match found': 'no_match',
|
||||
'low_confidence': 'low_confidence',
|
||||
'fuzzy match below threshold': 'low_confidence',
|
||||
'ambiguous': 'ambiguous',
|
||||
'new_entity': 'new_entity',
|
||||
}
|
||||
return reason_map.get(reason_value.lower(), 'no_match')
|
||||
144
scraper/engine/db_alias_loader.py
Normal file
144
scraper/engine/db_alias_loader.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Database-aware alias loaders for team and stadium resolution.
|
||||
|
||||
These loaders check the Django TeamAlias and StadiumAlias models
|
||||
in addition to the hardcoded mappings, allowing aliases to be
|
||||
managed via the admin interface.
|
||||
"""
|
||||
|
||||
from datetime import date
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class DatabaseTeamAliasLoader:
|
||||
"""Load team aliases from the Django database.
|
||||
|
||||
Checks the core.TeamAlias model for alias mappings,
|
||||
supporting date-aware lookups for historical names.
|
||||
"""
|
||||
|
||||
def resolve(
|
||||
self,
|
||||
value: str,
|
||||
sport_code: str,
|
||||
check_date: Optional[date] = None,
|
||||
) -> Optional[str]:
|
||||
"""Resolve an alias value to a canonical team ID.
|
||||
|
||||
Args:
|
||||
value: Alias value to look up (case-insensitive)
|
||||
sport_code: Sport code to filter by
|
||||
check_date: Date to check validity (None = current date)
|
||||
|
||||
Returns:
|
||||
Canonical team ID if found, None otherwise
|
||||
"""
|
||||
from core.models import TeamAlias
|
||||
from django.db.models import Q
|
||||
|
||||
if check_date is None:
|
||||
check_date = date.today()
|
||||
|
||||
value_lower = value.lower().strip()
|
||||
|
||||
# Query aliases matching the value and sport
|
||||
aliases = TeamAlias.objects.filter(
|
||||
alias__iexact=value_lower,
|
||||
team__sport__code=sport_code,
|
||||
).select_related('team')
|
||||
|
||||
for alias in aliases:
|
||||
if alias.is_valid_for_date(check_date):
|
||||
return alias.team.id
|
||||
|
||||
return None
|
||||
|
||||
def get_aliases_for_team(
|
||||
self,
|
||||
team_id: str,
|
||||
check_date: Optional[date] = None,
|
||||
) -> list:
|
||||
"""Get all aliases for a team.
|
||||
|
||||
Args:
|
||||
team_id: Team ID
|
||||
check_date: Date to filter by (None = all aliases)
|
||||
|
||||
Returns:
|
||||
List of TeamAlias objects
|
||||
"""
|
||||
from core.models import TeamAlias
|
||||
|
||||
aliases = TeamAlias.objects.filter(team_id=team_id)
|
||||
|
||||
if check_date:
|
||||
result = []
|
||||
for alias in aliases:
|
||||
if alias.is_valid_for_date(check_date):
|
||||
result.append(alias)
|
||||
return result
|
||||
|
||||
return list(aliases)
|
||||
|
||||
|
||||
class DatabaseStadiumAliasLoader:
|
||||
"""Load stadium aliases from the Django database.
|
||||
|
||||
Checks the core.StadiumAlias model for alias mappings,
|
||||
supporting date-aware lookups for naming rights changes.
|
||||
"""
|
||||
|
||||
def resolve(
|
||||
self,
|
||||
name: str,
|
||||
sport_code: str,
|
||||
check_date: Optional[date] = None,
|
||||
) -> Optional[str]:
|
||||
"""Resolve a stadium name to a canonical stadium ID.
|
||||
|
||||
Args:
|
||||
name: Stadium name to look up (case-insensitive)
|
||||
sport_code: Sport code to filter by
|
||||
check_date: Date to check validity (None = current date)
|
||||
|
||||
Returns:
|
||||
Canonical stadium ID if found, None otherwise
|
||||
"""
|
||||
from core.models import StadiumAlias
|
||||
|
||||
if check_date is None:
|
||||
check_date = date.today()
|
||||
|
||||
name_lower = name.lower().strip()
|
||||
|
||||
# Query aliases matching the name and sport
|
||||
aliases = StadiumAlias.objects.filter(
|
||||
alias__iexact=name_lower,
|
||||
stadium__sport__code=sport_code,
|
||||
).select_related('stadium')
|
||||
|
||||
for alias in aliases:
|
||||
if alias.is_valid_for_date(check_date):
|
||||
return alias.stadium.id
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# Global instances
|
||||
_db_team_loader: Optional[DatabaseTeamAliasLoader] = None
|
||||
_db_stadium_loader: Optional[DatabaseStadiumAliasLoader] = None
|
||||
|
||||
|
||||
def get_db_team_alias_loader() -> DatabaseTeamAliasLoader:
|
||||
"""Get the database team alias loader."""
|
||||
global _db_team_loader
|
||||
if _db_team_loader is None:
|
||||
_db_team_loader = DatabaseTeamAliasLoader()
|
||||
return _db_team_loader
|
||||
|
||||
|
||||
def get_db_stadium_alias_loader() -> DatabaseStadiumAliasLoader:
|
||||
"""Get the database stadium alias loader."""
|
||||
global _db_stadium_loader
|
||||
if _db_stadium_loader is None:
|
||||
_db_stadium_loader = DatabaseStadiumAliasLoader()
|
||||
return _db_stadium_loader
|
||||
201
scraper/migrations/0001_initial.py
Normal file
201
scraper/migrations/0001_initial.py
Normal file
@@ -0,0 +1,201 @@
|
||||
# Generated by Django 5.1.15 on 2026-01-26 08:59
|
||||
|
||||
import django.db.models.deletion
|
||||
import simple_history.models
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('core', '0001_initial'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='ScrapeJob',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('status', models.CharField(choices=[('pending', 'Pending'), ('running', 'Running'), ('completed', 'Completed'), ('failed', 'Failed'), ('cancelled', 'Cancelled')], default='pending', max_length=20)),
|
||||
('triggered_by', models.CharField(default='manual', help_text='How the job was triggered (manual, scheduled, api)', max_length=50)),
|
||||
('started_at', models.DateTimeField(blank=True, null=True)),
|
||||
('finished_at', models.DateTimeField(blank=True, null=True)),
|
||||
('games_found', models.PositiveIntegerField(default=0)),
|
||||
('games_new', models.PositiveIntegerField(default=0)),
|
||||
('games_updated', models.PositiveIntegerField(default=0)),
|
||||
('games_unchanged', models.PositiveIntegerField(default=0)),
|
||||
('games_errors', models.PositiveIntegerField(default=0)),
|
||||
('teams_found', models.PositiveIntegerField(default=0)),
|
||||
('stadiums_found', models.PositiveIntegerField(default=0)),
|
||||
('review_items_created', models.PositiveIntegerField(default=0)),
|
||||
('error_message', models.TextField(blank=True)),
|
||||
('error_traceback', models.TextField(blank=True)),
|
||||
('celery_task_id', models.CharField(blank=True, help_text='Celery task ID for this job', max_length=255)),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('updated_at', models.DateTimeField(auto_now=True)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Scrape Job',
|
||||
'verbose_name_plural': 'Scrape Jobs',
|
||||
'ordering': ['-created_at'],
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='HistoricalScraperConfig',
|
||||
fields=[
|
||||
('id', models.BigIntegerField(auto_created=True, blank=True, db_index=True, verbose_name='ID')),
|
||||
('season', models.PositiveSmallIntegerField(help_text='Season to scrape (e.g., 2025 for 2025-26 season)')),
|
||||
('is_enabled', models.BooleanField(default=True, help_text='Whether this scraper is enabled for scheduling')),
|
||||
('sources', models.JSONField(default=list, help_text='Ordered list of sources to try (e.g., ["basketball_reference", "espn"])')),
|
||||
('primary_source', models.CharField(blank=True, help_text='Primary source for this scraper', max_length=100)),
|
||||
('request_delay', models.FloatField(default=3.0, help_text='Seconds between requests')),
|
||||
('max_retries', models.PositiveSmallIntegerField(default=3, help_text='Maximum retry attempts')),
|
||||
('fuzzy_threshold', models.PositiveSmallIntegerField(default=85, help_text='Minimum fuzzy match confidence (0-100)')),
|
||||
('last_run', models.DateTimeField(blank=True, help_text='Last successful run timestamp', null=True)),
|
||||
('last_run_status', models.CharField(blank=True, help_text='Status of last run', max_length=20)),
|
||||
('last_run_games', models.PositiveIntegerField(default=0, help_text='Games found in last run')),
|
||||
('notes', models.TextField(blank=True, help_text='Configuration notes')),
|
||||
('created_at', models.DateTimeField(blank=True, editable=False)),
|
||||
('updated_at', models.DateTimeField(blank=True, editable=False)),
|
||||
('history_id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('history_date', models.DateTimeField(db_index=True)),
|
||||
('history_change_reason', models.CharField(max_length=100, null=True)),
|
||||
('history_type', models.CharField(choices=[('+', 'Created'), ('~', 'Changed'), ('-', 'Deleted')], max_length=1)),
|
||||
('history_user', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='+', to=settings.AUTH_USER_MODEL)),
|
||||
('sport', models.ForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='core.sport')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'historical Scraper Configuration',
|
||||
'verbose_name_plural': 'historical Scraper Configurations',
|
||||
'ordering': ('-history_date', '-history_id'),
|
||||
'get_latest_by': ('history_date', 'history_id'),
|
||||
},
|
||||
bases=(simple_history.models.HistoricalChanges, models.Model),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='HistoricalManualReviewItem',
|
||||
fields=[
|
||||
('id', models.BigIntegerField(auto_created=True, blank=True, db_index=True, verbose_name='ID')),
|
||||
('item_type', models.CharField(choices=[('team', 'Team'), ('stadium', 'Stadium')], max_length=20)),
|
||||
('raw_value', models.CharField(help_text='Original scraped value', max_length=300)),
|
||||
('suggested_id', models.CharField(blank=True, help_text='Suggested canonical ID (if any match found)', max_length=100)),
|
||||
('confidence', models.FloatField(default=0.0, help_text='Match confidence (0.0 - 1.0)')),
|
||||
('reason', models.CharField(choices=[('no_match', 'No Match Found'), ('low_confidence', 'Low Confidence Match'), ('ambiguous', 'Ambiguous Match'), ('new_entity', 'Potentially New Entity')], help_text='Why manual review is needed', max_length=20)),
|
||||
('source_url', models.URLField(blank=True, help_text='URL where this value was found')),
|
||||
('check_date', models.DateField(blank=True, help_text='Date context for alias resolution', null=True)),
|
||||
('context', models.JSONField(blank=True, help_text='Additional context (e.g., game info)', null=True)),
|
||||
('status', models.CharField(choices=[('pending', 'Pending Review'), ('resolved', 'Resolved'), ('ignored', 'Ignored'), ('new_entity', 'Created New Entity')], default='pending', max_length=20)),
|
||||
('resolved_to', models.CharField(blank=True, help_text='Final resolved canonical ID', max_length=100)),
|
||||
('resolved_at', models.DateTimeField(blank=True, null=True)),
|
||||
('resolution_notes', models.TextField(blank=True, help_text='Notes about the resolution')),
|
||||
('create_alias', models.BooleanField(default=False, help_text='Whether to create an alias from this resolution')),
|
||||
('created_at', models.DateTimeField(blank=True, editable=False)),
|
||||
('updated_at', models.DateTimeField(blank=True, editable=False)),
|
||||
('history_id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('history_date', models.DateTimeField(db_index=True)),
|
||||
('history_change_reason', models.CharField(max_length=100, null=True)),
|
||||
('history_type', models.CharField(choices=[('+', 'Created'), ('~', 'Changed'), ('-', 'Deleted')], max_length=1)),
|
||||
('history_user', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='+', to=settings.AUTH_USER_MODEL)),
|
||||
('resolved_by', models.ForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to=settings.AUTH_USER_MODEL)),
|
||||
('sport', models.ForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='core.sport')),
|
||||
('job', models.ForeignKey(blank=True, db_constraint=False, help_text='Job that created this review item', null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='scraper.scrapejob')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'historical Manual Review Item',
|
||||
'verbose_name_plural': 'historical Manual Review Items',
|
||||
'ordering': ('-history_date', '-history_id'),
|
||||
'get_latest_by': ('history_date', 'history_id'),
|
||||
},
|
||||
bases=(simple_history.models.HistoricalChanges, models.Model),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='ScrapeJobLog',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('level', models.CharField(choices=[('debug', 'Debug'), ('info', 'Info'), ('warning', 'Warning'), ('error', 'Error')], default='info', max_length=10)),
|
||||
('message', models.TextField()),
|
||||
('source', models.CharField(blank=True, help_text='Source/component that generated this log', max_length=100)),
|
||||
('extra_data', models.JSONField(blank=True, help_text='Additional structured data', null=True)),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('job', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='logs', to='scraper.scrapejob')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Scrape Job Log',
|
||||
'verbose_name_plural': 'Scrape Job Logs',
|
||||
'ordering': ['created_at'],
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='ScraperConfig',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('season', models.PositiveSmallIntegerField(help_text='Season to scrape (e.g., 2025 for 2025-26 season)')),
|
||||
('is_enabled', models.BooleanField(default=True, help_text='Whether this scraper is enabled for scheduling')),
|
||||
('sources', models.JSONField(default=list, help_text='Ordered list of sources to try (e.g., ["basketball_reference", "espn"])')),
|
||||
('primary_source', models.CharField(blank=True, help_text='Primary source for this scraper', max_length=100)),
|
||||
('request_delay', models.FloatField(default=3.0, help_text='Seconds between requests')),
|
||||
('max_retries', models.PositiveSmallIntegerField(default=3, help_text='Maximum retry attempts')),
|
||||
('fuzzy_threshold', models.PositiveSmallIntegerField(default=85, help_text='Minimum fuzzy match confidence (0-100)')),
|
||||
('last_run', models.DateTimeField(blank=True, help_text='Last successful run timestamp', null=True)),
|
||||
('last_run_status', models.CharField(blank=True, help_text='Status of last run', max_length=20)),
|
||||
('last_run_games', models.PositiveIntegerField(default=0, help_text='Games found in last run')),
|
||||
('notes', models.TextField(blank=True, help_text='Configuration notes')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('updated_at', models.DateTimeField(auto_now=True)),
|
||||
('sport', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='scraper_configs', to='core.sport')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Scraper Configuration',
|
||||
'verbose_name_plural': 'Scraper Configurations',
|
||||
'ordering': ['sport', 'season'],
|
||||
'unique_together': {('sport', 'season')},
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='scrapejob',
|
||||
name='config',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='jobs', to='scraper.scraperconfig'),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='ManualReviewItem',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('item_type', models.CharField(choices=[('team', 'Team'), ('stadium', 'Stadium')], max_length=20)),
|
||||
('raw_value', models.CharField(help_text='Original scraped value', max_length=300)),
|
||||
('suggested_id', models.CharField(blank=True, help_text='Suggested canonical ID (if any match found)', max_length=100)),
|
||||
('confidence', models.FloatField(default=0.0, help_text='Match confidence (0.0 - 1.0)')),
|
||||
('reason', models.CharField(choices=[('no_match', 'No Match Found'), ('low_confidence', 'Low Confidence Match'), ('ambiguous', 'Ambiguous Match'), ('new_entity', 'Potentially New Entity')], help_text='Why manual review is needed', max_length=20)),
|
||||
('source_url', models.URLField(blank=True, help_text='URL where this value was found')),
|
||||
('check_date', models.DateField(blank=True, help_text='Date context for alias resolution', null=True)),
|
||||
('context', models.JSONField(blank=True, help_text='Additional context (e.g., game info)', null=True)),
|
||||
('status', models.CharField(choices=[('pending', 'Pending Review'), ('resolved', 'Resolved'), ('ignored', 'Ignored'), ('new_entity', 'Created New Entity')], default='pending', max_length=20)),
|
||||
('resolved_to', models.CharField(blank=True, help_text='Final resolved canonical ID', max_length=100)),
|
||||
('resolved_at', models.DateTimeField(blank=True, null=True)),
|
||||
('resolution_notes', models.TextField(blank=True, help_text='Notes about the resolution')),
|
||||
('create_alias', models.BooleanField(default=False, help_text='Whether to create an alias from this resolution')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('updated_at', models.DateTimeField(auto_now=True)),
|
||||
('resolved_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='resolved_review_items', to=settings.AUTH_USER_MODEL)),
|
||||
('sport', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='review_items', to='core.sport')),
|
||||
('job', models.ForeignKey(blank=True, help_text='Job that created this review item', null=True, on_delete=django.db.models.deletion.CASCADE, related_name='review_items', to='scraper.scrapejob')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Manual Review Item',
|
||||
'verbose_name_plural': 'Manual Review Items',
|
||||
'ordering': ['-created_at'],
|
||||
'indexes': [models.Index(fields=['status', 'item_type'], name='scraper_man_status_5d06e2_idx'), models.Index(fields=['sport', 'status'], name='scraper_man_sport_i_7af37b_idx'), models.Index(fields=['raw_value'], name='scraper_man_raw_val_abdd0a_idx')],
|
||||
},
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='scrapejob',
|
||||
index=models.Index(fields=['config', 'status'], name='scraper_scr_config__4c4058_idx'),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='scrapejob',
|
||||
index=models.Index(fields=['status', 'created_at'], name='scraper_scr_status_f3978d_idx'),
|
||||
),
|
||||
]
|
||||
0
scraper/migrations/__init__.py
Normal file
0
scraper/migrations/__init__.py
Normal file
199
scraper/models.py
Normal file
199
scraper/models.py
Normal file
@@ -0,0 +1,199 @@
|
||||
"""
|
||||
Scraper models for tracking scraping jobs and manual reviews.
|
||||
"""
|
||||
from django.db import models
|
||||
from simple_history.models import HistoricalRecords
|
||||
|
||||
|
||||
class ScraperConfig(models.Model):
|
||||
"""
|
||||
Configuration for a sport scraper per season.
|
||||
"""
|
||||
sport = models.ForeignKey(
|
||||
'core.Sport',
|
||||
on_delete=models.CASCADE,
|
||||
related_name='scraper_configs'
|
||||
)
|
||||
season = models.PositiveSmallIntegerField(
|
||||
help_text='Season year (start year for split seasons)'
|
||||
)
|
||||
is_active = models.BooleanField(
|
||||
default=True,
|
||||
help_text='Whether this config is actively scraping'
|
||||
)
|
||||
schedule_url = models.URLField(
|
||||
blank=True,
|
||||
help_text='Base URL for schedule scraping'
|
||||
)
|
||||
scrape_interval_hours = models.PositiveSmallIntegerField(
|
||||
default=24,
|
||||
help_text='How often to run the scraper (hours)'
|
||||
)
|
||||
last_scrape_at = models.DateTimeField(
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='When the last scrape completed'
|
||||
)
|
||||
next_scrape_at = models.DateTimeField(
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='When the next scrape is scheduled'
|
||||
)
|
||||
|
||||
# Metadata
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
# Audit trail
|
||||
history = HistoricalRecords()
|
||||
|
||||
class Meta:
|
||||
ordering = ['-season', 'sport']
|
||||
unique_together = ['sport', 'season']
|
||||
verbose_name = 'Scraper Config'
|
||||
verbose_name_plural = 'Scraper Configs'
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.sport.short_name} {self.sport.get_season_display(self.season)}"
|
||||
|
||||
|
||||
class ScrapeJob(models.Model):
|
||||
"""
|
||||
Record of a scraping job execution.
|
||||
"""
|
||||
STATUS_CHOICES = [
|
||||
('pending', 'Pending'),
|
||||
('running', 'Running'),
|
||||
('completed', 'Completed'),
|
||||
('failed', 'Failed'),
|
||||
('cancelled', 'Cancelled'),
|
||||
]
|
||||
|
||||
config = models.ForeignKey(
|
||||
ScraperConfig,
|
||||
on_delete=models.CASCADE,
|
||||
related_name='jobs'
|
||||
)
|
||||
status = models.CharField(
|
||||
max_length=20,
|
||||
choices=STATUS_CHOICES,
|
||||
default='pending'
|
||||
)
|
||||
started_at = models.DateTimeField(
|
||||
null=True,
|
||||
blank=True
|
||||
)
|
||||
completed_at = models.DateTimeField(
|
||||
null=True,
|
||||
blank=True
|
||||
)
|
||||
games_found = models.PositiveIntegerField(default=0)
|
||||
games_created = models.PositiveIntegerField(default=0)
|
||||
games_updated = models.PositiveIntegerField(default=0)
|
||||
errors = models.TextField(blank=True)
|
||||
log_output = models.TextField(blank=True)
|
||||
|
||||
# Metadata
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
class Meta:
|
||||
ordering = ['-created_at']
|
||||
verbose_name = 'Scrape Job'
|
||||
verbose_name_plural = 'Scrape Jobs'
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.config} - {self.status} ({self.created_at.strftime('%Y-%m-%d %H:%M')})"
|
||||
|
||||
@property
|
||||
def duration(self):
|
||||
"""Return job duration in seconds."""
|
||||
if self.started_at and self.completed_at:
|
||||
return (self.completed_at - self.started_at).total_seconds()
|
||||
return None
|
||||
|
||||
|
||||
class ManualReviewItem(models.Model):
|
||||
"""
|
||||
Items flagged for manual review (fuzzy matches, conflicts, etc).
|
||||
"""
|
||||
STATUS_CHOICES = [
|
||||
('pending', 'Pending Review'),
|
||||
('approved', 'Approved'),
|
||||
('rejected', 'Rejected'),
|
||||
('resolved', 'Resolved'),
|
||||
]
|
||||
|
||||
ITEM_TYPE_CHOICES = [
|
||||
('team', 'Team Match'),
|
||||
('stadium', 'Stadium Match'),
|
||||
('game', 'Game Conflict'),
|
||||
('alias', 'New Alias'),
|
||||
]
|
||||
|
||||
sport = models.ForeignKey(
|
||||
'core.Sport',
|
||||
on_delete=models.CASCADE,
|
||||
related_name='review_items'
|
||||
)
|
||||
job = models.ForeignKey(
|
||||
ScrapeJob,
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name='review_items'
|
||||
)
|
||||
item_type = models.CharField(
|
||||
max_length=20,
|
||||
choices=ITEM_TYPE_CHOICES
|
||||
)
|
||||
status = models.CharField(
|
||||
max_length=20,
|
||||
choices=STATUS_CHOICES,
|
||||
default='pending'
|
||||
)
|
||||
raw_value = models.CharField(
|
||||
max_length=500,
|
||||
help_text='The raw scraped value'
|
||||
)
|
||||
matched_value = models.CharField(
|
||||
max_length=500,
|
||||
blank=True,
|
||||
help_text='The matched canonical value (if any)'
|
||||
)
|
||||
confidence = models.PositiveSmallIntegerField(
|
||||
default=0,
|
||||
help_text='Match confidence score (0-100)'
|
||||
)
|
||||
context = models.JSONField(
|
||||
default=dict,
|
||||
blank=True,
|
||||
help_text='Additional context (game date, opposing team, etc)'
|
||||
)
|
||||
resolution_notes = models.TextField(
|
||||
blank=True,
|
||||
help_text='Notes about the resolution'
|
||||
)
|
||||
resolved_at = models.DateTimeField(
|
||||
null=True,
|
||||
blank=True
|
||||
)
|
||||
resolved_by = models.ForeignKey(
|
||||
'auth.User',
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name='resolved_reviews'
|
||||
)
|
||||
|
||||
# Metadata
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
class Meta:
|
||||
ordering = ['-confidence', '-created_at']
|
||||
verbose_name = 'Manual Review Item'
|
||||
verbose_name_plural = 'Manual Review Items'
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.item_type}: {self.raw_value} ({self.confidence}%)"
|
||||
10
scraper/models/__init__.py
Normal file
10
scraper/models/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from .config import ScraperConfig
|
||||
from .job import ScrapeJob, ScrapeJobLog
|
||||
from .review import ManualReviewItem
|
||||
|
||||
__all__ = [
|
||||
'ScraperConfig',
|
||||
'ScrapeJob',
|
||||
'ScrapeJobLog',
|
||||
'ManualReviewItem',
|
||||
]
|
||||
102
scraper/models/config.py
Normal file
102
scraper/models/config.py
Normal file
@@ -0,0 +1,102 @@
|
||||
from django.db import models
|
||||
from django.conf import settings
|
||||
from simple_history.models import HistoricalRecords
|
||||
|
||||
|
||||
class ScraperConfig(models.Model):
|
||||
"""
|
||||
Configuration for a sport's scraper.
|
||||
"""
|
||||
sport = models.ForeignKey(
|
||||
'core.Sport',
|
||||
on_delete=models.CASCADE,
|
||||
related_name='scraper_configs'
|
||||
)
|
||||
season = models.PositiveSmallIntegerField(
|
||||
help_text='Season to scrape (e.g., 2025 for 2025-26 season)'
|
||||
)
|
||||
is_enabled = models.BooleanField(
|
||||
default=True,
|
||||
help_text='Whether this scraper is enabled for scheduling'
|
||||
)
|
||||
|
||||
# Source configuration
|
||||
sources = models.JSONField(
|
||||
default=list,
|
||||
help_text='Ordered list of sources to try (e.g., ["basketball_reference", "espn"])'
|
||||
)
|
||||
primary_source = models.CharField(
|
||||
max_length=100,
|
||||
blank=True,
|
||||
help_text='Primary source for this scraper'
|
||||
)
|
||||
|
||||
# Rate limiting
|
||||
request_delay = models.FloatField(
|
||||
default=settings.SCRAPER_REQUEST_DELAY,
|
||||
help_text='Seconds between requests'
|
||||
)
|
||||
max_retries = models.PositiveSmallIntegerField(
|
||||
default=settings.SCRAPER_MAX_RETRIES,
|
||||
help_text='Maximum retry attempts'
|
||||
)
|
||||
|
||||
# Fuzzy matching
|
||||
fuzzy_threshold = models.PositiveSmallIntegerField(
|
||||
default=settings.SCRAPER_FUZZY_THRESHOLD,
|
||||
help_text='Minimum fuzzy match confidence (0-100)'
|
||||
)
|
||||
|
||||
# Scheduling
|
||||
last_run = models.DateTimeField(
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='Last successful run timestamp'
|
||||
)
|
||||
last_run_status = models.CharField(
|
||||
max_length=20,
|
||||
blank=True,
|
||||
help_text='Status of last run'
|
||||
)
|
||||
last_run_games = models.PositiveIntegerField(
|
||||
default=0,
|
||||
help_text='Games found in last run'
|
||||
)
|
||||
|
||||
# Notes
|
||||
notes = models.TextField(
|
||||
blank=True,
|
||||
help_text='Configuration notes'
|
||||
)
|
||||
|
||||
# Metadata
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
# Audit trail
|
||||
history = HistoricalRecords()
|
||||
|
||||
class Meta:
|
||||
ordering = ['sport', 'season']
|
||||
unique_together = ['sport', 'season']
|
||||
verbose_name = 'Scraper Configuration'
|
||||
verbose_name_plural = 'Scraper Configurations'
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.sport.short_name} {self.sport.get_season_display(self.season)}"
|
||||
|
||||
def get_sources_list(self):
|
||||
"""Return sources as list, using defaults if empty."""
|
||||
if self.sources:
|
||||
return self.sources
|
||||
# Default sources per sport
|
||||
defaults = {
|
||||
'nba': ['basketball_reference', 'espn'],
|
||||
'mlb': ['baseball_reference', 'mlb_api', 'espn'],
|
||||
'nfl': ['espn', 'pro_football_reference'],
|
||||
'nhl': ['hockey_reference', 'nhl_api', 'espn'],
|
||||
'mls': ['espn'],
|
||||
'wnba': ['espn'],
|
||||
'nwsl': ['espn'],
|
||||
}
|
||||
return defaults.get(self.sport.code, ['espn'])
|
||||
159
scraper/models/job.py
Normal file
159
scraper/models/job.py
Normal file
@@ -0,0 +1,159 @@
|
||||
from django.db import models
|
||||
from simple_history.models import HistoricalRecords
|
||||
|
||||
|
||||
class ScrapeJob(models.Model):
|
||||
"""
|
||||
Record of a scraping job execution.
|
||||
"""
|
||||
STATUS_CHOICES = [
|
||||
('pending', 'Pending'),
|
||||
('running', 'Running'),
|
||||
('completed', 'Completed'),
|
||||
('failed', 'Failed'),
|
||||
('cancelled', 'Cancelled'),
|
||||
]
|
||||
|
||||
config = models.ForeignKey(
|
||||
'scraper.ScraperConfig',
|
||||
on_delete=models.CASCADE,
|
||||
related_name='jobs'
|
||||
)
|
||||
status = models.CharField(
|
||||
max_length=20,
|
||||
choices=STATUS_CHOICES,
|
||||
default='pending'
|
||||
)
|
||||
triggered_by = models.CharField(
|
||||
max_length=50,
|
||||
default='manual',
|
||||
help_text='How the job was triggered (manual, scheduled, api)'
|
||||
)
|
||||
|
||||
# Timing
|
||||
started_at = models.DateTimeField(null=True, blank=True)
|
||||
finished_at = models.DateTimeField(null=True, blank=True)
|
||||
|
||||
# Results
|
||||
games_found = models.PositiveIntegerField(default=0)
|
||||
games_new = models.PositiveIntegerField(default=0)
|
||||
games_updated = models.PositiveIntegerField(default=0)
|
||||
games_unchanged = models.PositiveIntegerField(default=0)
|
||||
games_errors = models.PositiveIntegerField(default=0)
|
||||
|
||||
teams_found = models.PositiveIntegerField(default=0)
|
||||
stadiums_found = models.PositiveIntegerField(default=0)
|
||||
review_items_created = models.PositiveIntegerField(default=0)
|
||||
|
||||
# Error tracking
|
||||
error_message = models.TextField(blank=True)
|
||||
error_traceback = models.TextField(blank=True)
|
||||
|
||||
# Celery task ID for tracking
|
||||
celery_task_id = models.CharField(
|
||||
max_length=255,
|
||||
blank=True,
|
||||
help_text='Celery task ID for this job'
|
||||
)
|
||||
|
||||
# Metadata
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
class Meta:
|
||||
ordering = ['-created_at']
|
||||
verbose_name = 'Scrape Job'
|
||||
verbose_name_plural = 'Scrape Jobs'
|
||||
indexes = [
|
||||
models.Index(fields=['config', 'status']),
|
||||
models.Index(fields=['status', 'created_at']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.config} - {self.created_at.strftime('%Y-%m-%d %H:%M')}"
|
||||
|
||||
@property
|
||||
def duration(self):
|
||||
"""Return job duration as timedelta or None."""
|
||||
if self.started_at and self.finished_at:
|
||||
return self.finished_at - self.started_at
|
||||
return None
|
||||
|
||||
@property
|
||||
def duration_display(self):
|
||||
"""Return formatted duration string."""
|
||||
duration = self.duration
|
||||
if duration:
|
||||
total_seconds = int(duration.total_seconds())
|
||||
minutes, seconds = divmod(total_seconds, 60)
|
||||
if minutes > 0:
|
||||
return f"{minutes}m {seconds}s"
|
||||
return f"{seconds}s"
|
||||
return '-'
|
||||
|
||||
@property
|
||||
def sport(self):
|
||||
return self.config.sport
|
||||
|
||||
@property
|
||||
def season(self):
|
||||
return self.config.season
|
||||
|
||||
def get_summary(self):
|
||||
"""Return summary dict for notifications."""
|
||||
return {
|
||||
'sport': self.config.sport.short_name,
|
||||
'season': self.config.sport.get_season_display(self.config.season),
|
||||
'status': self.status,
|
||||
'duration': self.duration_display,
|
||||
'games_found': self.games_found,
|
||||
'games_new': self.games_new,
|
||||
'games_updated': self.games_updated,
|
||||
'games_unchanged': self.games_unchanged,
|
||||
'games_errors': self.games_errors,
|
||||
'review_items': self.review_items_created,
|
||||
'error_message': self.error_message,
|
||||
}
|
||||
|
||||
|
||||
class ScrapeJobLog(models.Model):
|
||||
"""
|
||||
Log entries for a scrape job.
|
||||
"""
|
||||
LEVEL_CHOICES = [
|
||||
('debug', 'Debug'),
|
||||
('info', 'Info'),
|
||||
('warning', 'Warning'),
|
||||
('error', 'Error'),
|
||||
]
|
||||
|
||||
job = models.ForeignKey(
|
||||
ScrapeJob,
|
||||
on_delete=models.CASCADE,
|
||||
related_name='logs'
|
||||
)
|
||||
level = models.CharField(
|
||||
max_length=10,
|
||||
choices=LEVEL_CHOICES,
|
||||
default='info'
|
||||
)
|
||||
message = models.TextField()
|
||||
source = models.CharField(
|
||||
max_length=100,
|
||||
blank=True,
|
||||
help_text='Source/component that generated this log'
|
||||
)
|
||||
extra_data = models.JSONField(
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='Additional structured data'
|
||||
)
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
|
||||
class Meta:
|
||||
ordering = ['created_at']
|
||||
verbose_name = 'Scrape Job Log'
|
||||
verbose_name_plural = 'Scrape Job Logs'
|
||||
|
||||
def __str__(self):
|
||||
return f"[{self.level.upper()}] {self.message[:50]}"
|
||||
192
scraper/models/review.py
Normal file
192
scraper/models/review.py
Normal file
@@ -0,0 +1,192 @@
|
||||
from django.db import models
|
||||
from simple_history.models import HistoricalRecords
|
||||
|
||||
|
||||
class ManualReviewItem(models.Model):
|
||||
"""
|
||||
Items that require manual review before resolution.
|
||||
"""
|
||||
ITEM_TYPE_CHOICES = [
|
||||
('team', 'Team'),
|
||||
('stadium', 'Stadium'),
|
||||
]
|
||||
|
||||
STATUS_CHOICES = [
|
||||
('pending', 'Pending Review'),
|
||||
('resolved', 'Resolved'),
|
||||
('ignored', 'Ignored'),
|
||||
('new_entity', 'Created New Entity'),
|
||||
]
|
||||
|
||||
REASON_CHOICES = [
|
||||
('no_match', 'No Match Found'),
|
||||
('low_confidence', 'Low Confidence Match'),
|
||||
('ambiguous', 'Ambiguous Match'),
|
||||
('new_entity', 'Potentially New Entity'),
|
||||
]
|
||||
|
||||
job = models.ForeignKey(
|
||||
'scraper.ScrapeJob',
|
||||
on_delete=models.CASCADE,
|
||||
related_name='review_items',
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='Job that created this review item'
|
||||
)
|
||||
item_type = models.CharField(
|
||||
max_length=20,
|
||||
choices=ITEM_TYPE_CHOICES
|
||||
)
|
||||
sport = models.ForeignKey(
|
||||
'core.Sport',
|
||||
on_delete=models.CASCADE,
|
||||
related_name='review_items'
|
||||
)
|
||||
|
||||
# Raw value from scraping
|
||||
raw_value = models.CharField(
|
||||
max_length=300,
|
||||
help_text='Original scraped value'
|
||||
)
|
||||
|
||||
# Suggested resolution
|
||||
suggested_id = models.CharField(
|
||||
max_length=100,
|
||||
blank=True,
|
||||
help_text='Suggested canonical ID (if any match found)'
|
||||
)
|
||||
confidence = models.FloatField(
|
||||
default=0.0,
|
||||
help_text='Match confidence (0.0 - 1.0)'
|
||||
)
|
||||
reason = models.CharField(
|
||||
max_length=20,
|
||||
choices=REASON_CHOICES,
|
||||
help_text='Why manual review is needed'
|
||||
)
|
||||
|
||||
# Context
|
||||
source_url = models.URLField(
|
||||
blank=True,
|
||||
help_text='URL where this value was found'
|
||||
)
|
||||
check_date = models.DateField(
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='Date context for alias resolution'
|
||||
)
|
||||
context = models.JSONField(
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text='Additional context (e.g., game info)'
|
||||
)
|
||||
|
||||
# Resolution
|
||||
status = models.CharField(
|
||||
max_length=20,
|
||||
choices=STATUS_CHOICES,
|
||||
default='pending'
|
||||
)
|
||||
resolved_to = models.CharField(
|
||||
max_length=100,
|
||||
blank=True,
|
||||
help_text='Final resolved canonical ID'
|
||||
)
|
||||
resolved_by = models.ForeignKey(
|
||||
'auth.User',
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name='resolved_review_items'
|
||||
)
|
||||
resolved_at = models.DateTimeField(null=True, blank=True)
|
||||
resolution_notes = models.TextField(
|
||||
blank=True,
|
||||
help_text='Notes about the resolution'
|
||||
)
|
||||
create_alias = models.BooleanField(
|
||||
default=False,
|
||||
help_text='Whether to create an alias from this resolution'
|
||||
)
|
||||
|
||||
# Metadata
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
# Audit trail
|
||||
history = HistoricalRecords()
|
||||
|
||||
class Meta:
|
||||
ordering = ['-created_at']
|
||||
verbose_name = 'Manual Review Item'
|
||||
verbose_name_plural = 'Manual Review Items'
|
||||
indexes = [
|
||||
models.Index(fields=['status', 'item_type']),
|
||||
models.Index(fields=['sport', 'status']),
|
||||
models.Index(fields=['raw_value']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.item_type}: {self.raw_value} ({self.get_status_display()})"
|
||||
|
||||
@property
|
||||
def confidence_display(self):
|
||||
"""Return confidence as percentage string."""
|
||||
return f"{self.confidence * 100:.0f}%"
|
||||
|
||||
def resolve(self, canonical_id, user=None, notes='', create_alias=False):
|
||||
"""Resolve this review item."""
|
||||
from django.utils import timezone
|
||||
self.status = 'resolved'
|
||||
self.resolved_to = canonical_id
|
||||
self.resolved_by = user
|
||||
self.resolved_at = timezone.now()
|
||||
self.resolution_notes = notes
|
||||
self.create_alias = create_alias
|
||||
self.save()
|
||||
|
||||
# Optionally create alias
|
||||
if create_alias and canonical_id:
|
||||
self._create_alias(canonical_id)
|
||||
|
||||
def _create_alias(self, canonical_id):
|
||||
"""Create an alias from this resolution."""
|
||||
from core.models import TeamAlias, StadiumAlias, Team, Stadium
|
||||
|
||||
if self.item_type == 'team':
|
||||
try:
|
||||
team = Team.objects.get(id=canonical_id)
|
||||
TeamAlias.objects.get_or_create(
|
||||
team=team,
|
||||
alias=self.raw_value,
|
||||
defaults={
|
||||
'alias_type': 'historical',
|
||||
'source': 'manual_review',
|
||||
'notes': f'Created from review item #{self.id}',
|
||||
}
|
||||
)
|
||||
except Team.DoesNotExist:
|
||||
pass
|
||||
elif self.item_type == 'stadium':
|
||||
try:
|
||||
stadium = Stadium.objects.get(id=canonical_id)
|
||||
StadiumAlias.objects.get_or_create(
|
||||
stadium=stadium,
|
||||
alias=self.raw_value,
|
||||
defaults={
|
||||
'alias_type': 'former',
|
||||
'source': 'manual_review',
|
||||
'notes': f'Created from review item #{self.id}',
|
||||
}
|
||||
)
|
||||
except Stadium.DoesNotExist:
|
||||
pass
|
||||
|
||||
def ignore(self, user=None, notes=''):
|
||||
"""Mark this review item as ignored."""
|
||||
from django.utils import timezone
|
||||
self.status = 'ignored'
|
||||
self.resolved_by = user
|
||||
self.resolved_at = timezone.now()
|
||||
self.resolution_notes = notes
|
||||
self.save()
|
||||
55
scraper/resources.py
Normal file
55
scraper/resources.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Import/Export resources for scraper models."""
|
||||
from import_export import resources, fields
|
||||
from import_export.widgets import ForeignKeyWidget
|
||||
|
||||
from core.models import Sport
|
||||
from .models import ScraperConfig, ScrapeJob, ManualReviewItem
|
||||
|
||||
|
||||
class ScraperConfigResource(resources.ModelResource):
|
||||
sport = fields.Field(
|
||||
column_name='sport',
|
||||
attribute='sport',
|
||||
widget=ForeignKeyWidget(Sport, 'code')
|
||||
)
|
||||
|
||||
class Meta:
|
||||
model = ScraperConfig
|
||||
import_id_fields = ['sport', 'season']
|
||||
fields = [
|
||||
'sport', 'season', 'is_active', 'is_enabled',
|
||||
'scrape_interval_hours', 'primary_source',
|
||||
]
|
||||
export_order = fields
|
||||
|
||||
|
||||
class ScrapeJobResource(resources.ModelResource):
|
||||
sport = fields.Field(attribute='config__sport__code', readonly=True)
|
||||
season = fields.Field(attribute='config__season', readonly=True)
|
||||
|
||||
class Meta:
|
||||
model = ScrapeJob
|
||||
fields = [
|
||||
'id', 'sport', 'season', 'status',
|
||||
'games_found', 'games_new', 'games_updated', 'games_unchanged',
|
||||
'started_at', 'finished_at', 'errors', 'created_at',
|
||||
]
|
||||
export_order = fields
|
||||
|
||||
|
||||
class ManualReviewItemResource(resources.ModelResource):
|
||||
sport = fields.Field(
|
||||
column_name='sport',
|
||||
attribute='sport',
|
||||
widget=ForeignKeyWidget(Sport, 'code')
|
||||
)
|
||||
|
||||
class Meta:
|
||||
model = ManualReviewItem
|
||||
import_id_fields = ['id']
|
||||
fields = [
|
||||
'id', 'sport', 'item_type', 'raw_value', 'matched_value',
|
||||
'status', 'confidence', 'reason', 'source_url',
|
||||
'check_date', 'created_at',
|
||||
]
|
||||
export_order = fields
|
||||
182
scraper/tasks.py
Normal file
182
scraper/tasks.py
Normal file
@@ -0,0 +1,182 @@
|
||||
import logging
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
|
||||
from celery import shared_task
|
||||
from django.utils import timezone
|
||||
|
||||
logger = logging.getLogger('scraper')
|
||||
|
||||
|
||||
@shared_task(bind=True, max_retries=3)
|
||||
def run_scraper_task(self, config_id: int, triggered_by: str = 'manual'):
|
||||
"""
|
||||
Run a scraper job for the given configuration.
|
||||
"""
|
||||
from scraper.models import ScraperConfig, ScrapeJob, ScrapeJobLog
|
||||
from notifications.tasks import send_scrape_notification
|
||||
|
||||
# Get configuration
|
||||
try:
|
||||
config = ScraperConfig.objects.select_related('sport').get(id=config_id)
|
||||
except ScraperConfig.DoesNotExist:
|
||||
logger.error(f"ScraperConfig {config_id} not found")
|
||||
return {'error': 'Configuration not found'}
|
||||
|
||||
# Create job record
|
||||
job = ScrapeJob.objects.create(
|
||||
config=config,
|
||||
status='running',
|
||||
triggered_by=triggered_by,
|
||||
started_at=timezone.now(),
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
|
||||
def log(level, message, source='', extra_data=None):
|
||||
ScrapeJobLog.objects.create(
|
||||
job=job,
|
||||
level=level,
|
||||
message=message,
|
||||
source=source,
|
||||
extra_data=extra_data,
|
||||
)
|
||||
getattr(logger, level)(f"[{config.sport.code}] {message}")
|
||||
|
||||
try:
|
||||
log('info', f'Starting scraper for {config.sport.short_name} {config.season}')
|
||||
|
||||
# Import and run the appropriate scraper
|
||||
result = run_sport_scraper(config, log)
|
||||
|
||||
# Update job with results
|
||||
job.status = 'completed'
|
||||
job.finished_at = timezone.now()
|
||||
job.games_found = result.get('games_found', 0)
|
||||
job.games_new = result.get('games_new', 0)
|
||||
job.games_updated = result.get('games_updated', 0)
|
||||
job.games_unchanged = result.get('games_unchanged', 0)
|
||||
job.games_errors = result.get('games_errors', 0)
|
||||
job.teams_found = result.get('teams_found', 0)
|
||||
job.stadiums_found = result.get('stadiums_found', 0)
|
||||
job.review_items_created = result.get('review_items', 0)
|
||||
job.save()
|
||||
|
||||
# Update config
|
||||
config.last_run = timezone.now()
|
||||
config.last_run_status = 'completed'
|
||||
config.last_run_games = result.get('games_found', 0)
|
||||
config.save()
|
||||
|
||||
log('info', f'Scraper completed: {job.games_found} games, {job.games_new} new, {job.review_items_created} reviews')
|
||||
|
||||
# Send notification
|
||||
send_scrape_notification.delay(job.id)
|
||||
|
||||
return {
|
||||
'job_id': job.id,
|
||||
'status': 'completed',
|
||||
'games_found': job.games_found,
|
||||
'games_new': job.games_new,
|
||||
'review_items': job.review_items_created,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
error_tb = traceback.format_exc()
|
||||
|
||||
job.status = 'failed'
|
||||
job.finished_at = timezone.now()
|
||||
job.error_message = error_msg
|
||||
job.error_traceback = error_tb
|
||||
job.save()
|
||||
|
||||
config.last_run = timezone.now()
|
||||
config.last_run_status = 'failed'
|
||||
config.save()
|
||||
|
||||
log('error', f'Scraper failed: {error_msg}', extra_data={'traceback': error_tb})
|
||||
|
||||
# Send failure notification
|
||||
send_scrape_notification.delay(job.id)
|
||||
|
||||
# Retry if applicable
|
||||
if self.request.retries < self.max_retries:
|
||||
raise self.retry(exc=e, countdown=60 * (self.request.retries + 1))
|
||||
|
||||
return {
|
||||
'job_id': job.id,
|
||||
'status': 'failed',
|
||||
'error': error_msg,
|
||||
}
|
||||
|
||||
|
||||
def run_sport_scraper(config, log_func):
|
||||
"""
|
||||
Run the appropriate scraper for the sport.
|
||||
Returns dict with results.
|
||||
"""
|
||||
from core.models import Game, Team, Stadium
|
||||
from scraper.models import ManualReviewItem
|
||||
|
||||
sport_code = config.sport.code
|
||||
season = config.season
|
||||
|
||||
log_func('info', f'Loading scraper for {sport_code}', source='engine')
|
||||
|
||||
# Import the scraper engine from sportstime_parser
|
||||
# This adapts the existing scrapers to work with Django models
|
||||
from scraper.engine.adapter import ScraperAdapter
|
||||
|
||||
adapter = ScraperAdapter(
|
||||
sport_code=sport_code,
|
||||
season=season,
|
||||
config=config,
|
||||
log_func=log_func,
|
||||
)
|
||||
|
||||
# Run the scraper
|
||||
result = adapter.run()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@shared_task
|
||||
def run_all_enabled_scrapers():
|
||||
"""
|
||||
Run all enabled scraper configurations.
|
||||
Called by celery-beat on schedule.
|
||||
"""
|
||||
from scraper.models import ScraperConfig
|
||||
|
||||
configs = ScraperConfig.objects.filter(is_enabled=True)
|
||||
for config in configs:
|
||||
run_scraper_task.delay(config.id, triggered_by='scheduled')
|
||||
|
||||
return {'configs_queued': configs.count()}
|
||||
|
||||
|
||||
@shared_task
|
||||
def cleanup_old_jobs(days: int = 30):
|
||||
"""
|
||||
Clean up old scrape job records.
|
||||
"""
|
||||
from scraper.models import ScrapeJob, ScrapeJobLog
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
|
||||
cutoff = timezone.now() - timedelta(days=days)
|
||||
|
||||
# Delete old logs first (foreign key)
|
||||
logs_deleted, _ = ScrapeJobLog.objects.filter(
|
||||
job__created_at__lt=cutoff
|
||||
).delete()
|
||||
|
||||
# Then delete old jobs
|
||||
jobs_deleted, _ = ScrapeJob.objects.filter(
|
||||
created_at__lt=cutoff
|
||||
).delete()
|
||||
|
||||
return {
|
||||
'jobs_deleted': jobs_deleted,
|
||||
'logs_deleted': logs_deleted,
|
||||
}
|
||||
Reference in New Issue
Block a user