After Phase 1 refactoring moved scraper functions to sport-specific modules (nba.py, mlb.py, etc.), these pipeline scripts still imported from scrape_schedules.py. - run_pipeline.py: import from core.py and sport modules - validate_data.py: import from core.py and sport modules - run_canonicalization_pipeline.py: import from core.py and sport modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
539 lines
20 KiB
Python
Executable File
539 lines
20 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
SportsTime Data Pipeline
|
|
========================
|
|
Master script that orchestrates all data fetching, validation, and reporting.
|
|
|
|
Usage:
|
|
python run_pipeline.py # Full pipeline with defaults
|
|
python run_pipeline.py --season 2026 # Specify season
|
|
python run_pipeline.py --sport nba # Single sport only
|
|
python run_pipeline.py --skip-scrape # Validate existing data only
|
|
python run_pipeline.py --verbose # Detailed output
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
from enum import Enum
|
|
|
|
# Import from core module
|
|
from core import (
|
|
Game, Stadium, ScraperSource, scrape_with_fallback,
|
|
assign_stable_ids, export_to_json,
|
|
)
|
|
|
|
# Import from sport modules
|
|
from nba import scrape_nba_basketball_reference, scrape_nba_espn, scrape_nba_cbssports
|
|
from mlb import scrape_mlb_statsapi, scrape_mlb_baseball_reference, scrape_mlb_espn
|
|
from nhl import scrape_nhl_hockey_reference, scrape_nhl_espn, scrape_nhl_api
|
|
from nfl import scrape_nfl_espn, scrape_nfl_pro_football_reference, scrape_nfl_cbssports
|
|
|
|
# Import secondary sports from scrape_schedules (stubs)
|
|
from scrape_schedules import (
|
|
# WNBA sources
|
|
scrape_wnba_espn, scrape_wnba_basketball_reference, scrape_wnba_cbssports,
|
|
# MLS sources
|
|
scrape_mls_espn, scrape_mls_fbref, scrape_mls_mlssoccer,
|
|
# NWSL sources
|
|
scrape_nwsl_espn, scrape_nwsl_fbref, scrape_nwsl_nwslsoccer,
|
|
# CBB sources
|
|
scrape_cbb_espn, scrape_cbb_sports_reference, scrape_cbb_cbssports,
|
|
# Utilities
|
|
generate_stadiums_from_teams,
|
|
)
|
|
from validate_data import (
|
|
validate_games,
|
|
validate_stadiums,
|
|
scrape_mlb_all_sources,
|
|
scrape_nba_all_sources,
|
|
scrape_nhl_all_sources,
|
|
ValidationReport,
|
|
)
|
|
|
|
|
|
class Severity(Enum):
|
|
HIGH = "high"
|
|
MEDIUM = "medium"
|
|
LOW = "low"
|
|
|
|
|
|
@dataclass
|
|
class PipelineResult:
|
|
success: bool
|
|
games_scraped: int
|
|
stadiums_scraped: int
|
|
games_by_sport: dict
|
|
validation_reports: list
|
|
stadium_issues: list
|
|
high_severity_count: int
|
|
medium_severity_count: int
|
|
low_severity_count: int
|
|
output_dir: Path
|
|
duration_seconds: float
|
|
|
|
|
|
def print_header(text: str):
|
|
"""Print a formatted header."""
|
|
print()
|
|
print("=" * 70)
|
|
print(f" {text}")
|
|
print("=" * 70)
|
|
|
|
|
|
def print_section(text: str):
|
|
"""Print a section header."""
|
|
print()
|
|
print(f"--- {text} ---")
|
|
|
|
|
|
def print_severity(severity: str, message: str):
|
|
"""Print a message with severity indicator."""
|
|
icons = {
|
|
'high': '🔴 HIGH',
|
|
'medium': '🟡 MEDIUM',
|
|
'low': '🟢 LOW',
|
|
}
|
|
print(f" {icons.get(severity, '⚪')} {message}")
|
|
|
|
|
|
def run_pipeline(
|
|
season: int = 2025,
|
|
sport: str = 'all',
|
|
output_dir: Path = Path('./data'),
|
|
skip_scrape: bool = False,
|
|
validate: bool = True,
|
|
verbose: bool = False,
|
|
) -> PipelineResult:
|
|
"""
|
|
Run the complete data pipeline.
|
|
"""
|
|
start_time = datetime.now()
|
|
|
|
all_games = []
|
|
all_stadiums = []
|
|
games_by_sport = {}
|
|
validation_reports = []
|
|
stadium_issues = []
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# =========================================================================
|
|
# PHASE 1: SCRAPE DATA
|
|
# =========================================================================
|
|
|
|
if not skip_scrape:
|
|
print_header("PHASE 1: SCRAPING DATA")
|
|
|
|
# Scrape stadiums
|
|
print_section("Stadiums")
|
|
all_stadiums = generate_stadiums_from_teams()
|
|
print(f" Generated {len(all_stadiums)} stadiums from team data")
|
|
|
|
# Scrape by sport with multi-source fallback
|
|
if sport in ['nba', 'all']:
|
|
print_section(f"NBA {season}")
|
|
nba_sources = [
|
|
ScraperSource('Basketball-Reference', scrape_nba_basketball_reference, priority=1, min_games=500),
|
|
ScraperSource('ESPN', scrape_nba_espn, priority=2, min_games=500),
|
|
ScraperSource('CBS Sports', scrape_nba_cbssports, priority=3, min_games=100),
|
|
]
|
|
nba_games = scrape_with_fallback('NBA', season, nba_sources)
|
|
nba_season = f"{season-1}-{str(season)[2:]}"
|
|
nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
|
|
all_games.extend(nba_games)
|
|
games_by_sport['NBA'] = len(nba_games)
|
|
|
|
if sport in ['mlb', 'all']:
|
|
print_section(f"MLB {season}")
|
|
mlb_sources = [
|
|
ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=1000),
|
|
ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=500),
|
|
ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=500),
|
|
]
|
|
mlb_games = scrape_with_fallback('MLB', season, mlb_sources)
|
|
mlb_games = assign_stable_ids(mlb_games, 'MLB', str(season))
|
|
all_games.extend(mlb_games)
|
|
games_by_sport['MLB'] = len(mlb_games)
|
|
|
|
if sport in ['nhl', 'all']:
|
|
print_section(f"NHL {season}")
|
|
nhl_sources = [
|
|
ScraperSource('Hockey-Reference', scrape_nhl_hockey_reference, priority=1, min_games=500),
|
|
ScraperSource('ESPN', scrape_nhl_espn, priority=2, min_games=500),
|
|
ScraperSource('NHL API', scrape_nhl_api, priority=3, min_games=100),
|
|
]
|
|
nhl_games = scrape_with_fallback('NHL', season, nhl_sources)
|
|
nhl_season = f"{season-1}-{str(season)[2:]}"
|
|
nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
|
|
all_games.extend(nhl_games)
|
|
games_by_sport['NHL'] = len(nhl_games)
|
|
|
|
if sport in ['nfl', 'all']:
|
|
print_section(f"NFL {season}")
|
|
nfl_sources = [
|
|
ScraperSource('ESPN', scrape_nfl_espn, priority=1, min_games=200),
|
|
ScraperSource('Pro-Football-Reference', scrape_nfl_pro_football_reference, priority=2, min_games=200),
|
|
ScraperSource('CBS Sports', scrape_nfl_cbssports, priority=3, min_games=100),
|
|
]
|
|
nfl_games = scrape_with_fallback('NFL', season, nfl_sources)
|
|
nfl_season = f"{season-1}-{str(season)[2:]}"
|
|
nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
|
|
all_games.extend(nfl_games)
|
|
games_by_sport['NFL'] = len(nfl_games)
|
|
|
|
if sport in ['wnba', 'all']:
|
|
print_section(f"WNBA {season}")
|
|
wnba_sources = [
|
|
ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
|
|
ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
|
|
ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
|
|
]
|
|
wnba_games = scrape_with_fallback('WNBA', season, wnba_sources)
|
|
wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(season))
|
|
all_games.extend(wnba_games)
|
|
games_by_sport['WNBA'] = len(wnba_games)
|
|
|
|
if sport in ['mls', 'all']:
|
|
print_section(f"MLS {season}")
|
|
mls_sources = [
|
|
ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
|
|
ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
|
|
ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
|
|
]
|
|
mls_games = scrape_with_fallback('MLS', season, mls_sources)
|
|
mls_games = assign_stable_ids(mls_games, 'MLS', str(season))
|
|
all_games.extend(mls_games)
|
|
games_by_sport['MLS'] = len(mls_games)
|
|
|
|
if sport in ['nwsl', 'all']:
|
|
print_section(f"NWSL {season}")
|
|
nwsl_sources = [
|
|
ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
|
|
ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
|
|
ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
|
|
]
|
|
nwsl_games = scrape_with_fallback('NWSL', season, nwsl_sources)
|
|
nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(season))
|
|
all_games.extend(nwsl_games)
|
|
games_by_sport['NWSL'] = len(nwsl_games)
|
|
|
|
if sport in ['cbb', 'all']:
|
|
print_section(f"CBB {season}")
|
|
cbb_sources = [
|
|
ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
|
|
ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
|
|
ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
|
|
]
|
|
cbb_games = scrape_with_fallback('CBB', season, cbb_sources)
|
|
cbb_season = f"{season-1}-{str(season)[2:]}"
|
|
cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
|
|
all_games.extend(cbb_games)
|
|
games_by_sport['CBB'] = len(cbb_games)
|
|
|
|
# Export data
|
|
print_section("Exporting Data")
|
|
export_to_json(all_games, all_stadiums, output_dir)
|
|
print(f" Exported to {output_dir}")
|
|
|
|
else:
|
|
# Load existing data
|
|
print_header("LOADING EXISTING DATA")
|
|
|
|
games_file = output_dir / 'games.json'
|
|
stadiums_file = output_dir / 'stadiums.json'
|
|
|
|
if games_file.exists():
|
|
with open(games_file) as f:
|
|
games_data = json.load(f)
|
|
all_games = [Game(**g) for g in games_data]
|
|
for g in all_games:
|
|
games_by_sport[g.sport] = games_by_sport.get(g.sport, 0) + 1
|
|
print(f" Loaded {len(all_games)} games")
|
|
|
|
if stadiums_file.exists():
|
|
with open(stadiums_file) as f:
|
|
stadiums_data = json.load(f)
|
|
all_stadiums = [Stadium(**s) for s in stadiums_data]
|
|
print(f" Loaded {len(all_stadiums)} stadiums")
|
|
|
|
# =========================================================================
|
|
# PHASE 2: VALIDATE DATA
|
|
# =========================================================================
|
|
|
|
if validate:
|
|
print_header("PHASE 2: CROSS-VALIDATION")
|
|
|
|
# MLB validation (has two good sources)
|
|
if sport in ['mlb', 'all']:
|
|
print_section("MLB Cross-Validation")
|
|
try:
|
|
mlb_sources = scrape_mlb_all_sources(season)
|
|
source_names = list(mlb_sources.keys())
|
|
|
|
if len(source_names) >= 2:
|
|
games1 = mlb_sources[source_names[0]]
|
|
games2 = mlb_sources[source_names[1]]
|
|
|
|
if games1 and games2:
|
|
report = validate_games(
|
|
games1, games2,
|
|
source_names[0], source_names[1],
|
|
'MLB', str(season)
|
|
)
|
|
validation_reports.append(report)
|
|
|
|
print(f" Sources: {source_names[0]} vs {source_names[1]}")
|
|
print(f" Games compared: {report.total_games_source1} vs {report.total_games_source2}")
|
|
print(f" Matched: {report.games_matched}")
|
|
print(f" Discrepancies: {len(report.discrepancies)}")
|
|
except Exception as e:
|
|
print(f" Error during MLB validation: {e}")
|
|
|
|
# Stadium validation
|
|
print_section("Stadium Validation")
|
|
stadium_issues = validate_stadiums(all_stadiums)
|
|
print(f" Issues found: {len(stadium_issues)}")
|
|
|
|
# Data quality checks
|
|
print_section("Data Quality Checks")
|
|
|
|
# Check game counts per team
|
|
if sport in ['nba', 'all']:
|
|
nba_games = [g for g in all_games if g.sport == 'NBA']
|
|
team_counts = {}
|
|
for g in nba_games:
|
|
team_counts[g.home_team_abbrev] = team_counts.get(g.home_team_abbrev, 0) + 1
|
|
team_counts[g.away_team_abbrev] = team_counts.get(g.away_team_abbrev, 0) + 1
|
|
|
|
for team, count in sorted(team_counts.items()):
|
|
if count < 75 or count > 90:
|
|
print(f" NBA: {team} has {count} games (expected ~82)")
|
|
|
|
if sport in ['nhl', 'all']:
|
|
nhl_games = [g for g in all_games if g.sport == 'NHL']
|
|
team_counts = {}
|
|
for g in nhl_games:
|
|
team_counts[g.home_team_abbrev] = team_counts.get(g.home_team_abbrev, 0) + 1
|
|
team_counts[g.away_team_abbrev] = team_counts.get(g.away_team_abbrev, 0) + 1
|
|
|
|
for team, count in sorted(team_counts.items()):
|
|
if count < 75 or count > 90:
|
|
print(f" NHL: {team} has {count} games (expected ~82)")
|
|
|
|
if sport in ['nfl', 'all']:
|
|
nfl_games = [g for g in all_games if g.sport == 'NFL']
|
|
team_counts = {}
|
|
for g in nfl_games:
|
|
team_counts[g.home_team_abbrev] = team_counts.get(g.home_team_abbrev, 0) + 1
|
|
team_counts[g.away_team_abbrev] = team_counts.get(g.away_team_abbrev, 0) + 1
|
|
|
|
for team, count in sorted(team_counts.items()):
|
|
if count < 15 or count > 20:
|
|
print(f" NFL: {team} has {count} games (expected ~17)")
|
|
|
|
# =========================================================================
|
|
# PHASE 3: GENERATE REPORT
|
|
# =========================================================================
|
|
|
|
print_header("PHASE 3: DISCREPANCY REPORT")
|
|
|
|
# Count by severity
|
|
high_count = 0
|
|
medium_count = 0
|
|
low_count = 0
|
|
|
|
# Game discrepancies
|
|
for report in validation_reports:
|
|
for d in report.discrepancies:
|
|
if d.severity == 'high':
|
|
high_count += 1
|
|
elif d.severity == 'medium':
|
|
medium_count += 1
|
|
else:
|
|
low_count += 1
|
|
|
|
# Stadium issues
|
|
for issue in stadium_issues:
|
|
if issue['severity'] == 'high':
|
|
high_count += 1
|
|
elif issue['severity'] == 'medium':
|
|
medium_count += 1
|
|
else:
|
|
low_count += 1
|
|
|
|
# Print summary
|
|
print()
|
|
print(f" 🔴 HIGH severity: {high_count}")
|
|
print(f" 🟡 MEDIUM severity: {medium_count}")
|
|
print(f" 🟢 LOW severity: {low_count}")
|
|
print()
|
|
|
|
# Print high severity issues (always)
|
|
if high_count > 0:
|
|
print_section("HIGH Severity Issues (Requires Attention)")
|
|
|
|
shown = 0
|
|
max_show = 10 if not verbose else 50
|
|
|
|
for report in validation_reports:
|
|
for d in report.discrepancies:
|
|
if d.severity == 'high' and shown < max_show:
|
|
print_severity('high', f"[{report.sport}] {d.field}: {d.game_key}")
|
|
if verbose:
|
|
print(f" {d.source1}: {d.value1}")
|
|
print(f" {d.source2}: {d.value2}")
|
|
shown += 1
|
|
|
|
for issue in stadium_issues:
|
|
if issue['severity'] == 'high' and shown < max_show:
|
|
print_severity('high', f"[Stadium] {issue['stadium']}: {issue['issue']}")
|
|
shown += 1
|
|
|
|
if high_count > max_show:
|
|
print(f" ... and {high_count - max_show} more (use --verbose to see all)")
|
|
|
|
# Print medium severity if verbose
|
|
if medium_count > 0 and verbose:
|
|
print_section("MEDIUM Severity Issues")
|
|
|
|
for report in validation_reports:
|
|
for d in report.discrepancies:
|
|
if d.severity == 'medium':
|
|
print_severity('medium', f"[{report.sport}] {d.field}: {d.game_key}")
|
|
|
|
for issue in stadium_issues:
|
|
if issue['severity'] == 'medium':
|
|
print_severity('medium', f"[Stadium] {issue['stadium']}: {issue['issue']}")
|
|
|
|
# Save full report
|
|
report_path = output_dir / 'pipeline_report.json'
|
|
full_report = {
|
|
'generated_at': datetime.now().isoformat(),
|
|
'season': season,
|
|
'sport': sport,
|
|
'summary': {
|
|
'games_scraped': len(all_games),
|
|
'stadiums_scraped': len(all_stadiums),
|
|
'games_by_sport': games_by_sport,
|
|
'high_severity': high_count,
|
|
'medium_severity': medium_count,
|
|
'low_severity': low_count,
|
|
},
|
|
'game_validations': [r.to_dict() for r in validation_reports],
|
|
'stadium_issues': stadium_issues,
|
|
}
|
|
|
|
with open(report_path, 'w') as f:
|
|
json.dump(full_report, f, indent=2)
|
|
|
|
# =========================================================================
|
|
# FINAL SUMMARY
|
|
# =========================================================================
|
|
|
|
duration = (datetime.now() - start_time).total_seconds()
|
|
|
|
print_header("PIPELINE COMPLETE")
|
|
print()
|
|
print(f" Duration: {duration:.1f} seconds")
|
|
print(f" Games: {len(all_games):,}")
|
|
print(f" Stadiums: {len(all_stadiums)}")
|
|
print(f" Output: {output_dir.absolute()}")
|
|
print()
|
|
|
|
for sport_name, count in sorted(games_by_sport.items()):
|
|
print(f" {sport_name}: {count:,} games")
|
|
|
|
print()
|
|
print(f" Reports saved to:")
|
|
print(f" - {output_dir / 'games.json'}")
|
|
print(f" - {output_dir / 'stadiums.json'}")
|
|
print(f" - {output_dir / 'pipeline_report.json'}")
|
|
print()
|
|
|
|
# Status indicator
|
|
if high_count > 0:
|
|
print(" ⚠️ STATUS: Review required - high severity issues found")
|
|
elif medium_count > 0:
|
|
print(" ✓ STATUS: Complete with warnings")
|
|
else:
|
|
print(" ✅ STATUS: All checks passed")
|
|
|
|
print()
|
|
|
|
return PipelineResult(
|
|
success=high_count == 0,
|
|
games_scraped=len(all_games),
|
|
stadiums_scraped=len(all_stadiums),
|
|
games_by_sport=games_by_sport,
|
|
validation_reports=validation_reports,
|
|
stadium_issues=stadium_issues,
|
|
high_severity_count=high_count,
|
|
medium_severity_count=medium_count,
|
|
low_severity_count=low_count,
|
|
output_dir=output_dir,
|
|
duration_seconds=duration,
|
|
)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='SportsTime Data Pipeline - Fetch, validate, and report on sports data',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python run_pipeline.py # Full pipeline
|
|
python run_pipeline.py --season 2026 # Different season
|
|
python run_pipeline.py --sport mlb # MLB only
|
|
python run_pipeline.py --skip-scrape # Validate existing data
|
|
python run_pipeline.py --verbose # Show all issues
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--season', type=int, default=2025,
|
|
help='Season year (default: 2025)'
|
|
)
|
|
parser.add_argument(
|
|
'--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all',
|
|
help='Sport to process (default: all)'
|
|
)
|
|
parser.add_argument(
|
|
'--output', type=str, default='./data',
|
|
help='Output directory (default: ./data)'
|
|
)
|
|
parser.add_argument(
|
|
'--skip-scrape', action='store_true',
|
|
help='Skip scraping, validate existing data only'
|
|
)
|
|
parser.add_argument(
|
|
'--no-validate', action='store_true',
|
|
help='Skip validation step'
|
|
)
|
|
parser.add_argument(
|
|
'--verbose', '-v', action='store_true',
|
|
help='Verbose output with all issues'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
result = run_pipeline(
|
|
season=args.season,
|
|
sport=args.sport,
|
|
output_dir=Path(args.output),
|
|
skip_scrape=args.skip_scrape,
|
|
validate=not args.no_validate,
|
|
verbose=args.verbose,
|
|
)
|
|
|
|
# Exit with error code if high severity issues
|
|
sys.exit(0 if result.success else 1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|