Remove CFB/NASCAR/PGA and streamline to 8 supported sports

- Remove College Football, NASCAR, and PGA from scraper and app
- Clean all data files (stadiums, games, pipeline reports)
- Update Sport.swift enum and all UI components
- Add sportstime.py CLI tool for pipeline management
- Add DATA_SCRAPING.md documentation
- Add WNBA/MLS/NWSL implementation documentation
- Scraper now supports: NBA, MLB, NHL, NFL, WNBA, MLS, NWSL, CBB

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-09 23:22:13 -06:00
parent f5e509a9ae
commit 8790d2ad73
35 changed files with 117819 additions and 65871 deletions

View File

@@ -23,10 +23,24 @@ from enum import Enum
# Import our modules
from scrape_schedules import (
Game, Stadium,
scrape_nba_basketball_reference,
scrape_mlb_statsapi, scrape_mlb_baseball_reference,
scrape_nhl_hockey_reference,
Game, Stadium, ScraperSource, scrape_with_fallback,
# NBA sources
scrape_nba_basketball_reference, scrape_nba_espn, scrape_nba_cbssports,
# MLB sources
scrape_mlb_statsapi, scrape_mlb_baseball_reference, scrape_mlb_espn,
# NHL sources
scrape_nhl_hockey_reference, scrape_nhl_espn, scrape_nhl_api,
# NFL sources
scrape_nfl_espn, scrape_nfl_pro_football_reference, scrape_nfl_cbssports,
# WNBA sources
scrape_wnba_espn, scrape_wnba_basketball_reference, scrape_wnba_cbssports,
# MLS sources
scrape_mls_espn, scrape_mls_fbref, scrape_mls_mlssoccer,
# NWSL sources
scrape_nwsl_espn, scrape_nwsl_fbref, scrape_nwsl_nwslsoccer,
# CBB sources
scrape_cbb_espn, scrape_cbb_sports_reference, scrape_cbb_cbssports,
# Utilities
generate_stadiums_from_teams,
export_to_json,
assign_stable_ids,
@@ -119,10 +133,15 @@ def run_pipeline(
all_stadiums = generate_stadiums_from_teams()
print(f" Generated {len(all_stadiums)} stadiums from team data")
# Scrape by sport
# Scrape by sport with multi-source fallback
if sport in ['nba', 'all']:
print_section(f"NBA {season}")
nba_games = scrape_nba_basketball_reference(season)
nba_sources = [
ScraperSource('Basketball-Reference', scrape_nba_basketball_reference, priority=1, min_games=500),
ScraperSource('ESPN', scrape_nba_espn, priority=2, min_games=500),
ScraperSource('CBS Sports', scrape_nba_cbssports, priority=3, min_games=100),
]
nba_games = scrape_with_fallback('NBA', season, nba_sources)
nba_season = f"{season-1}-{str(season)[2:]}"
nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
all_games.extend(nba_games)
@@ -130,19 +149,91 @@ def run_pipeline(
if sport in ['mlb', 'all']:
print_section(f"MLB {season}")
mlb_games = scrape_mlb_statsapi(season)
# MLB API uses official gamePk - already stable
mlb_sources = [
ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=1000),
ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=500),
ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=500),
]
mlb_games = scrape_with_fallback('MLB', season, mlb_sources)
mlb_games = assign_stable_ids(mlb_games, 'MLB', str(season))
all_games.extend(mlb_games)
games_by_sport['MLB'] = len(mlb_games)
if sport in ['nhl', 'all']:
print_section(f"NHL {season}")
nhl_games = scrape_nhl_hockey_reference(season)
nhl_sources = [
ScraperSource('Hockey-Reference', scrape_nhl_hockey_reference, priority=1, min_games=500),
ScraperSource('ESPN', scrape_nhl_espn, priority=2, min_games=500),
ScraperSource('NHL API', scrape_nhl_api, priority=3, min_games=100),
]
nhl_games = scrape_with_fallback('NHL', season, nhl_sources)
nhl_season = f"{season-1}-{str(season)[2:]}"
nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
all_games.extend(nhl_games)
games_by_sport['NHL'] = len(nhl_games)
if sport in ['nfl', 'all']:
print_section(f"NFL {season}")
nfl_sources = [
ScraperSource('ESPN', scrape_nfl_espn, priority=1, min_games=200),
ScraperSource('Pro-Football-Reference', scrape_nfl_pro_football_reference, priority=2, min_games=200),
ScraperSource('CBS Sports', scrape_nfl_cbssports, priority=3, min_games=100),
]
nfl_games = scrape_with_fallback('NFL', season, nfl_sources)
nfl_season = f"{season-1}-{str(season)[2:]}"
nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
all_games.extend(nfl_games)
games_by_sport['NFL'] = len(nfl_games)
if sport in ['wnba', 'all']:
print_section(f"WNBA {season}")
wnba_sources = [
ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
]
wnba_games = scrape_with_fallback('WNBA', season, wnba_sources)
wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(season))
all_games.extend(wnba_games)
games_by_sport['WNBA'] = len(wnba_games)
if sport in ['mls', 'all']:
print_section(f"MLS {season}")
mls_sources = [
ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
]
mls_games = scrape_with_fallback('MLS', season, mls_sources)
mls_games = assign_stable_ids(mls_games, 'MLS', str(season))
all_games.extend(mls_games)
games_by_sport['MLS'] = len(mls_games)
if sport in ['nwsl', 'all']:
print_section(f"NWSL {season}")
nwsl_sources = [
ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
]
nwsl_games = scrape_with_fallback('NWSL', season, nwsl_sources)
nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(season))
all_games.extend(nwsl_games)
games_by_sport['NWSL'] = len(nwsl_games)
if sport in ['cbb', 'all']:
print_section(f"CBB {season}")
cbb_sources = [
ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
]
cbb_games = scrape_with_fallback('CBB', season, cbb_sources)
cbb_season = f"{season-1}-{str(season)[2:]}"
cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
all_games.extend(cbb_games)
games_by_sport['CBB'] = len(cbb_games)
# Export data
print_section("Exporting Data")
export_to_json(all_games, all_stadiums, output_dir)
@@ -233,6 +324,17 @@ def run_pipeline(
if count < 75 or count > 90:
print(f" NHL: {team} has {count} games (expected ~82)")
if sport in ['nfl', 'all']:
nfl_games = [g for g in all_games if g.sport == 'NFL']
team_counts = {}
for g in nfl_games:
team_counts[g.home_team_abbrev] = team_counts.get(g.home_team_abbrev, 0) + 1
team_counts[g.away_team_abbrev] = team_counts.get(g.away_team_abbrev, 0) + 1
for team, count in sorted(team_counts.items()):
if count < 15 or count > 20:
print(f" NFL: {team} has {count} games (expected ~17)")
# =========================================================================
# PHASE 3: GENERATE REPORT
# =========================================================================
@@ -396,7 +498,7 @@ Examples:
help='Season year (default: 2025)'
)
parser.add_argument(
'--sport', choices=['nba', 'mlb', 'nhl', 'all'], default='all',
'--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all',
help='Sport to process (default: all)'
)
parser.add_argument(