Remove CFB/NASCAR/PGA and streamline to 8 supported sports

- Remove College Football, NASCAR, and PGA from scraper and app
- Clean all data files (stadiums, games, pipeline reports)
- Update Sport.swift enum and all UI components
- Add sportstime.py CLI tool for pipeline management
- Add DATA_SCRAPING.md documentation
- Add WNBA/MLS/NWSL implementation documentation
- Scraper now supports: NBA, MLB, NHL, NFL, WNBA, MLS, NWSL, CBB

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-09 23:22:13 -06:00
parent f5e509a9ae
commit 8790d2ad73
35 changed files with 117819 additions and 65871 deletions

View File

@@ -31,9 +31,24 @@ from dataclasses import dataclass, asdict
# Import pipeline components
from scrape_schedules import (
scrape_nba_basketball_reference,
scrape_mlb_statsapi,
scrape_nhl_hockey_reference,
ScraperSource, scrape_with_fallback,
# NBA sources
scrape_nba_basketball_reference, scrape_nba_espn, scrape_nba_cbssports,
# MLB sources
scrape_mlb_statsapi, scrape_mlb_baseball_reference, scrape_mlb_espn,
# NHL sources
scrape_nhl_hockey_reference, scrape_nhl_espn, scrape_nhl_api,
# NFL sources
scrape_nfl_espn, scrape_nfl_pro_football_reference, scrape_nfl_cbssports,
# WNBA sources
scrape_wnba_espn, scrape_wnba_basketball_reference, scrape_wnba_cbssports,
# MLS sources
scrape_mls_espn, scrape_mls_fbref, scrape_mls_mlssoccer,
# NWSL sources
scrape_nwsl_espn, scrape_nwsl_fbref, scrape_nwsl_nwslsoccer,
# CBB sources
scrape_cbb_espn, scrape_cbb_sports_reference, scrape_cbb_cbssports,
# Utilities
generate_stadiums_from_teams,
assign_stable_ids,
export_to_json,
@@ -114,28 +129,90 @@ def run_pipeline(
all_stadiums = generate_stadiums_from_teams()
print(f" Generated {len(all_stadiums)} stadiums from team data")
# Scrape NBA
# Scrape all sports with multi-source fallback
print_section(f"NBA {season}")
nba_games = scrape_nba_basketball_reference(season)
nba_sources = [
ScraperSource('Basketball-Reference', scrape_nba_basketball_reference, priority=1, min_games=500),
ScraperSource('ESPN', scrape_nba_espn, priority=2, min_games=500),
ScraperSource('CBS Sports', scrape_nba_cbssports, priority=3, min_games=100),
]
nba_games = scrape_with_fallback('NBA', season, nba_sources)
nba_season = f"{season-1}-{str(season)[2:]}"
nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
all_games.extend(nba_games)
print(f" Scraped {len(nba_games)} NBA games")
# Scrape MLB
print_section(f"MLB {season}")
mlb_games = scrape_mlb_statsapi(season)
mlb_sources = [
ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=1000),
ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=500),
ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=500),
]
mlb_games = scrape_with_fallback('MLB', season, mlb_sources)
mlb_games = assign_stable_ids(mlb_games, 'MLB', str(season))
all_games.extend(mlb_games)
print(f" Scraped {len(mlb_games)} MLB games")
# Scrape NHL
print_section(f"NHL {season}")
nhl_games = scrape_nhl_hockey_reference(season)
nhl_sources = [
ScraperSource('Hockey-Reference', scrape_nhl_hockey_reference, priority=1, min_games=500),
ScraperSource('ESPN', scrape_nhl_espn, priority=2, min_games=500),
ScraperSource('NHL API', scrape_nhl_api, priority=3, min_games=100),
]
nhl_games = scrape_with_fallback('NHL', season, nhl_sources)
nhl_season = f"{season-1}-{str(season)[2:]}"
nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
all_games.extend(nhl_games)
print(f" Scraped {len(nhl_games)} NHL games")
print_section(f"NFL {season}")
nfl_sources = [
ScraperSource('ESPN', scrape_nfl_espn, priority=1, min_games=200),
ScraperSource('Pro-Football-Reference', scrape_nfl_pro_football_reference, priority=2, min_games=200),
ScraperSource('CBS Sports', scrape_nfl_cbssports, priority=3, min_games=100),
]
nfl_games = scrape_with_fallback('NFL', season, nfl_sources)
nfl_season = f"{season-1}-{str(season)[2:]}"
nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
all_games.extend(nfl_games)
print_section(f"WNBA {season}")
wnba_sources = [
ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
]
wnba_games = scrape_with_fallback('WNBA', season, wnba_sources)
wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(season))
all_games.extend(wnba_games)
print_section(f"MLS {season}")
mls_sources = [
ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
]
mls_games = scrape_with_fallback('MLS', season, mls_sources)
mls_games = assign_stable_ids(mls_games, 'MLS', str(season))
all_games.extend(mls_games)
print_section(f"NWSL {season}")
nwsl_sources = [
ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
]
nwsl_games = scrape_with_fallback('NWSL', season, nwsl_sources)
nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(season))
all_games.extend(nwsl_games)
print_section(f"CBB {season}")
cbb_sources = [
ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
]
cbb_games = scrape_with_fallback('CBB', season, cbb_sources)
cbb_season = f"{season-1}-{str(season)[2:]}"
cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
all_games.extend(cbb_games)
# Export raw data
print_section("Exporting Raw Data")
@@ -148,16 +225,36 @@ def run_pipeline(
else:
print_header("LOADING EXISTING RAW DATA")
games_file = output_dir / 'games.json'
stadiums_file = output_dir / 'stadiums.json'
# Try loading from new structure first (games/*.json)
games_dir = output_dir / 'games'
raw_games = []
with open(games_file) as f:
raw_games = json.load(f)
print(f" Loaded {len(raw_games)} raw games")
if games_dir.exists() and any(games_dir.glob('*.json')):
print_section("Loading from games/ directory")
for games_file in sorted(games_dir.glob('*.json')):
with open(games_file) as f:
file_games = json.load(f)
raw_games.extend(file_games)
print(f" Loaded {len(file_games):,} games from {games_file.name}")
else:
# Fallback to legacy games.json
print_section("Loading from legacy games.json")
games_file = output_dir / 'games.json'
with open(games_file) as f:
raw_games = json.load(f)
with open(stadiums_file) as f:
raw_stadiums = json.load(f)
print(f" Loaded {len(raw_stadiums)} raw stadiums")
print(f" Total: {len(raw_games):,} raw games")
# Try loading stadiums from canonical/ first, then legacy
canonical_dir = output_dir / 'canonical'
if (canonical_dir / 'stadiums.json').exists():
with open(canonical_dir / 'stadiums.json') as f:
raw_stadiums = json.load(f)
print(f" Loaded {len(raw_stadiums)} raw stadiums from canonical/stadiums.json")
else:
with open(output_dir / 'stadiums.json') as f:
raw_stadiums = json.load(f)
print(f" Loaded {len(raw_stadiums)} raw stadiums from stadiums.json")
# =========================================================================
# STAGE 2: CANONICALIZE STADIUMS
@@ -242,13 +339,32 @@ def run_pipeline(
for issue, count in by_issue.items():
print(f" - {issue}: {count}")
# Export
games_canonical_path = output_dir / 'games_canonical.json'
# Export games to new structure: canonical/games/{sport}_{season}.json
canonical_games_dir = output_dir / 'canonical' / 'games'
canonical_games_dir.mkdir(parents=True, exist_ok=True)
# Group games by sport and season
games_by_sport_season = {}
for game in canonical_games_list:
sport = game.sport.lower()
season = game.season
key = f"{sport}_{season}"
if key not in games_by_sport_season:
games_by_sport_season[key] = []
games_by_sport_season[key].append(game)
# Export each sport/season file
for key, sport_games in sorted(games_by_sport_season.items()):
filepath = canonical_games_dir / f"{key}.json"
with open(filepath, 'w') as f:
json.dump([asdict(g) for g in sport_games], f, indent=2)
print(f" Exported {len(sport_games):,} games to canonical/games/{key}.json")
# Also export combined games_canonical.json for backward compatibility
games_canonical_path = output_dir / 'games_canonical.json'
with open(games_canonical_path, 'w') as f:
json.dump([asdict(g) for g in canonical_games_list], f, indent=2)
print(f" Exported to {games_canonical_path}")
print(f" Exported combined to {games_canonical_path}")
# =========================================================================
# STAGE 5: VALIDATE
@@ -320,7 +436,8 @@ def run_pipeline(
print(f" - {output_dir / 'stadiums_canonical.json'}")
print(f" - {output_dir / 'stadium_aliases.json'}")
print(f" - {output_dir / 'teams_canonical.json'}")
print(f" - {output_dir / 'games_canonical.json'}")
print(f" - {output_dir / 'games_canonical.json'} (combined)")
print(f" - {output_dir / 'canonical' / 'games' / '*.json'} (by sport/season)")
print(f" - {output_dir / 'canonicalization_validation.json'}")
print()