Remove CFB/NASCAR/PGA and streamline to 8 supported sports
- Remove College Football, NASCAR, and PGA from scraper and app - Clean all data files (stadiums, games, pipeline reports) - Update Sport.swift enum and all UI components - Add sportstime.py CLI tool for pipeline management - Add DATA_SCRAPING.md documentation - Add WNBA/MLS/NWSL implementation documentation - Scraper now supports: NBA, MLB, NHL, NFL, WNBA, MLS, NWSL, CBB Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -23,10 +23,24 @@ from enum import Enum
|
||||
|
||||
# Import our modules
|
||||
from scrape_schedules import (
|
||||
Game, Stadium,
|
||||
scrape_nba_basketball_reference,
|
||||
scrape_mlb_statsapi, scrape_mlb_baseball_reference,
|
||||
scrape_nhl_hockey_reference,
|
||||
Game, Stadium, ScraperSource, scrape_with_fallback,
|
||||
# NBA sources
|
||||
scrape_nba_basketball_reference, scrape_nba_espn, scrape_nba_cbssports,
|
||||
# MLB sources
|
||||
scrape_mlb_statsapi, scrape_mlb_baseball_reference, scrape_mlb_espn,
|
||||
# NHL sources
|
||||
scrape_nhl_hockey_reference, scrape_nhl_espn, scrape_nhl_api,
|
||||
# NFL sources
|
||||
scrape_nfl_espn, scrape_nfl_pro_football_reference, scrape_nfl_cbssports,
|
||||
# WNBA sources
|
||||
scrape_wnba_espn, scrape_wnba_basketball_reference, scrape_wnba_cbssports,
|
||||
# MLS sources
|
||||
scrape_mls_espn, scrape_mls_fbref, scrape_mls_mlssoccer,
|
||||
# NWSL sources
|
||||
scrape_nwsl_espn, scrape_nwsl_fbref, scrape_nwsl_nwslsoccer,
|
||||
# CBB sources
|
||||
scrape_cbb_espn, scrape_cbb_sports_reference, scrape_cbb_cbssports,
|
||||
# Utilities
|
||||
generate_stadiums_from_teams,
|
||||
export_to_json,
|
||||
assign_stable_ids,
|
||||
@@ -119,10 +133,15 @@ def run_pipeline(
|
||||
all_stadiums = generate_stadiums_from_teams()
|
||||
print(f" Generated {len(all_stadiums)} stadiums from team data")
|
||||
|
||||
# Scrape by sport
|
||||
# Scrape by sport with multi-source fallback
|
||||
if sport in ['nba', 'all']:
|
||||
print_section(f"NBA {season}")
|
||||
nba_games = scrape_nba_basketball_reference(season)
|
||||
nba_sources = [
|
||||
ScraperSource('Basketball-Reference', scrape_nba_basketball_reference, priority=1, min_games=500),
|
||||
ScraperSource('ESPN', scrape_nba_espn, priority=2, min_games=500),
|
||||
ScraperSource('CBS Sports', scrape_nba_cbssports, priority=3, min_games=100),
|
||||
]
|
||||
nba_games = scrape_with_fallback('NBA', season, nba_sources)
|
||||
nba_season = f"{season-1}-{str(season)[2:]}"
|
||||
nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
|
||||
all_games.extend(nba_games)
|
||||
@@ -130,19 +149,91 @@ def run_pipeline(
|
||||
|
||||
if sport in ['mlb', 'all']:
|
||||
print_section(f"MLB {season}")
|
||||
mlb_games = scrape_mlb_statsapi(season)
|
||||
# MLB API uses official gamePk - already stable
|
||||
mlb_sources = [
|
||||
ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=1000),
|
||||
ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=500),
|
||||
ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=500),
|
||||
]
|
||||
mlb_games = scrape_with_fallback('MLB', season, mlb_sources)
|
||||
mlb_games = assign_stable_ids(mlb_games, 'MLB', str(season))
|
||||
all_games.extend(mlb_games)
|
||||
games_by_sport['MLB'] = len(mlb_games)
|
||||
|
||||
if sport in ['nhl', 'all']:
|
||||
print_section(f"NHL {season}")
|
||||
nhl_games = scrape_nhl_hockey_reference(season)
|
||||
nhl_sources = [
|
||||
ScraperSource('Hockey-Reference', scrape_nhl_hockey_reference, priority=1, min_games=500),
|
||||
ScraperSource('ESPN', scrape_nhl_espn, priority=2, min_games=500),
|
||||
ScraperSource('NHL API', scrape_nhl_api, priority=3, min_games=100),
|
||||
]
|
||||
nhl_games = scrape_with_fallback('NHL', season, nhl_sources)
|
||||
nhl_season = f"{season-1}-{str(season)[2:]}"
|
||||
nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
|
||||
all_games.extend(nhl_games)
|
||||
games_by_sport['NHL'] = len(nhl_games)
|
||||
|
||||
if sport in ['nfl', 'all']:
|
||||
print_section(f"NFL {season}")
|
||||
nfl_sources = [
|
||||
ScraperSource('ESPN', scrape_nfl_espn, priority=1, min_games=200),
|
||||
ScraperSource('Pro-Football-Reference', scrape_nfl_pro_football_reference, priority=2, min_games=200),
|
||||
ScraperSource('CBS Sports', scrape_nfl_cbssports, priority=3, min_games=100),
|
||||
]
|
||||
nfl_games = scrape_with_fallback('NFL', season, nfl_sources)
|
||||
nfl_season = f"{season-1}-{str(season)[2:]}"
|
||||
nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
|
||||
all_games.extend(nfl_games)
|
||||
games_by_sport['NFL'] = len(nfl_games)
|
||||
|
||||
if sport in ['wnba', 'all']:
|
||||
print_section(f"WNBA {season}")
|
||||
wnba_sources = [
|
||||
ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
|
||||
ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
|
||||
ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
|
||||
]
|
||||
wnba_games = scrape_with_fallback('WNBA', season, wnba_sources)
|
||||
wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(season))
|
||||
all_games.extend(wnba_games)
|
||||
games_by_sport['WNBA'] = len(wnba_games)
|
||||
|
||||
if sport in ['mls', 'all']:
|
||||
print_section(f"MLS {season}")
|
||||
mls_sources = [
|
||||
ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
|
||||
ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
|
||||
ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
|
||||
]
|
||||
mls_games = scrape_with_fallback('MLS', season, mls_sources)
|
||||
mls_games = assign_stable_ids(mls_games, 'MLS', str(season))
|
||||
all_games.extend(mls_games)
|
||||
games_by_sport['MLS'] = len(mls_games)
|
||||
|
||||
if sport in ['nwsl', 'all']:
|
||||
print_section(f"NWSL {season}")
|
||||
nwsl_sources = [
|
||||
ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
|
||||
ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
|
||||
ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
|
||||
]
|
||||
nwsl_games = scrape_with_fallback('NWSL', season, nwsl_sources)
|
||||
nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(season))
|
||||
all_games.extend(nwsl_games)
|
||||
games_by_sport['NWSL'] = len(nwsl_games)
|
||||
|
||||
if sport in ['cbb', 'all']:
|
||||
print_section(f"CBB {season}")
|
||||
cbb_sources = [
|
||||
ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
|
||||
ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
|
||||
ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
|
||||
]
|
||||
cbb_games = scrape_with_fallback('CBB', season, cbb_sources)
|
||||
cbb_season = f"{season-1}-{str(season)[2:]}"
|
||||
cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
|
||||
all_games.extend(cbb_games)
|
||||
games_by_sport['CBB'] = len(cbb_games)
|
||||
|
||||
# Export data
|
||||
print_section("Exporting Data")
|
||||
export_to_json(all_games, all_stadiums, output_dir)
|
||||
@@ -233,6 +324,17 @@ def run_pipeline(
|
||||
if count < 75 or count > 90:
|
||||
print(f" NHL: {team} has {count} games (expected ~82)")
|
||||
|
||||
if sport in ['nfl', 'all']:
|
||||
nfl_games = [g for g in all_games if g.sport == 'NFL']
|
||||
team_counts = {}
|
||||
for g in nfl_games:
|
||||
team_counts[g.home_team_abbrev] = team_counts.get(g.home_team_abbrev, 0) + 1
|
||||
team_counts[g.away_team_abbrev] = team_counts.get(g.away_team_abbrev, 0) + 1
|
||||
|
||||
for team, count in sorted(team_counts.items()):
|
||||
if count < 15 or count > 20:
|
||||
print(f" NFL: {team} has {count} games (expected ~17)")
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 3: GENERATE REPORT
|
||||
# =========================================================================
|
||||
@@ -396,7 +498,7 @@ Examples:
|
||||
help='Season year (default: 2025)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--sport', choices=['nba', 'mlb', 'nhl', 'all'], default='all',
|
||||
'--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all',
|
||||
help='Sport to process (default: all)'
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
Reference in New Issue
Block a user