remove cbb

This commit is contained in:
Trey t
2026-01-10 11:16:15 -06:00
parent ca9fa535f1
commit 9ef4b1a770
5 changed files with 11 additions and 77 deletions

View File

@@ -42,7 +42,7 @@ from scrape_schedules import (
# NWSL sources # NWSL sources
scrape_nwsl_espn, scrape_nwsl_fbref, scrape_nwsl_nwslsoccer, scrape_nwsl_espn, scrape_nwsl_fbref, scrape_nwsl_nwslsoccer,
# Utilities # Utilities
generate_stadiums_from_teams, scrape_all_stadiums,
) )
from validate_data import ( from validate_data import (
validate_games, validate_games,
@@ -129,7 +129,7 @@ def run_pipeline(
# Scrape stadiums # Scrape stadiums
print_section("Stadiums") print_section("Stadiums")
all_stadiums = generate_stadiums_from_teams() all_stadiums = scrape_all_stadiums()
print(f" Generated {len(all_stadiums)} stadiums from team data") print(f" Generated {len(all_stadiums)} stadiums from team data")
# Scrape by sport with multi-source fallback # Scrape by sport with multi-source fallback

View File

@@ -12,8 +12,6 @@ This script coordinates scraping across sport-specific modules:
- wnba.py: WNBA stadiums - wnba.py: WNBA stadiums
- nwsl.py: NWSL stadiums - nwsl.py: NWSL stadiums
CBB (College Basketball) remains inline pending extraction (350+ D1 teams).
Usage: Usage:
python scrape_schedules.py --sport nba --season 2026 python scrape_schedules.py --sport nba --season 2026
python scrape_schedules.py --sport all --season 2026 python scrape_schedules.py --sport all --season 2026
@@ -93,7 +91,6 @@ from nwsl import (
# ============================================================================= # =============================================================================
# NON-CORE SPORT SCRAPERS # NON-CORE SPORT SCRAPERS
# NOTE: MLS, WNBA, NWSL stadiums are now imported from their respective modules # NOTE: MLS, WNBA, NWSL stadiums are now imported from their respective modules
# TODO: Extract CBB to separate module (350+ D1 teams requires separate scoped phase)
# ============================================================================= # =============================================================================
def _scrape_espn_schedule(sport: str, league: str, season: int, date_range: tuple[str, str]) -> list[Game]: def _scrape_espn_schedule(sport: str, league: str, season: int, date_range: tuple[str, str]) -> list[Game]:
@@ -106,7 +103,6 @@ def _scrape_espn_schedule(sport: str, league: str, season: int, date_range: tupl
'wnba': 'WNBA', 'wnba': 'WNBA',
'usa.1': 'MLS', 'usa.1': 'MLS',
'usa.nwsl': 'NWSL', 'usa.nwsl': 'NWSL',
'mens-college-basketball': 'CBB'
}.get(league, league.upper()) }.get(league, league.upper())
print(f"Fetching {sport_upper} {season} from ESPN API...") print(f"Fetching {sport_upper} {season} from ESPN API...")
@@ -213,13 +209,6 @@ def scrape_nwsl_espn(season: int) -> list[Game]:
return _scrape_espn_schedule('soccer', 'usa.nwsl', season, (start, end)) return _scrape_espn_schedule('soccer', 'usa.nwsl', season, (start, end))
def scrape_cbb_espn(season: int) -> list[Game]:
"""Fetch College Basketball schedule from ESPN API (D1 only)."""
start = f"{season-1}1101"
end = f"{season}0415"
return _scrape_espn_schedule('basketball', 'mens-college-basketball', season, (start, end))
def scrape_wnba_basketball_reference(season: int) -> list[Game]: def scrape_wnba_basketball_reference(season: int) -> list[Game]:
"""Scrape WNBA schedule from Basketball-Reference.""" """Scrape WNBA schedule from Basketball-Reference."""
games = [] games = []
@@ -339,42 +328,6 @@ def scrape_nwsl_nwslsoccer(season: int) -> list[Game]:
return games return games
def scrape_cbb_sports_reference(season: int) -> list[Game]:
"""Scrape College Basketball schedule from Sports-Reference."""
games = []
print(f"Scraping CBB {season} from Sports-Reference...")
# Placeholder - Sports-Reference scraping would go here
print(f" Found {len(games)} games from Sports-Reference")
return games
def scrape_cbb_cbssports(season: int) -> list[Game]:
"""Fetch College Basketball schedule from CBS Sports."""
games = []
print(f"Fetching CBB {season} from CBS Sports...")
# Placeholder - CBS Sports scraping would go here
print(f" Found {len(games)} games from CBS Sports")
return games
# =============================================================================
# NON-CORE STADIUM SCRAPERS
# NOTE: scrape_mls_stadiums() is now imported from mls.py
# NOTE: scrape_wnba_stadiums() is now imported from wnba.py
# NOTE: scrape_nwsl_stadiums() is now imported from nwsl.py
# TODO: Extract CBB to separate module (350+ D1 teams requires separate scoped phase)
# =============================================================================
def scrape_cbb_stadiums() -> list[Stadium]:
"""Fetch College Basketball arena data."""
print("\nCBB STADIUMS")
print("-" * 40)
stadiums = []
# Would include CBB arena data here
print(f" Found {len(stadiums)} CBB arenas")
return stadiums
# ============================================================================= # =============================================================================
# LEGACY STADIUM FUNCTIONS # LEGACY STADIUM FUNCTIONS
# ============================================================================= # =============================================================================
@@ -407,7 +360,6 @@ def scrape_all_stadiums() -> list[Stadium]:
all_stadiums.extend(scrape_mls_stadiums()) all_stadiums.extend(scrape_mls_stadiums())
all_stadiums.extend(scrape_wnba_stadiums()) all_stadiums.extend(scrape_wnba_stadiums())
all_stadiums.extend(scrape_nwsl_stadiums()) all_stadiums.extend(scrape_nwsl_stadiums())
all_stadiums.extend(scrape_cbb_stadiums())
return all_stadiums return all_stadiums
@@ -444,7 +396,7 @@ def get_team_abbrev(team_name: str, sport: str) -> str:
def main(): def main():
parser = argparse.ArgumentParser(description='Scrape sports schedules') parser = argparse.ArgumentParser(description='Scrape sports schedules')
parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all') parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'all'], default='all')
parser.add_argument('--season', type=int, default=2026, help='Season year (ending year)') parser.add_argument('--season', type=int, default=2026, help='Season year (ending year)')
parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data (legacy method)') parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data (legacy method)')
parser.add_argument('--stadiums-update', action='store_true', help='Scrape ALL stadium data for all 8 sports (comprehensive)') parser.add_argument('--stadiums-update', action='store_true', help='Scrape ALL stadium data for all 8 sports (comprehensive)')
@@ -550,20 +502,6 @@ def main():
nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(args.season)) nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(args.season))
all_games.extend(nwsl_games) all_games.extend(nwsl_games)
if args.sport in ['cbb', 'all']:
print("\n" + "="*60)
print(f"SCRAPING CBB {args.season}")
print("="*60)
cbb_sources = [
ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
]
cbb_games = scrape_with_fallback('CBB', args.season, cbb_sources)
cbb_season = f"{args.season-1}-{str(args.season)[2:]}"
cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
all_games.extend(cbb_games)
# Export # Export
print("\n" + "="*60) print("\n" + "="*60)
print("EXPORTING DATA") print("EXPORTING DATA")

View File

@@ -123,11 +123,11 @@ def get_season_and_sport(mode: str):
print(f"\n{Colors.BOLD}Select sport:{Colors.RESET}") print(f"\n{Colors.BOLD}Select sport:{Colors.RESET}")
print(f" {Colors.GREEN}[1]{Colors.RESET} All Sports") print(f" {Colors.GREEN}[1]{Colors.RESET} All Sports")
print(f" {Colors.GREEN}[2]{Colors.RESET} MLB {Colors.GREEN}[3]{Colors.RESET} NBA {Colors.GREEN}[4]{Colors.RESET} NHL {Colors.GREEN}[5]{Colors.RESET} NFL") print(f" {Colors.GREEN}[2]{Colors.RESET} MLB {Colors.GREEN}[3]{Colors.RESET} NBA {Colors.GREEN}[4]{Colors.RESET} NHL {Colors.GREEN}[5]{Colors.RESET} NFL")
print(f" {Colors.GREEN}[6]{Colors.RESET} MLS {Colors.GREEN}[7]{Colors.RESET} WNBA {Colors.GREEN}[8]{Colors.RESET} NWSL {Colors.GREEN}[9]{Colors.RESET} CBB") print(f" {Colors.GREEN}[6]{Colors.RESET} MLS {Colors.GREEN}[7]{Colors.RESET} WNBA {Colors.GREEN}[8]{Colors.RESET} NWSL")
sport_map = { sport_map = {
'1': 'all', '2': 'mlb', '3': 'nba', '4': 'nhl', '5': 'nfl', '1': 'all', '2': 'mlb', '3': 'nba', '4': 'nhl', '5': 'nfl',
'6': 'mls', '7': 'wnba', '8': 'nwsl', '9': 'cbb' '6': 'mls', '7': 'wnba', '8': 'nwsl'
} }
sport_choice = input(f"{Colors.CYAN}Enter choice [1]:{Colors.RESET} ").strip() sport_choice = input(f"{Colors.CYAN}Enter choice [1]:{Colors.RESET} ").strip()
@@ -156,7 +156,6 @@ def scrape_submenu():
('6', 'mls', 'MLS - Major League Soccer'), ('6', 'mls', 'MLS - Major League Soccer'),
('7', 'wnba', 'WNBA - Women\'s National Basketball Association'), ('7', 'wnba', 'WNBA - Women\'s National Basketball Association'),
('8', 'nwsl', 'NWSL - National Women\'s Soccer League'), ('8', 'nwsl', 'NWSL - National Women\'s Soccer League'),
('9', 'cbb', 'CBB - College Basketball'),
('b', 'back', 'Back to main menu'), ('b', 'back', 'Back to main menu'),
] ]
@@ -235,7 +234,6 @@ def pipeline_submenu():
('6', 'mls', 'MLS only'), ('6', 'mls', 'MLS only'),
('7', 'wnba', 'WNBA only'), ('7', 'wnba', 'WNBA only'),
('8', 'nwsl', 'NWSL only'), ('8', 'nwsl', 'NWSL only'),
('9', 'cbb', 'CBB only'),
('b', 'back', 'Back to main menu'), ('b', 'back', 'Back to main menu'),
] ]
@@ -624,7 +622,7 @@ Examples:
) )
scrape_parser.add_argument( scrape_parser.add_argument(
'--sport', '--sport',
choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'all'],
default='all', default='all',
help='Sport to scrape (default: all)' help='Sport to scrape (default: all)'
) )
@@ -653,7 +651,7 @@ Examples:
stadiums_update_parser = subparsers.add_parser( stadiums_update_parser = subparsers.add_parser(
'stadiums-update', 'stadiums-update',
help='Scrape ALL stadium data for all 8 sports', help='Scrape ALL stadium data for all 8 sports',
description='Comprehensive stadium scraping for NBA, MLB, NHL, NFL, WNBA, MLS, NWSL, and CBB' description='Comprehensive stadium scraping for NBA, MLB, NHL, NFL, WNBA, MLS, and NWSL'
) )
stadiums_update_parser.add_argument( stadiums_update_parser.add_argument(
'--output', '--output',
@@ -893,7 +891,7 @@ Examples:
) )
pipeline_parser.add_argument( pipeline_parser.add_argument(
'--sport', '--sport',
choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'all'],
default='all', default='all',
help='Sport to process (default: all)' help='Sport to process (default: all)'
) )
@@ -934,7 +932,7 @@ Examples:
) )
full_pipeline_parser.add_argument( full_pipeline_parser.add_argument(
'--sport', '--sport',
choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'all'],
default='all', default='all',
help='Sport to process (default: all)' help='Sport to process (default: all)'
) )

View File

@@ -91,8 +91,6 @@ EXPECTED_GAMES = {
'max': 30, 'max': 30,
'description': 'NWSL regular season (26 games)' 'description': 'NWSL regular season (26 games)'
}, },
# Note: CBB doesn't have fixed game counts per "team"
# CBB teams vary widely (30+ games)
} }

View File

@@ -27,7 +27,7 @@ from nfl import scrape_nfl_espn, NFL_TEAMS
# Import secondary sports from scrape_schedules (stubs) # Import secondary sports from scrape_schedules (stubs)
from scrape_schedules import ( from scrape_schedules import (
scrape_wnba_espn, scrape_mls_espn, scrape_nwsl_espn, scrape_cbb_espn, scrape_wnba_espn, scrape_mls_espn, scrape_nwsl_espn,
WNBA_TEAMS, MLS_TEAMS, NWSL_TEAMS, WNBA_TEAMS, MLS_TEAMS, NWSL_TEAMS,
) )
@@ -474,7 +474,7 @@ def main():
parser.add_argument('--data-dir', type=str, default='./data', help='Data directory') parser.add_argument('--data-dir', type=str, default='./data', help='Data directory')
parser.add_argument('--scrape-and-validate', action='store_true', help='Scrape fresh and validate') parser.add_argument('--scrape-and-validate', action='store_true', help='Scrape fresh and validate')
parser.add_argument('--season', type=int, default=2025, help='Season year') parser.add_argument('--season', type=int, default=2025, help='Season year')
parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all') parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'all'], default='all')
parser.add_argument('--output', type=str, default='./data/validation_report.json') parser.add_argument('--output', type=str, default='./data/validation_report.json')
args = parser.parse_args() args = parser.parse_args()