diff --git a/Scripts/run_pipeline.py b/Scripts/run_pipeline.py index c34fceb..734f660 100755 --- a/Scripts/run_pipeline.py +++ b/Scripts/run_pipeline.py @@ -42,7 +42,7 @@ from scrape_schedules import ( # NWSL sources scrape_nwsl_espn, scrape_nwsl_fbref, scrape_nwsl_nwslsoccer, # Utilities - generate_stadiums_from_teams, + scrape_all_stadiums, ) from validate_data import ( validate_games, @@ -129,7 +129,7 @@ def run_pipeline( # Scrape stadiums print_section("Stadiums") - all_stadiums = generate_stadiums_from_teams() + all_stadiums = scrape_all_stadiums() print(f" Generated {len(all_stadiums)} stadiums from team data") # Scrape by sport with multi-source fallback diff --git a/Scripts/scrape_schedules.py b/Scripts/scrape_schedules.py index 35f325a..bc455f2 100644 --- a/Scripts/scrape_schedules.py +++ b/Scripts/scrape_schedules.py @@ -12,8 +12,6 @@ This script coordinates scraping across sport-specific modules: - wnba.py: WNBA stadiums - nwsl.py: NWSL stadiums -CBB (College Basketball) remains inline pending extraction (350+ D1 teams). - Usage: python scrape_schedules.py --sport nba --season 2026 python scrape_schedules.py --sport all --season 2026 @@ -93,7 +91,6 @@ from nwsl import ( # ============================================================================= # NON-CORE SPORT SCRAPERS # NOTE: MLS, WNBA, NWSL stadiums are now imported from their respective modules -# TODO: Extract CBB to separate module (350+ D1 teams requires separate scoped phase) # ============================================================================= def _scrape_espn_schedule(sport: str, league: str, season: int, date_range: tuple[str, str]) -> list[Game]: @@ -106,7 +103,6 @@ def _scrape_espn_schedule(sport: str, league: str, season: int, date_range: tupl 'wnba': 'WNBA', 'usa.1': 'MLS', 'usa.nwsl': 'NWSL', - 'mens-college-basketball': 'CBB' }.get(league, league.upper()) print(f"Fetching {sport_upper} {season} from ESPN API...") @@ -213,13 +209,6 @@ def scrape_nwsl_espn(season: int) -> list[Game]: return _scrape_espn_schedule('soccer', 'usa.nwsl', season, (start, end)) -def scrape_cbb_espn(season: int) -> list[Game]: - """Fetch College Basketball schedule from ESPN API (D1 only).""" - start = f"{season-1}1101" - end = f"{season}0415" - return _scrape_espn_schedule('basketball', 'mens-college-basketball', season, (start, end)) - - def scrape_wnba_basketball_reference(season: int) -> list[Game]: """Scrape WNBA schedule from Basketball-Reference.""" games = [] @@ -339,42 +328,6 @@ def scrape_nwsl_nwslsoccer(season: int) -> list[Game]: return games -def scrape_cbb_sports_reference(season: int) -> list[Game]: - """Scrape College Basketball schedule from Sports-Reference.""" - games = [] - print(f"Scraping CBB {season} from Sports-Reference...") - # Placeholder - Sports-Reference scraping would go here - print(f" Found {len(games)} games from Sports-Reference") - return games - - -def scrape_cbb_cbssports(season: int) -> list[Game]: - """Fetch College Basketball schedule from CBS Sports.""" - games = [] - print(f"Fetching CBB {season} from CBS Sports...") - # Placeholder - CBS Sports scraping would go here - print(f" Found {len(games)} games from CBS Sports") - return games - - -# ============================================================================= -# NON-CORE STADIUM SCRAPERS -# NOTE: scrape_mls_stadiums() is now imported from mls.py -# NOTE: scrape_wnba_stadiums() is now imported from wnba.py -# NOTE: scrape_nwsl_stadiums() is now imported from nwsl.py -# TODO: Extract CBB to separate module (350+ D1 teams requires separate scoped phase) -# ============================================================================= - -def scrape_cbb_stadiums() -> list[Stadium]: - """Fetch College Basketball arena data.""" - print("\nCBB STADIUMS") - print("-" * 40) - stadiums = [] - # Would include CBB arena data here - print(f" Found {len(stadiums)} CBB arenas") - return stadiums - - # ============================================================================= # LEGACY STADIUM FUNCTIONS # ============================================================================= @@ -407,7 +360,6 @@ def scrape_all_stadiums() -> list[Stadium]: all_stadiums.extend(scrape_mls_stadiums()) all_stadiums.extend(scrape_wnba_stadiums()) all_stadiums.extend(scrape_nwsl_stadiums()) - all_stadiums.extend(scrape_cbb_stadiums()) return all_stadiums @@ -444,7 +396,7 @@ def get_team_abbrev(team_name: str, sport: str) -> str: def main(): parser = argparse.ArgumentParser(description='Scrape sports schedules') - parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all') + parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'all'], default='all') parser.add_argument('--season', type=int, default=2026, help='Season year (ending year)') parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data (legacy method)') parser.add_argument('--stadiums-update', action='store_true', help='Scrape ALL stadium data for all 8 sports (comprehensive)') @@ -550,20 +502,6 @@ def main(): nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(args.season)) all_games.extend(nwsl_games) - if args.sport in ['cbb', 'all']: - print("\n" + "="*60) - print(f"SCRAPING CBB {args.season}") - print("="*60) - cbb_sources = [ - ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000), - ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500), - ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300), - ] - cbb_games = scrape_with_fallback('CBB', args.season, cbb_sources) - cbb_season = f"{args.season-1}-{str(args.season)[2:]}" - cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season) - all_games.extend(cbb_games) - # Export print("\n" + "="*60) print("EXPORTING DATA") diff --git a/Scripts/sportstime.py b/Scripts/sportstime.py index 53a11d4..2071754 100755 --- a/Scripts/sportstime.py +++ b/Scripts/sportstime.py @@ -123,11 +123,11 @@ def get_season_and_sport(mode: str): print(f"\n{Colors.BOLD}Select sport:{Colors.RESET}") print(f" {Colors.GREEN}[1]{Colors.RESET} All Sports") print(f" {Colors.GREEN}[2]{Colors.RESET} MLB {Colors.GREEN}[3]{Colors.RESET} NBA {Colors.GREEN}[4]{Colors.RESET} NHL {Colors.GREEN}[5]{Colors.RESET} NFL") - print(f" {Colors.GREEN}[6]{Colors.RESET} MLS {Colors.GREEN}[7]{Colors.RESET} WNBA {Colors.GREEN}[8]{Colors.RESET} NWSL {Colors.GREEN}[9]{Colors.RESET} CBB") + print(f" {Colors.GREEN}[6]{Colors.RESET} MLS {Colors.GREEN}[7]{Colors.RESET} WNBA {Colors.GREEN}[8]{Colors.RESET} NWSL") sport_map = { '1': 'all', '2': 'mlb', '3': 'nba', '4': 'nhl', '5': 'nfl', - '6': 'mls', '7': 'wnba', '8': 'nwsl', '9': 'cbb' + '6': 'mls', '7': 'wnba', '8': 'nwsl' } sport_choice = input(f"{Colors.CYAN}Enter choice [1]:{Colors.RESET} ").strip() @@ -156,7 +156,6 @@ def scrape_submenu(): ('6', 'mls', 'MLS - Major League Soccer'), ('7', 'wnba', 'WNBA - Women\'s National Basketball Association'), ('8', 'nwsl', 'NWSL - National Women\'s Soccer League'), - ('9', 'cbb', 'CBB - College Basketball'), ('b', 'back', 'Back to main menu'), ] @@ -235,7 +234,6 @@ def pipeline_submenu(): ('6', 'mls', 'MLS only'), ('7', 'wnba', 'WNBA only'), ('8', 'nwsl', 'NWSL only'), - ('9', 'cbb', 'CBB only'), ('b', 'back', 'Back to main menu'), ] @@ -624,7 +622,7 @@ Examples: ) scrape_parser.add_argument( '--sport', - choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], + choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'all'], default='all', help='Sport to scrape (default: all)' ) @@ -653,7 +651,7 @@ Examples: stadiums_update_parser = subparsers.add_parser( 'stadiums-update', help='Scrape ALL stadium data for all 8 sports', - description='Comprehensive stadium scraping for NBA, MLB, NHL, NFL, WNBA, MLS, NWSL, and CBB' + description='Comprehensive stadium scraping for NBA, MLB, NHL, NFL, WNBA, MLS, and NWSL' ) stadiums_update_parser.add_argument( '--output', @@ -893,7 +891,7 @@ Examples: ) pipeline_parser.add_argument( '--sport', - choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], + choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'all'], default='all', help='Sport to process (default: all)' ) @@ -934,7 +932,7 @@ Examples: ) full_pipeline_parser.add_argument( '--sport', - choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], + choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'all'], default='all', help='Sport to process (default: all)' ) diff --git a/Scripts/validate_canonical.py b/Scripts/validate_canonical.py index bb76630..6b4fe68 100644 --- a/Scripts/validate_canonical.py +++ b/Scripts/validate_canonical.py @@ -91,8 +91,6 @@ EXPECTED_GAMES = { 'max': 30, 'description': 'NWSL regular season (26 games)' }, - # Note: CBB doesn't have fixed game counts per "team" - # CBB teams vary widely (30+ games) } diff --git a/Scripts/validate_data.py b/Scripts/validate_data.py index adb4514..1b2032f 100644 --- a/Scripts/validate_data.py +++ b/Scripts/validate_data.py @@ -27,7 +27,7 @@ from nfl import scrape_nfl_espn, NFL_TEAMS # Import secondary sports from scrape_schedules (stubs) from scrape_schedules import ( - scrape_wnba_espn, scrape_mls_espn, scrape_nwsl_espn, scrape_cbb_espn, + scrape_wnba_espn, scrape_mls_espn, scrape_nwsl_espn, WNBA_TEAMS, MLS_TEAMS, NWSL_TEAMS, ) @@ -474,7 +474,7 @@ def main(): parser.add_argument('--data-dir', type=str, default='./data', help='Data directory') parser.add_argument('--scrape-and-validate', action='store_true', help='Scrape fresh and validate') parser.add_argument('--season', type=int, default=2025, help='Season year') - parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all') + parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'all'], default='all') parser.add_argument('--output', type=str, default='./data/validation_report.json') args = parser.parse_args()