Remove CFB/NASCAR/PGA and streamline to 8 supported sports

- Remove College Football, NASCAR, and PGA from scraper and app - Clean all data files (stadiums, games, pipeline reports) - Update Sport.swift enum and all UI components - Add sportstime.py CLI tool for pipeline management - Add DATA_SCRAPING.md documentation - Add WNBA/MLS/NWSL implementation documentation - Scraper now supports: NBA, MLB, NHL, NFL, WNBA, MLS, NWSL, CBB Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-09 23:22:13 -06:00
parent f5e509a9ae
commit 8790d2ad73
35 changed files with 117819 additions and 65871 deletions
--- a/Scripts/run_canonicalization_pipeline.py
+++ b/Scripts/run_canonicalization_pipeline.py
@@ -31,9 +31,24 @@ from dataclasses import dataclass, asdict

 # Import pipeline components
 from scrape_schedules import (
-    scrape_nba_basketball_reference,
-    scrape_mlb_statsapi,
-    scrape_nhl_hockey_reference,
+    ScraperSource, scrape_with_fallback,
+    # NBA sources
+    scrape_nba_basketball_reference, scrape_nba_espn, scrape_nba_cbssports,
+    # MLB sources
+    scrape_mlb_statsapi, scrape_mlb_baseball_reference, scrape_mlb_espn,
+    # NHL sources
+    scrape_nhl_hockey_reference, scrape_nhl_espn, scrape_nhl_api,
+    # NFL sources
+    scrape_nfl_espn, scrape_nfl_pro_football_reference, scrape_nfl_cbssports,
+    # WNBA sources
+    scrape_wnba_espn, scrape_wnba_basketball_reference, scrape_wnba_cbssports,
+    # MLS sources
+    scrape_mls_espn, scrape_mls_fbref, scrape_mls_mlssoccer,
+    # NWSL sources
+    scrape_nwsl_espn, scrape_nwsl_fbref, scrape_nwsl_nwslsoccer,
+    # CBB sources
+    scrape_cbb_espn, scrape_cbb_sports_reference, scrape_cbb_cbssports,
+    # Utilities
    generate_stadiums_from_teams,
    assign_stable_ids,
    export_to_json,
@@ -114,28 +129,90 @@ def run_pipeline(
        all_stadiums = generate_stadiums_from_teams()
        print(f"  Generated {len(all_stadiums)} stadiums from team data")

-        # Scrape NBA
+        # Scrape all sports with multi-source fallback
        print_section(f"NBA {season}")
-        nba_games = scrape_nba_basketball_reference(season)
+        nba_sources = [
+            ScraperSource('Basketball-Reference', scrape_nba_basketball_reference, priority=1, min_games=500),
+            ScraperSource('ESPN', scrape_nba_espn, priority=2, min_games=500),
+            ScraperSource('CBS Sports', scrape_nba_cbssports, priority=3, min_games=100),
+        ]
+        nba_games = scrape_with_fallback('NBA', season, nba_sources)
        nba_season = f"{season-1}-{str(season)[2:]}"
        nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
        all_games.extend(nba_games)
-        print(f"  Scraped {len(nba_games)} NBA games")

-        # Scrape MLB
        print_section(f"MLB {season}")
-        mlb_games = scrape_mlb_statsapi(season)
+        mlb_sources = [
+            ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=1000),
+            ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=500),
+            ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=500),
+        ]
+        mlb_games = scrape_with_fallback('MLB', season, mlb_sources)
        mlb_games = assign_stable_ids(mlb_games, 'MLB', str(season))
        all_games.extend(mlb_games)
-        print(f"  Scraped {len(mlb_games)} MLB games")

-        # Scrape NHL
        print_section(f"NHL {season}")
-        nhl_games = scrape_nhl_hockey_reference(season)
+        nhl_sources = [
+            ScraperSource('Hockey-Reference', scrape_nhl_hockey_reference, priority=1, min_games=500),
+            ScraperSource('ESPN', scrape_nhl_espn, priority=2, min_games=500),
+            ScraperSource('NHL API', scrape_nhl_api, priority=3, min_games=100),
+        ]
+        nhl_games = scrape_with_fallback('NHL', season, nhl_sources)
        nhl_season = f"{season-1}-{str(season)[2:]}"
        nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
        all_games.extend(nhl_games)
-        print(f"  Scraped {len(nhl_games)} NHL games")
+
+        print_section(f"NFL {season}")
+        nfl_sources = [
+            ScraperSource('ESPN', scrape_nfl_espn, priority=1, min_games=200),
+            ScraperSource('Pro-Football-Reference', scrape_nfl_pro_football_reference, priority=2, min_games=200),
+            ScraperSource('CBS Sports', scrape_nfl_cbssports, priority=3, min_games=100),
+        ]
+        nfl_games = scrape_with_fallback('NFL', season, nfl_sources)
+        nfl_season = f"{season-1}-{str(season)[2:]}"
+        nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
+        all_games.extend(nfl_games)
+
+        print_section(f"WNBA {season}")
+        wnba_sources = [
+            ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
+            ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
+            ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
+        ]
+        wnba_games = scrape_with_fallback('WNBA', season, wnba_sources)
+        wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(season))
+        all_games.extend(wnba_games)
+
+        print_section(f"MLS {season}")
+        mls_sources = [
+            ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
+            ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
+            ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
+        ]
+        mls_games = scrape_with_fallback('MLS', season, mls_sources)
+        mls_games = assign_stable_ids(mls_games, 'MLS', str(season))
+        all_games.extend(mls_games)
+
+        print_section(f"NWSL {season}")
+        nwsl_sources = [
+            ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
+            ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
+            ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
+        ]
+        nwsl_games = scrape_with_fallback('NWSL', season, nwsl_sources)
+        nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(season))
+        all_games.extend(nwsl_games)
+
+        print_section(f"CBB {season}")
+        cbb_sources = [
+            ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
+            ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
+            ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
+        ]
+        cbb_games = scrape_with_fallback('CBB', season, cbb_sources)
+        cbb_season = f"{season-1}-{str(season)[2:]}"
+        cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
+        all_games.extend(cbb_games)

        # Export raw data
        print_section("Exporting Raw Data")
@@ -148,16 +225,36 @@ def run_pipeline(
    else:
        print_header("LOADING EXISTING RAW DATA")

-        games_file = output_dir / 'games.json'
-        stadiums_file = output_dir / 'stadiums.json'
+        # Try loading from new structure first (games/*.json)
+        games_dir = output_dir / 'games'
+        raw_games = []

-        with open(games_file) as f:
-            raw_games = json.load(f)
-        print(f"  Loaded {len(raw_games)} raw games")
+        if games_dir.exists() and any(games_dir.glob('*.json')):
+            print_section("Loading from games/ directory")
+            for games_file in sorted(games_dir.glob('*.json')):
+                with open(games_file) as f:
+                    file_games = json.load(f)
+                    raw_games.extend(file_games)
+                    print(f"  Loaded {len(file_games):,} games from {games_file.name}")
+        else:
+            # Fallback to legacy games.json
+            print_section("Loading from legacy games.json")
+            games_file = output_dir / 'games.json'
+            with open(games_file) as f:
+                raw_games = json.load(f)

-        with open(stadiums_file) as f:
-            raw_stadiums = json.load(f)
-        print(f"  Loaded {len(raw_stadiums)} raw stadiums")
+        print(f"  Total: {len(raw_games):,} raw games")
+
+        # Try loading stadiums from canonical/ first, then legacy
+        canonical_dir = output_dir / 'canonical'
+        if (canonical_dir / 'stadiums.json').exists():
+            with open(canonical_dir / 'stadiums.json') as f:
+                raw_stadiums = json.load(f)
+            print(f"  Loaded {len(raw_stadiums)} raw stadiums from canonical/stadiums.json")
+        else:
+            with open(output_dir / 'stadiums.json') as f:
+                raw_stadiums = json.load(f)
+            print(f"  Loaded {len(raw_stadiums)} raw stadiums from stadiums.json")

    # =========================================================================
    # STAGE 2: CANONICALIZE STADIUMS
@@ -242,13 +339,32 @@ def run_pipeline(
            for issue, count in by_issue.items():
                print(f"    - {issue}: {count}")

-    # Export
-    games_canonical_path = output_dir / 'games_canonical.json'
+    # Export games to new structure: canonical/games/{sport}_{season}.json
+    canonical_games_dir = output_dir / 'canonical' / 'games'
+    canonical_games_dir.mkdir(parents=True, exist_ok=True)

+    # Group games by sport and season
+    games_by_sport_season = {}
+    for game in canonical_games_list:
+        sport = game.sport.lower()
+        season = game.season
+        key = f"{sport}_{season}"
+        if key not in games_by_sport_season:
+            games_by_sport_season[key] = []
+        games_by_sport_season[key].append(game)
+
+    # Export each sport/season file
+    for key, sport_games in sorted(games_by_sport_season.items()):
+        filepath = canonical_games_dir / f"{key}.json"
+        with open(filepath, 'w') as f:
+            json.dump([asdict(g) for g in sport_games], f, indent=2)
+        print(f"  Exported {len(sport_games):,} games to canonical/games/{key}.json")
+
+    # Also export combined games_canonical.json for backward compatibility
+    games_canonical_path = output_dir / 'games_canonical.json'
    with open(games_canonical_path, 'w') as f:
        json.dump([asdict(g) for g in canonical_games_list], f, indent=2)
-
-    print(f"  Exported to {games_canonical_path}")
+    print(f"  Exported combined to {games_canonical_path}")

    # =========================================================================
    # STAGE 5: VALIDATE
@@ -320,7 +436,8 @@ def run_pipeline(
    print(f"    - {output_dir / 'stadiums_canonical.json'}")
    print(f"    - {output_dir / 'stadium_aliases.json'}")
    print(f"    - {output_dir / 'teams_canonical.json'}")
-    print(f"    - {output_dir / 'games_canonical.json'}")
+    print(f"    - {output_dir / 'games_canonical.json'} (combined)")
+    print(f"    - {output_dir / 'canonical' / 'games' / '*.json'} (by sport/season)")
    print(f"    - {output_dir / 'canonicalization_validation.json'}")
    print()