Remove CFB/NASCAR/PGA and streamline to 8 supported sports
- Remove College Football, NASCAR, and PGA from scraper and app - Clean all data files (stadiums, games, pipeline reports) - Update Sport.swift enum and all UI components - Add sportstime.py CLI tool for pipeline management - Add DATA_SCRAPING.md documentation - Add WNBA/MLS/NWSL implementation documentation - Scraper now supports: NBA, MLB, NHL, NFL, WNBA, MLS, NWSL, CBB Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -31,9 +31,24 @@ from dataclasses import dataclass, asdict
|
||||
|
||||
# Import pipeline components
|
||||
from scrape_schedules import (
|
||||
scrape_nba_basketball_reference,
|
||||
scrape_mlb_statsapi,
|
||||
scrape_nhl_hockey_reference,
|
||||
ScraperSource, scrape_with_fallback,
|
||||
# NBA sources
|
||||
scrape_nba_basketball_reference, scrape_nba_espn, scrape_nba_cbssports,
|
||||
# MLB sources
|
||||
scrape_mlb_statsapi, scrape_mlb_baseball_reference, scrape_mlb_espn,
|
||||
# NHL sources
|
||||
scrape_nhl_hockey_reference, scrape_nhl_espn, scrape_nhl_api,
|
||||
# NFL sources
|
||||
scrape_nfl_espn, scrape_nfl_pro_football_reference, scrape_nfl_cbssports,
|
||||
# WNBA sources
|
||||
scrape_wnba_espn, scrape_wnba_basketball_reference, scrape_wnba_cbssports,
|
||||
# MLS sources
|
||||
scrape_mls_espn, scrape_mls_fbref, scrape_mls_mlssoccer,
|
||||
# NWSL sources
|
||||
scrape_nwsl_espn, scrape_nwsl_fbref, scrape_nwsl_nwslsoccer,
|
||||
# CBB sources
|
||||
scrape_cbb_espn, scrape_cbb_sports_reference, scrape_cbb_cbssports,
|
||||
# Utilities
|
||||
generate_stadiums_from_teams,
|
||||
assign_stable_ids,
|
||||
export_to_json,
|
||||
@@ -114,28 +129,90 @@ def run_pipeline(
|
||||
all_stadiums = generate_stadiums_from_teams()
|
||||
print(f" Generated {len(all_stadiums)} stadiums from team data")
|
||||
|
||||
# Scrape NBA
|
||||
# Scrape all sports with multi-source fallback
|
||||
print_section(f"NBA {season}")
|
||||
nba_games = scrape_nba_basketball_reference(season)
|
||||
nba_sources = [
|
||||
ScraperSource('Basketball-Reference', scrape_nba_basketball_reference, priority=1, min_games=500),
|
||||
ScraperSource('ESPN', scrape_nba_espn, priority=2, min_games=500),
|
||||
ScraperSource('CBS Sports', scrape_nba_cbssports, priority=3, min_games=100),
|
||||
]
|
||||
nba_games = scrape_with_fallback('NBA', season, nba_sources)
|
||||
nba_season = f"{season-1}-{str(season)[2:]}"
|
||||
nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
|
||||
all_games.extend(nba_games)
|
||||
print(f" Scraped {len(nba_games)} NBA games")
|
||||
|
||||
# Scrape MLB
|
||||
print_section(f"MLB {season}")
|
||||
mlb_games = scrape_mlb_statsapi(season)
|
||||
mlb_sources = [
|
||||
ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=1000),
|
||||
ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=500),
|
||||
ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=500),
|
||||
]
|
||||
mlb_games = scrape_with_fallback('MLB', season, mlb_sources)
|
||||
mlb_games = assign_stable_ids(mlb_games, 'MLB', str(season))
|
||||
all_games.extend(mlb_games)
|
||||
print(f" Scraped {len(mlb_games)} MLB games")
|
||||
|
||||
# Scrape NHL
|
||||
print_section(f"NHL {season}")
|
||||
nhl_games = scrape_nhl_hockey_reference(season)
|
||||
nhl_sources = [
|
||||
ScraperSource('Hockey-Reference', scrape_nhl_hockey_reference, priority=1, min_games=500),
|
||||
ScraperSource('ESPN', scrape_nhl_espn, priority=2, min_games=500),
|
||||
ScraperSource('NHL API', scrape_nhl_api, priority=3, min_games=100),
|
||||
]
|
||||
nhl_games = scrape_with_fallback('NHL', season, nhl_sources)
|
||||
nhl_season = f"{season-1}-{str(season)[2:]}"
|
||||
nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
|
||||
all_games.extend(nhl_games)
|
||||
print(f" Scraped {len(nhl_games)} NHL games")
|
||||
|
||||
print_section(f"NFL {season}")
|
||||
nfl_sources = [
|
||||
ScraperSource('ESPN', scrape_nfl_espn, priority=1, min_games=200),
|
||||
ScraperSource('Pro-Football-Reference', scrape_nfl_pro_football_reference, priority=2, min_games=200),
|
||||
ScraperSource('CBS Sports', scrape_nfl_cbssports, priority=3, min_games=100),
|
||||
]
|
||||
nfl_games = scrape_with_fallback('NFL', season, nfl_sources)
|
||||
nfl_season = f"{season-1}-{str(season)[2:]}"
|
||||
nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
|
||||
all_games.extend(nfl_games)
|
||||
|
||||
print_section(f"WNBA {season}")
|
||||
wnba_sources = [
|
||||
ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
|
||||
ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
|
||||
ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
|
||||
]
|
||||
wnba_games = scrape_with_fallback('WNBA', season, wnba_sources)
|
||||
wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(season))
|
||||
all_games.extend(wnba_games)
|
||||
|
||||
print_section(f"MLS {season}")
|
||||
mls_sources = [
|
||||
ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
|
||||
ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
|
||||
ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
|
||||
]
|
||||
mls_games = scrape_with_fallback('MLS', season, mls_sources)
|
||||
mls_games = assign_stable_ids(mls_games, 'MLS', str(season))
|
||||
all_games.extend(mls_games)
|
||||
|
||||
print_section(f"NWSL {season}")
|
||||
nwsl_sources = [
|
||||
ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
|
||||
ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
|
||||
ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
|
||||
]
|
||||
nwsl_games = scrape_with_fallback('NWSL', season, nwsl_sources)
|
||||
nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(season))
|
||||
all_games.extend(nwsl_games)
|
||||
|
||||
print_section(f"CBB {season}")
|
||||
cbb_sources = [
|
||||
ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
|
||||
ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
|
||||
ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
|
||||
]
|
||||
cbb_games = scrape_with_fallback('CBB', season, cbb_sources)
|
||||
cbb_season = f"{season-1}-{str(season)[2:]}"
|
||||
cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
|
||||
all_games.extend(cbb_games)
|
||||
|
||||
# Export raw data
|
||||
print_section("Exporting Raw Data")
|
||||
@@ -148,16 +225,36 @@ def run_pipeline(
|
||||
else:
|
||||
print_header("LOADING EXISTING RAW DATA")
|
||||
|
||||
games_file = output_dir / 'games.json'
|
||||
stadiums_file = output_dir / 'stadiums.json'
|
||||
# Try loading from new structure first (games/*.json)
|
||||
games_dir = output_dir / 'games'
|
||||
raw_games = []
|
||||
|
||||
with open(games_file) as f:
|
||||
raw_games = json.load(f)
|
||||
print(f" Loaded {len(raw_games)} raw games")
|
||||
if games_dir.exists() and any(games_dir.glob('*.json')):
|
||||
print_section("Loading from games/ directory")
|
||||
for games_file in sorted(games_dir.glob('*.json')):
|
||||
with open(games_file) as f:
|
||||
file_games = json.load(f)
|
||||
raw_games.extend(file_games)
|
||||
print(f" Loaded {len(file_games):,} games from {games_file.name}")
|
||||
else:
|
||||
# Fallback to legacy games.json
|
||||
print_section("Loading from legacy games.json")
|
||||
games_file = output_dir / 'games.json'
|
||||
with open(games_file) as f:
|
||||
raw_games = json.load(f)
|
||||
|
||||
with open(stadiums_file) as f:
|
||||
raw_stadiums = json.load(f)
|
||||
print(f" Loaded {len(raw_stadiums)} raw stadiums")
|
||||
print(f" Total: {len(raw_games):,} raw games")
|
||||
|
||||
# Try loading stadiums from canonical/ first, then legacy
|
||||
canonical_dir = output_dir / 'canonical'
|
||||
if (canonical_dir / 'stadiums.json').exists():
|
||||
with open(canonical_dir / 'stadiums.json') as f:
|
||||
raw_stadiums = json.load(f)
|
||||
print(f" Loaded {len(raw_stadiums)} raw stadiums from canonical/stadiums.json")
|
||||
else:
|
||||
with open(output_dir / 'stadiums.json') as f:
|
||||
raw_stadiums = json.load(f)
|
||||
print(f" Loaded {len(raw_stadiums)} raw stadiums from stadiums.json")
|
||||
|
||||
# =========================================================================
|
||||
# STAGE 2: CANONICALIZE STADIUMS
|
||||
@@ -242,13 +339,32 @@ def run_pipeline(
|
||||
for issue, count in by_issue.items():
|
||||
print(f" - {issue}: {count}")
|
||||
|
||||
# Export
|
||||
games_canonical_path = output_dir / 'games_canonical.json'
|
||||
# Export games to new structure: canonical/games/{sport}_{season}.json
|
||||
canonical_games_dir = output_dir / 'canonical' / 'games'
|
||||
canonical_games_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Group games by sport and season
|
||||
games_by_sport_season = {}
|
||||
for game in canonical_games_list:
|
||||
sport = game.sport.lower()
|
||||
season = game.season
|
||||
key = f"{sport}_{season}"
|
||||
if key not in games_by_sport_season:
|
||||
games_by_sport_season[key] = []
|
||||
games_by_sport_season[key].append(game)
|
||||
|
||||
# Export each sport/season file
|
||||
for key, sport_games in sorted(games_by_sport_season.items()):
|
||||
filepath = canonical_games_dir / f"{key}.json"
|
||||
with open(filepath, 'w') as f:
|
||||
json.dump([asdict(g) for g in sport_games], f, indent=2)
|
||||
print(f" Exported {len(sport_games):,} games to canonical/games/{key}.json")
|
||||
|
||||
# Also export combined games_canonical.json for backward compatibility
|
||||
games_canonical_path = output_dir / 'games_canonical.json'
|
||||
with open(games_canonical_path, 'w') as f:
|
||||
json.dump([asdict(g) for g in canonical_games_list], f, indent=2)
|
||||
|
||||
print(f" Exported to {games_canonical_path}")
|
||||
print(f" Exported combined to {games_canonical_path}")
|
||||
|
||||
# =========================================================================
|
||||
# STAGE 5: VALIDATE
|
||||
@@ -320,7 +436,8 @@ def run_pipeline(
|
||||
print(f" - {output_dir / 'stadiums_canonical.json'}")
|
||||
print(f" - {output_dir / 'stadium_aliases.json'}")
|
||||
print(f" - {output_dir / 'teams_canonical.json'}")
|
||||
print(f" - {output_dir / 'games_canonical.json'}")
|
||||
print(f" - {output_dir / 'games_canonical.json'} (combined)")
|
||||
print(f" - {output_dir / 'canonical' / 'games' / '*.json'} (by sport/season)")
|
||||
print(f" - {output_dir / 'canonicalization_validation.json'}")
|
||||
print()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user