- Import MLS_TEAMS, get_mls_team_abbrev, scrape_mls_stadiums from mls.py - Remove inline MLS_TEAMS dict (now imported from module) - Remove inline MLS stadium scraper functions (now in mls.py) - Update TODO comments to reflect MLS extraction complete Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
631 lines
22 KiB
Python
631 lines
22 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Sports Schedule Scraper Orchestrator
|
|
|
|
This script coordinates scraping across sport-specific modules:
|
|
- core.py: Shared utilities, data classes, fallback system
|
|
- mlb.py: MLB scrapers
|
|
- nba.py: NBA scrapers
|
|
- nhl.py: NHL scrapers
|
|
- nfl.py: NFL scrapers
|
|
|
|
Non-core sports (WNBA, MLS, NWSL, CBB) remain inline pending extraction.
|
|
|
|
Usage:
|
|
python scrape_schedules.py --sport nba --season 2026
|
|
python scrape_schedules.py --sport all --season 2026
|
|
python scrape_schedules.py --stadiums-only
|
|
"""
|
|
|
|
import argparse
|
|
import csv
|
|
import json
|
|
import time
|
|
from collections import defaultdict
|
|
from dataclasses import asdict
|
|
from datetime import datetime
|
|
from io import StringIO
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import requests
|
|
|
|
# Import from core module
|
|
from core import (
|
|
Game,
|
|
Stadium,
|
|
ScraperSource,
|
|
StadiumScraperSource,
|
|
fetch_page,
|
|
scrape_with_fallback,
|
|
scrape_stadiums_with_fallback,
|
|
assign_stable_ids,
|
|
export_to_json,
|
|
)
|
|
|
|
# Import from sport modules (core 4 sports)
|
|
from mlb import (
|
|
scrape_mlb_games,
|
|
scrape_mlb_stadiums,
|
|
MLB_TEAMS,
|
|
)
|
|
from nba import (
|
|
scrape_nba_games,
|
|
scrape_nba_stadiums,
|
|
get_nba_season_string,
|
|
NBA_TEAMS,
|
|
)
|
|
from nhl import (
|
|
scrape_nhl_games,
|
|
scrape_nhl_stadiums,
|
|
get_nhl_season_string,
|
|
NHL_TEAMS,
|
|
)
|
|
from nfl import (
|
|
scrape_nfl_games,
|
|
scrape_nfl_stadiums,
|
|
get_nfl_season_string,
|
|
NFL_TEAMS,
|
|
)
|
|
from mls import (
|
|
MLS_TEAMS,
|
|
get_mls_team_abbrev,
|
|
scrape_mls_stadiums,
|
|
MLS_STADIUM_SOURCES,
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# NON-CORE SPORT TEAM MAPPINGS
|
|
# TODO: Extract to separate modules (wnba.py, nwsl.py, cbb.py)
|
|
# NOTE: MLS_TEAMS is now imported from mls.py
|
|
# =============================================================================
|
|
|
|
WNBA_TEAMS = {
|
|
'ATL': {'name': 'Atlanta Dream', 'city': 'Atlanta', 'arena': 'Gateway Center Arena'},
|
|
'CHI': {'name': 'Chicago Sky', 'city': 'Chicago', 'arena': 'Wintrust Arena'},
|
|
'CON': {'name': 'Connecticut Sun', 'city': 'Uncasville', 'arena': 'Mohegan Sun Arena'},
|
|
'DAL': {'name': 'Dallas Wings', 'city': 'Arlington', 'arena': 'College Park Center'},
|
|
'GSV': {'name': 'Golden State Valkyries', 'city': 'San Francisco', 'arena': 'Chase Center'},
|
|
'IND': {'name': 'Indiana Fever', 'city': 'Indianapolis', 'arena': 'Gainbridge Fieldhouse'},
|
|
'LVA': {'name': 'Las Vegas Aces', 'city': 'Las Vegas', 'arena': 'Michelob Ultra Arena'},
|
|
'LA': {'name': 'Los Angeles Sparks', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
|
|
'MIN': {'name': 'Minnesota Lynx', 'city': 'Minneapolis', 'arena': 'Target Center'},
|
|
'NY': {'name': 'New York Liberty', 'city': 'Brooklyn', 'arena': 'Barclays Center'},
|
|
'PHO': {'name': 'Phoenix Mercury', 'city': 'Phoenix', 'arena': 'Footprint Center'},
|
|
'SEA': {'name': 'Seattle Storm', 'city': 'Seattle', 'arena': 'Climate Pledge Arena'},
|
|
'WAS': {'name': 'Washington Mystics', 'city': 'Washington', 'arena': 'Entertainment & Sports Arena'},
|
|
}
|
|
|
|
NWSL_TEAMS = {
|
|
'LA': {'name': 'Angel City FC', 'city': 'Los Angeles', 'stadium': 'BMO Stadium'},
|
|
'SJ': {'name': 'Bay FC', 'city': 'San Jose', 'stadium': 'PayPal Park'},
|
|
'CHI': {'name': 'Chicago Red Stars', 'city': 'Bridgeview', 'stadium': 'SeatGeek Stadium'},
|
|
'HOU': {'name': 'Houston Dash', 'city': 'Houston', 'stadium': 'Shell Energy Stadium'},
|
|
'KC': {'name': 'Kansas City Current', 'city': 'Kansas City', 'stadium': 'CPKC Stadium'},
|
|
'NJ': {'name': 'NJ/NY Gotham FC', 'city': 'Harrison', 'stadium': 'Red Bull Arena'},
|
|
'NC': {'name': 'North Carolina Courage', 'city': 'Cary', 'stadium': 'WakeMed Soccer Park'},
|
|
'ORL': {'name': 'Orlando Pride', 'city': 'Orlando', 'stadium': 'Inter&Co Stadium'},
|
|
'POR': {'name': 'Portland Thorns FC', 'city': 'Portland', 'stadium': 'Providence Park'},
|
|
'SEA': {'name': 'Seattle Reign FC', 'city': 'Seattle', 'stadium': 'Lumen Field'},
|
|
'SD': {'name': 'San Diego Wave FC', 'city': 'San Diego', 'stadium': 'Snapdragon Stadium'},
|
|
'UTA': {'name': 'Utah Royals FC', 'city': 'Sandy', 'stadium': 'America First Field'},
|
|
'WAS': {'name': 'Washington Spirit', 'city': 'Washington', 'stadium': 'Audi Field'},
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# NON-CORE SPORT SCRAPERS
|
|
# TODO: Extract to separate modules (wnba.py, mls.py, nwsl.py, cbb.py)
|
|
# =============================================================================
|
|
|
|
def _scrape_espn_schedule(sport: str, league: str, season: int, date_range: tuple[str, str]) -> list[Game]:
|
|
"""
|
|
Fetch schedule from ESPN API.
|
|
Shared helper for non-core sports that use ESPN API.
|
|
"""
|
|
games = []
|
|
sport_upper = {
|
|
'wnba': 'WNBA',
|
|
'usa.1': 'MLS',
|
|
'usa.nwsl': 'NWSL',
|
|
'mens-college-basketball': 'CBB'
|
|
}.get(league, league.upper())
|
|
|
|
print(f"Fetching {sport_upper} {season} from ESPN API...")
|
|
|
|
url = f"https://site.api.espn.com/apis/site/v2/sports/{sport}/{league}/scoreboard"
|
|
params = {
|
|
'dates': f"{date_range[0]}-{date_range[1]}",
|
|
'limit': 1000
|
|
}
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, params=params, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
events = data.get('events', [])
|
|
|
|
for event in events:
|
|
try:
|
|
date_str = event.get('date', '')[:10]
|
|
time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None
|
|
|
|
competitions = event.get('competitions', [{}])
|
|
if not competitions:
|
|
continue
|
|
|
|
comp = competitions[0]
|
|
competitors = comp.get('competitors', [])
|
|
|
|
if len(competitors) < 2:
|
|
continue
|
|
|
|
home_team = None
|
|
away_team = None
|
|
home_abbrev = None
|
|
away_abbrev = None
|
|
|
|
for team in competitors:
|
|
team_data = team.get('team', {})
|
|
team_name = team_data.get('displayName', team_data.get('name', ''))
|
|
team_abbrev = team_data.get('abbreviation', '')
|
|
|
|
if team.get('homeAway') == 'home':
|
|
home_team = team_name
|
|
home_abbrev = team_abbrev
|
|
else:
|
|
away_team = team_name
|
|
away_abbrev = team_abbrev
|
|
|
|
if not home_team or not away_team:
|
|
continue
|
|
|
|
venue = comp.get('venue', {}).get('fullName', '')
|
|
game_id = f"{sport_upper.lower()}_{date_str}_{away_abbrev}_{home_abbrev}".lower()
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport=sport_upper,
|
|
season=str(season),
|
|
date=date_str,
|
|
time=time_str,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=home_abbrev or get_team_abbrev(home_team, sport_upper),
|
|
away_team_abbrev=away_abbrev or get_team_abbrev(away_team, sport_upper),
|
|
venue=venue,
|
|
source='espn.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from ESPN")
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching ESPN {sport_upper}: {e}")
|
|
|
|
return games
|
|
|
|
|
|
def scrape_wnba_espn(season: int) -> list[Game]:
|
|
"""Fetch WNBA schedule from ESPN API."""
|
|
start = f"{season}0501"
|
|
end = f"{season}1031"
|
|
return _scrape_espn_schedule('basketball', 'wnba', season, (start, end))
|
|
|
|
|
|
def scrape_mls_espn(season: int) -> list[Game]:
|
|
"""Fetch MLS schedule from ESPN API."""
|
|
start = f"{season}0201"
|
|
end = f"{season}1231"
|
|
return _scrape_espn_schedule('soccer', 'usa.1', season, (start, end))
|
|
|
|
|
|
def scrape_nwsl_espn(season: int) -> list[Game]:
|
|
"""Fetch NWSL schedule from ESPN API."""
|
|
start = f"{season}0301"
|
|
end = f"{season}1130"
|
|
return _scrape_espn_schedule('soccer', 'usa.nwsl', season, (start, end))
|
|
|
|
|
|
def scrape_cbb_espn(season: int) -> list[Game]:
|
|
"""Fetch College Basketball schedule from ESPN API (D1 only)."""
|
|
start = f"{season-1}1101"
|
|
end = f"{season}0415"
|
|
return _scrape_espn_schedule('basketball', 'mens-college-basketball', season, (start, end))
|
|
|
|
|
|
def scrape_wnba_basketball_reference(season: int) -> list[Game]:
|
|
"""Scrape WNBA schedule from Basketball-Reference."""
|
|
games = []
|
|
url = f"https://www.basketball-reference.com/wnba/years/{season}_games.html"
|
|
|
|
print(f"Scraping WNBA {season} from Basketball-Reference...")
|
|
soup = fetch_page(url, 'basketball-reference.com')
|
|
|
|
if not soup:
|
|
return games
|
|
|
|
table = soup.find('table', {'id': 'schedule'})
|
|
if not table:
|
|
return games
|
|
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
return games
|
|
|
|
for row in tbody.find_all('tr'):
|
|
if row.get('class') and 'thead' in row.get('class'):
|
|
continue
|
|
|
|
try:
|
|
date_cell = row.find('th', {'data-stat': 'date_game'})
|
|
if not date_cell:
|
|
continue
|
|
date_link = date_cell.find('a')
|
|
date_str = date_link.text if date_link else date_cell.text
|
|
|
|
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
|
|
home_cell = row.find('td', {'data-stat': 'home_team_name'})
|
|
|
|
if not visitor_cell or not home_cell:
|
|
continue
|
|
|
|
visitor_link = visitor_cell.find('a')
|
|
home_link = home_cell.find('a')
|
|
|
|
away_team = visitor_link.text if visitor_link else visitor_cell.text
|
|
home_team = home_link.text if home_link else home_cell.text
|
|
|
|
try:
|
|
parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y')
|
|
date_formatted = parsed_date.strftime('%Y-%m-%d')
|
|
except:
|
|
continue
|
|
|
|
away_abbrev = get_team_abbrev(away_team, 'WNBA')
|
|
home_abbrev = get_team_abbrev(home_team, 'WNBA')
|
|
game_id = f"wnba_{date_formatted}_{away_abbrev}_{home_abbrev}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='WNBA',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=home_abbrev,
|
|
away_team_abbrev=away_abbrev,
|
|
venue='',
|
|
source='basketball-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Basketball-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_wnba_cbssports(season: int) -> list[Game]:
|
|
"""Fetch WNBA schedule from CBS Sports."""
|
|
games = []
|
|
print(f"Fetching WNBA {season} from CBS Sports...")
|
|
# Placeholder - CBS Sports scraping would go here
|
|
print(f" Found {len(games)} games from CBS Sports")
|
|
return games
|
|
|
|
|
|
def scrape_mls_fbref(season: int) -> list[Game]:
|
|
"""Scrape MLS schedule from FBref."""
|
|
games = []
|
|
print(f"Scraping MLS {season} from FBref...")
|
|
# Placeholder - FBref scraping would go here
|
|
print(f" Found {len(games)} games from FBref")
|
|
return games
|
|
|
|
|
|
def scrape_mls_mlssoccer(season: int) -> list[Game]:
|
|
"""Scrape MLS schedule from MLSSoccer.com."""
|
|
games = []
|
|
print(f"Scraping MLS {season} from MLSSoccer.com...")
|
|
# Placeholder - MLSSoccer.com scraping would go here
|
|
print(f" Found {len(games)} games from MLSSoccer.com")
|
|
return games
|
|
|
|
|
|
def scrape_nwsl_fbref(season: int) -> list[Game]:
|
|
"""Scrape NWSL schedule from FBref."""
|
|
games = []
|
|
print(f"Scraping NWSL {season} from FBref...")
|
|
# Placeholder - FBref scraping would go here
|
|
print(f" Found {len(games)} games from FBref")
|
|
return games
|
|
|
|
|
|
def scrape_nwsl_nwslsoccer(season: int) -> list[Game]:
|
|
"""Scrape NWSL schedule from NWSL.com."""
|
|
games = []
|
|
print(f"Scraping NWSL {season} from NWSL.com...")
|
|
# Placeholder - NWSL.com scraping would go here
|
|
print(f" Found {len(games)} games from NWSL.com")
|
|
return games
|
|
|
|
|
|
def scrape_cbb_sports_reference(season: int) -> list[Game]:
|
|
"""Scrape College Basketball schedule from Sports-Reference."""
|
|
games = []
|
|
print(f"Scraping CBB {season} from Sports-Reference...")
|
|
# Placeholder - Sports-Reference scraping would go here
|
|
print(f" Found {len(games)} games from Sports-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_cbb_cbssports(season: int) -> list[Game]:
|
|
"""Fetch College Basketball schedule from CBS Sports."""
|
|
games = []
|
|
print(f"Fetching CBB {season} from CBS Sports...")
|
|
# Placeholder - CBS Sports scraping would go here
|
|
print(f" Found {len(games)} games from CBS Sports")
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# NON-CORE STADIUM SCRAPERS
|
|
# TODO: Extract to separate modules (wnba.py, nwsl.py, cbb.py)
|
|
# NOTE: scrape_mls_stadiums() is now imported from mls.py
|
|
# =============================================================================
|
|
|
|
def scrape_wnba_stadiums() -> list[Stadium]:
|
|
"""Fetch WNBA arena data (hardcoded)."""
|
|
print("\nWNBA STADIUMS")
|
|
print("-" * 40)
|
|
stadiums = []
|
|
# Would include WNBA arena data here
|
|
print(f" Found {len(stadiums)} WNBA arenas")
|
|
return stadiums
|
|
|
|
|
|
def scrape_nwsl_stadiums() -> list[Stadium]:
|
|
"""Fetch NWSL stadium data (hardcoded)."""
|
|
print("\nNWSL STADIUMS")
|
|
print("-" * 40)
|
|
stadiums = []
|
|
# Would include NWSL stadium data here
|
|
print(f" Found {len(stadiums)} NWSL stadiums")
|
|
return stadiums
|
|
|
|
|
|
def scrape_cbb_stadiums() -> list[Stadium]:
|
|
"""Fetch College Basketball arena data."""
|
|
print("\nCBB STADIUMS")
|
|
print("-" * 40)
|
|
stadiums = []
|
|
# Would include CBB arena data here
|
|
print(f" Found {len(stadiums)} CBB arenas")
|
|
return stadiums
|
|
|
|
|
|
# =============================================================================
|
|
# LEGACY STADIUM FUNCTIONS
|
|
# =============================================================================
|
|
|
|
def scrape_stadiums_hifld() -> list[Stadium]:
|
|
"""Legacy: Scrape from HIFLD open data."""
|
|
# Placeholder for legacy HIFLD scraping
|
|
return []
|
|
|
|
|
|
def generate_stadiums_from_teams() -> list[Stadium]:
|
|
"""Generate stadium entries from team data with hardcoded coordinates."""
|
|
stadiums = []
|
|
# This function would generate stadiums from all team dictionaries
|
|
# Keeping as placeholder since sport modules have their own stadium scrapers
|
|
return stadiums
|
|
|
|
|
|
def scrape_all_stadiums() -> list[Stadium]:
|
|
"""Comprehensive stadium scraping for all sports."""
|
|
all_stadiums = []
|
|
|
|
# Core sports (from modules)
|
|
all_stadiums.extend(scrape_mlb_stadiums())
|
|
all_stadiums.extend(scrape_nba_stadiums())
|
|
all_stadiums.extend(scrape_nhl_stadiums())
|
|
all_stadiums.extend(scrape_nfl_stadiums())
|
|
|
|
# Non-core sports
|
|
all_stadiums.extend(scrape_mls_stadiums())
|
|
all_stadiums.extend(scrape_wnba_stadiums())
|
|
all_stadiums.extend(scrape_nwsl_stadiums())
|
|
all_stadiums.extend(scrape_cbb_stadiums())
|
|
|
|
return all_stadiums
|
|
|
|
|
|
# =============================================================================
|
|
# HELPERS
|
|
# =============================================================================
|
|
|
|
def get_team_abbrev(team_name: str, sport: str) -> str:
|
|
"""Get team abbreviation from full name."""
|
|
teams = {
|
|
'NBA': NBA_TEAMS,
|
|
'MLB': MLB_TEAMS,
|
|
'NHL': NHL_TEAMS,
|
|
'NFL': NFL_TEAMS,
|
|
'WNBA': WNBA_TEAMS,
|
|
'MLS': MLS_TEAMS,
|
|
'NWSL': NWSL_TEAMS,
|
|
}.get(sport, {})
|
|
|
|
for abbrev, info in teams.items():
|
|
if info['name'].lower() == team_name.lower():
|
|
return abbrev
|
|
if team_name.lower() in info['name'].lower():
|
|
return abbrev
|
|
|
|
# Return first 3 letters as fallback
|
|
return team_name[:3].upper()
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN ORCHESTRATOR
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Scrape sports schedules')
|
|
parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all')
|
|
parser.add_argument('--season', type=int, default=2026, help='Season year (ending year)')
|
|
parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data (legacy method)')
|
|
parser.add_argument('--stadiums-update', action='store_true', help='Scrape ALL stadium data for all 8 sports (comprehensive)')
|
|
parser.add_argument('--output', type=str, default='./data', help='Output directory')
|
|
|
|
args = parser.parse_args()
|
|
output_dir = Path(args.output)
|
|
|
|
all_games = []
|
|
all_stadiums = []
|
|
|
|
# Scrape stadiums
|
|
print("\n" + "="*60)
|
|
print("SCRAPING STADIUMS")
|
|
print("="*60)
|
|
|
|
if args.stadiums_update:
|
|
print("Using comprehensive stadium scrapers for all sports...")
|
|
all_stadiums.extend(scrape_all_stadiums())
|
|
print(f" Total stadiums scraped: {len(all_stadiums)}")
|
|
else:
|
|
all_stadiums.extend(scrape_stadiums_hifld())
|
|
all_stadiums.extend(generate_stadiums_from_teams())
|
|
|
|
# If stadiums-only mode, export and exit
|
|
if args.stadiums_only:
|
|
export_to_json([], all_stadiums, output_dir)
|
|
return
|
|
|
|
# Scrape schedules using sport modules
|
|
if args.sport in ['nba', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING NBA {args.season}")
|
|
print("="*60)
|
|
nba_games = scrape_nba_games(args.season)
|
|
nba_season = get_nba_season_string(args.season)
|
|
nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
|
|
all_games.extend(nba_games)
|
|
|
|
if args.sport in ['mlb', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING MLB {args.season}")
|
|
print("="*60)
|
|
mlb_games = scrape_mlb_games(args.season)
|
|
mlb_games = assign_stable_ids(mlb_games, 'MLB', str(args.season))
|
|
all_games.extend(mlb_games)
|
|
|
|
if args.sport in ['nhl', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING NHL {args.season}")
|
|
print("="*60)
|
|
nhl_games = scrape_nhl_games(args.season)
|
|
nhl_season = get_nhl_season_string(args.season)
|
|
nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
|
|
all_games.extend(nhl_games)
|
|
|
|
if args.sport in ['nfl', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING NFL {args.season}")
|
|
print("="*60)
|
|
nfl_games = scrape_nfl_games(args.season)
|
|
nfl_season = get_nfl_season_string(args.season)
|
|
nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
|
|
all_games.extend(nfl_games)
|
|
|
|
# Non-core sports (TODO: Extract to modules)
|
|
if args.sport in ['wnba', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING WNBA {args.season}")
|
|
print("="*60)
|
|
wnba_sources = [
|
|
ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
|
|
ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
|
|
ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
|
|
]
|
|
wnba_games = scrape_with_fallback('WNBA', args.season, wnba_sources)
|
|
wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(args.season))
|
|
all_games.extend(wnba_games)
|
|
|
|
if args.sport in ['mls', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING MLS {args.season}")
|
|
print("="*60)
|
|
mls_sources = [
|
|
ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
|
|
ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
|
|
ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
|
|
]
|
|
mls_games = scrape_with_fallback('MLS', args.season, mls_sources)
|
|
mls_games = assign_stable_ids(mls_games, 'MLS', str(args.season))
|
|
all_games.extend(mls_games)
|
|
|
|
if args.sport in ['nwsl', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING NWSL {args.season}")
|
|
print("="*60)
|
|
nwsl_sources = [
|
|
ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
|
|
ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
|
|
ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
|
|
]
|
|
nwsl_games = scrape_with_fallback('NWSL', args.season, nwsl_sources)
|
|
nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(args.season))
|
|
all_games.extend(nwsl_games)
|
|
|
|
if args.sport in ['cbb', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING CBB {args.season}")
|
|
print("="*60)
|
|
cbb_sources = [
|
|
ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
|
|
ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
|
|
ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
|
|
]
|
|
cbb_games = scrape_with_fallback('CBB', args.season, cbb_sources)
|
|
cbb_season = f"{args.season-1}-{str(args.season)[2:]}"
|
|
cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
|
|
all_games.extend(cbb_games)
|
|
|
|
# Export
|
|
print("\n" + "="*60)
|
|
print("EXPORTING DATA")
|
|
print("="*60)
|
|
|
|
export_to_json(all_games, all_stadiums, output_dir)
|
|
|
|
# Summary
|
|
print("\n" + "="*60)
|
|
print("SUMMARY")
|
|
print("="*60)
|
|
print(f"Total games scraped: {len(all_games)}")
|
|
print(f"Total stadiums: {len(all_stadiums)}")
|
|
|
|
by_sport = {}
|
|
for g in all_games:
|
|
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
|
|
for sport, count in by_sport.items():
|
|
print(f" {sport}: {count} games")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|