Files
Sportstime/Scripts/scrape_schedules.py
Trey t 8f1803b10d feat(02.1-01): integrate MLS module with scrape_schedules.py
- Import MLS_TEAMS, get_mls_team_abbrev, scrape_mls_stadiums from mls.py
- Remove inline MLS_TEAMS dict (now imported from module)
- Remove inline MLS stadium scraper functions (now in mls.py)
- Update TODO comments to reflect MLS extraction complete

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 00:52:17 -06:00

631 lines
22 KiB
Python

#!/usr/bin/env python3
"""
Sports Schedule Scraper Orchestrator
This script coordinates scraping across sport-specific modules:
- core.py: Shared utilities, data classes, fallback system
- mlb.py: MLB scrapers
- nba.py: NBA scrapers
- nhl.py: NHL scrapers
- nfl.py: NFL scrapers
Non-core sports (WNBA, MLS, NWSL, CBB) remain inline pending extraction.
Usage:
python scrape_schedules.py --sport nba --season 2026
python scrape_schedules.py --sport all --season 2026
python scrape_schedules.py --stadiums-only
"""
import argparse
import csv
import json
import time
from collections import defaultdict
from dataclasses import asdict
from datetime import datetime
from io import StringIO
from pathlib import Path
from typing import Optional
import requests
# Import from core module
from core import (
Game,
Stadium,
ScraperSource,
StadiumScraperSource,
fetch_page,
scrape_with_fallback,
scrape_stadiums_with_fallback,
assign_stable_ids,
export_to_json,
)
# Import from sport modules (core 4 sports)
from mlb import (
scrape_mlb_games,
scrape_mlb_stadiums,
MLB_TEAMS,
)
from nba import (
scrape_nba_games,
scrape_nba_stadiums,
get_nba_season_string,
NBA_TEAMS,
)
from nhl import (
scrape_nhl_games,
scrape_nhl_stadiums,
get_nhl_season_string,
NHL_TEAMS,
)
from nfl import (
scrape_nfl_games,
scrape_nfl_stadiums,
get_nfl_season_string,
NFL_TEAMS,
)
from mls import (
MLS_TEAMS,
get_mls_team_abbrev,
scrape_mls_stadiums,
MLS_STADIUM_SOURCES,
)
# =============================================================================
# NON-CORE SPORT TEAM MAPPINGS
# TODO: Extract to separate modules (wnba.py, nwsl.py, cbb.py)
# NOTE: MLS_TEAMS is now imported from mls.py
# =============================================================================
WNBA_TEAMS = {
'ATL': {'name': 'Atlanta Dream', 'city': 'Atlanta', 'arena': 'Gateway Center Arena'},
'CHI': {'name': 'Chicago Sky', 'city': 'Chicago', 'arena': 'Wintrust Arena'},
'CON': {'name': 'Connecticut Sun', 'city': 'Uncasville', 'arena': 'Mohegan Sun Arena'},
'DAL': {'name': 'Dallas Wings', 'city': 'Arlington', 'arena': 'College Park Center'},
'GSV': {'name': 'Golden State Valkyries', 'city': 'San Francisco', 'arena': 'Chase Center'},
'IND': {'name': 'Indiana Fever', 'city': 'Indianapolis', 'arena': 'Gainbridge Fieldhouse'},
'LVA': {'name': 'Las Vegas Aces', 'city': 'Las Vegas', 'arena': 'Michelob Ultra Arena'},
'LA': {'name': 'Los Angeles Sparks', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
'MIN': {'name': 'Minnesota Lynx', 'city': 'Minneapolis', 'arena': 'Target Center'},
'NY': {'name': 'New York Liberty', 'city': 'Brooklyn', 'arena': 'Barclays Center'},
'PHO': {'name': 'Phoenix Mercury', 'city': 'Phoenix', 'arena': 'Footprint Center'},
'SEA': {'name': 'Seattle Storm', 'city': 'Seattle', 'arena': 'Climate Pledge Arena'},
'WAS': {'name': 'Washington Mystics', 'city': 'Washington', 'arena': 'Entertainment & Sports Arena'},
}
NWSL_TEAMS = {
'LA': {'name': 'Angel City FC', 'city': 'Los Angeles', 'stadium': 'BMO Stadium'},
'SJ': {'name': 'Bay FC', 'city': 'San Jose', 'stadium': 'PayPal Park'},
'CHI': {'name': 'Chicago Red Stars', 'city': 'Bridgeview', 'stadium': 'SeatGeek Stadium'},
'HOU': {'name': 'Houston Dash', 'city': 'Houston', 'stadium': 'Shell Energy Stadium'},
'KC': {'name': 'Kansas City Current', 'city': 'Kansas City', 'stadium': 'CPKC Stadium'},
'NJ': {'name': 'NJ/NY Gotham FC', 'city': 'Harrison', 'stadium': 'Red Bull Arena'},
'NC': {'name': 'North Carolina Courage', 'city': 'Cary', 'stadium': 'WakeMed Soccer Park'},
'ORL': {'name': 'Orlando Pride', 'city': 'Orlando', 'stadium': 'Inter&Co Stadium'},
'POR': {'name': 'Portland Thorns FC', 'city': 'Portland', 'stadium': 'Providence Park'},
'SEA': {'name': 'Seattle Reign FC', 'city': 'Seattle', 'stadium': 'Lumen Field'},
'SD': {'name': 'San Diego Wave FC', 'city': 'San Diego', 'stadium': 'Snapdragon Stadium'},
'UTA': {'name': 'Utah Royals FC', 'city': 'Sandy', 'stadium': 'America First Field'},
'WAS': {'name': 'Washington Spirit', 'city': 'Washington', 'stadium': 'Audi Field'},
}
# =============================================================================
# NON-CORE SPORT SCRAPERS
# TODO: Extract to separate modules (wnba.py, mls.py, nwsl.py, cbb.py)
# =============================================================================
def _scrape_espn_schedule(sport: str, league: str, season: int, date_range: tuple[str, str]) -> list[Game]:
"""
Fetch schedule from ESPN API.
Shared helper for non-core sports that use ESPN API.
"""
games = []
sport_upper = {
'wnba': 'WNBA',
'usa.1': 'MLS',
'usa.nwsl': 'NWSL',
'mens-college-basketball': 'CBB'
}.get(league, league.upper())
print(f"Fetching {sport_upper} {season} from ESPN API...")
url = f"https://site.api.espn.com/apis/site/v2/sports/{sport}/{league}/scoreboard"
params = {
'dates': f"{date_range[0]}-{date_range[1]}",
'limit': 1000
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
try:
response = requests.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
events = data.get('events', [])
for event in events:
try:
date_str = event.get('date', '')[:10]
time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None
competitions = event.get('competitions', [{}])
if not competitions:
continue
comp = competitions[0]
competitors = comp.get('competitors', [])
if len(competitors) < 2:
continue
home_team = None
away_team = None
home_abbrev = None
away_abbrev = None
for team in competitors:
team_data = team.get('team', {})
team_name = team_data.get('displayName', team_data.get('name', ''))
team_abbrev = team_data.get('abbreviation', '')
if team.get('homeAway') == 'home':
home_team = team_name
home_abbrev = team_abbrev
else:
away_team = team_name
away_abbrev = team_abbrev
if not home_team or not away_team:
continue
venue = comp.get('venue', {}).get('fullName', '')
game_id = f"{sport_upper.lower()}_{date_str}_{away_abbrev}_{home_abbrev}".lower()
game = Game(
id=game_id,
sport=sport_upper,
season=str(season),
date=date_str,
time=time_str,
home_team=home_team,
away_team=away_team,
home_team_abbrev=home_abbrev or get_team_abbrev(home_team, sport_upper),
away_team_abbrev=away_abbrev or get_team_abbrev(away_team, sport_upper),
venue=venue,
source='espn.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from ESPN")
except Exception as e:
print(f"Error fetching ESPN {sport_upper}: {e}")
return games
def scrape_wnba_espn(season: int) -> list[Game]:
"""Fetch WNBA schedule from ESPN API."""
start = f"{season}0501"
end = f"{season}1031"
return _scrape_espn_schedule('basketball', 'wnba', season, (start, end))
def scrape_mls_espn(season: int) -> list[Game]:
"""Fetch MLS schedule from ESPN API."""
start = f"{season}0201"
end = f"{season}1231"
return _scrape_espn_schedule('soccer', 'usa.1', season, (start, end))
def scrape_nwsl_espn(season: int) -> list[Game]:
"""Fetch NWSL schedule from ESPN API."""
start = f"{season}0301"
end = f"{season}1130"
return _scrape_espn_schedule('soccer', 'usa.nwsl', season, (start, end))
def scrape_cbb_espn(season: int) -> list[Game]:
"""Fetch College Basketball schedule from ESPN API (D1 only)."""
start = f"{season-1}1101"
end = f"{season}0415"
return _scrape_espn_schedule('basketball', 'mens-college-basketball', season, (start, end))
def scrape_wnba_basketball_reference(season: int) -> list[Game]:
"""Scrape WNBA schedule from Basketball-Reference."""
games = []
url = f"https://www.basketball-reference.com/wnba/years/{season}_games.html"
print(f"Scraping WNBA {season} from Basketball-Reference...")
soup = fetch_page(url, 'basketball-reference.com')
if not soup:
return games
table = soup.find('table', {'id': 'schedule'})
if not table:
return games
tbody = table.find('tbody')
if not tbody:
return games
for row in tbody.find_all('tr'):
if row.get('class') and 'thead' in row.get('class'):
continue
try:
date_cell = row.find('th', {'data-stat': 'date_game'})
if not date_cell:
continue
date_link = date_cell.find('a')
date_str = date_link.text if date_link else date_cell.text
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
home_cell = row.find('td', {'data-stat': 'home_team_name'})
if not visitor_cell or not home_cell:
continue
visitor_link = visitor_cell.find('a')
home_link = home_cell.find('a')
away_team = visitor_link.text if visitor_link else visitor_cell.text
home_team = home_link.text if home_link else home_cell.text
try:
parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y')
date_formatted = parsed_date.strftime('%Y-%m-%d')
except:
continue
away_abbrev = get_team_abbrev(away_team, 'WNBA')
home_abbrev = get_team_abbrev(home_team, 'WNBA')
game_id = f"wnba_{date_formatted}_{away_abbrev}_{home_abbrev}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='WNBA',
season=str(season),
date=date_formatted,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=home_abbrev,
away_team_abbrev=away_abbrev,
venue='',
source='basketball-reference.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from Basketball-Reference")
return games
def scrape_wnba_cbssports(season: int) -> list[Game]:
"""Fetch WNBA schedule from CBS Sports."""
games = []
print(f"Fetching WNBA {season} from CBS Sports...")
# Placeholder - CBS Sports scraping would go here
print(f" Found {len(games)} games from CBS Sports")
return games
def scrape_mls_fbref(season: int) -> list[Game]:
"""Scrape MLS schedule from FBref."""
games = []
print(f"Scraping MLS {season} from FBref...")
# Placeholder - FBref scraping would go here
print(f" Found {len(games)} games from FBref")
return games
def scrape_mls_mlssoccer(season: int) -> list[Game]:
"""Scrape MLS schedule from MLSSoccer.com."""
games = []
print(f"Scraping MLS {season} from MLSSoccer.com...")
# Placeholder - MLSSoccer.com scraping would go here
print(f" Found {len(games)} games from MLSSoccer.com")
return games
def scrape_nwsl_fbref(season: int) -> list[Game]:
"""Scrape NWSL schedule from FBref."""
games = []
print(f"Scraping NWSL {season} from FBref...")
# Placeholder - FBref scraping would go here
print(f" Found {len(games)} games from FBref")
return games
def scrape_nwsl_nwslsoccer(season: int) -> list[Game]:
"""Scrape NWSL schedule from NWSL.com."""
games = []
print(f"Scraping NWSL {season} from NWSL.com...")
# Placeholder - NWSL.com scraping would go here
print(f" Found {len(games)} games from NWSL.com")
return games
def scrape_cbb_sports_reference(season: int) -> list[Game]:
"""Scrape College Basketball schedule from Sports-Reference."""
games = []
print(f"Scraping CBB {season} from Sports-Reference...")
# Placeholder - Sports-Reference scraping would go here
print(f" Found {len(games)} games from Sports-Reference")
return games
def scrape_cbb_cbssports(season: int) -> list[Game]:
"""Fetch College Basketball schedule from CBS Sports."""
games = []
print(f"Fetching CBB {season} from CBS Sports...")
# Placeholder - CBS Sports scraping would go here
print(f" Found {len(games)} games from CBS Sports")
return games
# =============================================================================
# NON-CORE STADIUM SCRAPERS
# TODO: Extract to separate modules (wnba.py, nwsl.py, cbb.py)
# NOTE: scrape_mls_stadiums() is now imported from mls.py
# =============================================================================
def scrape_wnba_stadiums() -> list[Stadium]:
"""Fetch WNBA arena data (hardcoded)."""
print("\nWNBA STADIUMS")
print("-" * 40)
stadiums = []
# Would include WNBA arena data here
print(f" Found {len(stadiums)} WNBA arenas")
return stadiums
def scrape_nwsl_stadiums() -> list[Stadium]:
"""Fetch NWSL stadium data (hardcoded)."""
print("\nNWSL STADIUMS")
print("-" * 40)
stadiums = []
# Would include NWSL stadium data here
print(f" Found {len(stadiums)} NWSL stadiums")
return stadiums
def scrape_cbb_stadiums() -> list[Stadium]:
"""Fetch College Basketball arena data."""
print("\nCBB STADIUMS")
print("-" * 40)
stadiums = []
# Would include CBB arena data here
print(f" Found {len(stadiums)} CBB arenas")
return stadiums
# =============================================================================
# LEGACY STADIUM FUNCTIONS
# =============================================================================
def scrape_stadiums_hifld() -> list[Stadium]:
"""Legacy: Scrape from HIFLD open data."""
# Placeholder for legacy HIFLD scraping
return []
def generate_stadiums_from_teams() -> list[Stadium]:
"""Generate stadium entries from team data with hardcoded coordinates."""
stadiums = []
# This function would generate stadiums from all team dictionaries
# Keeping as placeholder since sport modules have their own stadium scrapers
return stadiums
def scrape_all_stadiums() -> list[Stadium]:
"""Comprehensive stadium scraping for all sports."""
all_stadiums = []
# Core sports (from modules)
all_stadiums.extend(scrape_mlb_stadiums())
all_stadiums.extend(scrape_nba_stadiums())
all_stadiums.extend(scrape_nhl_stadiums())
all_stadiums.extend(scrape_nfl_stadiums())
# Non-core sports
all_stadiums.extend(scrape_mls_stadiums())
all_stadiums.extend(scrape_wnba_stadiums())
all_stadiums.extend(scrape_nwsl_stadiums())
all_stadiums.extend(scrape_cbb_stadiums())
return all_stadiums
# =============================================================================
# HELPERS
# =============================================================================
def get_team_abbrev(team_name: str, sport: str) -> str:
"""Get team abbreviation from full name."""
teams = {
'NBA': NBA_TEAMS,
'MLB': MLB_TEAMS,
'NHL': NHL_TEAMS,
'NFL': NFL_TEAMS,
'WNBA': WNBA_TEAMS,
'MLS': MLS_TEAMS,
'NWSL': NWSL_TEAMS,
}.get(sport, {})
for abbrev, info in teams.items():
if info['name'].lower() == team_name.lower():
return abbrev
if team_name.lower() in info['name'].lower():
return abbrev
# Return first 3 letters as fallback
return team_name[:3].upper()
# =============================================================================
# MAIN ORCHESTRATOR
# =============================================================================
def main():
parser = argparse.ArgumentParser(description='Scrape sports schedules')
parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all')
parser.add_argument('--season', type=int, default=2026, help='Season year (ending year)')
parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data (legacy method)')
parser.add_argument('--stadiums-update', action='store_true', help='Scrape ALL stadium data for all 8 sports (comprehensive)')
parser.add_argument('--output', type=str, default='./data', help='Output directory')
args = parser.parse_args()
output_dir = Path(args.output)
all_games = []
all_stadiums = []
# Scrape stadiums
print("\n" + "="*60)
print("SCRAPING STADIUMS")
print("="*60)
if args.stadiums_update:
print("Using comprehensive stadium scrapers for all sports...")
all_stadiums.extend(scrape_all_stadiums())
print(f" Total stadiums scraped: {len(all_stadiums)}")
else:
all_stadiums.extend(scrape_stadiums_hifld())
all_stadiums.extend(generate_stadiums_from_teams())
# If stadiums-only mode, export and exit
if args.stadiums_only:
export_to_json([], all_stadiums, output_dir)
return
# Scrape schedules using sport modules
if args.sport in ['nba', 'all']:
print("\n" + "="*60)
print(f"SCRAPING NBA {args.season}")
print("="*60)
nba_games = scrape_nba_games(args.season)
nba_season = get_nba_season_string(args.season)
nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
all_games.extend(nba_games)
if args.sport in ['mlb', 'all']:
print("\n" + "="*60)
print(f"SCRAPING MLB {args.season}")
print("="*60)
mlb_games = scrape_mlb_games(args.season)
mlb_games = assign_stable_ids(mlb_games, 'MLB', str(args.season))
all_games.extend(mlb_games)
if args.sport in ['nhl', 'all']:
print("\n" + "="*60)
print(f"SCRAPING NHL {args.season}")
print("="*60)
nhl_games = scrape_nhl_games(args.season)
nhl_season = get_nhl_season_string(args.season)
nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
all_games.extend(nhl_games)
if args.sport in ['nfl', 'all']:
print("\n" + "="*60)
print(f"SCRAPING NFL {args.season}")
print("="*60)
nfl_games = scrape_nfl_games(args.season)
nfl_season = get_nfl_season_string(args.season)
nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
all_games.extend(nfl_games)
# Non-core sports (TODO: Extract to modules)
if args.sport in ['wnba', 'all']:
print("\n" + "="*60)
print(f"SCRAPING WNBA {args.season}")
print("="*60)
wnba_sources = [
ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
]
wnba_games = scrape_with_fallback('WNBA', args.season, wnba_sources)
wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(args.season))
all_games.extend(wnba_games)
if args.sport in ['mls', 'all']:
print("\n" + "="*60)
print(f"SCRAPING MLS {args.season}")
print("="*60)
mls_sources = [
ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
]
mls_games = scrape_with_fallback('MLS', args.season, mls_sources)
mls_games = assign_stable_ids(mls_games, 'MLS', str(args.season))
all_games.extend(mls_games)
if args.sport in ['nwsl', 'all']:
print("\n" + "="*60)
print(f"SCRAPING NWSL {args.season}")
print("="*60)
nwsl_sources = [
ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
]
nwsl_games = scrape_with_fallback('NWSL', args.season, nwsl_sources)
nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(args.season))
all_games.extend(nwsl_games)
if args.sport in ['cbb', 'all']:
print("\n" + "="*60)
print(f"SCRAPING CBB {args.season}")
print("="*60)
cbb_sources = [
ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
]
cbb_games = scrape_with_fallback('CBB', args.season, cbb_sources)
cbb_season = f"{args.season-1}-{str(args.season)[2:]}"
cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
all_games.extend(cbb_games)
# Export
print("\n" + "="*60)
print("EXPORTING DATA")
print("="*60)
export_to_json(all_games, all_stadiums, output_dir)
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Total games scraped: {len(all_games)}")
print(f"Total stadiums: {len(all_stadiums)}")
by_sport = {}
for g in all_games:
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
for sport, count in by_sport.items():
print(f" {sport}: {count} games")
if __name__ == '__main__':
main()