Files
Sportstime/Scripts/scrape_schedules.py
Trey t 8790d2ad73 Remove CFB/NASCAR/PGA and streamline to 8 supported sports
- Remove College Football, NASCAR, and PGA from scraper and app
- Clean all data files (stadiums, games, pipeline reports)
- Update Sport.swift enum and all UI components
- Add sportstime.py CLI tool for pipeline management
- Add DATA_SCRAPING.md documentation
- Add WNBA/MLS/NWSL implementation documentation
- Scraper now supports: NBA, MLB, NHL, NFL, WNBA, MLS, NWSL, CBB

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-09 23:22:13 -06:00

3360 lines
140 KiB
Python

#!/usr/bin/env python3
"""
Sports Schedule Scraper for SportsTime App
Scrapes NBA, MLB, NHL schedules from multiple sources for cross-validation.
Usage:
python scrape_schedules.py --sport nba --season 2026
python scrape_schedules.py --sport all --season 2026
python scrape_schedules.py --stadiums-only
"""
import argparse
import json
import time
import re
from datetime import datetime, timedelta
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import Optional
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Rate limiting
REQUEST_DELAY = 3.0 # seconds between requests to same domain
last_request_time = {}
def rate_limit(domain: str):
"""Enforce rate limiting per domain."""
now = time.time()
if domain in last_request_time:
elapsed = now - last_request_time[domain]
if elapsed < REQUEST_DELAY:
time.sleep(REQUEST_DELAY - elapsed)
last_request_time[domain] = time.time()
def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]:
"""Fetch and parse a webpage with rate limiting."""
rate_limit(domain)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Cache-Control': 'max-age=0',
}
try:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
# =============================================================================
# DATA CLASSES
# =============================================================================
@dataclass
class Game:
id: str
sport: str
season: str
date: str # YYYY-MM-DD
time: Optional[str] # HH:MM (24hr, ET)
home_team: str
away_team: str
home_team_abbrev: str
away_team_abbrev: str
venue: str
source: str
is_playoff: bool = False
broadcast: Optional[str] = None
@dataclass
class Stadium:
id: str
name: str
city: str
state: str
latitude: float
longitude: float
capacity: int
sport: str
team_abbrevs: list
source: str
year_opened: Optional[int] = None
# =============================================================================
# MULTI-SOURCE FALLBACK SYSTEM
# =============================================================================
from dataclasses import field
from typing import Callable
@dataclass
class ScraperSource:
"""Represents a single data source for scraping."""
name: str
scraper_func: Callable[[int], list] # Takes season, returns list[Game]
priority: int = 1 # Lower = higher priority (1 is best)
min_games: int = 10 # Minimum games to consider successful
def scrape_with_fallback(
sport: str,
season: int,
sources: list[ScraperSource],
verbose: bool = True
) -> list:
"""
Try multiple sources in priority order until one succeeds.
Args:
sport: Sport name for logging
season: Season year
sources: List of ScraperSource configs, sorted by priority
verbose: Whether to print status messages
Returns:
List of Game objects from the first successful source
"""
sources = sorted(sources, key=lambda s: s.priority)
for i, source in enumerate(sources):
try:
if verbose:
attempt = f"[{i+1}/{len(sources)}]"
print(f" {attempt} Trying {source.name}...")
games = source.scraper_func(season)
if games and len(games) >= source.min_games:
if verbose:
print(f"{source.name} returned {len(games)} games")
return games
else:
if verbose:
count = len(games) if games else 0
print(f"{source.name} returned only {count} games (min: {source.min_games})")
except Exception as e:
if verbose:
print(f"{source.name} failed: {e}")
continue
# All sources failed
if verbose:
print(f" ⚠ All {len(sources)} sources failed for {sport}")
return []
@dataclass
class StadiumScraperSource:
"""Represents a single data source for stadium scraping."""
name: str
scraper_func: Callable[[], list] # Returns list[Stadium]
priority: int = 1 # Lower = higher priority (1 is best)
min_venues: int = 5 # Minimum venues to consider successful
def scrape_stadiums_with_fallback(
sport: str,
sources: list[StadiumScraperSource],
verbose: bool = True
) -> list:
"""
Try multiple stadium sources in priority order until one succeeds.
Args:
sport: Sport name for logging
sources: List of StadiumScraperSource configs, sorted by priority
verbose: Whether to print status messages
Returns:
List of Stadium objects from the first successful source
"""
sources = sorted(sources, key=lambda s: s.priority)
for i, source in enumerate(sources):
try:
if verbose:
attempt = f"[{i+1}/{len(sources)}]"
print(f" {attempt} Trying {source.name}...")
stadiums = source.scraper_func()
if stadiums and len(stadiums) >= source.min_venues:
if verbose:
print(f"{source.name} returned {len(stadiums)} venues")
return stadiums
else:
if verbose:
count = len(stadiums) if stadiums else 0
print(f"{source.name} returned only {count} venues (min: {source.min_venues})")
except Exception as e:
if verbose:
print(f"{source.name} failed: {e}")
continue
# All sources failed
if verbose:
print(f" ⚠ All {len(sources)} sources failed for {sport}")
return []
# =============================================================================
# TEAM MAPPINGS
# =============================================================================
NBA_TEAMS = {
'ATL': {'name': 'Atlanta Hawks', 'city': 'Atlanta', 'arena': 'State Farm Arena'},
'BOS': {'name': 'Boston Celtics', 'city': 'Boston', 'arena': 'TD Garden'},
'BRK': {'name': 'Brooklyn Nets', 'city': 'Brooklyn', 'arena': 'Barclays Center'},
'CHO': {'name': 'Charlotte Hornets', 'city': 'Charlotte', 'arena': 'Spectrum Center'},
'CHI': {'name': 'Chicago Bulls', 'city': 'Chicago', 'arena': 'United Center'},
'CLE': {'name': 'Cleveland Cavaliers', 'city': 'Cleveland', 'arena': 'Rocket Mortgage FieldHouse'},
'DAL': {'name': 'Dallas Mavericks', 'city': 'Dallas', 'arena': 'American Airlines Center'},
'DEN': {'name': 'Denver Nuggets', 'city': 'Denver', 'arena': 'Ball Arena'},
'DET': {'name': 'Detroit Pistons', 'city': 'Detroit', 'arena': 'Little Caesars Arena'},
'GSW': {'name': 'Golden State Warriors', 'city': 'San Francisco', 'arena': 'Chase Center'},
'HOU': {'name': 'Houston Rockets', 'city': 'Houston', 'arena': 'Toyota Center'},
'IND': {'name': 'Indiana Pacers', 'city': 'Indianapolis', 'arena': 'Gainbridge Fieldhouse'},
'LAC': {'name': 'Los Angeles Clippers', 'city': 'Inglewood', 'arena': 'Intuit Dome'},
'LAL': {'name': 'Los Angeles Lakers', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
'MEM': {'name': 'Memphis Grizzlies', 'city': 'Memphis', 'arena': 'FedExForum'},
'MIA': {'name': 'Miami Heat', 'city': 'Miami', 'arena': 'Kaseya Center'},
'MIL': {'name': 'Milwaukee Bucks', 'city': 'Milwaukee', 'arena': 'Fiserv Forum'},
'MIN': {'name': 'Minnesota Timberwolves', 'city': 'Minneapolis', 'arena': 'Target Center'},
'NOP': {'name': 'New Orleans Pelicans', 'city': 'New Orleans', 'arena': 'Smoothie King Center'},
'NYK': {'name': 'New York Knicks', 'city': 'New York', 'arena': 'Madison Square Garden'},
'OKC': {'name': 'Oklahoma City Thunder', 'city': 'Oklahoma City', 'arena': 'Paycom Center'},
'ORL': {'name': 'Orlando Magic', 'city': 'Orlando', 'arena': 'Kia Center'},
'PHI': {'name': 'Philadelphia 76ers', 'city': 'Philadelphia', 'arena': 'Wells Fargo Center'},
'PHO': {'name': 'Phoenix Suns', 'city': 'Phoenix', 'arena': 'Footprint Center'},
'POR': {'name': 'Portland Trail Blazers', 'city': 'Portland', 'arena': 'Moda Center'},
'SAC': {'name': 'Sacramento Kings', 'city': 'Sacramento', 'arena': 'Golden 1 Center'},
'SAS': {'name': 'San Antonio Spurs', 'city': 'San Antonio', 'arena': 'Frost Bank Center'},
'TOR': {'name': 'Toronto Raptors', 'city': 'Toronto', 'arena': 'Scotiabank Arena'},
'UTA': {'name': 'Utah Jazz', 'city': 'Salt Lake City', 'arena': 'Delta Center'},
'WAS': {'name': 'Washington Wizards', 'city': 'Washington', 'arena': 'Capital One Arena'},
}
MLB_TEAMS = {
'ARI': {'name': 'Arizona Diamondbacks', 'city': 'Phoenix', 'stadium': 'Chase Field'},
'ATL': {'name': 'Atlanta Braves', 'city': 'Atlanta', 'stadium': 'Truist Park'},
'BAL': {'name': 'Baltimore Orioles', 'city': 'Baltimore', 'stadium': 'Oriole Park at Camden Yards'},
'BOS': {'name': 'Boston Red Sox', 'city': 'Boston', 'stadium': 'Fenway Park'},
'CHC': {'name': 'Chicago Cubs', 'city': 'Chicago', 'stadium': 'Wrigley Field'},
'CHW': {'name': 'Chicago White Sox', 'city': 'Chicago', 'stadium': 'Guaranteed Rate Field'},
'CIN': {'name': 'Cincinnati Reds', 'city': 'Cincinnati', 'stadium': 'Great American Ball Park'},
'CLE': {'name': 'Cleveland Guardians', 'city': 'Cleveland', 'stadium': 'Progressive Field'},
'COL': {'name': 'Colorado Rockies', 'city': 'Denver', 'stadium': 'Coors Field'},
'DET': {'name': 'Detroit Tigers', 'city': 'Detroit', 'stadium': 'Comerica Park'},
'HOU': {'name': 'Houston Astros', 'city': 'Houston', 'stadium': 'Minute Maid Park'},
'KCR': {'name': 'Kansas City Royals', 'city': 'Kansas City', 'stadium': 'Kauffman Stadium'},
'LAA': {'name': 'Los Angeles Angels', 'city': 'Anaheim', 'stadium': 'Angel Stadium'},
'LAD': {'name': 'Los Angeles Dodgers', 'city': 'Los Angeles', 'stadium': 'Dodger Stadium'},
'MIA': {'name': 'Miami Marlins', 'city': 'Miami', 'stadium': 'LoanDepot Park'},
'MIL': {'name': 'Milwaukee Brewers', 'city': 'Milwaukee', 'stadium': 'American Family Field'},
'MIN': {'name': 'Minnesota Twins', 'city': 'Minneapolis', 'stadium': 'Target Field'},
'NYM': {'name': 'New York Mets', 'city': 'New York', 'stadium': 'Citi Field'},
'NYY': {'name': 'New York Yankees', 'city': 'New York', 'stadium': 'Yankee Stadium'},
'OAK': {'name': 'Oakland Athletics', 'city': 'Sacramento', 'stadium': 'Sutter Health Park'},
'PHI': {'name': 'Philadelphia Phillies', 'city': 'Philadelphia', 'stadium': 'Citizens Bank Park'},
'PIT': {'name': 'Pittsburgh Pirates', 'city': 'Pittsburgh', 'stadium': 'PNC Park'},
'SDP': {'name': 'San Diego Padres', 'city': 'San Diego', 'stadium': 'Petco Park'},
'SFG': {'name': 'San Francisco Giants', 'city': 'San Francisco', 'stadium': 'Oracle Park'},
'SEA': {'name': 'Seattle Mariners', 'city': 'Seattle', 'stadium': 'T-Mobile Park'},
'STL': {'name': 'St. Louis Cardinals', 'city': 'St. Louis', 'stadium': 'Busch Stadium'},
'TBR': {'name': 'Tampa Bay Rays', 'city': 'St. Petersburg', 'stadium': 'Tropicana Field'},
'TEX': {'name': 'Texas Rangers', 'city': 'Arlington', 'stadium': 'Globe Life Field'},
'TOR': {'name': 'Toronto Blue Jays', 'city': 'Toronto', 'stadium': 'Rogers Centre'},
'WSN': {'name': 'Washington Nationals', 'city': 'Washington', 'stadium': 'Nationals Park'},
}
NHL_TEAMS = {
'ANA': {'name': 'Anaheim Ducks', 'city': 'Anaheim', 'arena': 'Honda Center'},
'ARI': {'name': 'Utah Hockey Club', 'city': 'Salt Lake City', 'arena': 'Delta Center'},
'BOS': {'name': 'Boston Bruins', 'city': 'Boston', 'arena': 'TD Garden'},
'BUF': {'name': 'Buffalo Sabres', 'city': 'Buffalo', 'arena': 'KeyBank Center'},
'CGY': {'name': 'Calgary Flames', 'city': 'Calgary', 'arena': 'Scotiabank Saddledome'},
'CAR': {'name': 'Carolina Hurricanes', 'city': 'Raleigh', 'arena': 'PNC Arena'},
'CHI': {'name': 'Chicago Blackhawks', 'city': 'Chicago', 'arena': 'United Center'},
'COL': {'name': 'Colorado Avalanche', 'city': 'Denver', 'arena': 'Ball Arena'},
'CBJ': {'name': 'Columbus Blue Jackets', 'city': 'Columbus', 'arena': 'Nationwide Arena'},
'DAL': {'name': 'Dallas Stars', 'city': 'Dallas', 'arena': 'American Airlines Center'},
'DET': {'name': 'Detroit Red Wings', 'city': 'Detroit', 'arena': 'Little Caesars Arena'},
'EDM': {'name': 'Edmonton Oilers', 'city': 'Edmonton', 'arena': 'Rogers Place'},
'FLA': {'name': 'Florida Panthers', 'city': 'Sunrise', 'arena': 'Amerant Bank Arena'},
'LAK': {'name': 'Los Angeles Kings', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
'MIN': {'name': 'Minnesota Wild', 'city': 'St. Paul', 'arena': 'Xcel Energy Center'},
'MTL': {'name': 'Montreal Canadiens', 'city': 'Montreal', 'arena': 'Bell Centre'},
'NSH': {'name': 'Nashville Predators', 'city': 'Nashville', 'arena': 'Bridgestone Arena'},
'NJD': {'name': 'New Jersey Devils', 'city': 'Newark', 'arena': 'Prudential Center'},
'NYI': {'name': 'New York Islanders', 'city': 'Elmont', 'arena': 'UBS Arena'},
'NYR': {'name': 'New York Rangers', 'city': 'New York', 'arena': 'Madison Square Garden'},
'OTT': {'name': 'Ottawa Senators', 'city': 'Ottawa', 'arena': 'Canadian Tire Centre'},
'PHI': {'name': 'Philadelphia Flyers', 'city': 'Philadelphia', 'arena': 'Wells Fargo Center'},
'PIT': {'name': 'Pittsburgh Penguins', 'city': 'Pittsburgh', 'arena': 'PPG Paints Arena'},
'SJS': {'name': 'San Jose Sharks', 'city': 'San Jose', 'arena': 'SAP Center'},
'SEA': {'name': 'Seattle Kraken', 'city': 'Seattle', 'arena': 'Climate Pledge Arena'},
'STL': {'name': 'St. Louis Blues', 'city': 'St. Louis', 'arena': 'Enterprise Center'},
'TBL': {'name': 'Tampa Bay Lightning', 'city': 'Tampa', 'arena': 'Amalie Arena'},
'TOR': {'name': 'Toronto Maple Leafs', 'city': 'Toronto', 'arena': 'Scotiabank Arena'},
'VAN': {'name': 'Vancouver Canucks', 'city': 'Vancouver', 'arena': 'Rogers Arena'},
'VGK': {'name': 'Vegas Golden Knights', 'city': 'Las Vegas', 'arena': 'T-Mobile Arena'},
'WSH': {'name': 'Washington Capitals', 'city': 'Washington', 'arena': 'Capital One Arena'},
'WPG': {'name': 'Winnipeg Jets', 'city': 'Winnipeg', 'arena': 'Canada Life Centre'},
}
WNBA_TEAMS = {
'ATL': {'name': 'Atlanta Dream', 'city': 'College Park', 'arena': 'Gateway Center Arena'},
'CHI': {'name': 'Chicago Sky', 'city': 'Chicago', 'arena': 'Wintrust Arena'},
'CON': {'name': 'Connecticut Sun', 'city': 'Uncasville', 'arena': 'Mohegan Sun Arena'},
'DAL': {'name': 'Dallas Wings', 'city': 'Arlington', 'arena': 'College Park Center'},
'GSV': {'name': 'Golden State Valkyries', 'city': 'San Francisco', 'arena': 'Chase Center'},
'IND': {'name': 'Indiana Fever', 'city': 'Indianapolis', 'arena': 'Gainbridge Fieldhouse'},
'LVA': {'name': 'Las Vegas Aces', 'city': 'Las Vegas', 'arena': 'Michelob Ultra Arena'},
'LAS': {'name': 'Los Angeles Sparks', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
'MIN': {'name': 'Minnesota Lynx', 'city': 'Minneapolis', 'arena': 'Target Center'},
'NYL': {'name': 'New York Liberty', 'city': 'Brooklyn', 'arena': 'Barclays Center'},
'PHX': {'name': 'Phoenix Mercury', 'city': 'Phoenix', 'arena': 'Footprint Center'},
'SEA': {'name': 'Seattle Storm', 'city': 'Seattle', 'arena': 'Climate Pledge Arena'},
'WAS': {'name': 'Washington Mystics', 'city': 'Washington', 'arena': 'Entertainment & Sports Arena'},
}
MLS_TEAMS = {
'ATL': {'name': 'Atlanta United FC', 'city': 'Atlanta', 'stadium': 'Mercedes-Benz Stadium'},
'ATX': {'name': 'Austin FC', 'city': 'Austin', 'stadium': 'Q2 Stadium'},
'CLT': {'name': 'Charlotte FC', 'city': 'Charlotte', 'stadium': 'Bank of America Stadium'},
'CHI': {'name': 'Chicago Fire FC', 'city': 'Chicago', 'stadium': 'Soldier Field'},
'CIN': {'name': 'FC Cincinnati', 'city': 'Cincinnati', 'stadium': 'TQL Stadium'},
'COL': {'name': 'Colorado Rapids', 'city': 'Commerce City', 'stadium': 'Dick\'s Sporting Goods Park'},
'CLB': {'name': 'Columbus Crew', 'city': 'Columbus', 'stadium': 'Lower.com Field'},
'DAL': {'name': 'FC Dallas', 'city': 'Frisco', 'stadium': 'Toyota Stadium'},
'DCU': {'name': 'D.C. United', 'city': 'Washington', 'stadium': 'Audi Field'},
'HOU': {'name': 'Houston Dynamo FC', 'city': 'Houston', 'stadium': 'Shell Energy Stadium'},
'LAG': {'name': 'LA Galaxy', 'city': 'Carson', 'stadium': 'Dignity Health Sports Park'},
'LAFC': {'name': 'Los Angeles FC', 'city': 'Los Angeles', 'stadium': 'BMO Stadium'},
'MIA': {'name': 'Inter Miami CF', 'city': 'Fort Lauderdale', 'stadium': 'Chase Stadium'},
'MIN': {'name': 'Minnesota United FC', 'city': 'St. Paul', 'stadium': 'Allianz Field'},
'MTL': {'name': 'CF Montréal', 'city': 'Montreal', 'stadium': 'Stade Saputo'},
'NSH': {'name': 'Nashville SC', 'city': 'Nashville', 'stadium': 'Geodis Park'},
'NER': {'name': 'New England Revolution', 'city': 'Foxborough', 'stadium': 'Gillette Stadium'},
'NYC': {'name': 'New York City FC', 'city': 'New York', 'stadium': 'Yankee Stadium'},
'RBNY': {'name': 'New York Red Bulls', 'city': 'Harrison', 'stadium': 'Red Bull Arena'},
'ORL': {'name': 'Orlando City SC', 'city': 'Orlando', 'stadium': 'Inter&Co Stadium'},
'PHI': {'name': 'Philadelphia Union', 'city': 'Chester', 'stadium': 'Subaru Park'},
'POR': {'name': 'Portland Timbers', 'city': 'Portland', 'stadium': 'Providence Park'},
'RSL': {'name': 'Real Salt Lake', 'city': 'Sandy', 'stadium': 'America First Field'},
'SJE': {'name': 'San Jose Earthquakes', 'city': 'San Jose', 'stadium': 'PayPal Park'},
'SEA': {'name': 'Seattle Sounders FC', 'city': 'Seattle', 'stadium': 'Lumen Field'},
'SKC': {'name': 'Sporting Kansas City', 'city': 'Kansas City', 'stadium': 'Children\'s Mercy Park'},
'STL': {'name': 'St. Louis City SC', 'city': 'St. Louis', 'stadium': 'CityPark'},
'TOR': {'name': 'Toronto FC', 'city': 'Toronto', 'stadium': 'BMO Field'},
'VAN': {'name': 'Vancouver Whitecaps FC', 'city': 'Vancouver', 'stadium': 'BC Place'},
'SDG': {'name': 'San Diego FC', 'city': 'San Diego', 'stadium': 'Snapdragon Stadium'},
}
NWSL_TEAMS = {
'ANG': {'name': 'Angel City FC', 'city': 'Los Angeles', 'stadium': 'BMO Stadium'},
'BAY': {'name': 'Bay FC', 'city': 'San Jose', 'stadium': 'PayPal Park'},
'CHI': {'name': 'Chicago Red Stars', 'city': 'Chicago', 'stadium': 'SeatGeek Stadium'},
'HOU': {'name': 'Houston Dash', 'city': 'Houston', 'stadium': 'Shell Energy Stadium'},
'KCC': {'name': 'Kansas City Current', 'city': 'Kansas City', 'stadium': 'CPKC Stadium'},
'NJY': {'name': 'NJ/NY Gotham FC', 'city': 'Harrison', 'stadium': 'Red Bull Arena'},
'NCC': {'name': 'North Carolina Courage', 'city': 'Cary', 'stadium': 'WakeMed Soccer Park'},
'ORL': {'name': 'Orlando Pride', 'city': 'Orlando', 'stadium': 'Inter&Co Stadium'},
'POR': {'name': 'Portland Thorns FC', 'city': 'Portland', 'stadium': 'Providence Park'},
'RGN': {'name': 'Seattle Reign FC', 'city': 'Seattle', 'stadium': 'Lumen Field'},
'SDW': {'name': 'San Diego Wave FC', 'city': 'San Diego', 'stadium': 'Snapdragon Stadium'},
'UTA': {'name': 'Utah Royals FC', 'city': 'Sandy', 'stadium': 'America First Field'},
'WSH': {'name': 'Washington Spirit', 'city': 'Washington', 'stadium': 'Audi Field'},
}
# NFL Teams and Stadiums
NFL_TEAMS = {
'ARI': {'name': 'Arizona Cardinals', 'city': 'Glendale', 'stadium': 'State Farm Stadium'},
'ATL': {'name': 'Atlanta Falcons', 'city': 'Atlanta', 'stadium': 'Mercedes-Benz Stadium'},
'BAL': {'name': 'Baltimore Ravens', 'city': 'Baltimore', 'stadium': 'M&T Bank Stadium'},
'BUF': {'name': 'Buffalo Bills', 'city': 'Orchard Park', 'stadium': 'Highmark Stadium'},
'CAR': {'name': 'Carolina Panthers', 'city': 'Charlotte', 'stadium': 'Bank of America Stadium'},
'CHI': {'name': 'Chicago Bears', 'city': 'Chicago', 'stadium': 'Soldier Field'},
'CIN': {'name': 'Cincinnati Bengals', 'city': 'Cincinnati', 'stadium': 'Paycor Stadium'},
'CLE': {'name': 'Cleveland Browns', 'city': 'Cleveland', 'stadium': 'Cleveland Browns Stadium'},
'DAL': {'name': 'Dallas Cowboys', 'city': 'Arlington', 'stadium': 'AT&T Stadium'},
'DEN': {'name': 'Denver Broncos', 'city': 'Denver', 'stadium': 'Empower Field at Mile High'},
'DET': {'name': 'Detroit Lions', 'city': 'Detroit', 'stadium': 'Ford Field'},
'GB': {'name': 'Green Bay Packers', 'city': 'Green Bay', 'stadium': 'Lambeau Field'},
'HOU': {'name': 'Houston Texans', 'city': 'Houston', 'stadium': 'NRG Stadium'},
'IND': {'name': 'Indianapolis Colts', 'city': 'Indianapolis', 'stadium': 'Lucas Oil Stadium'},
'JAX': {'name': 'Jacksonville Jaguars', 'city': 'Jacksonville', 'stadium': 'EverBank Stadium'},
'KC': {'name': 'Kansas City Chiefs', 'city': 'Kansas City', 'stadium': 'GEHA Field at Arrowhead Stadium'},
'LV': {'name': 'Las Vegas Raiders', 'city': 'Las Vegas', 'stadium': 'Allegiant Stadium'},
'LAC': {'name': 'Los Angeles Chargers', 'city': 'Inglewood', 'stadium': 'SoFi Stadium'},
'LAR': {'name': 'Los Angeles Rams', 'city': 'Inglewood', 'stadium': 'SoFi Stadium'},
'MIA': {'name': 'Miami Dolphins', 'city': 'Miami Gardens', 'stadium': 'Hard Rock Stadium'},
'MIN': {'name': 'Minnesota Vikings', 'city': 'Minneapolis', 'stadium': 'U.S. Bank Stadium'},
'NE': {'name': 'New England Patriots', 'city': 'Foxborough', 'stadium': 'Gillette Stadium'},
'NO': {'name': 'New Orleans Saints', 'city': 'New Orleans', 'stadium': 'Caesars Superdome'},
'NYG': {'name': 'New York Giants', 'city': 'East Rutherford', 'stadium': 'MetLife Stadium'},
'NYJ': {'name': 'New York Jets', 'city': 'East Rutherford', 'stadium': 'MetLife Stadium'},
'PHI': {'name': 'Philadelphia Eagles', 'city': 'Philadelphia', 'stadium': 'Lincoln Financial Field'},
'PIT': {'name': 'Pittsburgh Steelers', 'city': 'Pittsburgh', 'stadium': 'Acrisure Stadium'},
'SF': {'name': 'San Francisco 49ers', 'city': 'Santa Clara', 'stadium': 'Levi\'s Stadium'},
'SEA': {'name': 'Seattle Seahawks', 'city': 'Seattle', 'stadium': 'Lumen Field'},
'TB': {'name': 'Tampa Bay Buccaneers', 'city': 'Tampa', 'stadium': 'Raymond James Stadium'},
'TEN': {'name': 'Tennessee Titans', 'city': 'Nashville', 'stadium': 'Nissan Stadium'},
'WAS': {'name': 'Washington Commanders', 'city': 'Landover', 'stadium': 'Northwest Stadium'},
}
# =============================================================================
# SCRAPERS - NBA
# =============================================================================
def scrape_nba_basketball_reference(season: int) -> list[Game]:
"""
Scrape NBA schedule from Basketball-Reference.
URL: https://www.basketball-reference.com/leagues/NBA_{YEAR}_games-{month}.html
Season year is the ending year (e.g., 2025 for 2024-25 season)
"""
games = []
months = ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june']
print(f"Scraping NBA {season} from Basketball-Reference...")
for month in months:
url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html"
soup = fetch_page(url, 'basketball-reference.com')
if not soup:
continue
table = soup.find('table', {'id': 'schedule'})
if not table:
continue
tbody = table.find('tbody')
if not tbody:
continue
for row in tbody.find_all('tr'):
if row.get('class') and 'thead' in row.get('class'):
continue
cells = row.find_all(['td', 'th'])
if len(cells) < 6:
continue
try:
# Parse date
date_cell = row.find('th', {'data-stat': 'date_game'})
if not date_cell:
continue
date_link = date_cell.find('a')
date_str = date_link.text if date_link else date_cell.text
# Parse time
time_cell = row.find('td', {'data-stat': 'game_start_time'})
time_str = time_cell.text.strip() if time_cell else None
# Parse teams
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
home_cell = row.find('td', {'data-stat': 'home_team_name'})
if not visitor_cell or not home_cell:
continue
visitor_link = visitor_cell.find('a')
home_link = home_cell.find('a')
away_team = visitor_link.text if visitor_link else visitor_cell.text
home_team = home_link.text if home_link else home_cell.text
# Parse arena
arena_cell = row.find('td', {'data-stat': 'arena_name'})
arena = arena_cell.text.strip() if arena_cell else ''
# Convert date
try:
parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y')
date_formatted = parsed_date.strftime('%Y-%m-%d')
except:
continue
# Generate game ID
game_id = f"nba_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='NBA',
season=f"{season-1}-{str(season)[2:]}",
date=date_formatted,
time=time_str,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'NBA'),
away_team_abbrev=get_team_abbrev(away_team, 'NBA'),
venue=arena,
source='basketball-reference.com'
)
games.append(game)
except Exception as e:
print(f" Error parsing row: {e}")
continue
print(f" Found {len(games)} games from Basketball-Reference")
return games
def scrape_nba_espn(season: int) -> list[Game]:
"""
Scrape NBA schedule from ESPN.
URL: https://www.espn.com/nba/schedule/_/date/{YYYYMMDD}
"""
games = []
print(f"Scraping NBA {season} from ESPN...")
# Determine date range for season
start_date = datetime(season - 1, 10, 1) # October of previous year
end_date = datetime(season, 6, 30) # June of season year
current_date = start_date
while current_date <= end_date:
date_str = current_date.strftime('%Y%m%d')
url = f"https://www.espn.com/nba/schedule/_/date/{date_str}"
soup = fetch_page(url, 'espn.com')
if soup:
# ESPN uses JavaScript rendering, so we need to parse what's available
# This is a simplified version - full implementation would need Selenium
pass
current_date += timedelta(days=7) # Sample weekly to respect rate limits
print(f" Found {len(games)} games from ESPN")
return games
def scrape_nba_cbssports(season: int) -> list[Game]:
"""
Fetch NBA schedule from CBS Sports.
CBS Sports provides a JSON API for schedule data.
"""
games = []
print(f"Fetching NBA {season} from CBS Sports...")
# CBS Sports has a schedule endpoint
url = "https://www.cbssports.com/nba/schedule/"
soup = fetch_page(url, 'cbssports.com')
if not soup:
return games
# Find all game rows
tables = soup.find_all('table', class_='TableBase-table')
for table in tables:
rows = table.find_all('tr')
for row in rows:
try:
cells = row.find_all('td')
if len(cells) < 2:
continue
# Parse teams from row
team_cells = row.find_all('a', class_='TeamName')
if len(team_cells) < 2:
continue
away_team = team_cells[0].get_text(strip=True)
home_team = team_cells[1].get_text(strip=True)
# Get date from table section
date_formatted = datetime.now().strftime('%Y-%m-%d') # Placeholder
game_id = f"nba_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='NBA',
season=str(season),
date=date_formatted,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'NBA'),
away_team_abbrev=get_team_abbrev(away_team, 'NBA'),
venue='',
source='cbssports.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from CBS Sports")
return games
# =============================================================================
# SCRAPERS - MLB
# =============================================================================
def scrape_mlb_baseball_reference(season: int) -> list[Game]:
"""
Scrape MLB schedule from Baseball-Reference.
URL: https://www.baseball-reference.com/leagues/majors/{YEAR}-schedule.shtml
"""
games = []
url = f"https://www.baseball-reference.com/leagues/majors/{season}-schedule.shtml"
print(f"Scraping MLB {season} from Baseball-Reference...")
soup = fetch_page(url, 'baseball-reference.com')
if not soup:
return games
# Baseball-Reference groups games by date in h3 headers
current_date = None
# Find the schedule section
schedule_div = soup.find('div', {'id': 'all_schedule'})
if not schedule_div:
schedule_div = soup
# Process all elements to track date context
for element in schedule_div.find_all(['h3', 'p', 'div']):
# Check for date header
if element.name == 'h3':
date_text = element.get_text(strip=True)
# Parse date like "Thursday, March 27, 2025"
try:
for fmt in ['%A, %B %d, %Y', '%B %d, %Y', '%a, %b %d, %Y']:
try:
parsed = datetime.strptime(date_text, fmt)
current_date = parsed.strftime('%Y-%m-%d')
break
except:
continue
except:
pass
# Check for game entries
elif element.name == 'p' and 'game' in element.get('class', []):
if not current_date:
continue
try:
links = element.find_all('a')
if len(links) >= 2:
away_team = links[0].text.strip()
home_team = links[1].text.strip()
# Generate unique game ID
away_abbrev = get_team_abbrev(away_team, 'MLB')
home_abbrev = get_team_abbrev(home_team, 'MLB')
game_id = f"mlb_br_{current_date}_{away_abbrev}_{home_abbrev}".lower()
game = Game(
id=game_id,
sport='MLB',
season=str(season),
date=current_date,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=home_abbrev,
away_team_abbrev=away_abbrev,
venue='',
source='baseball-reference.com'
)
games.append(game)
except Exception as e:
continue
print(f" Found {len(games)} games from Baseball-Reference")
return games
def scrape_mlb_statsapi(season: int) -> list[Game]:
"""
Fetch MLB schedule from official Stats API (JSON).
URL: https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={YEAR}&gameType=R
"""
games = []
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={season}&gameType=R&hydrate=team,venue"
print(f"Fetching MLB {season} from Stats API...")
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
for date_entry in data.get('dates', []):
game_date = date_entry.get('date', '')
for game_data in date_entry.get('games', []):
try:
teams = game_data.get('teams', {})
away = teams.get('away', {}).get('team', {})
home = teams.get('home', {}).get('team', {})
venue = game_data.get('venue', {})
game_time = game_data.get('gameDate', '')
if 'T' in game_time:
time_str = game_time.split('T')[1][:5]
else:
time_str = None
game = Game(
id='', # Will be assigned by assign_stable_ids
sport='MLB',
season=str(season),
date=game_date,
time=time_str,
home_team=home.get('name', ''),
away_team=away.get('name', ''),
home_team_abbrev=home.get('abbreviation', ''),
away_team_abbrev=away.get('abbreviation', ''),
venue=venue.get('name', ''),
source='statsapi.mlb.com'
)
games.append(game)
except Exception as e:
continue
except Exception as e:
print(f" Error fetching MLB API: {e}")
print(f" Found {len(games)} games from MLB Stats API")
return games
def scrape_mlb_espn(season: int) -> list[Game]:
"""Fetch MLB schedule from ESPN API."""
games = []
print(f"Fetching MLB {season} from ESPN API...")
# MLB regular season: Late March - Early October
start = f"{season}0320"
end = f"{season}1010"
url = "https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard"
params = {
'dates': f"{start}-{end}",
'limit': 1000
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
try:
response = requests.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
events = data.get('events', [])
for event in events:
try:
date_str = event.get('date', '')[:10]
time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None
competitions = event.get('competitions', [{}])
if not competitions:
continue
comp = competitions[0]
competitors = comp.get('competitors', [])
if len(competitors) < 2:
continue
home_team = away_team = home_abbrev = away_abbrev = None
for team in competitors:
team_data = team.get('team', {})
team_name = team_data.get('displayName', team_data.get('name', ''))
team_abbrev = team_data.get('abbreviation', '')
if team.get('homeAway') == 'home':
home_team = team_name
home_abbrev = team_abbrev
else:
away_team = team_name
away_abbrev = team_abbrev
if not home_team or not away_team:
continue
venue = comp.get('venue', {}).get('fullName', '')
game_id = f"mlb_{date_str}_{away_abbrev}_{home_abbrev}".lower()
game = Game(
id=game_id,
sport='MLB',
season=str(season),
date=date_str,
time=time_str,
home_team=home_team,
away_team=away_team,
home_team_abbrev=home_abbrev or get_team_abbrev(home_team, 'MLB'),
away_team_abbrev=away_abbrev or get_team_abbrev(away_team, 'MLB'),
venue=venue,
source='espn.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from ESPN")
except Exception as e:
print(f"Error fetching ESPN MLB: {e}")
return games
# =============================================================================
# SCRAPERS - NHL
# =============================================================================
def scrape_nhl_hockey_reference(season: int) -> list[Game]:
"""
Scrape NHL schedule from Hockey-Reference.
URL: https://www.hockey-reference.com/leagues/NHL_{YEAR}_games.html
"""
games = []
url = f"https://www.hockey-reference.com/leagues/NHL_{season}_games.html"
print(f"Scraping NHL {season} from Hockey-Reference...")
soup = fetch_page(url, 'hockey-reference.com')
if not soup:
return games
table = soup.find('table', {'id': 'games'})
if not table:
print(" Could not find games table")
return games
tbody = table.find('tbody')
if not tbody:
return games
for row in tbody.find_all('tr'):
try:
cells = row.find_all(['td', 'th'])
if len(cells) < 5:
continue
# Parse date
date_cell = row.find('th', {'data-stat': 'date_game'})
if not date_cell:
continue
date_link = date_cell.find('a')
date_str = date_link.text if date_link else date_cell.text
# Parse teams
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
home_cell = row.find('td', {'data-stat': 'home_team_name'})
if not visitor_cell or not home_cell:
continue
visitor_link = visitor_cell.find('a')
home_link = home_cell.find('a')
away_team = visitor_link.text if visitor_link else visitor_cell.text
home_team = home_link.text if home_link else home_cell.text
# Convert date
try:
parsed_date = datetime.strptime(date_str.strip(), '%Y-%m-%d')
date_formatted = parsed_date.strftime('%Y-%m-%d')
except:
continue
game_id = f"nhl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='NHL',
season=f"{season-1}-{str(season)[2:]}",
date=date_formatted,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'NHL'),
away_team_abbrev=get_team_abbrev(away_team, 'NHL'),
venue='',
source='hockey-reference.com'
)
games.append(game)
except Exception as e:
continue
print(f" Found {len(games)} games from Hockey-Reference")
return games
def scrape_nhl_api(season: int) -> list[Game]:
"""
Fetch NHL schedule from official API (JSON).
URL: https://api-web.nhle.com/v1/schedule/{YYYY-MM-DD}
"""
games = []
print(f"Fetching NHL {season} from NHL API...")
# NHL API provides club schedules
# We'd need to iterate through dates or teams
# Simplified implementation here
return games
def scrape_nhl_espn(season: int) -> list[Game]:
"""Fetch NHL schedule from ESPN API."""
games = []
print(f"Fetching NHL {season} from ESPN API...")
# NHL regular season: October - April (spans calendar years)
start = f"{season-1}1001"
end = f"{season}0430"
url = "https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard"
params = {
'dates': f"{start}-{end}",
'limit': 1000
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
try:
response = requests.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
events = data.get('events', [])
for event in events:
try:
date_str = event.get('date', '')[:10]
time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None
competitions = event.get('competitions', [{}])
if not competitions:
continue
comp = competitions[0]
competitors = comp.get('competitors', [])
if len(competitors) < 2:
continue
home_team = away_team = home_abbrev = away_abbrev = None
for team in competitors:
team_data = team.get('team', {})
team_name = team_data.get('displayName', team_data.get('name', ''))
team_abbrev = team_data.get('abbreviation', '')
if team.get('homeAway') == 'home':
home_team = team_name
home_abbrev = team_abbrev
else:
away_team = team_name
away_abbrev = team_abbrev
if not home_team or not away_team:
continue
venue = comp.get('venue', {}).get('fullName', '')
game_id = f"nhl_{date_str}_{away_abbrev}_{home_abbrev}".lower()
game = Game(
id=game_id,
sport='NHL',
season=str(season),
date=date_str,
time=time_str,
home_team=home_team,
away_team=away_team,
home_team_abbrev=home_abbrev or get_team_abbrev(home_team, 'NHL'),
away_team_abbrev=away_abbrev or get_team_abbrev(away_team, 'NHL'),
venue=venue,
source='espn.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from ESPN")
except Exception as e:
print(f"Error fetching ESPN NHL: {e}")
return games
# =============================================================================
# SCRAPERS - ESPN API (WNBA, MLS, NWSL)
# =============================================================================
def scrape_espn_schedule(sport: str, league: str, season: int, date_range: tuple[str, str]) -> list[Game]:
"""
Fetch schedule from ESPN API.
Args:
sport: 'basketball' or 'soccer'
league: 'wnba', 'usa.1' (MLS), 'usa.nwsl' (NWSL)
season: Season year
date_range: (start_date, end_date) in YYYYMMDD format
"""
games = []
sport_upper = {
'wnba': 'WNBA',
'usa.1': 'MLS',
'usa.nwsl': 'NWSL',
'nfl': 'NFL',
'mens-college-basketball': 'CBB'
}.get(league, league.upper())
print(f"Fetching {sport_upper} {season} from ESPN API...")
url = f"https://site.api.espn.com/apis/site/v2/sports/{sport}/{league}/scoreboard"
params = {
'dates': f"{date_range[0]}-{date_range[1]}",
'limit': 1000
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
try:
response = requests.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
events = data.get('events', [])
for event in events:
try:
# Parse date/time
date_str = event.get('date', '')[:10] # YYYY-MM-DD
time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None
# Get teams
competitions = event.get('competitions', [{}])
if not competitions:
continue
comp = competitions[0]
competitors = comp.get('competitors', [])
if len(competitors) < 2:
continue
home_team = None
away_team = None
home_abbrev = None
away_abbrev = None
for team in competitors:
team_data = team.get('team', {})
team_name = team_data.get('displayName', team_data.get('name', ''))
team_abbrev = team_data.get('abbreviation', '')
if team.get('homeAway') == 'home':
home_team = team_name
home_abbrev = team_abbrev
else:
away_team = team_name
away_abbrev = team_abbrev
if not home_team or not away_team:
continue
# Get venue
venue = comp.get('venue', {}).get('fullName', '')
game_id = f"{sport_upper.lower()}_{date_str}_{away_abbrev}_{home_abbrev}".lower()
game = Game(
id=game_id,
sport=sport_upper,
season=str(season),
date=date_str,
time=time_str,
home_team=home_team,
away_team=away_team,
home_team_abbrev=home_abbrev or get_team_abbrev(home_team, sport_upper),
away_team_abbrev=away_abbrev or get_team_abbrev(away_team, sport_upper),
venue=venue,
source='espn.com'
)
games.append(game)
except Exception as e:
continue
print(f" Found {len(games)} games from ESPN")
except Exception as e:
print(f"Error fetching ESPN {sport_upper}: {e}")
return games
def scrape_wnba_espn(season: int) -> list[Game]:
"""Fetch WNBA schedule from ESPN API."""
# WNBA season: May - October
start = f"{season}0501"
end = f"{season}1031"
return scrape_espn_schedule('basketball', 'wnba', season, (start, end))
def scrape_mls_espn(season: int) -> list[Game]:
"""Fetch MLS schedule from ESPN API."""
# MLS season: February - December
start = f"{season}0201"
end = f"{season}1231"
return scrape_espn_schedule('soccer', 'usa.1', season, (start, end))
def scrape_nwsl_espn(season: int) -> list[Game]:
"""Fetch NWSL schedule from ESPN API."""
# NWSL season: March - November
start = f"{season}0301"
end = f"{season}1130"
return scrape_espn_schedule('soccer', 'usa.nwsl', season, (start, end))
def scrape_nfl_espn(season: int) -> list[Game]:
"""Fetch NFL schedule from ESPN API."""
# NFL season: September - February (spans years)
start = f"{season-1}0901"
end = f"{season}0228"
return scrape_espn_schedule('football', 'nfl', season, (start, end))
def scrape_nfl_pro_football_reference(season: int) -> list[Game]:
"""
Scrape NFL schedule from Pro-Football-Reference.
URL: https://www.pro-football-reference.com/years/{YEAR}/games.htm
Season year is the starting year (e.g., 2025 for 2025-26 season)
"""
games = []
year = season - 1 # PFR uses starting year
url = f"https://www.pro-football-reference.com/years/{year}/games.htm"
print(f"Scraping NFL {season} from Pro-Football-Reference...")
soup = fetch_page(url, 'pro-football-reference.com')
if not soup:
return games
table = soup.find('table', {'id': 'games'})
if not table:
print(" Could not find games table")
return games
tbody = table.find('tbody')
if not tbody:
return games
for row in tbody.find_all('tr'):
if row.get('class') and 'thead' in row.get('class'):
continue
try:
# Parse date
date_cell = row.find('td', {'data-stat': 'game_date'})
if not date_cell:
continue
date_str = date_cell.text.strip()
# Parse teams
winner_cell = row.find('td', {'data-stat': 'winner'})
loser_cell = row.find('td', {'data-stat': 'loser'})
home_cell = row.find('td', {'data-stat': 'game_location'})
if not winner_cell or not loser_cell:
continue
winner_link = winner_cell.find('a')
loser_link = loser_cell.find('a')
winner = winner_link.text if winner_link else winner_cell.text.strip()
loser = loser_link.text if loser_link else loser_cell.text.strip()
# Determine home/away - '@' in game_location means winner was away
is_at_loser = home_cell and '@' in home_cell.text
if is_at_loser:
home_team, away_team = loser, winner
else:
home_team, away_team = winner, loser
# Convert date (e.g., "September 7" or "2025-09-07")
try:
if '-' in date_str:
parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
else:
# Add year based on month
month_str = date_str.split()[0]
if month_str in ['January', 'February']:
date_with_year = f"{date_str}, {year + 1}"
else:
date_with_year = f"{date_str}, {year}"
parsed_date = datetime.strptime(date_with_year, '%B %d, %Y')
date_formatted = parsed_date.strftime('%Y-%m-%d')
except:
continue
game_id = f"nfl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='NFL',
season=str(season),
date=date_formatted,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'NFL'),
away_team_abbrev=get_team_abbrev(away_team, 'NFL'),
venue='',
source='pro-football-reference.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from Pro-Football-Reference")
return games
def scrape_nfl_cbssports(season: int) -> list[Game]:
"""
Scrape NFL schedule from CBS Sports API.
Provides more structured data than web scraping.
"""
games = []
year = season - 1 # CBS uses starting year
print(f"Fetching NFL {season} from CBS Sports...")
# CBS Sports schedule endpoint
url = f"https://www.cbssports.com/nfl/schedule/{year}/regular/"
soup = fetch_page(url, 'cbssports.com')
if not soup:
return games
# Find game tables
tables = soup.find_all('table', class_='TableBase-table')
for table in tables:
rows = table.find_all('tr')
for row in rows:
try:
cells = row.find_all('td')
if len(cells) < 3:
continue
# Parse matchup
away_cell = cells[0] if len(cells) > 0 else None
home_cell = cells[1] if len(cells) > 1 else None
if not away_cell or not home_cell:
continue
away_team = away_cell.get_text(strip=True)
home_team = home_cell.get_text(strip=True)
if not away_team or not home_team:
continue
# CBS includes @ symbol
away_team = away_team.replace('@', '').strip()
# Get date from parent section if available
date_formatted = datetime.now().strftime('%Y-%m-%d') # Placeholder
game_id = f"nfl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='NFL',
season=str(season),
date=date_formatted,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'NFL'),
away_team_abbrev=get_team_abbrev(away_team, 'NFL'),
venue='',
source='cbssports.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from CBS Sports")
return games
def scrape_cbb_espn(season: int) -> list[Game]:
"""Fetch College Basketball schedule from ESPN API (D1 only)."""
# CBB season: November - April
start = f"{season-1}1101"
end = f"{season}0415"
return scrape_espn_schedule('basketball', 'mens-college-basketball', season, (start, end))
def scrape_cbb_sports_reference(season: int) -> list[Game]:
"""
Scrape College Basketball schedule from Sports-Reference.
URL: https://www.sports-reference.com/cbb/seasons/{YEAR}-schedule.html
"""
games = []
url = f"https://www.sports-reference.com/cbb/seasons/{season}-schedule.html"
print(f"Scraping CBB {season} from Sports-Reference...")
soup = fetch_page(url, 'sports-reference.com')
if not soup:
return games
table = soup.find('table', {'id': 'schedule'})
if not table:
print(" Could not find schedule table")
return games
tbody = table.find('tbody')
if not tbody:
return games
for row in tbody.find_all('tr'):
if row.get('class') and 'thead' in row.get('class'):
continue
try:
date_cell = row.find('td', {'data-stat': 'date_game'})
if not date_cell:
continue
date_str = date_cell.text.strip()
home_cell = row.find('td', {'data-stat': 'home_team_name'})
away_cell = row.find('td', {'data-stat': 'away_team_name'})
if not home_cell or not away_cell:
continue
home_team = home_cell.get_text(strip=True)
away_team = away_cell.get_text(strip=True)
try:
parsed_date = datetime.strptime(date_str, '%b %d, %Y')
date_formatted = parsed_date.strftime('%Y-%m-%d')
except:
continue
game_id = f"cbb_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='CBB',
season=str(season),
date=date_formatted,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=away_team[:3].upper(),
away_team_abbrev=home_team[:3].upper(),
venue='',
source='sports-reference.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from Sports-Reference")
return games
def scrape_cbb_cbssports(season: int) -> list[Game]:
"""Fetch College Basketball schedule from CBS Sports."""
games = []
print(f"Fetching CBB {season} from CBS Sports...")
url = "https://www.cbssports.com/college-basketball/schedule/"
soup = fetch_page(url, 'cbssports.com')
if not soup:
return games
tables = soup.find_all('table', class_='TableBase-table')
for table in tables:
rows = table.find_all('tr')
for row in rows:
try:
cells = row.find_all('td')
if len(cells) < 2:
continue
team_cells = row.find_all('a', class_='TeamName')
if len(team_cells) < 2:
continue
away_team = team_cells[0].get_text(strip=True)
home_team = team_cells[1].get_text(strip=True)
date_formatted = datetime.now().strftime('%Y-%m-%d')
game_id = f"cbb_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='CBB',
season=str(season),
date=date_formatted,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=away_team[:3].upper(),
away_team_abbrev=home_team[:3].upper(),
venue='',
source='cbssports.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from CBS Sports")
return games
def scrape_wnba_cbssports(season: int) -> list[Game]:
"""Fetch WNBA schedule from CBS Sports."""
games = []
print(f"Fetching WNBA {season} from CBS Sports...")
url = "https://www.cbssports.com/wnba/schedule/"
soup = fetch_page(url, 'cbssports.com')
if not soup:
return games
tables = soup.find_all('table', class_='TableBase-table')
for table in tables:
rows = table.find_all('tr')
for row in rows:
try:
cells = row.find_all('td')
if len(cells) < 2:
continue
team_cells = row.find_all('a', class_='TeamName')
if len(team_cells) < 2:
continue
away_team = team_cells[0].get_text(strip=True)
home_team = team_cells[1].get_text(strip=True)
date_formatted = datetime.now().strftime('%Y-%m-%d')
game_id = f"wnba_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='WNBA',
season=str(season),
date=date_formatted,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'WNBA'),
away_team_abbrev=get_team_abbrev(away_team, 'WNBA'),
venue='',
source='cbssports.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from CBS Sports")
return games
def scrape_mls_mlssoccer(season: int) -> list[Game]:
"""Fetch MLS schedule from official MLSSoccer.com."""
games = []
print(f"Fetching MLS {season} from MLSSoccer.com...")
url = f"https://www.mlssoccer.com/schedule/{season}"
soup = fetch_page(url, 'mlssoccer.com')
if not soup:
return games
# MLS schedule is typically rendered via JavaScript
# This is a fallback parser for any static content
tables = soup.find_all('table')
for table in tables:
rows = table.find_all('tr')
for row in rows:
try:
cells = row.find_all('td')
if len(cells) < 2:
continue
away_team = cells[0].get_text(strip=True) if cells else ''
home_team = cells[1].get_text(strip=True) if len(cells) > 1 else ''
if not away_team or not home_team:
continue
date_formatted = datetime.now().strftime('%Y-%m-%d')
game_id = f"mls_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='MLS',
season=str(season),
date=date_formatted,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'MLS'),
away_team_abbrev=get_team_abbrev(away_team, 'MLS'),
venue='',
source='mlssoccer.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from MLSSoccer.com")
return games
def scrape_nwsl_nwslsoccer(season: int) -> list[Game]:
"""Fetch NWSL schedule from official NWSL site."""
games = []
print(f"Fetching NWSL {season} from NWSL.com...")
url = f"https://www.nwslsoccer.com/schedule/{season}"
soup = fetch_page(url, 'nwslsoccer.com')
if not soup:
return games
tables = soup.find_all('table')
for table in tables:
rows = table.find_all('tr')
for row in rows:
try:
cells = row.find_all('td')
if len(cells) < 2:
continue
away_team = cells[0].get_text(strip=True) if cells else ''
home_team = cells[1].get_text(strip=True) if len(cells) > 1 else ''
if not away_team or not home_team:
continue
date_formatted = datetime.now().strftime('%Y-%m-%d')
game_id = f"nwsl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='NWSL',
season=str(season),
date=date_formatted,
time=None,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'NWSL'),
away_team_abbrev=get_team_abbrev(away_team, 'NWSL'),
venue='',
source='nwslsoccer.com'
)
games.append(game)
except Exception:
continue
print(f" Found {len(games)} games from NWSL.com")
return games
# =============================================================================
# SCRAPERS - WNBA (Basketball-Reference fallback)
# =============================================================================
def scrape_wnba_basketball_reference(season: int) -> list[Game]:
"""
Scrape WNBA schedule from Basketball-Reference.
URL: https://www.basketball-reference.com/wnba/years/{YEAR}_games.html
"""
games = []
url = f"https://www.basketball-reference.com/wnba/years/{season}_games.html"
print(f"Scraping WNBA {season} from Basketball-Reference...")
soup = fetch_page(url, 'basketball-reference.com')
if not soup:
return games
table = soup.find('table', {'id': 'schedule'})
if not table:
print(" Could not find schedule table")
return games
tbody = table.find('tbody')
if not tbody:
return games
for row in tbody.find_all('tr'):
if row.get('class') and 'thead' in row.get('class'):
continue
cells = row.find_all(['td', 'th'])
if len(cells) < 6:
continue
try:
# Parse date
date_cell = row.find('th', {'data-stat': 'date_game'})
if not date_cell:
continue
date_link = date_cell.find('a')
date_str = date_link.text if date_link else date_cell.text
# Parse time
time_cell = row.find('td', {'data-stat': 'game_start_time'})
time_str = time_cell.text.strip() if time_cell else None
# Parse teams
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
home_cell = row.find('td', {'data-stat': 'home_team_name'})
if not visitor_cell or not home_cell:
continue
visitor_link = visitor_cell.find('a')
home_link = home_cell.find('a')
away_team = visitor_link.text if visitor_link else visitor_cell.text
home_team = home_link.text if home_link else home_cell.text
# Parse arena
arena_cell = row.find('td', {'data-stat': 'arena_name'})
arena = arena_cell.text.strip() if arena_cell else ''
# Convert date
try:
parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y')
date_formatted = parsed_date.strftime('%Y-%m-%d')
except:
continue
game_id = f"wnba_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='WNBA',
season=str(season),
date=date_formatted,
time=time_str,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'WNBA'),
away_team_abbrev=get_team_abbrev(away_team, 'WNBA'),
venue=arena,
source='basketball-reference.com'
)
games.append(game)
except Exception as e:
continue
print(f" Found {len(games)} games from Basketball-Reference")
return games
# =============================================================================
# SCRAPERS - MLS
# =============================================================================
def scrape_mls_fbref(season: int) -> list[Game]:
"""
Scrape MLS schedule from FBref.
URL: https://fbref.com/en/comps/22/{YEAR}/schedule/{YEAR}-Major-League-Soccer-Scores-and-Fixtures
"""
games = []
url = f"https://fbref.com/en/comps/22/{season}/schedule/{season}-Major-League-Soccer-Scores-and-Fixtures"
print(f"Scraping MLS {season} from FBref...")
soup = fetch_page(url, 'fbref.com')
if not soup:
return games
table = soup.find('table', {'id': 'sched_all'}) or soup.find('table', {'id': re.compile(r'sched.*')})
if not table:
print(" Could not find schedule table")
return games
tbody = table.find('tbody')
if not tbody:
return games
for row in tbody.find_all('tr'):
if row.get('class') and 'spacer' in row.get('class'):
continue
try:
# Parse date
date_cell = row.find('td', {'data-stat': 'date'})
if not date_cell:
continue
date_str = date_cell.text.strip()
# Parse time
time_cell = row.find('td', {'data-stat': 'time'})
time_str = time_cell.text.strip() if time_cell else None
# Parse teams
home_cell = row.find('td', {'data-stat': 'home_team'})
away_cell = row.find('td', {'data-stat': 'away_team'})
if not home_cell or not away_cell:
continue
home_team = home_cell.text.strip()
away_team = away_cell.text.strip()
# Parse venue
venue_cell = row.find('td', {'data-stat': 'venue'})
venue = venue_cell.text.strip() if venue_cell else ''
# Convert date
try:
parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
date_formatted = parsed_date.strftime('%Y-%m-%d')
except:
continue
game_id = f"mls_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='MLS',
season=str(season),
date=date_formatted,
time=time_str,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'MLS'),
away_team_abbrev=get_team_abbrev(away_team, 'MLS'),
venue=venue,
source='fbref.com'
)
games.append(game)
except Exception as e:
continue
print(f" Found {len(games)} games from FBref")
return games
# =============================================================================
# SCRAPERS - NWSL
# =============================================================================
def scrape_nwsl_fbref(season: int) -> list[Game]:
"""
Scrape NWSL schedule from FBref.
URL: https://fbref.com/en/comps/182/{YEAR}/schedule/{YEAR}-NWSL-Scores-and-Fixtures
"""
games = []
url = f"https://fbref.com/en/comps/182/{season}/schedule/{season}-NWSL-Scores-and-Fixtures"
print(f"Scraping NWSL {season} from FBref...")
soup = fetch_page(url, 'fbref.com')
if not soup:
return games
table = soup.find('table', {'id': 'sched_all'}) or soup.find('table', {'id': re.compile(r'sched.*')})
if not table:
print(" Could not find schedule table")
return games
tbody = table.find('tbody')
if not tbody:
return games
for row in tbody.find_all('tr'):
if row.get('class') and 'spacer' in row.get('class'):
continue
try:
# Parse date
date_cell = row.find('td', {'data-stat': 'date'})
if not date_cell:
continue
date_str = date_cell.text.strip()
# Parse time
time_cell = row.find('td', {'data-stat': 'time'})
time_str = time_cell.text.strip() if time_cell else None
# Parse teams
home_cell = row.find('td', {'data-stat': 'home_team'})
away_cell = row.find('td', {'data-stat': 'away_team'})
if not home_cell or not away_cell:
continue
home_team = home_cell.text.strip()
away_team = away_cell.text.strip()
# Parse venue
venue_cell = row.find('td', {'data-stat': 'venue'})
venue = venue_cell.text.strip() if venue_cell else ''
# Convert date
try:
parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
date_formatted = parsed_date.strftime('%Y-%m-%d')
except:
continue
game_id = f"nwsl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
game = Game(
id=game_id,
sport='NWSL',
season=str(season),
date=date_formatted,
time=time_str,
home_team=home_team,
away_team=away_team,
home_team_abbrev=get_team_abbrev(home_team, 'NWSL'),
away_team_abbrev=get_team_abbrev(away_team, 'NWSL'),
venue=venue,
source='fbref.com'
)
games.append(game)
except Exception as e:
continue
print(f" Found {len(games)} games from FBref")
return games
# =============================================================================
# STADIUM SCRAPER
# =============================================================================
def scrape_stadiums_hifld() -> list[Stadium]:
"""
Fetch stadium data from HIFLD Open Data (US Government).
Returns GeoJSON with coordinates.
"""
stadiums = []
url = "https://services1.arcgis.com/Hp6G80Pky0om7QvQ/arcgis/rest/services/Major_Sport_Venues/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
print("Fetching stadiums from HIFLD Open Data...")
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
for feature in data.get('features', []):
attrs = feature.get('attributes', {})
geom = feature.get('geometry', {})
# Filter for NBA, MLB, NHL venues
league = attrs.get('LEAGUE', '')
if league not in ['NBA', 'MLB', 'NHL', 'NFL']:
continue
sport_map = {'NBA': 'NBA', 'MLB': 'MLB', 'NHL': 'NHL'}
if league not in sport_map:
continue
stadium = Stadium(
id=f"hifld_{attrs.get('OBJECTID', '')}",
name=attrs.get('NAME', ''),
city=attrs.get('CITY', ''),
state=attrs.get('STATE', ''),
latitude=geom.get('y', 0),
longitude=geom.get('x', 0),
capacity=attrs.get('CAPACITY', 0) or 0,
sport=sport_map.get(league, ''),
team_abbrevs=[attrs.get('TEAM', '')],
source='hifld.gov',
year_opened=attrs.get('YEAR_OPEN')
)
stadiums.append(stadium)
except Exception as e:
print(f" Error fetching HIFLD data: {e}")
print(f" Found {len(stadiums)} stadiums from HIFLD")
return stadiums
# =============================================================================
# SPORT-SPECIFIC STADIUM SCRAPERS
# =============================================================================
def scrape_mlb_stadiums_scorebot() -> list[Stadium]:
"""
Source 1: MLBScoreBot/ballparks GitHub (public domain).
"""
stadiums = []
url = "https://raw.githubusercontent.com/MLBScoreBot/ballparks/main/ballparks.json"
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
for name, info in data.items():
stadium = Stadium(
id=f"mlb_{name.lower().replace(' ', '_')[:30]}",
name=name,
city=info.get('city', ''),
state=info.get('state', ''),
latitude=info.get('lat', 0) / 1000000 if info.get('lat') else 0,
longitude=info.get('long', 0) / 1000000 if info.get('long') else 0,
capacity=info.get('capacity', 0),
sport='MLB',
team_abbrevs=[info.get('team', '')],
source='github.com/MLBScoreBot'
)
stadiums.append(stadium)
return stadiums
def scrape_mlb_stadiums_geojson() -> list[Stadium]:
"""
Source 2: cageyjames/GeoJSON-Ballparks GitHub.
"""
stadiums = []
url = "https://raw.githubusercontent.com/cageyjames/GeoJSON-Ballparks/master/ballparks.geojson"
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
for feature in data.get('features', []):
props = feature.get('properties', {})
coords = feature.get('geometry', {}).get('coordinates', [0, 0])
# Only include MLB stadiums (filter by League)
if props.get('League', '').upper() != 'MLB':
continue
stadium = Stadium(
id=f"mlb_{props.get('Ballpark', '').lower().replace(' ', '_')[:30]}",
name=props.get('Ballpark', ''),
city=props.get('City', ''),
state=props.get('State', ''),
latitude=coords[1] if len(coords) > 1 else 0,
longitude=coords[0] if len(coords) > 0 else 0,
capacity=0, # Not in this dataset
sport='MLB',
team_abbrevs=[props.get('Team', '')],
source='github.com/cageyjames'
)
stadiums.append(stadium)
return stadiums
def scrape_mlb_stadiums_hardcoded() -> list[Stadium]:
"""
Source 3: Hardcoded MLB ballparks (fallback).
"""
mlb_ballparks = {
'Chase Field': {'city': 'Phoenix', 'state': 'AZ', 'lat': 33.4453, 'lng': -112.0667, 'capacity': 48519, 'teams': ['ARI']},
'Truist Park': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.8907, 'lng': -84.4677, 'capacity': 41084, 'teams': ['ATL']},
'Oriole Park at Camden Yards': {'city': 'Baltimore', 'state': 'MD', 'lat': 39.2839, 'lng': -76.6216, 'capacity': 44970, 'teams': ['BAL']},
'Fenway Park': {'city': 'Boston', 'state': 'MA', 'lat': 42.3467, 'lng': -71.0972, 'capacity': 37755, 'teams': ['BOS']},
'Wrigley Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.9484, 'lng': -87.6553, 'capacity': 41649, 'teams': ['CHC']},
'Guaranteed Rate Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8299, 'lng': -87.6338, 'capacity': 40615, 'teams': ['CHW']},
'Great American Ball Park': {'city': 'Cincinnati', 'state': 'OH', 'lat': 39.0979, 'lng': -84.5082, 'capacity': 42319, 'teams': ['CIN']},
'Progressive Field': {'city': 'Cleveland', 'state': 'OH', 'lat': 41.4958, 'lng': -81.6853, 'capacity': 34830, 'teams': ['CLE']},
'Coors Field': {'city': 'Denver', 'state': 'CO', 'lat': 39.7559, 'lng': -104.9942, 'capacity': 50144, 'teams': ['COL']},
'Comerica Park': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3390, 'lng': -83.0485, 'capacity': 41083, 'teams': ['DET']},
'Minute Maid Park': {'city': 'Houston', 'state': 'TX', 'lat': 29.7573, 'lng': -95.3555, 'capacity': 41168, 'teams': ['HOU']},
'Kauffman Stadium': {'city': 'Kansas City', 'state': 'MO', 'lat': 39.0517, 'lng': -94.4803, 'capacity': 37903, 'teams': ['KCR']},
'Angel Stadium': {'city': 'Anaheim', 'state': 'CA', 'lat': 33.8003, 'lng': -117.8827, 'capacity': 45517, 'teams': ['LAA']},
'Dodger Stadium': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0739, 'lng': -118.2400, 'capacity': 56000, 'teams': ['LAD']},
'LoanDepot Park': {'city': 'Miami', 'state': 'FL', 'lat': 25.7781, 'lng': -80.2196, 'capacity': 36742, 'teams': ['MIA']},
'American Family Field': {'city': 'Milwaukee', 'state': 'WI', 'lat': 43.0280, 'lng': -87.9712, 'capacity': 41900, 'teams': ['MIL']},
'Target Field': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9818, 'lng': -93.2775, 'capacity': 38544, 'teams': ['MIN']},
'Citi Field': {'city': 'Queens', 'state': 'NY', 'lat': 40.7571, 'lng': -73.8458, 'capacity': 41922, 'teams': ['NYM']},
'Yankee Stadium': {'city': 'Bronx', 'state': 'NY', 'lat': 40.8296, 'lng': -73.9262, 'capacity': 46537, 'teams': ['NYY']},
'Oakland Coliseum': {'city': 'Oakland', 'state': 'CA', 'lat': 37.7516, 'lng': -122.2005, 'capacity': 46847, 'teams': ['OAK']},
'Citizens Bank Park': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9061, 'lng': -75.1665, 'capacity': 42901, 'teams': ['PHI']},
'PNC Park': {'city': 'Pittsburgh', 'state': 'PA', 'lat': 40.4469, 'lng': -80.0057, 'capacity': 38362, 'teams': ['PIT']},
'Petco Park': {'city': 'San Diego', 'state': 'CA', 'lat': 32.7073, 'lng': -117.1566, 'capacity': 40209, 'teams': ['SDP']},
'Oracle Park': {'city': 'San Francisco', 'state': 'CA', 'lat': 37.7786, 'lng': -122.3893, 'capacity': 41915, 'teams': ['SFG']},
'T-Mobile Park': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5914, 'lng': -122.3325, 'capacity': 47929, 'teams': ['SEA']},
'Busch Stadium': {'city': 'St. Louis', 'state': 'MO', 'lat': 38.6226, 'lng': -90.1928, 'capacity': 45538, 'teams': ['STL']},
'Tropicana Field': {'city': 'St. Petersburg', 'state': 'FL', 'lat': 27.7682, 'lng': -82.6534, 'capacity': 25000, 'teams': ['TBR']},
'Globe Life Field': {'city': 'Arlington', 'state': 'TX', 'lat': 32.7473, 'lng': -97.0844, 'capacity': 40300, 'teams': ['TEX']},
'Rogers Centre': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6414, 'lng': -79.3894, 'capacity': 49282, 'teams': ['TOR']},
'Nationals Park': {'city': 'Washington', 'state': 'DC', 'lat': 38.8729, 'lng': -77.0074, 'capacity': 41339, 'teams': ['WSN']},
}
stadiums = []
for name, info in mlb_ballparks.items():
stadium = Stadium(
id=f"mlb_{name.lower().replace(' ', '_')[:30]}",
name=name,
city=info['city'],
state=info['state'],
latitude=info['lat'],
longitude=info['lng'],
capacity=info['capacity'],
sport='MLB',
team_abbrevs=info['teams'],
source='mlb_hardcoded'
)
stadiums.append(stadium)
return stadiums
def scrape_mlb_stadiums() -> list[Stadium]:
"""
Fetch MLB stadium data with multi-source fallback.
"""
print("\nMLB STADIUMS")
print("-" * 40)
sources = [
StadiumScraperSource('MLBScoreBot', scrape_mlb_stadiums_scorebot, priority=1, min_venues=25),
StadiumScraperSource('GeoJSON-Ballparks', scrape_mlb_stadiums_geojson, priority=2, min_venues=25),
StadiumScraperSource('Hardcoded', scrape_mlb_stadiums_hardcoded, priority=3, min_venues=25),
]
return scrape_stadiums_with_fallback('MLB', sources)
def scrape_nfl_stadiums_scorebot() -> list[Stadium]:
"""
Source 1: NFLScoreBot/stadiums GitHub (public domain).
"""
stadiums = []
url = "https://raw.githubusercontent.com/NFLScoreBot/stadiums/main/stadiums.json"
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
for name, info in data.items():
stadium = Stadium(
id=f"nfl_{name.lower().replace(' ', '_')[:30]}",
name=name,
city=info.get('city', ''),
state=info.get('state', ''),
latitude=info.get('lat', 0) / 1000000 if info.get('lat') else 0,
longitude=info.get('long', 0) / 1000000 if info.get('long') else 0,
capacity=info.get('capacity', 0),
sport='NFL',
team_abbrevs=info.get('teams', []),
source='github.com/NFLScoreBot'
)
stadiums.append(stadium)
return stadiums
def scrape_nfl_stadiums_geojson() -> list[Stadium]:
"""
Source 2: brianhatchl/nfl-stadiums GeoJSON gist.
"""
stadiums = []
url = "https://gist.githubusercontent.com/brianhatchl/6265918/raw/dbe6acfe5deb48f51ce5a4c4f8f5dded4f02b9bd/nfl_stadiums.geojson"
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
for feature in data.get('features', []):
props = feature.get('properties', {})
coords = feature.get('geometry', {}).get('coordinates', [0, 0])
stadium = Stadium(
id=f"nfl_{props.get('Stadium', '').lower().replace(' ', '_')[:30]}",
name=props.get('Stadium', ''),
city=props.get('City', ''),
state=props.get('State', ''),
latitude=coords[1] if len(coords) > 1 else 0,
longitude=coords[0] if len(coords) > 0 else 0,
capacity=int(props.get('Capacity', 0) or 0),
sport='NFL',
team_abbrevs=[props.get('Team', '')],
source='gist.github.com/brianhatchl'
)
stadiums.append(stadium)
return stadiums
def scrape_nfl_stadiums_hardcoded() -> list[Stadium]:
"""
Source 3: Hardcoded NFL stadiums (fallback).
"""
nfl_stadiums_data = {
'State Farm Stadium': {'city': 'Glendale', 'state': 'AZ', 'lat': 33.5276, 'lng': -112.2626, 'capacity': 63400, 'teams': ['ARI']},
'Mercedes-Benz Stadium': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.7553, 'lng': -84.4006, 'capacity': 71000, 'teams': ['ATL']},
'M&T Bank Stadium': {'city': 'Baltimore', 'state': 'MD', 'lat': 39.2780, 'lng': -76.6227, 'capacity': 71008, 'teams': ['BAL']},
'Highmark Stadium': {'city': 'Orchard Park', 'state': 'NY', 'lat': 42.7738, 'lng': -78.7870, 'capacity': 71608, 'teams': ['BUF']},
'Bank of America Stadium': {'city': 'Charlotte', 'state': 'NC', 'lat': 35.2258, 'lng': -80.8528, 'capacity': 75523, 'teams': ['CAR']},
'Soldier Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8623, 'lng': -87.6167, 'capacity': 61500, 'teams': ['CHI']},
'Paycor Stadium': {'city': 'Cincinnati', 'state': 'OH', 'lat': 39.0954, 'lng': -84.5160, 'capacity': 65515, 'teams': ['CIN']},
'Cleveland Browns Stadium': {'city': 'Cleveland', 'state': 'OH', 'lat': 41.5061, 'lng': -81.6995, 'capacity': 67895, 'teams': ['CLE']},
'AT&T Stadium': {'city': 'Arlington', 'state': 'TX', 'lat': 32.7480, 'lng': -97.0928, 'capacity': 80000, 'teams': ['DAL']},
'Empower Field at Mile High': {'city': 'Denver', 'state': 'CO', 'lat': 39.7439, 'lng': -105.0201, 'capacity': 76125, 'teams': ['DEN']},
'Ford Field': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3400, 'lng': -83.0456, 'capacity': 65000, 'teams': ['DET']},
'Lambeau Field': {'city': 'Green Bay', 'state': 'WI', 'lat': 44.5013, 'lng': -88.0622, 'capacity': 81435, 'teams': ['GB']},
'NRG Stadium': {'city': 'Houston', 'state': 'TX', 'lat': 29.6847, 'lng': -95.4107, 'capacity': 72220, 'teams': ['HOU']},
'Lucas Oil Stadium': {'city': 'Indianapolis', 'state': 'IN', 'lat': 39.7601, 'lng': -86.1639, 'capacity': 67000, 'teams': ['IND']},
'EverBank Stadium': {'city': 'Jacksonville', 'state': 'FL', 'lat': 30.3239, 'lng': -81.6373, 'capacity': 67814, 'teams': ['JAX']},
'GEHA Field at Arrowhead Stadium': {'city': 'Kansas City', 'state': 'MO', 'lat': 39.0489, 'lng': -94.4839, 'capacity': 76416, 'teams': ['KC']},
'Allegiant Stadium': {'city': 'Las Vegas', 'state': 'NV', 'lat': 36.0909, 'lng': -115.1833, 'capacity': 65000, 'teams': ['LV']},
'SoFi Stadium': {'city': 'Inglewood', 'state': 'CA', 'lat': 33.9535, 'lng': -118.3392, 'capacity': 70240, 'teams': ['LAC', 'LAR']},
'Hard Rock Stadium': {'city': 'Miami Gardens', 'state': 'FL', 'lat': 25.9580, 'lng': -80.2389, 'capacity': 64767, 'teams': ['MIA']},
'U.S. Bank Stadium': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9736, 'lng': -93.2575, 'capacity': 66655, 'teams': ['MIN']},
'Gillette Stadium': {'city': 'Foxborough', 'state': 'MA', 'lat': 42.0909, 'lng': -71.2643, 'capacity': 65878, 'teams': ['NE']},
'Caesars Superdome': {'city': 'New Orleans', 'state': 'LA', 'lat': 29.9511, 'lng': -90.0812, 'capacity': 73208, 'teams': ['NO']},
'MetLife Stadium': {'city': 'East Rutherford', 'state': 'NJ', 'lat': 40.8135, 'lng': -74.0745, 'capacity': 82500, 'teams': ['NYG', 'NYJ']},
'Lincoln Financial Field': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9008, 'lng': -75.1675, 'capacity': 69596, 'teams': ['PHI']},
'Acrisure Stadium': {'city': 'Pittsburgh', 'state': 'PA', 'lat': 40.4468, 'lng': -80.0158, 'capacity': 68400, 'teams': ['PIT']},
'Levi\'s Stadium': {'city': 'Santa Clara', 'state': 'CA', 'lat': 37.4032, 'lng': -121.9698, 'capacity': 68500, 'teams': ['SF']},
'Lumen Field': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5952, 'lng': -122.3316, 'capacity': 68740, 'teams': ['SEA']},
'Raymond James Stadium': {'city': 'Tampa', 'state': 'FL', 'lat': 27.9759, 'lng': -82.5033, 'capacity': 65618, 'teams': ['TB']},
'Nissan Stadium': {'city': 'Nashville', 'state': 'TN', 'lat': 36.1665, 'lng': -86.7713, 'capacity': 69143, 'teams': ['TEN']},
'Commanders Field': {'city': 'Landover', 'state': 'MD', 'lat': 38.9076, 'lng': -76.8645, 'capacity': 67617, 'teams': ['WAS']},
}
stadiums = []
for name, info in nfl_stadiums_data.items():
stadium = Stadium(
id=f"nfl_{name.lower().replace(' ', '_')[:30]}",
name=name,
city=info['city'],
state=info['state'],
latitude=info['lat'],
longitude=info['lng'],
capacity=info['capacity'],
sport='NFL',
team_abbrevs=info['teams'],
source='nfl_hardcoded'
)
stadiums.append(stadium)
return stadiums
def scrape_nfl_stadiums() -> list[Stadium]:
"""
Fetch NFL stadium data with multi-source fallback.
"""
print("\nNFL STADIUMS")
print("-" * 40)
sources = [
StadiumScraperSource('NFLScoreBot', scrape_nfl_stadiums_scorebot, priority=1, min_venues=28),
StadiumScraperSource('GeoJSON-Gist', scrape_nfl_stadiums_geojson, priority=2, min_venues=28),
StadiumScraperSource('Hardcoded', scrape_nfl_stadiums_hardcoded, priority=3, min_venues=28),
]
return scrape_stadiums_with_fallback('NFL', sources)
def scrape_mls_stadiums_geojson() -> list[Stadium]:
"""
Source 1: gavinr/usa-soccer GeoJSON.
"""
stadiums = []
url = "https://raw.githubusercontent.com/gavinr/usa-soccer/master/mls.geojson"
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
for feature in data.get('features', []):
props = feature.get('properties', {})
coords = feature.get('geometry', {}).get('coordinates', [0, 0])
stadium = Stadium(
id=f"mls_{props.get('stadium', '').lower().replace(' ', '_')[:30]}",
name=props.get('stadium', ''),
city=props.get('city', ''),
state=props.get('state', ''),
latitude=coords[1] if len(coords) > 1 else 0,
longitude=coords[0] if len(coords) > 0 else 0,
capacity=props.get('capacity', 0),
sport='MLS',
team_abbrevs=[props.get('team', '')],
source='github.com/gavinr'
)
stadiums.append(stadium)
return stadiums
def scrape_mls_stadiums_csv() -> list[Stadium]:
"""
Source 2: gavinr/usa-soccer CSV.
"""
stadiums = []
url = "https://raw.githubusercontent.com/gavinr/usa-soccer/master/mls.csv"
response = requests.get(url, timeout=30)
response.raise_for_status()
import csv
from io import StringIO
reader = csv.DictReader(StringIO(response.text))
for row in reader:
stadium = Stadium(
id=f"mls_{row.get('stadium', '').lower().replace(' ', '_')[:30]}",
name=row.get('stadium', ''),
city=row.get('city', ''),
state=row.get('state', ''),
latitude=float(row.get('lat', 0) or 0),
longitude=float(row.get('lng', 0) or 0),
capacity=int(row.get('capacity', 0) or 0),
sport='MLS',
team_abbrevs=[row.get('team', '')],
source='github.com/gavinr/csv'
)
stadiums.append(stadium)
return stadiums
def scrape_mls_stadiums_hardcoded() -> list[Stadium]:
"""
Source 3: Hardcoded MLS stadiums (fallback).
"""
mls_stadiums_data = {
'Mercedes-Benz Stadium': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.7553, 'lng': -84.4006, 'capacity': 42500, 'team': 'ATL'},
'Q2 Stadium': {'city': 'Austin', 'state': 'TX', 'lat': 30.3879, 'lng': -97.7195, 'capacity': 20738, 'team': 'ATX'},
'Audi Field': {'city': 'Washington', 'state': 'DC', 'lat': 38.8687, 'lng': -77.0128, 'capacity': 20000, 'team': 'DC'},
'TQL Stadium': {'city': 'Cincinnati', 'state': 'OH', 'lat': 39.1107, 'lng': -84.5228, 'capacity': 26000, 'team': 'CIN'},
'Lower.com Field': {'city': 'Columbus', 'state': 'OH', 'lat': 39.9689, 'lng': -83.0172, 'capacity': 20371, 'team': 'CLB'},
'Toyota Stadium': {'city': 'Frisco', 'state': 'TX', 'lat': 33.1542, 'lng': -96.8350, 'capacity': 20500, 'team': 'DAL'},
'Dick\'s Sporting Goods Park': {'city': 'Commerce City', 'state': 'CO', 'lat': 39.8056, 'lng': -104.8919, 'capacity': 18061, 'team': 'COL'},
'Shell Energy Stadium': {'city': 'Houston', 'state': 'TX', 'lat': 29.7523, 'lng': -95.3526, 'capacity': 22039, 'team': 'HOU'},
'Dignity Health Sports Park': {'city': 'Carson', 'state': 'CA', 'lat': 33.8644, 'lng': -118.2611, 'capacity': 27000, 'team': 'LA'},
'BMO Stadium': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0128, 'lng': -118.2841, 'capacity': 22000, 'team': 'LAFC'},
'Chase Stadium': {'city': 'Fort Lauderdale', 'state': 'FL', 'lat': 26.1931, 'lng': -80.1606, 'capacity': 21550, 'team': 'MIA'},
'Allianz Field': {'city': 'Saint Paul', 'state': 'MN', 'lat': 44.9530, 'lng': -93.1653, 'capacity': 19400, 'team': 'MIN'},
'Stade Saputo': {'city': 'Montreal', 'state': 'QC', 'lat': 45.5629, 'lng': -73.5528, 'capacity': 19619, 'team': 'MTL'},
'Geodis Park': {'city': 'Nashville', 'state': 'TN', 'lat': 36.1306, 'lng': -86.7658, 'capacity': 30000, 'team': 'NSH'},
'Yankee Stadium': {'city': 'Bronx', 'state': 'NY', 'lat': 40.8296, 'lng': -73.9262, 'capacity': 30321, 'team': 'NYC'},
'Red Bull Arena': {'city': 'Harrison', 'state': 'NJ', 'lat': 40.7369, 'lng': -74.1503, 'capacity': 25000, 'team': 'NYRB'},
'Inter&Co Stadium': {'city': 'Orlando', 'state': 'FL', 'lat': 28.5412, 'lng': -81.3896, 'capacity': 25500, 'team': 'ORL'},
'Subaru Park': {'city': 'Chester', 'state': 'PA', 'lat': 39.8328, 'lng': -75.3789, 'capacity': 18500, 'team': 'PHI'},
'Providence Park': {'city': 'Portland', 'state': 'OR', 'lat': 45.5217, 'lng': -122.6918, 'capacity': 25218, 'team': 'POR'},
'America First Field': {'city': 'Sandy', 'state': 'UT', 'lat': 40.5829, 'lng': -111.8933, 'capacity': 20213, 'team': 'RSL'},
'PayPal Park': {'city': 'San Jose', 'state': 'CA', 'lat': 37.3512, 'lng': -121.9251, 'capacity': 18000, 'team': 'SJ'},
'Lumen Field': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5952, 'lng': -122.3316, 'capacity': 69000, 'team': 'SEA'},
'Children\'s Mercy Park': {'city': 'Kansas City', 'state': 'KS', 'lat': 39.1218, 'lng': -94.8231, 'capacity': 18467, 'team': 'SKC'},
'CityPark': {'city': 'St. Louis', 'state': 'MO', 'lat': 38.6316, 'lng': -90.2094, 'capacity': 22500, 'team': 'STL'},
'BMO Field': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6332, 'lng': -79.4185, 'capacity': 30000, 'team': 'TOR'},
'BC Place': {'city': 'Vancouver', 'state': 'BC', 'lat': 49.2768, 'lng': -123.1117, 'capacity': 22120, 'team': 'VAN'},
}
stadiums = []
for name, info in mls_stadiums_data.items():
stadium = Stadium(
id=f"mls_{name.lower().replace(' ', '_')[:30]}",
name=name,
city=info['city'],
state=info['state'],
latitude=info['lat'],
longitude=info['lng'],
capacity=info['capacity'],
sport='MLS',
team_abbrevs=[info['team']],
source='mls_hardcoded'
)
stadiums.append(stadium)
return stadiums
def scrape_mls_stadiums() -> list[Stadium]:
"""
Fetch MLS stadium data with multi-source fallback.
"""
print("\nMLS STADIUMS")
print("-" * 40)
sources = [
StadiumScraperSource('gavinr GeoJSON', scrape_mls_stadiums_geojson, priority=1, min_venues=20),
StadiumScraperSource('gavinr CSV', scrape_mls_stadiums_csv, priority=2, min_venues=20),
StadiumScraperSource('Hardcoded', scrape_mls_stadiums_hardcoded, priority=3, min_venues=20),
]
return scrape_stadiums_with_fallback('MLS', sources)
def scrape_nhl_stadiums() -> list[Stadium]:
"""
Fetch NHL arena data from NHL API.
"""
stadiums = []
url = "https://api-web.nhle.com/v1/standings/now"
print(" Fetching NHL arenas from NHL API...")
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
seen_venues = set()
for team in data.get('standings', []):
venue_name = team.get('homepageUrl', '') # Try to extract venue
team_name = team.get('teamName', {}).get('default', '')
team_abbrev = team.get('teamAbbrev', {}).get('default', '')
# NHL API doesn't give venue directly, use team info
# We'll supplement with hardcoded data
if team_abbrev and team_abbrev not in seen_venues:
seen_venues.add(team_abbrev)
# Fallback to hardcoded NHL arenas with coordinates
nhl_arenas = {
'TD Garden': {'city': 'Boston', 'state': 'MA', 'lat': 42.3662, 'lng': -71.0621, 'capacity': 17850, 'teams': ['BOS']},
'KeyBank Center': {'city': 'Buffalo', 'state': 'NY', 'lat': 42.8750, 'lng': -78.8764, 'capacity': 19070, 'teams': ['BUF']},
'Little Caesars Arena': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3411, 'lng': -83.0553, 'capacity': 19515, 'teams': ['DET']},
'Amerant Bank Arena': {'city': 'Sunrise', 'state': 'FL', 'lat': 26.1584, 'lng': -80.3256, 'capacity': 19250, 'teams': ['FLA']},
'Bell Centre': {'city': 'Montreal', 'state': 'QC', 'lat': 45.4961, 'lng': -73.5693, 'capacity': 21302, 'teams': ['MTL']},
'Canadian Tire Centre': {'city': 'Ottawa', 'state': 'ON', 'lat': 45.2969, 'lng': -75.9272, 'capacity': 18652, 'teams': ['OTT']},
'Amalie Arena': {'city': 'Tampa', 'state': 'FL', 'lat': 27.9426, 'lng': -82.4519, 'capacity': 19092, 'teams': ['TBL']},
'Scotiabank Arena': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6435, 'lng': -79.3791, 'capacity': 18800, 'teams': ['TOR']},
'PNC Arena': {'city': 'Raleigh', 'state': 'NC', 'lat': 35.8033, 'lng': -78.7220, 'capacity': 18680, 'teams': ['CAR']},
'Nationwide Arena': {'city': 'Columbus', 'state': 'OH', 'lat': 39.9692, 'lng': -83.0061, 'capacity': 18500, 'teams': ['CBJ']},
'Prudential Center': {'city': 'Newark', 'state': 'NJ', 'lat': 40.7334, 'lng': -74.1713, 'capacity': 16514, 'teams': ['NJD']},
'UBS Arena': {'city': 'Elmont', 'state': 'NY', 'lat': 40.7170, 'lng': -73.7260, 'capacity': 17255, 'teams': ['NYI']},
'Madison Square Garden': {'city': 'New York', 'state': 'NY', 'lat': 40.7505, 'lng': -73.9934, 'capacity': 18006, 'teams': ['NYR']},
'Wells Fargo Center': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9012, 'lng': -75.1720, 'capacity': 19500, 'teams': ['PHI']},
'PPG Paints Arena': {'city': 'Pittsburgh', 'state': 'PA', 'lat': 40.4395, 'lng': -79.9892, 'capacity': 18387, 'teams': ['PIT']},
'Capital One Arena': {'city': 'Washington', 'state': 'DC', 'lat': 38.8982, 'lng': -77.0209, 'capacity': 18573, 'teams': ['WSH']},
'United Center': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8807, 'lng': -87.6742, 'capacity': 19717, 'teams': ['CHI']},
'Ball Arena': {'city': 'Denver', 'state': 'CO', 'lat': 39.7487, 'lng': -105.0077, 'capacity': 18007, 'teams': ['COL']},
'American Airlines Center': {'city': 'Dallas', 'state': 'TX', 'lat': 32.7905, 'lng': -96.8103, 'capacity': 18532, 'teams': ['DAL']},
'Xcel Energy Center': {'city': 'Saint Paul', 'state': 'MN', 'lat': 44.9448, 'lng': -93.1010, 'capacity': 17954, 'teams': ['MIN']},
'Bridgestone Arena': {'city': 'Nashville', 'state': 'TN', 'lat': 36.1592, 'lng': -86.7785, 'capacity': 17159, 'teams': ['NSH']},
'Enterprise Center': {'city': 'St. Louis', 'state': 'MO', 'lat': 38.6268, 'lng': -90.2025, 'capacity': 18096, 'teams': ['STL']},
'Canada Life Centre': {'city': 'Winnipeg', 'state': 'MB', 'lat': 49.8928, 'lng': -97.1437, 'capacity': 15321, 'teams': ['WPG']},
'Honda Center': {'city': 'Anaheim', 'state': 'CA', 'lat': 33.8078, 'lng': -117.8765, 'capacity': 17174, 'teams': ['ANA']},
'Footprint Center': {'city': 'Tempe', 'state': 'AZ', 'lat': 33.4457, 'lng': -112.0712, 'capacity': 16210, 'teams': ['UTA']},
'SAP Center': {'city': 'San Jose', 'state': 'CA', 'lat': 37.3327, 'lng': -121.9012, 'capacity': 17562, 'teams': ['SJS']},
'Rogers Arena': {'city': 'Vancouver', 'state': 'BC', 'lat': 49.2778, 'lng': -123.1089, 'capacity': 18910, 'teams': ['VAN']},
'T-Mobile Arena': {'city': 'Las Vegas', 'state': 'NV', 'lat': 36.1028, 'lng': -115.1784, 'capacity': 17500, 'teams': ['VGK']},
'Climate Pledge Arena': {'city': 'Seattle', 'state': 'WA', 'lat': 47.6220, 'lng': -122.3540, 'capacity': 17100, 'teams': ['SEA']},
'Crypto.com Arena': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0430, 'lng': -118.2673, 'capacity': 18230, 'teams': ['LAK']},
'Rogers Place': {'city': 'Edmonton', 'state': 'AB', 'lat': 53.5469, 'lng': -113.4979, 'capacity': 18347, 'teams': ['EDM']},
'Scotiabank Saddledome': {'city': 'Calgary', 'state': 'AB', 'lat': 51.0374, 'lng': -114.0519, 'capacity': 19289, 'teams': ['CGY']},
}
for name, info in nhl_arenas.items():
stadium = Stadium(
id=f"nhl_{name.lower().replace(' ', '_')[:30]}",
name=name,
city=info['city'],
state=info['state'],
latitude=info['lat'],
longitude=info['lng'],
capacity=info['capacity'],
sport='NHL',
team_abbrevs=info['teams'],
source='nhl_hardcoded'
)
stadiums.append(stadium)
print(f" Found {len(stadiums)} NHL arenas")
except Exception as e:
print(f" Error fetching NHL arenas: {e}")
return stadiums
def scrape_nba_stadiums() -> list[Stadium]:
"""
Fetch NBA arena data (hardcoded with accurate coordinates).
"""
print(" Loading NBA arenas...")
nba_arenas = {
'State Farm Arena': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.7573, 'lng': -84.3963, 'capacity': 18118, 'teams': ['ATL']},
'TD Garden': {'city': 'Boston', 'state': 'MA', 'lat': 42.3662, 'lng': -71.0621, 'capacity': 19156, 'teams': ['BOS']},
'Barclays Center': {'city': 'Brooklyn', 'state': 'NY', 'lat': 40.6826, 'lng': -73.9754, 'capacity': 17732, 'teams': ['BKN']},
'Spectrum Center': {'city': 'Charlotte', 'state': 'NC', 'lat': 35.2251, 'lng': -80.8392, 'capacity': 19077, 'teams': ['CHA']},
'United Center': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8807, 'lng': -87.6742, 'capacity': 20917, 'teams': ['CHI']},
'Rocket Mortgage FieldHouse': {'city': 'Cleveland', 'state': 'OH', 'lat': 41.4965, 'lng': -81.6882, 'capacity': 19432, 'teams': ['CLE']},
'American Airlines Center': {'city': 'Dallas', 'state': 'TX', 'lat': 32.7905, 'lng': -96.8103, 'capacity': 19200, 'teams': ['DAL']},
'Ball Arena': {'city': 'Denver', 'state': 'CO', 'lat': 39.7487, 'lng': -105.0077, 'capacity': 19520, 'teams': ['DEN']},
'Little Caesars Arena': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3411, 'lng': -83.0553, 'capacity': 20332, 'teams': ['DET']},
'Chase Center': {'city': 'San Francisco', 'state': 'CA', 'lat': 37.7680, 'lng': -122.3879, 'capacity': 18064, 'teams': ['GSW']},
'Toyota Center': {'city': 'Houston', 'state': 'TX', 'lat': 29.7508, 'lng': -95.3621, 'capacity': 18055, 'teams': ['HOU']},
'Gainbridge Fieldhouse': {'city': 'Indianapolis', 'state': 'IN', 'lat': 39.7640, 'lng': -86.1555, 'capacity': 17923, 'teams': ['IND']},
'Intuit Dome': {'city': 'Inglewood', 'state': 'CA', 'lat': 33.9425, 'lng': -118.3419, 'capacity': 18000, 'teams': ['LAC']},
'Crypto.com Arena': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0430, 'lng': -118.2673, 'capacity': 18997, 'teams': ['LAL']},
'FedExForum': {'city': 'Memphis', 'state': 'TN', 'lat': 35.1382, 'lng': -90.0506, 'capacity': 17794, 'teams': ['MEM']},
'Kaseya Center': {'city': 'Miami', 'state': 'FL', 'lat': 25.7814, 'lng': -80.1870, 'capacity': 19600, 'teams': ['MIA']},
'Fiserv Forum': {'city': 'Milwaukee', 'state': 'WI', 'lat': 43.0451, 'lng': -87.9174, 'capacity': 17341, 'teams': ['MIL']},
'Target Center': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9795, 'lng': -93.2761, 'capacity': 18978, 'teams': ['MIN']},
'Smoothie King Center': {'city': 'New Orleans', 'state': 'LA', 'lat': 29.9490, 'lng': -90.0821, 'capacity': 16867, 'teams': ['NOP']},
'Madison Square Garden': {'city': 'New York', 'state': 'NY', 'lat': 40.7505, 'lng': -73.9934, 'capacity': 19812, 'teams': ['NYK']},
'Paycom Center': {'city': 'Oklahoma City', 'state': 'OK', 'lat': 35.4634, 'lng': -97.5151, 'capacity': 18203, 'teams': ['OKC']},
'Kia Center': {'city': 'Orlando', 'state': 'FL', 'lat': 28.5392, 'lng': -81.3839, 'capacity': 18846, 'teams': ['ORL']},
'Wells Fargo Center': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9012, 'lng': -75.1720, 'capacity': 20478, 'teams': ['PHI']},
'Footprint Center': {'city': 'Phoenix', 'state': 'AZ', 'lat': 33.4457, 'lng': -112.0712, 'capacity': 17071, 'teams': ['PHX']},
'Moda Center': {'city': 'Portland', 'state': 'OR', 'lat': 45.5316, 'lng': -122.6668, 'capacity': 19393, 'teams': ['POR']},
'Golden 1 Center': {'city': 'Sacramento', 'state': 'CA', 'lat': 38.5802, 'lng': -121.4997, 'capacity': 17608, 'teams': ['SAC']},
'Frost Bank Center': {'city': 'San Antonio', 'state': 'TX', 'lat': 29.4270, 'lng': -98.4375, 'capacity': 18418, 'teams': ['SAS']},
'Scotiabank Arena': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6435, 'lng': -79.3791, 'capacity': 19800, 'teams': ['TOR']},
'Delta Center': {'city': 'Salt Lake City', 'state': 'UT', 'lat': 40.7683, 'lng': -111.9011, 'capacity': 18306, 'teams': ['UTA']},
'Capital One Arena': {'city': 'Washington', 'state': 'DC', 'lat': 38.8982, 'lng': -77.0209, 'capacity': 20356, 'teams': ['WAS']},
}
stadiums = []
for name, info in nba_arenas.items():
stadium = Stadium(
id=f"nba_{name.lower().replace(' ', '_')[:30]}",
name=name,
city=info['city'],
state=info['state'],
latitude=info['lat'],
longitude=info['lng'],
capacity=info['capacity'],
sport='NBA',
team_abbrevs=info['teams'],
source='nba_hardcoded'
)
stadiums.append(stadium)
print(f" Found {len(stadiums)} NBA arenas")
return stadiums
def scrape_wnba_stadiums() -> list[Stadium]:
"""
Fetch WNBA arena data (hardcoded with accurate coordinates).
"""
print(" Loading WNBA arenas...")
wnba_arenas = {
'Gateway Center Arena': {'city': 'College Park', 'state': 'GA', 'lat': 33.6532, 'lng': -84.4474, 'capacity': 3500, 'teams': ['ATL']},
'Wintrust Arena': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8658, 'lng': -87.6169, 'capacity': 10387, 'teams': ['CHI']},
'Mohegan Sun Arena': {'city': 'Uncasville', 'state': 'CT', 'lat': 41.4932, 'lng': -72.0889, 'capacity': 10000, 'teams': ['CON']},
'College Park Center': {'city': 'Arlington', 'state': 'TX', 'lat': 32.7299, 'lng': -97.1100, 'capacity': 7000, 'teams': ['DAL']},
'Chase Center': {'city': 'San Francisco', 'state': 'CA', 'lat': 37.7680, 'lng': -122.3879, 'capacity': 18064, 'teams': ['GSV']},
'Gainbridge Fieldhouse': {'city': 'Indianapolis', 'state': 'IN', 'lat': 39.7640, 'lng': -86.1555, 'capacity': 17923, 'teams': ['IND']},
'Michelob ULTRA Arena': {'city': 'Las Vegas', 'state': 'NV', 'lat': 36.0909, 'lng': -115.1761, 'capacity': 12000, 'teams': ['LVA']},
'Crypto.com Arena': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0430, 'lng': -118.2673, 'capacity': 18997, 'teams': ['LAS']},
'Target Center': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9795, 'lng': -93.2761, 'capacity': 20000, 'teams': ['MIN']},
'Barclays Center': {'city': 'Brooklyn', 'state': 'NY', 'lat': 40.6826, 'lng': -73.9754, 'capacity': 17732, 'teams': ['NYL']},
'Footprint Center': {'city': 'Phoenix', 'state': 'AZ', 'lat': 33.4457, 'lng': -112.0712, 'capacity': 17071, 'teams': ['PHX']},
'Climate Pledge Arena': {'city': 'Seattle', 'state': 'WA', 'lat': 47.6220, 'lng': -122.3540, 'capacity': 18100, 'teams': ['SEA']},
'Entertainment & Sports Arena': {'city': 'Washington', 'state': 'DC', 'lat': 38.8688, 'lng': -76.9731, 'capacity': 4200, 'teams': ['WAS']},
}
stadiums = []
for name, info in wnba_arenas.items():
stadium = Stadium(
id=f"wnba_{name.lower().replace(' ', '_')[:30]}",
name=name,
city=info['city'],
state=info['state'],
latitude=info['lat'],
longitude=info['lng'],
capacity=info['capacity'],
sport='WNBA',
team_abbrevs=info['teams'],
source='wnba_hardcoded'
)
stadiums.append(stadium)
print(f" Found {len(stadiums)} WNBA arenas")
return stadiums
def scrape_nwsl_stadiums() -> list[Stadium]:
"""
Fetch NWSL stadium data (hardcoded with accurate coordinates).
"""
print(" Loading NWSL stadiums...")
nwsl_stadiums = {
'BMO Stadium': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0128, 'lng': -118.2841, 'capacity': 22000, 'teams': ['ANG']},
'WakeMed Soccer Park': {'city': 'Cary', 'state': 'NC', 'lat': 35.7645, 'lng': -78.7761, 'capacity': 10000, 'teams': ['NCC']},
'SeatGeek Stadium': {'city': 'Bridgeview', 'state': 'IL', 'lat': 41.7653, 'lng': -87.8020, 'capacity': 20000, 'teams': ['CHI']},
'Shell Energy Stadium': {'city': 'Houston', 'state': 'TX', 'lat': 29.7523, 'lng': -95.3526, 'capacity': 22039, 'teams': ['HOU']},
'CPKC Stadium': {'city': 'Kansas City', 'state': 'MO', 'lat': 39.1243, 'lng': -94.8232, 'capacity': 11500, 'teams': ['KCC']},
'Lynn Family Stadium': {'city': 'Louisville', 'state': 'KY', 'lat': 38.2210, 'lng': -85.7388, 'capacity': 15304, 'teams': ['LOU']},
'Red Bull Arena': {'city': 'Harrison', 'state': 'NJ', 'lat': 40.7369, 'lng': -74.1503, 'capacity': 25000, 'teams': ['NJG']},
'Inter&Co Stadium': {'city': 'Orlando', 'state': 'FL', 'lat': 28.5412, 'lng': -81.3896, 'capacity': 25500, 'teams': ['ORL']},
'Providence Park': {'city': 'Portland', 'state': 'OR', 'lat': 45.5217, 'lng': -122.6918, 'capacity': 25218, 'teams': ['POR']},
'Snapdragon Stadium': {'city': 'San Diego', 'state': 'CA', 'lat': 32.7839, 'lng': -117.1194, 'capacity': 32000, 'teams': ['SDW']},
'PayPal Park': {'city': 'San Jose', 'state': 'CA', 'lat': 37.3512, 'lng': -121.9251, 'capacity': 18000, 'teams': ['SJE']},
'Lumen Field': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5952, 'lng': -122.3316, 'capacity': 69000, 'teams': ['SEA']},
'America First Field': {'city': 'Sandy', 'state': 'UT', 'lat': 40.5829, 'lng': -111.8933, 'capacity': 20213, 'teams': ['UTA']},
'Audi Field': {'city': 'Washington', 'state': 'DC', 'lat': 38.8687, 'lng': -77.0128, 'capacity': 20000, 'teams': ['WAS']},
}
stadiums = []
for name, info in nwsl_stadiums.items():
stadium = Stadium(
id=f"nwsl_{name.lower().replace(' ', '_')[:30]}",
name=name,
city=info['city'],
state=info['state'],
latitude=info['lat'],
longitude=info['lng'],
capacity=info['capacity'],
sport='NWSL',
team_abbrevs=info['teams'],
source='nwsl_hardcoded'
)
stadiums.append(stadium)
print(f" Found {len(stadiums)} NWSL stadiums")
return stadiums
def scrape_cbb_stadiums() -> list[Stadium]:
"""
Fetch CBB (College Basketball) arena data from Wikipedia.
This scrapes the List of NCAA Division I basketball arenas.
"""
stadiums = []
url = "https://en.wikipedia.org/wiki/List_of_NCAA_Division_I_basketball_arenas"
print(" Fetching CBB arenas from Wikipedia...")
try:
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
# Find tables with arena data
tables = soup.find_all('table', class_='wikitable')
for table in tables:
rows = table.find_all('tr')[1:] # Skip header
for row in rows:
cells = row.find_all(['td', 'th'])
if len(cells) >= 4:
try:
arena_name = cells[0].get_text(strip=True)
city_state = cells[1].get_text(strip=True) if len(cells) > 1 else ''
capacity_text = cells[2].get_text(strip=True) if len(cells) > 2 else '0'
team = cells[3].get_text(strip=True) if len(cells) > 3 else ''
# Parse capacity (remove commas)
capacity = int(re.sub(r'[^\d]', '', capacity_text) or 0)
# Parse city/state
city = city_state.split(',')[0].strip() if ',' in city_state else city_state
state = city_state.split(',')[-1].strip() if ',' in city_state else ''
if arena_name and capacity > 0:
stadium = Stadium(
id=f"cbb_{arena_name.lower().replace(' ', '_')[:30]}",
name=arena_name,
city=city,
state=state,
latitude=0, # Wikipedia doesn't have coords in table
longitude=0,
capacity=capacity,
sport='CBB',
team_abbrevs=[team[:3].upper()] if team else [],
source='wikipedia'
)
stadiums.append(stadium)
except (ValueError, IndexError):
continue
print(f" Found {len(stadiums)} CBB arenas")
except Exception as e:
print(f" Error fetching CBB arenas: {e}")
return stadiums
def scrape_all_stadiums() -> list[Stadium]:
"""
Scrape stadium/venue data for ALL 8 sports.
Returns a combined list of all venues.
"""
all_stadiums = []
print("\n" + "="*60)
print("SCRAPING ALL STADIUMS/VENUES")
print("="*60)
# Pro leagues
all_stadiums.extend(scrape_nba_stadiums())
all_stadiums.extend(scrape_mlb_stadiums())
all_stadiums.extend(scrape_nhl_stadiums())
all_stadiums.extend(scrape_nfl_stadiums())
all_stadiums.extend(scrape_wnba_stadiums())
all_stadiums.extend(scrape_mls_stadiums())
all_stadiums.extend(scrape_nwsl_stadiums())
# College sports
all_stadiums.extend(scrape_cbb_stadiums())
print(f"\n TOTAL: {len(all_stadiums)} stadiums/venues across all sports")
return all_stadiums
def generate_stadiums_from_teams() -> list[Stadium]:
"""
Generate stadium data from team mappings with manual coordinates.
This serves as a fallback/validation source.
"""
stadiums = []
# NBA Arenas with coordinates (manually curated)
nba_coords = {
'State Farm Arena': (33.7573, -84.3963),
'TD Garden': (42.3662, -71.0621),
'Barclays Center': (40.6826, -73.9754),
'Spectrum Center': (35.2251, -80.8392),
'United Center': (41.8807, -87.6742),
'Rocket Mortgage FieldHouse': (41.4965, -81.6882),
'American Airlines Center': (32.7905, -96.8103),
'Ball Arena': (39.7487, -105.0077),
'Little Caesars Arena': (42.3411, -83.0553),
'Chase Center': (37.7680, -122.3879),
'Toyota Center': (29.7508, -95.3621),
'Gainbridge Fieldhouse': (39.7640, -86.1555),
'Intuit Dome': (33.9425, -118.3419),
'Crypto.com Arena': (34.0430, -118.2673),
'FedExForum': (35.1382, -90.0506),
'Kaseya Center': (25.7814, -80.1870),
'Fiserv Forum': (43.0451, -87.9174),
'Target Center': (44.9795, -93.2761),
'Smoothie King Center': (29.9490, -90.0821),
'Madison Square Garden': (40.7505, -73.9934),
'Paycom Center': (35.4634, -97.5151),
'Kia Center': (28.5392, -81.3839),
'Wells Fargo Center': (39.9012, -75.1720),
'Footprint Center': (33.4457, -112.0712),
'Moda Center': (45.5316, -122.6668),
'Golden 1 Center': (38.5802, -121.4997),
'Frost Bank Center': (29.4270, -98.4375),
'Scotiabank Arena': (43.6435, -79.3791),
'Delta Center': (40.7683, -111.9011),
'Capital One Arena': (38.8982, -77.0209),
}
for abbrev, info in NBA_TEAMS.items():
arena = info['arena']
coords = nba_coords.get(arena, (0, 0))
stadium = Stadium(
id=f"manual_nba_{abbrev.lower()}",
name=arena,
city=info['city'],
state='',
latitude=coords[0],
longitude=coords[1],
capacity=0,
sport='NBA',
team_abbrevs=[abbrev],
source='manual'
)
stadiums.append(stadium)
# MLB Stadiums with coordinates
mlb_coords = {
'Chase Field': (33.4453, -112.0667, 'AZ', 48686),
'Truist Park': (33.8907, -84.4678, 'GA', 41084),
'Oriole Park at Camden Yards': (39.2838, -76.6218, 'MD', 45971),
'Fenway Park': (42.3467, -71.0972, 'MA', 37755),
'Wrigley Field': (41.9484, -87.6553, 'IL', 41649),
'Guaranteed Rate Field': (41.8299, -87.6338, 'IL', 40615),
'Great American Ball Park': (39.0979, -84.5082, 'OH', 42319),
'Progressive Field': (41.4962, -81.6852, 'OH', 34830),
'Coors Field': (39.7559, -104.9942, 'CO', 50144),
'Comerica Park': (42.3390, -83.0485, 'MI', 41083),
'Minute Maid Park': (29.7573, -95.3555, 'TX', 41168),
'Kauffman Stadium': (39.0517, -94.4803, 'MO', 37903),
'Angel Stadium': (33.8003, -117.8827, 'CA', 45517),
'Dodger Stadium': (34.0739, -118.2400, 'CA', 56000),
'LoanDepot Park': (25.7781, -80.2196, 'FL', 36742),
'American Family Field': (43.0280, -87.9712, 'WI', 41900),
'Target Field': (44.9817, -93.2776, 'MN', 38544),
'Citi Field': (40.7571, -73.8458, 'NY', 41922),
'Yankee Stadium': (40.8296, -73.9262, 'NY', 46537),
'Sutter Health Park': (38.5802, -121.5097, 'CA', 14014),
'Citizens Bank Park': (39.9061, -75.1665, 'PA', 42792),
'PNC Park': (40.4469, -80.0057, 'PA', 38362),
'Petco Park': (32.7076, -117.1570, 'CA', 40209),
'Oracle Park': (37.7786, -122.3893, 'CA', 41265),
'T-Mobile Park': (47.5914, -122.3325, 'WA', 47929),
'Busch Stadium': (38.6226, -90.1928, 'MO', 45494),
'Tropicana Field': (27.7682, -82.6534, 'FL', 25000),
'Globe Life Field': (32.7473, -97.0845, 'TX', 40300),
'Rogers Centre': (43.6414, -79.3894, 'ON', 49282),
'Nationals Park': (38.8730, -77.0074, 'DC', 41339),
}
for abbrev, info in MLB_TEAMS.items():
stadium_name = info['stadium']
coord_data = mlb_coords.get(stadium_name, (0, 0, '', 0))
stadium = Stadium(
id=f"manual_mlb_{abbrev.lower()}",
name=stadium_name,
city=info['city'],
state=coord_data[2] if len(coord_data) > 2 else '',
latitude=coord_data[0],
longitude=coord_data[1],
capacity=coord_data[3] if len(coord_data) > 3 else 0,
sport='MLB',
team_abbrevs=[abbrev],
source='manual'
)
stadiums.append(stadium)
# NHL Arenas with coordinates
nhl_coords = {
'Honda Center': (33.8078, -117.8765, 'CA', 17174),
'Delta Center': (40.7683, -111.9011, 'UT', 18306),
'TD Garden': (42.3662, -71.0621, 'MA', 17565),
'KeyBank Center': (42.8750, -78.8764, 'NY', 19070),
'Scotiabank Saddledome': (51.0374, -114.0519, 'AB', 19289),
'PNC Arena': (35.8034, -78.7220, 'NC', 18680),
'United Center': (41.8807, -87.6742, 'IL', 19717),
'Ball Arena': (39.7487, -105.0077, 'CO', 18007),
'Nationwide Arena': (39.9693, -83.0061, 'OH', 18500),
'American Airlines Center': (32.7905, -96.8103, 'TX', 18532),
'Little Caesars Arena': (42.3411, -83.0553, 'MI', 19515),
'Rogers Place': (53.5469, -113.4978, 'AB', 18347),
'Amerant Bank Arena': (26.1584, -80.3256, 'FL', 19250),
'Crypto.com Arena': (34.0430, -118.2673, 'CA', 18230),
'Xcel Energy Center': (44.9448, -93.1010, 'MN', 17954),
'Bell Centre': (45.4961, -73.5693, 'QC', 21302),
'Bridgestone Arena': (36.1592, -86.7785, 'TN', 17159),
'Prudential Center': (40.7334, -74.1712, 'NJ', 16514),
'UBS Arena': (40.7161, -73.7246, 'NY', 17255),
'Madison Square Garden': (40.7505, -73.9934, 'NY', 18006),
'Canadian Tire Centre': (45.2969, -75.9272, 'ON', 18652),
'Wells Fargo Center': (39.9012, -75.1720, 'PA', 19543),
'PPG Paints Arena': (40.4395, -79.9892, 'PA', 18387),
'SAP Center': (37.3327, -121.9010, 'CA', 17562),
'Climate Pledge Arena': (47.6221, -122.3540, 'WA', 17100),
'Enterprise Center': (38.6268, -90.2025, 'MO', 18096),
'Amalie Arena': (27.9426, -82.4519, 'FL', 19092),
'Scotiabank Arena': (43.6435, -79.3791, 'ON', 18819),
'Rogers Arena': (49.2778, -123.1089, 'BC', 18910),
'T-Mobile Arena': (36.1028, -115.1784, 'NV', 17500),
'Capital One Arena': (38.8982, -77.0209, 'DC', 18573),
'Canada Life Centre': (49.8928, -97.1436, 'MB', 15321),
}
for abbrev, info in NHL_TEAMS.items():
arena_name = info['arena']
coord_data = nhl_coords.get(arena_name, (0, 0, '', 0))
stadium = Stadium(
id=f"manual_nhl_{abbrev.lower()}",
name=arena_name,
city=info['city'],
state=coord_data[2] if len(coord_data) > 2 else '',
latitude=coord_data[0],
longitude=coord_data[1],
capacity=coord_data[3] if len(coord_data) > 3 else 0,
sport='NHL',
team_abbrevs=[abbrev],
source='manual'
)
stadiums.append(stadium)
# WNBA Arenas with coordinates
wnba_coords = {
'Gateway Center Arena': (33.6534, -84.4480, 'GA', 3500),
'Wintrust Arena': (41.8622, -87.6164, 'IL', 10387),
'Mohegan Sun Arena': (41.4946, -72.0874, 'CT', 10000),
'College Park Center': (32.7298, -97.1137, 'TX', 7000),
'Chase Center': (37.7680, -122.3879, 'CA', 18064),
'Gainbridge Fieldhouse': (39.7640, -86.1555, 'IN', 17274),
'Michelob Ultra Arena': (36.0929, -115.1757, 'NV', 12000),
'Crypto.com Arena': (34.0430, -118.2673, 'CA', 19068),
'Target Center': (44.9795, -93.2761, 'MN', 17500),
'Barclays Center': (40.6826, -73.9754, 'NY', 17732),
'Footprint Center': (33.4457, -112.0712, 'AZ', 17000),
'Climate Pledge Arena': (47.6221, -122.3540, 'WA', 17100),
'Entertainment & Sports Arena': (38.8701, -76.9728, 'DC', 4200),
}
for abbrev, info in WNBA_TEAMS.items():
arena_name = info['arena']
coord_data = wnba_coords.get(arena_name, (0, 0, '', 0))
stadium = Stadium(
id=f"manual_wnba_{abbrev.lower()}",
name=arena_name,
city=info['city'],
state=coord_data[2] if len(coord_data) > 2 else '',
latitude=coord_data[0],
longitude=coord_data[1],
capacity=coord_data[3] if len(coord_data) > 3 else 0,
sport='WNBA',
team_abbrevs=[abbrev],
source='manual'
)
stadiums.append(stadium)
# MLS Stadiums with coordinates
mls_coords = {
'Mercedes-Benz Stadium': (33.7553, -84.4006, 'GA', 71000),
'Q2 Stadium': (30.3876, -97.7200, 'TX', 20738),
'Bank of America Stadium': (35.2258, -80.8528, 'NC', 74867),
'Soldier Field': (41.8623, -87.6167, 'IL', 61500),
'TQL Stadium': (39.1113, -84.5212, 'OH', 26000),
"Dick's Sporting Goods Park": (39.8056, -104.8919, 'CO', 18061),
'Lower.com Field': (39.9689, -83.0173, 'OH', 20371),
'Toyota Stadium': (33.1546, -96.8353, 'TX', 20500),
'Audi Field': (38.8686, -77.0128, 'DC', 20000),
'Shell Energy Stadium': (29.7523, -95.3522, 'TX', 22039),
'Dignity Health Sports Park': (33.8644, -118.2611, 'CA', 27000),
'BMO Stadium': (34.0128, -118.2841, 'CA', 22000),
'Chase Stadium': (26.1902, -80.1630, 'FL', 21550),
'Allianz Field': (44.9532, -93.1653, 'MN', 19400),
'Stade Saputo': (45.5628, -73.5530, 'QC', 19619),
'Geodis Park': (36.1303, -86.7663, 'TN', 30000),
'Gillette Stadium': (42.0909, -71.2643, 'MA', 65878),
'Yankee Stadium': (40.8296, -73.9262, 'NY', 46537),
'Red Bull Arena': (40.7368, -74.1503, 'NJ', 25000),
'Inter&Co Stadium': (28.5411, -81.3899, 'FL', 25500),
'Subaru Park': (39.8328, -75.3789, 'PA', 18500),
'Providence Park': (45.5217, -122.6917, 'OR', 25218),
'America First Field': (40.5828, -111.8933, 'UT', 20213),
'PayPal Park': (37.3513, -121.9253, 'CA', 18000),
'Lumen Field': (47.5952, -122.3316, 'WA', 68740),
"Children's Mercy Park": (39.1218, -94.8234, 'KS', 18467),
'CityPark': (38.6322, -90.2094, 'MO', 22500),
'BMO Field': (43.6332, -79.4186, 'ON', 30000),
'BC Place': (49.2768, -123.1118, 'BC', 54320),
'Snapdragon Stadium': (32.7839, -117.1224, 'CA', 35000),
}
for abbrev, info in MLS_TEAMS.items():
stadium_name = info['stadium']
coord_data = mls_coords.get(stadium_name, (0, 0, '', 0))
stadium = Stadium(
id=f"manual_mls_{abbrev.lower()}",
name=stadium_name,
city=info['city'],
state=coord_data[2] if len(coord_data) > 2 else '',
latitude=coord_data[0],
longitude=coord_data[1],
capacity=coord_data[3] if len(coord_data) > 3 else 0,
sport='MLS',
team_abbrevs=[abbrev],
source='manual'
)
stadiums.append(stadium)
# NWSL Stadiums with coordinates
nwsl_coords = {
'BMO Stadium': (34.0128, -118.2841, 'CA', 22000),
'PayPal Park': (37.3513, -121.9253, 'CA', 18000),
'SeatGeek Stadium': (41.6462, -87.7304, 'IL', 20000),
'Shell Energy Stadium': (29.7523, -95.3522, 'TX', 22039),
'CPKC Stadium': (39.0851, -94.5582, 'KS', 11500),
'Red Bull Arena': (40.7368, -74.1503, 'NJ', 25000),
'WakeMed Soccer Park': (35.8589, -78.7989, 'NC', 10000),
'Inter&Co Stadium': (28.5411, -81.3899, 'FL', 25500),
'Providence Park': (45.5217, -122.6917, 'OR', 25218),
'Lumen Field': (47.5952, -122.3316, 'WA', 68740),
'Snapdragon Stadium': (32.7839, -117.1224, 'CA', 35000),
'America First Field': (40.5828, -111.8933, 'UT', 20213),
'Audi Field': (38.8686, -77.0128, 'DC', 20000),
}
for abbrev, info in NWSL_TEAMS.items():
stadium_name = info['stadium']
coord_data = nwsl_coords.get(stadium_name, (0, 0, '', 0))
stadium = Stadium(
id=f"manual_nwsl_{abbrev.lower()}",
name=stadium_name,
city=info['city'],
state=coord_data[2] if len(coord_data) > 2 else '',
latitude=coord_data[0],
longitude=coord_data[1],
capacity=coord_data[3] if len(coord_data) > 3 else 0,
sport='NWSL',
team_abbrevs=[abbrev],
source='manual'
)
stadiums.append(stadium)
# NFL Stadiums with coordinates
nfl_coords = {
'State Farm Stadium': (33.5276, -112.2626, 'AZ', 63400),
'Mercedes-Benz Stadium': (33.7553, -84.4006, 'GA', 71000),
'M&T Bank Stadium': (39.2780, -76.6227, 'MD', 71008),
'Highmark Stadium': (42.7738, -78.7870, 'NY', 71608),
'Bank of America Stadium': (35.2258, -80.8528, 'NC', 74867),
'Soldier Field': (41.8623, -87.6167, 'IL', 61500),
'Paycor Stadium': (39.0954, -84.5160, 'OH', 65515),
'Cleveland Browns Stadium': (41.5061, -81.6995, 'OH', 67431),
'AT&T Stadium': (32.7480, -97.0928, 'TX', 80000),
'Empower Field at Mile High': (39.7439, -105.0201, 'CO', 76125),
'Ford Field': (42.3400, -83.0456, 'MI', 65000),
'Lambeau Field': (44.5013, -88.0622, 'WI', 81435),
'NRG Stadium': (29.6847, -95.4107, 'TX', 72220),
'Lucas Oil Stadium': (39.7601, -86.1639, 'IN', 67000),
'EverBank Stadium': (30.3239, -81.6373, 'FL', 67814),
'GEHA Field at Arrowhead Stadium': (39.0489, -94.4839, 'MO', 76416),
'Allegiant Stadium': (36.0909, -115.1833, 'NV', 65000),
'SoFi Stadium': (33.9535, -118.3392, 'CA', 70240),
'Hard Rock Stadium': (25.9580, -80.2389, 'FL', 65326),
'U.S. Bank Stadium': (44.9737, -93.2577, 'MN', 66655),
'Gillette Stadium': (42.0909, -71.2643, 'MA', 65878),
'Caesars Superdome': (29.9511, -90.0812, 'LA', 73208),
'MetLife Stadium': (40.8128, -74.0742, 'NJ', 82500),
'Lincoln Financial Field': (39.9008, -75.1674, 'PA', 69176),
'Acrisure Stadium': (40.4468, -80.0158, 'PA', 68400),
"Levi's Stadium": (37.4032, -121.9698, 'CA', 68500),
'Lumen Field': (47.5952, -122.3316, 'WA', 68740),
'Raymond James Stadium': (27.9759, -82.5033, 'FL', 65618),
'Nissan Stadium': (36.1665, -86.7713, 'TN', 69143),
'Northwest Stadium': (38.9076, -76.8645, 'MD', 67617),
}
for abbrev, info in NFL_TEAMS.items():
stadium_name = info['stadium']
coord_data = nfl_coords.get(stadium_name, (0, 0, '', 0))
stadium = Stadium(
id=f"manual_nfl_{abbrev.lower()}",
name=stadium_name,
city=info['city'],
state=coord_data[2] if len(coord_data) > 2 else '',
latitude=coord_data[0],
longitude=coord_data[1],
capacity=coord_data[3] if len(coord_data) > 3 else 0,
sport='NFL',
team_abbrevs=[abbrev],
source='manual'
)
stadiums.append(stadium)
return stadiums
# =============================================================================
# HELPERS
# =============================================================================
def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]:
"""
Assign IDs based on matchup + date.
Format: {sport}_{season}_{away}_{home}_{MMDD} (or {MMDD}_2 for doubleheaders)
When games are rescheduled, the old ID becomes orphaned and a new one is created.
Use --delete-all before import to clean up orphaned records.
"""
from collections import defaultdict
season_str = season.replace('-', '')
# Track how many times we've seen each base ID (for doubleheaders)
id_counts = defaultdict(int)
for game in games:
away = game.away_team_abbrev.lower()
home = game.home_team_abbrev.lower()
# Extract MMDD from date (YYYY-MM-DD)
date_parts = game.date.split('-')
mmdd = f"{date_parts[1]}{date_parts[2]}" if len(date_parts) == 3 else "0000"
base_id = f"{sport.lower()}_{season_str}_{away}_{home}_{mmdd}"
id_counts[base_id] += 1
# Add suffix for doubleheaders (game 2+)
if id_counts[base_id] > 1:
game.id = f"{base_id}_{id_counts[base_id]}"
else:
game.id = base_id
return games
def get_team_abbrev(team_name: str, sport: str) -> str:
"""Get team abbreviation from full name."""
teams = {
'NBA': NBA_TEAMS,
'MLB': MLB_TEAMS,
'NHL': NHL_TEAMS,
'WNBA': WNBA_TEAMS,
'MLS': MLS_TEAMS,
'NWSL': NWSL_TEAMS,
}.get(sport, {})
for abbrev, info in teams.items():
if info['name'].lower() == team_name.lower():
return abbrev
if team_name.lower() in info['name'].lower():
return abbrev
# Return first 3 letters as fallback
return team_name[:3].upper()
def validate_games(games_by_source: dict) -> dict:
"""
Cross-validate games from multiple sources.
Returns discrepancies.
"""
discrepancies = {
'missing_in_source': [],
'date_mismatch': [],
'time_mismatch': [],
'venue_mismatch': [],
}
sources = list(games_by_source.keys())
if len(sources) < 2:
return discrepancies
primary = sources[0]
primary_games = {g.id: g for g in games_by_source[primary]}
for source in sources[1:]:
secondary_games = {g.id: g for g in games_by_source[source]}
for game_id, game in primary_games.items():
if game_id not in secondary_games:
discrepancies['missing_in_source'].append({
'game_id': game_id,
'present_in': primary,
'missing_in': source
})
return discrepancies
def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path):
"""
Export scraped data to organized JSON files.
Structure:
data/
games/
mlb_2025.json
nba_2025.json
...
canonical/
stadiums.json
stadiums.json (legacy, for backward compatibility)
"""
output_dir.mkdir(parents=True, exist_ok=True)
# Create subdirectories
games_dir = output_dir / 'games'
canonical_dir = output_dir / 'canonical'
games_dir.mkdir(exist_ok=True)
canonical_dir.mkdir(exist_ok=True)
# Group games by sport and season
games_by_sport_season = {}
for game in games:
sport = game.sport.lower()
season = game.season
key = f"{sport}_{season}"
if key not in games_by_sport_season:
games_by_sport_season[key] = []
games_by_sport_season[key].append(game)
# Export games by sport/season
total_exported = 0
for key, sport_games in games_by_sport_season.items():
games_data = [asdict(g) for g in sport_games]
filepath = games_dir / f"{key}.json"
with open(filepath, 'w') as f:
json.dump(games_data, f, indent=2)
print(f" Exported {len(sport_games):,} games to games/{key}.json")
total_exported += len(sport_games)
# Export combined games.json for backward compatibility
all_games_data = [asdict(g) for g in games]
with open(output_dir / 'games.json', 'w') as f:
json.dump(all_games_data, f, indent=2)
# Export stadiums to canonical/
stadiums_data = [asdict(s) for s in stadiums]
with open(canonical_dir / 'stadiums.json', 'w') as f:
json.dump(stadiums_data, f, indent=2)
# Also export to root for backward compatibility
with open(output_dir / 'stadiums.json', 'w') as f:
json.dump(stadiums_data, f, indent=2)
# Export as CSV for easy viewing
if games:
df_games = pd.DataFrame(all_games_data)
df_games.to_csv(output_dir / 'games.csv', index=False)
if stadiums:
df_stadiums = pd.DataFrame(stadiums_data)
df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False)
print(f"\nExported {total_exported:,} games across {len(games_by_sport_season)} sport/season files")
print(f"Exported {len(stadiums):,} stadiums to canonical/stadiums.json")
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(description='Scrape sports schedules')
parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all')
parser.add_argument('--season', type=int, default=2026, help='Season year (ending year)')
parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data (legacy method)')
parser.add_argument('--stadiums-update', action='store_true', help='Scrape ALL stadium data for all 8 sports (comprehensive)')
parser.add_argument('--output', type=str, default='./data', help='Output directory')
args = parser.parse_args()
output_dir = Path(args.output)
all_games = []
all_stadiums = []
# Scrape stadiums
print("\n" + "="*60)
print("SCRAPING STADIUMS")
print("="*60)
if args.stadiums_update:
# Comprehensive scraping for ALL 11 sports
print("Using comprehensive stadium scrapers for all 11 sports...")
all_stadiums.extend(scrape_all_stadiums())
print(f" Total stadiums scraped: {len(all_stadiums)}")
else:
# Legacy method (HIFLD + manual team mappings)
all_stadiums.extend(scrape_stadiums_hifld())
all_stadiums.extend(generate_stadiums_from_teams())
# If stadiums-only mode, export and exit (skip schedule scraping)
if args.stadiums_only:
export_to_json([], all_stadiums, output_dir)
return
# Scrape schedules with multi-source fallback
if args.sport in ['nba', 'all']:
print("\n" + "="*60)
print(f"SCRAPING NBA {args.season}")
print("="*60)
nba_sources = [
ScraperSource('Basketball-Reference', scrape_nba_basketball_reference, priority=1, min_games=500),
ScraperSource('ESPN', scrape_nba_espn, priority=2, min_games=500),
ScraperSource('CBS Sports', scrape_nba_cbssports, priority=3, min_games=100),
]
nba_games = scrape_with_fallback('NBA', args.season, nba_sources)
nba_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2024-25"
nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
all_games.extend(nba_games)
if args.sport in ['mlb', 'all']:
print("\n" + "="*60)
print(f"SCRAPING MLB {args.season}")
print("="*60)
mlb_sources = [
ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=1000),
ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=500),
ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=500),
]
mlb_games = scrape_with_fallback('MLB', args.season, mlb_sources)
mlb_games = assign_stable_ids(mlb_games, 'MLB', str(args.season))
all_games.extend(mlb_games)
if args.sport in ['nhl', 'all']:
print("\n" + "="*60)
print(f"SCRAPING NHL {args.season}")
print("="*60)
nhl_sources = [
ScraperSource('Hockey-Reference', scrape_nhl_hockey_reference, priority=1, min_games=500),
ScraperSource('ESPN', scrape_nhl_espn, priority=2, min_games=500),
ScraperSource('NHL API', scrape_nhl_api, priority=3, min_games=100),
]
nhl_games = scrape_with_fallback('NHL', args.season, nhl_sources)
nhl_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2024-25"
nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
all_games.extend(nhl_games)
if args.sport in ['wnba', 'all']:
print("\n" + "="*60)
print(f"SCRAPING WNBA {args.season}")
print("="*60)
wnba_sources = [
ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
]
wnba_games = scrape_with_fallback('WNBA', args.season, wnba_sources)
wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(args.season))
all_games.extend(wnba_games)
if args.sport in ['mls', 'all']:
print("\n" + "="*60)
print(f"SCRAPING MLS {args.season}")
print("="*60)
mls_sources = [
ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
]
mls_games = scrape_with_fallback('MLS', args.season, mls_sources)
mls_games = assign_stable_ids(mls_games, 'MLS', str(args.season))
all_games.extend(mls_games)
if args.sport in ['nwsl', 'all']:
print("\n" + "="*60)
print(f"SCRAPING NWSL {args.season}")
print("="*60)
nwsl_sources = [
ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
]
nwsl_games = scrape_with_fallback('NWSL', args.season, nwsl_sources)
nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(args.season))
all_games.extend(nwsl_games)
if args.sport in ['nfl', 'all']:
print("\n" + "="*60)
print(f"SCRAPING NFL {args.season}")
print("="*60)
nfl_sources = [
ScraperSource('ESPN', scrape_nfl_espn, priority=1, min_games=200),
ScraperSource('Pro-Football-Reference', scrape_nfl_pro_football_reference, priority=2, min_games=200),
ScraperSource('CBS Sports', scrape_nfl_cbssports, priority=3, min_games=100),
]
nfl_games = scrape_with_fallback('NFL', args.season, nfl_sources)
nfl_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2025-26"
nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
all_games.extend(nfl_games)
if args.sport in ['cbb', 'all']:
print("\n" + "="*60)
print(f"SCRAPING CBB {args.season}")
print("="*60)
cbb_sources = [
ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
]
cbb_games = scrape_with_fallback('CBB', args.season, cbb_sources)
cbb_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2025-26"
cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
all_games.extend(cbb_games)
# Export
print("\n" + "="*60)
print("EXPORTING DATA")
print("="*60)
export_to_json(all_games, all_stadiums, output_dir)
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Total games scraped: {len(all_games)}")
print(f"Total stadiums: {len(all_stadiums)}")
# Games by sport
by_sport = {}
for g in all_games:
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
for sport, count in by_sport.items():
print(f" {sport}: {count} games")
if __name__ == '__main__':
main()