- Remove College Football, NASCAR, and PGA from scraper and app - Clean all data files (stadiums, games, pipeline reports) - Update Sport.swift enum and all UI components - Add sportstime.py CLI tool for pipeline management - Add DATA_SCRAPING.md documentation - Add WNBA/MLS/NWSL implementation documentation - Scraper now supports: NBA, MLB, NHL, NFL, WNBA, MLS, NWSL, CBB Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
3360 lines
140 KiB
Python
3360 lines
140 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Sports Schedule Scraper for SportsTime App
|
|
Scrapes NBA, MLB, NHL schedules from multiple sources for cross-validation.
|
|
|
|
Usage:
|
|
python scrape_schedules.py --sport nba --season 2026
|
|
python scrape_schedules.py --sport all --season 2026
|
|
python scrape_schedules.py --stadiums-only
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import time
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from dataclasses import dataclass, asdict
|
|
from typing import Optional
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
|
|
# Rate limiting
|
|
REQUEST_DELAY = 3.0 # seconds between requests to same domain
|
|
last_request_time = {}
|
|
|
|
|
|
def rate_limit(domain: str):
|
|
"""Enforce rate limiting per domain."""
|
|
now = time.time()
|
|
if domain in last_request_time:
|
|
elapsed = now - last_request_time[domain]
|
|
if elapsed < REQUEST_DELAY:
|
|
time.sleep(REQUEST_DELAY - elapsed)
|
|
last_request_time[domain] = time.time()
|
|
|
|
|
|
def fetch_page(url: str, domain: str) -> Optional[BeautifulSoup]:
|
|
"""Fetch and parse a webpage with rate limiting."""
|
|
rate_limit(domain)
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'Sec-Fetch-Dest': 'document',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Site': 'none',
|
|
'Sec-Fetch-User': '?1',
|
|
'Cache-Control': 'max-age=0',
|
|
}
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
return BeautifulSoup(response.content, 'html.parser')
|
|
except Exception as e:
|
|
print(f"Error fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
# =============================================================================
|
|
# DATA CLASSES
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class Game:
|
|
id: str
|
|
sport: str
|
|
season: str
|
|
date: str # YYYY-MM-DD
|
|
time: Optional[str] # HH:MM (24hr, ET)
|
|
home_team: str
|
|
away_team: str
|
|
home_team_abbrev: str
|
|
away_team_abbrev: str
|
|
venue: str
|
|
source: str
|
|
is_playoff: bool = False
|
|
broadcast: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class Stadium:
|
|
id: str
|
|
name: str
|
|
city: str
|
|
state: str
|
|
latitude: float
|
|
longitude: float
|
|
capacity: int
|
|
sport: str
|
|
team_abbrevs: list
|
|
source: str
|
|
year_opened: Optional[int] = None
|
|
|
|
|
|
# =============================================================================
|
|
# MULTI-SOURCE FALLBACK SYSTEM
|
|
# =============================================================================
|
|
|
|
from dataclasses import field
|
|
from typing import Callable
|
|
|
|
@dataclass
|
|
class ScraperSource:
|
|
"""Represents a single data source for scraping."""
|
|
name: str
|
|
scraper_func: Callable[[int], list] # Takes season, returns list[Game]
|
|
priority: int = 1 # Lower = higher priority (1 is best)
|
|
min_games: int = 10 # Minimum games to consider successful
|
|
|
|
|
|
def scrape_with_fallback(
|
|
sport: str,
|
|
season: int,
|
|
sources: list[ScraperSource],
|
|
verbose: bool = True
|
|
) -> list:
|
|
"""
|
|
Try multiple sources in priority order until one succeeds.
|
|
|
|
Args:
|
|
sport: Sport name for logging
|
|
season: Season year
|
|
sources: List of ScraperSource configs, sorted by priority
|
|
verbose: Whether to print status messages
|
|
|
|
Returns:
|
|
List of Game objects from the first successful source
|
|
"""
|
|
sources = sorted(sources, key=lambda s: s.priority)
|
|
|
|
for i, source in enumerate(sources):
|
|
try:
|
|
if verbose:
|
|
attempt = f"[{i+1}/{len(sources)}]"
|
|
print(f" {attempt} Trying {source.name}...")
|
|
|
|
games = source.scraper_func(season)
|
|
|
|
if games and len(games) >= source.min_games:
|
|
if verbose:
|
|
print(f" ✓ {source.name} returned {len(games)} games")
|
|
return games
|
|
else:
|
|
if verbose:
|
|
count = len(games) if games else 0
|
|
print(f" ✗ {source.name} returned only {count} games (min: {source.min_games})")
|
|
|
|
except Exception as e:
|
|
if verbose:
|
|
print(f" ✗ {source.name} failed: {e}")
|
|
continue
|
|
|
|
# All sources failed
|
|
if verbose:
|
|
print(f" ⚠ All {len(sources)} sources failed for {sport}")
|
|
return []
|
|
|
|
|
|
@dataclass
|
|
class StadiumScraperSource:
|
|
"""Represents a single data source for stadium scraping."""
|
|
name: str
|
|
scraper_func: Callable[[], list] # Returns list[Stadium]
|
|
priority: int = 1 # Lower = higher priority (1 is best)
|
|
min_venues: int = 5 # Minimum venues to consider successful
|
|
|
|
|
|
def scrape_stadiums_with_fallback(
|
|
sport: str,
|
|
sources: list[StadiumScraperSource],
|
|
verbose: bool = True
|
|
) -> list:
|
|
"""
|
|
Try multiple stadium sources in priority order until one succeeds.
|
|
|
|
Args:
|
|
sport: Sport name for logging
|
|
sources: List of StadiumScraperSource configs, sorted by priority
|
|
verbose: Whether to print status messages
|
|
|
|
Returns:
|
|
List of Stadium objects from the first successful source
|
|
"""
|
|
sources = sorted(sources, key=lambda s: s.priority)
|
|
|
|
for i, source in enumerate(sources):
|
|
try:
|
|
if verbose:
|
|
attempt = f"[{i+1}/{len(sources)}]"
|
|
print(f" {attempt} Trying {source.name}...")
|
|
|
|
stadiums = source.scraper_func()
|
|
|
|
if stadiums and len(stadiums) >= source.min_venues:
|
|
if verbose:
|
|
print(f" ✓ {source.name} returned {len(stadiums)} venues")
|
|
return stadiums
|
|
else:
|
|
if verbose:
|
|
count = len(stadiums) if stadiums else 0
|
|
print(f" ✗ {source.name} returned only {count} venues (min: {source.min_venues})")
|
|
|
|
except Exception as e:
|
|
if verbose:
|
|
print(f" ✗ {source.name} failed: {e}")
|
|
continue
|
|
|
|
# All sources failed
|
|
if verbose:
|
|
print(f" ⚠ All {len(sources)} sources failed for {sport}")
|
|
return []
|
|
|
|
|
|
# =============================================================================
|
|
# TEAM MAPPINGS
|
|
# =============================================================================
|
|
|
|
NBA_TEAMS = {
|
|
'ATL': {'name': 'Atlanta Hawks', 'city': 'Atlanta', 'arena': 'State Farm Arena'},
|
|
'BOS': {'name': 'Boston Celtics', 'city': 'Boston', 'arena': 'TD Garden'},
|
|
'BRK': {'name': 'Brooklyn Nets', 'city': 'Brooklyn', 'arena': 'Barclays Center'},
|
|
'CHO': {'name': 'Charlotte Hornets', 'city': 'Charlotte', 'arena': 'Spectrum Center'},
|
|
'CHI': {'name': 'Chicago Bulls', 'city': 'Chicago', 'arena': 'United Center'},
|
|
'CLE': {'name': 'Cleveland Cavaliers', 'city': 'Cleveland', 'arena': 'Rocket Mortgage FieldHouse'},
|
|
'DAL': {'name': 'Dallas Mavericks', 'city': 'Dallas', 'arena': 'American Airlines Center'},
|
|
'DEN': {'name': 'Denver Nuggets', 'city': 'Denver', 'arena': 'Ball Arena'},
|
|
'DET': {'name': 'Detroit Pistons', 'city': 'Detroit', 'arena': 'Little Caesars Arena'},
|
|
'GSW': {'name': 'Golden State Warriors', 'city': 'San Francisco', 'arena': 'Chase Center'},
|
|
'HOU': {'name': 'Houston Rockets', 'city': 'Houston', 'arena': 'Toyota Center'},
|
|
'IND': {'name': 'Indiana Pacers', 'city': 'Indianapolis', 'arena': 'Gainbridge Fieldhouse'},
|
|
'LAC': {'name': 'Los Angeles Clippers', 'city': 'Inglewood', 'arena': 'Intuit Dome'},
|
|
'LAL': {'name': 'Los Angeles Lakers', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
|
|
'MEM': {'name': 'Memphis Grizzlies', 'city': 'Memphis', 'arena': 'FedExForum'},
|
|
'MIA': {'name': 'Miami Heat', 'city': 'Miami', 'arena': 'Kaseya Center'},
|
|
'MIL': {'name': 'Milwaukee Bucks', 'city': 'Milwaukee', 'arena': 'Fiserv Forum'},
|
|
'MIN': {'name': 'Minnesota Timberwolves', 'city': 'Minneapolis', 'arena': 'Target Center'},
|
|
'NOP': {'name': 'New Orleans Pelicans', 'city': 'New Orleans', 'arena': 'Smoothie King Center'},
|
|
'NYK': {'name': 'New York Knicks', 'city': 'New York', 'arena': 'Madison Square Garden'},
|
|
'OKC': {'name': 'Oklahoma City Thunder', 'city': 'Oklahoma City', 'arena': 'Paycom Center'},
|
|
'ORL': {'name': 'Orlando Magic', 'city': 'Orlando', 'arena': 'Kia Center'},
|
|
'PHI': {'name': 'Philadelphia 76ers', 'city': 'Philadelphia', 'arena': 'Wells Fargo Center'},
|
|
'PHO': {'name': 'Phoenix Suns', 'city': 'Phoenix', 'arena': 'Footprint Center'},
|
|
'POR': {'name': 'Portland Trail Blazers', 'city': 'Portland', 'arena': 'Moda Center'},
|
|
'SAC': {'name': 'Sacramento Kings', 'city': 'Sacramento', 'arena': 'Golden 1 Center'},
|
|
'SAS': {'name': 'San Antonio Spurs', 'city': 'San Antonio', 'arena': 'Frost Bank Center'},
|
|
'TOR': {'name': 'Toronto Raptors', 'city': 'Toronto', 'arena': 'Scotiabank Arena'},
|
|
'UTA': {'name': 'Utah Jazz', 'city': 'Salt Lake City', 'arena': 'Delta Center'},
|
|
'WAS': {'name': 'Washington Wizards', 'city': 'Washington', 'arena': 'Capital One Arena'},
|
|
}
|
|
|
|
MLB_TEAMS = {
|
|
'ARI': {'name': 'Arizona Diamondbacks', 'city': 'Phoenix', 'stadium': 'Chase Field'},
|
|
'ATL': {'name': 'Atlanta Braves', 'city': 'Atlanta', 'stadium': 'Truist Park'},
|
|
'BAL': {'name': 'Baltimore Orioles', 'city': 'Baltimore', 'stadium': 'Oriole Park at Camden Yards'},
|
|
'BOS': {'name': 'Boston Red Sox', 'city': 'Boston', 'stadium': 'Fenway Park'},
|
|
'CHC': {'name': 'Chicago Cubs', 'city': 'Chicago', 'stadium': 'Wrigley Field'},
|
|
'CHW': {'name': 'Chicago White Sox', 'city': 'Chicago', 'stadium': 'Guaranteed Rate Field'},
|
|
'CIN': {'name': 'Cincinnati Reds', 'city': 'Cincinnati', 'stadium': 'Great American Ball Park'},
|
|
'CLE': {'name': 'Cleveland Guardians', 'city': 'Cleveland', 'stadium': 'Progressive Field'},
|
|
'COL': {'name': 'Colorado Rockies', 'city': 'Denver', 'stadium': 'Coors Field'},
|
|
'DET': {'name': 'Detroit Tigers', 'city': 'Detroit', 'stadium': 'Comerica Park'},
|
|
'HOU': {'name': 'Houston Astros', 'city': 'Houston', 'stadium': 'Minute Maid Park'},
|
|
'KCR': {'name': 'Kansas City Royals', 'city': 'Kansas City', 'stadium': 'Kauffman Stadium'},
|
|
'LAA': {'name': 'Los Angeles Angels', 'city': 'Anaheim', 'stadium': 'Angel Stadium'},
|
|
'LAD': {'name': 'Los Angeles Dodgers', 'city': 'Los Angeles', 'stadium': 'Dodger Stadium'},
|
|
'MIA': {'name': 'Miami Marlins', 'city': 'Miami', 'stadium': 'LoanDepot Park'},
|
|
'MIL': {'name': 'Milwaukee Brewers', 'city': 'Milwaukee', 'stadium': 'American Family Field'},
|
|
'MIN': {'name': 'Minnesota Twins', 'city': 'Minneapolis', 'stadium': 'Target Field'},
|
|
'NYM': {'name': 'New York Mets', 'city': 'New York', 'stadium': 'Citi Field'},
|
|
'NYY': {'name': 'New York Yankees', 'city': 'New York', 'stadium': 'Yankee Stadium'},
|
|
'OAK': {'name': 'Oakland Athletics', 'city': 'Sacramento', 'stadium': 'Sutter Health Park'},
|
|
'PHI': {'name': 'Philadelphia Phillies', 'city': 'Philadelphia', 'stadium': 'Citizens Bank Park'},
|
|
'PIT': {'name': 'Pittsburgh Pirates', 'city': 'Pittsburgh', 'stadium': 'PNC Park'},
|
|
'SDP': {'name': 'San Diego Padres', 'city': 'San Diego', 'stadium': 'Petco Park'},
|
|
'SFG': {'name': 'San Francisco Giants', 'city': 'San Francisco', 'stadium': 'Oracle Park'},
|
|
'SEA': {'name': 'Seattle Mariners', 'city': 'Seattle', 'stadium': 'T-Mobile Park'},
|
|
'STL': {'name': 'St. Louis Cardinals', 'city': 'St. Louis', 'stadium': 'Busch Stadium'},
|
|
'TBR': {'name': 'Tampa Bay Rays', 'city': 'St. Petersburg', 'stadium': 'Tropicana Field'},
|
|
'TEX': {'name': 'Texas Rangers', 'city': 'Arlington', 'stadium': 'Globe Life Field'},
|
|
'TOR': {'name': 'Toronto Blue Jays', 'city': 'Toronto', 'stadium': 'Rogers Centre'},
|
|
'WSN': {'name': 'Washington Nationals', 'city': 'Washington', 'stadium': 'Nationals Park'},
|
|
}
|
|
|
|
NHL_TEAMS = {
|
|
'ANA': {'name': 'Anaheim Ducks', 'city': 'Anaheim', 'arena': 'Honda Center'},
|
|
'ARI': {'name': 'Utah Hockey Club', 'city': 'Salt Lake City', 'arena': 'Delta Center'},
|
|
'BOS': {'name': 'Boston Bruins', 'city': 'Boston', 'arena': 'TD Garden'},
|
|
'BUF': {'name': 'Buffalo Sabres', 'city': 'Buffalo', 'arena': 'KeyBank Center'},
|
|
'CGY': {'name': 'Calgary Flames', 'city': 'Calgary', 'arena': 'Scotiabank Saddledome'},
|
|
'CAR': {'name': 'Carolina Hurricanes', 'city': 'Raleigh', 'arena': 'PNC Arena'},
|
|
'CHI': {'name': 'Chicago Blackhawks', 'city': 'Chicago', 'arena': 'United Center'},
|
|
'COL': {'name': 'Colorado Avalanche', 'city': 'Denver', 'arena': 'Ball Arena'},
|
|
'CBJ': {'name': 'Columbus Blue Jackets', 'city': 'Columbus', 'arena': 'Nationwide Arena'},
|
|
'DAL': {'name': 'Dallas Stars', 'city': 'Dallas', 'arena': 'American Airlines Center'},
|
|
'DET': {'name': 'Detroit Red Wings', 'city': 'Detroit', 'arena': 'Little Caesars Arena'},
|
|
'EDM': {'name': 'Edmonton Oilers', 'city': 'Edmonton', 'arena': 'Rogers Place'},
|
|
'FLA': {'name': 'Florida Panthers', 'city': 'Sunrise', 'arena': 'Amerant Bank Arena'},
|
|
'LAK': {'name': 'Los Angeles Kings', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
|
|
'MIN': {'name': 'Minnesota Wild', 'city': 'St. Paul', 'arena': 'Xcel Energy Center'},
|
|
'MTL': {'name': 'Montreal Canadiens', 'city': 'Montreal', 'arena': 'Bell Centre'},
|
|
'NSH': {'name': 'Nashville Predators', 'city': 'Nashville', 'arena': 'Bridgestone Arena'},
|
|
'NJD': {'name': 'New Jersey Devils', 'city': 'Newark', 'arena': 'Prudential Center'},
|
|
'NYI': {'name': 'New York Islanders', 'city': 'Elmont', 'arena': 'UBS Arena'},
|
|
'NYR': {'name': 'New York Rangers', 'city': 'New York', 'arena': 'Madison Square Garden'},
|
|
'OTT': {'name': 'Ottawa Senators', 'city': 'Ottawa', 'arena': 'Canadian Tire Centre'},
|
|
'PHI': {'name': 'Philadelphia Flyers', 'city': 'Philadelphia', 'arena': 'Wells Fargo Center'},
|
|
'PIT': {'name': 'Pittsburgh Penguins', 'city': 'Pittsburgh', 'arena': 'PPG Paints Arena'},
|
|
'SJS': {'name': 'San Jose Sharks', 'city': 'San Jose', 'arena': 'SAP Center'},
|
|
'SEA': {'name': 'Seattle Kraken', 'city': 'Seattle', 'arena': 'Climate Pledge Arena'},
|
|
'STL': {'name': 'St. Louis Blues', 'city': 'St. Louis', 'arena': 'Enterprise Center'},
|
|
'TBL': {'name': 'Tampa Bay Lightning', 'city': 'Tampa', 'arena': 'Amalie Arena'},
|
|
'TOR': {'name': 'Toronto Maple Leafs', 'city': 'Toronto', 'arena': 'Scotiabank Arena'},
|
|
'VAN': {'name': 'Vancouver Canucks', 'city': 'Vancouver', 'arena': 'Rogers Arena'},
|
|
'VGK': {'name': 'Vegas Golden Knights', 'city': 'Las Vegas', 'arena': 'T-Mobile Arena'},
|
|
'WSH': {'name': 'Washington Capitals', 'city': 'Washington', 'arena': 'Capital One Arena'},
|
|
'WPG': {'name': 'Winnipeg Jets', 'city': 'Winnipeg', 'arena': 'Canada Life Centre'},
|
|
}
|
|
|
|
WNBA_TEAMS = {
|
|
'ATL': {'name': 'Atlanta Dream', 'city': 'College Park', 'arena': 'Gateway Center Arena'},
|
|
'CHI': {'name': 'Chicago Sky', 'city': 'Chicago', 'arena': 'Wintrust Arena'},
|
|
'CON': {'name': 'Connecticut Sun', 'city': 'Uncasville', 'arena': 'Mohegan Sun Arena'},
|
|
'DAL': {'name': 'Dallas Wings', 'city': 'Arlington', 'arena': 'College Park Center'},
|
|
'GSV': {'name': 'Golden State Valkyries', 'city': 'San Francisco', 'arena': 'Chase Center'},
|
|
'IND': {'name': 'Indiana Fever', 'city': 'Indianapolis', 'arena': 'Gainbridge Fieldhouse'},
|
|
'LVA': {'name': 'Las Vegas Aces', 'city': 'Las Vegas', 'arena': 'Michelob Ultra Arena'},
|
|
'LAS': {'name': 'Los Angeles Sparks', 'city': 'Los Angeles', 'arena': 'Crypto.com Arena'},
|
|
'MIN': {'name': 'Minnesota Lynx', 'city': 'Minneapolis', 'arena': 'Target Center'},
|
|
'NYL': {'name': 'New York Liberty', 'city': 'Brooklyn', 'arena': 'Barclays Center'},
|
|
'PHX': {'name': 'Phoenix Mercury', 'city': 'Phoenix', 'arena': 'Footprint Center'},
|
|
'SEA': {'name': 'Seattle Storm', 'city': 'Seattle', 'arena': 'Climate Pledge Arena'},
|
|
'WAS': {'name': 'Washington Mystics', 'city': 'Washington', 'arena': 'Entertainment & Sports Arena'},
|
|
}
|
|
|
|
MLS_TEAMS = {
|
|
'ATL': {'name': 'Atlanta United FC', 'city': 'Atlanta', 'stadium': 'Mercedes-Benz Stadium'},
|
|
'ATX': {'name': 'Austin FC', 'city': 'Austin', 'stadium': 'Q2 Stadium'},
|
|
'CLT': {'name': 'Charlotte FC', 'city': 'Charlotte', 'stadium': 'Bank of America Stadium'},
|
|
'CHI': {'name': 'Chicago Fire FC', 'city': 'Chicago', 'stadium': 'Soldier Field'},
|
|
'CIN': {'name': 'FC Cincinnati', 'city': 'Cincinnati', 'stadium': 'TQL Stadium'},
|
|
'COL': {'name': 'Colorado Rapids', 'city': 'Commerce City', 'stadium': 'Dick\'s Sporting Goods Park'},
|
|
'CLB': {'name': 'Columbus Crew', 'city': 'Columbus', 'stadium': 'Lower.com Field'},
|
|
'DAL': {'name': 'FC Dallas', 'city': 'Frisco', 'stadium': 'Toyota Stadium'},
|
|
'DCU': {'name': 'D.C. United', 'city': 'Washington', 'stadium': 'Audi Field'},
|
|
'HOU': {'name': 'Houston Dynamo FC', 'city': 'Houston', 'stadium': 'Shell Energy Stadium'},
|
|
'LAG': {'name': 'LA Galaxy', 'city': 'Carson', 'stadium': 'Dignity Health Sports Park'},
|
|
'LAFC': {'name': 'Los Angeles FC', 'city': 'Los Angeles', 'stadium': 'BMO Stadium'},
|
|
'MIA': {'name': 'Inter Miami CF', 'city': 'Fort Lauderdale', 'stadium': 'Chase Stadium'},
|
|
'MIN': {'name': 'Minnesota United FC', 'city': 'St. Paul', 'stadium': 'Allianz Field'},
|
|
'MTL': {'name': 'CF Montréal', 'city': 'Montreal', 'stadium': 'Stade Saputo'},
|
|
'NSH': {'name': 'Nashville SC', 'city': 'Nashville', 'stadium': 'Geodis Park'},
|
|
'NER': {'name': 'New England Revolution', 'city': 'Foxborough', 'stadium': 'Gillette Stadium'},
|
|
'NYC': {'name': 'New York City FC', 'city': 'New York', 'stadium': 'Yankee Stadium'},
|
|
'RBNY': {'name': 'New York Red Bulls', 'city': 'Harrison', 'stadium': 'Red Bull Arena'},
|
|
'ORL': {'name': 'Orlando City SC', 'city': 'Orlando', 'stadium': 'Inter&Co Stadium'},
|
|
'PHI': {'name': 'Philadelphia Union', 'city': 'Chester', 'stadium': 'Subaru Park'},
|
|
'POR': {'name': 'Portland Timbers', 'city': 'Portland', 'stadium': 'Providence Park'},
|
|
'RSL': {'name': 'Real Salt Lake', 'city': 'Sandy', 'stadium': 'America First Field'},
|
|
'SJE': {'name': 'San Jose Earthquakes', 'city': 'San Jose', 'stadium': 'PayPal Park'},
|
|
'SEA': {'name': 'Seattle Sounders FC', 'city': 'Seattle', 'stadium': 'Lumen Field'},
|
|
'SKC': {'name': 'Sporting Kansas City', 'city': 'Kansas City', 'stadium': 'Children\'s Mercy Park'},
|
|
'STL': {'name': 'St. Louis City SC', 'city': 'St. Louis', 'stadium': 'CityPark'},
|
|
'TOR': {'name': 'Toronto FC', 'city': 'Toronto', 'stadium': 'BMO Field'},
|
|
'VAN': {'name': 'Vancouver Whitecaps FC', 'city': 'Vancouver', 'stadium': 'BC Place'},
|
|
'SDG': {'name': 'San Diego FC', 'city': 'San Diego', 'stadium': 'Snapdragon Stadium'},
|
|
}
|
|
|
|
NWSL_TEAMS = {
|
|
'ANG': {'name': 'Angel City FC', 'city': 'Los Angeles', 'stadium': 'BMO Stadium'},
|
|
'BAY': {'name': 'Bay FC', 'city': 'San Jose', 'stadium': 'PayPal Park'},
|
|
'CHI': {'name': 'Chicago Red Stars', 'city': 'Chicago', 'stadium': 'SeatGeek Stadium'},
|
|
'HOU': {'name': 'Houston Dash', 'city': 'Houston', 'stadium': 'Shell Energy Stadium'},
|
|
'KCC': {'name': 'Kansas City Current', 'city': 'Kansas City', 'stadium': 'CPKC Stadium'},
|
|
'NJY': {'name': 'NJ/NY Gotham FC', 'city': 'Harrison', 'stadium': 'Red Bull Arena'},
|
|
'NCC': {'name': 'North Carolina Courage', 'city': 'Cary', 'stadium': 'WakeMed Soccer Park'},
|
|
'ORL': {'name': 'Orlando Pride', 'city': 'Orlando', 'stadium': 'Inter&Co Stadium'},
|
|
'POR': {'name': 'Portland Thorns FC', 'city': 'Portland', 'stadium': 'Providence Park'},
|
|
'RGN': {'name': 'Seattle Reign FC', 'city': 'Seattle', 'stadium': 'Lumen Field'},
|
|
'SDW': {'name': 'San Diego Wave FC', 'city': 'San Diego', 'stadium': 'Snapdragon Stadium'},
|
|
'UTA': {'name': 'Utah Royals FC', 'city': 'Sandy', 'stadium': 'America First Field'},
|
|
'WSH': {'name': 'Washington Spirit', 'city': 'Washington', 'stadium': 'Audi Field'},
|
|
}
|
|
|
|
# NFL Teams and Stadiums
|
|
NFL_TEAMS = {
|
|
'ARI': {'name': 'Arizona Cardinals', 'city': 'Glendale', 'stadium': 'State Farm Stadium'},
|
|
'ATL': {'name': 'Atlanta Falcons', 'city': 'Atlanta', 'stadium': 'Mercedes-Benz Stadium'},
|
|
'BAL': {'name': 'Baltimore Ravens', 'city': 'Baltimore', 'stadium': 'M&T Bank Stadium'},
|
|
'BUF': {'name': 'Buffalo Bills', 'city': 'Orchard Park', 'stadium': 'Highmark Stadium'},
|
|
'CAR': {'name': 'Carolina Panthers', 'city': 'Charlotte', 'stadium': 'Bank of America Stadium'},
|
|
'CHI': {'name': 'Chicago Bears', 'city': 'Chicago', 'stadium': 'Soldier Field'},
|
|
'CIN': {'name': 'Cincinnati Bengals', 'city': 'Cincinnati', 'stadium': 'Paycor Stadium'},
|
|
'CLE': {'name': 'Cleveland Browns', 'city': 'Cleveland', 'stadium': 'Cleveland Browns Stadium'},
|
|
'DAL': {'name': 'Dallas Cowboys', 'city': 'Arlington', 'stadium': 'AT&T Stadium'},
|
|
'DEN': {'name': 'Denver Broncos', 'city': 'Denver', 'stadium': 'Empower Field at Mile High'},
|
|
'DET': {'name': 'Detroit Lions', 'city': 'Detroit', 'stadium': 'Ford Field'},
|
|
'GB': {'name': 'Green Bay Packers', 'city': 'Green Bay', 'stadium': 'Lambeau Field'},
|
|
'HOU': {'name': 'Houston Texans', 'city': 'Houston', 'stadium': 'NRG Stadium'},
|
|
'IND': {'name': 'Indianapolis Colts', 'city': 'Indianapolis', 'stadium': 'Lucas Oil Stadium'},
|
|
'JAX': {'name': 'Jacksonville Jaguars', 'city': 'Jacksonville', 'stadium': 'EverBank Stadium'},
|
|
'KC': {'name': 'Kansas City Chiefs', 'city': 'Kansas City', 'stadium': 'GEHA Field at Arrowhead Stadium'},
|
|
'LV': {'name': 'Las Vegas Raiders', 'city': 'Las Vegas', 'stadium': 'Allegiant Stadium'},
|
|
'LAC': {'name': 'Los Angeles Chargers', 'city': 'Inglewood', 'stadium': 'SoFi Stadium'},
|
|
'LAR': {'name': 'Los Angeles Rams', 'city': 'Inglewood', 'stadium': 'SoFi Stadium'},
|
|
'MIA': {'name': 'Miami Dolphins', 'city': 'Miami Gardens', 'stadium': 'Hard Rock Stadium'},
|
|
'MIN': {'name': 'Minnesota Vikings', 'city': 'Minneapolis', 'stadium': 'U.S. Bank Stadium'},
|
|
'NE': {'name': 'New England Patriots', 'city': 'Foxborough', 'stadium': 'Gillette Stadium'},
|
|
'NO': {'name': 'New Orleans Saints', 'city': 'New Orleans', 'stadium': 'Caesars Superdome'},
|
|
'NYG': {'name': 'New York Giants', 'city': 'East Rutherford', 'stadium': 'MetLife Stadium'},
|
|
'NYJ': {'name': 'New York Jets', 'city': 'East Rutherford', 'stadium': 'MetLife Stadium'},
|
|
'PHI': {'name': 'Philadelphia Eagles', 'city': 'Philadelphia', 'stadium': 'Lincoln Financial Field'},
|
|
'PIT': {'name': 'Pittsburgh Steelers', 'city': 'Pittsburgh', 'stadium': 'Acrisure Stadium'},
|
|
'SF': {'name': 'San Francisco 49ers', 'city': 'Santa Clara', 'stadium': 'Levi\'s Stadium'},
|
|
'SEA': {'name': 'Seattle Seahawks', 'city': 'Seattle', 'stadium': 'Lumen Field'},
|
|
'TB': {'name': 'Tampa Bay Buccaneers', 'city': 'Tampa', 'stadium': 'Raymond James Stadium'},
|
|
'TEN': {'name': 'Tennessee Titans', 'city': 'Nashville', 'stadium': 'Nissan Stadium'},
|
|
'WAS': {'name': 'Washington Commanders', 'city': 'Landover', 'stadium': 'Northwest Stadium'},
|
|
}
|
|
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPERS - NBA
|
|
# =============================================================================
|
|
|
|
def scrape_nba_basketball_reference(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NBA schedule from Basketball-Reference.
|
|
URL: https://www.basketball-reference.com/leagues/NBA_{YEAR}_games-{month}.html
|
|
Season year is the ending year (e.g., 2025 for 2024-25 season)
|
|
"""
|
|
games = []
|
|
months = ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june']
|
|
|
|
print(f"Scraping NBA {season} from Basketball-Reference...")
|
|
|
|
for month in months:
|
|
url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html"
|
|
soup = fetch_page(url, 'basketball-reference.com')
|
|
|
|
if not soup:
|
|
continue
|
|
|
|
table = soup.find('table', {'id': 'schedule'})
|
|
if not table:
|
|
continue
|
|
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
continue
|
|
|
|
for row in tbody.find_all('tr'):
|
|
if row.get('class') and 'thead' in row.get('class'):
|
|
continue
|
|
|
|
cells = row.find_all(['td', 'th'])
|
|
if len(cells) < 6:
|
|
continue
|
|
|
|
try:
|
|
# Parse date
|
|
date_cell = row.find('th', {'data-stat': 'date_game'})
|
|
if not date_cell:
|
|
continue
|
|
date_link = date_cell.find('a')
|
|
date_str = date_link.text if date_link else date_cell.text
|
|
|
|
# Parse time
|
|
time_cell = row.find('td', {'data-stat': 'game_start_time'})
|
|
time_str = time_cell.text.strip() if time_cell else None
|
|
|
|
# Parse teams
|
|
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
|
|
home_cell = row.find('td', {'data-stat': 'home_team_name'})
|
|
|
|
if not visitor_cell or not home_cell:
|
|
continue
|
|
|
|
visitor_link = visitor_cell.find('a')
|
|
home_link = home_cell.find('a')
|
|
|
|
away_team = visitor_link.text if visitor_link else visitor_cell.text
|
|
home_team = home_link.text if home_link else home_cell.text
|
|
|
|
# Parse arena
|
|
arena_cell = row.find('td', {'data-stat': 'arena_name'})
|
|
arena = arena_cell.text.strip() if arena_cell else ''
|
|
|
|
# Convert date
|
|
try:
|
|
parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y')
|
|
date_formatted = parsed_date.strftime('%Y-%m-%d')
|
|
except:
|
|
continue
|
|
|
|
# Generate game ID
|
|
game_id = f"nba_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NBA',
|
|
season=f"{season-1}-{str(season)[2:]}",
|
|
date=date_formatted,
|
|
time=time_str,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'NBA'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'NBA'),
|
|
venue=arena,
|
|
source='basketball-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
print(f" Error parsing row: {e}")
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Basketball-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_nba_espn(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NBA schedule from ESPN.
|
|
URL: https://www.espn.com/nba/schedule/_/date/{YYYYMMDD}
|
|
"""
|
|
games = []
|
|
print(f"Scraping NBA {season} from ESPN...")
|
|
|
|
# Determine date range for season
|
|
start_date = datetime(season - 1, 10, 1) # October of previous year
|
|
end_date = datetime(season, 6, 30) # June of season year
|
|
|
|
current_date = start_date
|
|
while current_date <= end_date:
|
|
date_str = current_date.strftime('%Y%m%d')
|
|
url = f"https://www.espn.com/nba/schedule/_/date/{date_str}"
|
|
|
|
soup = fetch_page(url, 'espn.com')
|
|
if soup:
|
|
# ESPN uses JavaScript rendering, so we need to parse what's available
|
|
# This is a simplified version - full implementation would need Selenium
|
|
pass
|
|
|
|
current_date += timedelta(days=7) # Sample weekly to respect rate limits
|
|
|
|
print(f" Found {len(games)} games from ESPN")
|
|
return games
|
|
|
|
|
|
def scrape_nba_cbssports(season: int) -> list[Game]:
|
|
"""
|
|
Fetch NBA schedule from CBS Sports.
|
|
CBS Sports provides a JSON API for schedule data.
|
|
"""
|
|
games = []
|
|
print(f"Fetching NBA {season} from CBS Sports...")
|
|
|
|
# CBS Sports has a schedule endpoint
|
|
url = "https://www.cbssports.com/nba/schedule/"
|
|
|
|
soup = fetch_page(url, 'cbssports.com')
|
|
if not soup:
|
|
return games
|
|
|
|
# Find all game rows
|
|
tables = soup.find_all('table', class_='TableBase-table')
|
|
|
|
for table in tables:
|
|
rows = table.find_all('tr')
|
|
for row in rows:
|
|
try:
|
|
cells = row.find_all('td')
|
|
if len(cells) < 2:
|
|
continue
|
|
|
|
# Parse teams from row
|
|
team_cells = row.find_all('a', class_='TeamName')
|
|
if len(team_cells) < 2:
|
|
continue
|
|
|
|
away_team = team_cells[0].get_text(strip=True)
|
|
home_team = team_cells[1].get_text(strip=True)
|
|
|
|
# Get date from table section
|
|
date_formatted = datetime.now().strftime('%Y-%m-%d') # Placeholder
|
|
|
|
game_id = f"nba_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NBA',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'NBA'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'NBA'),
|
|
venue='',
|
|
source='cbssports.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from CBS Sports")
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPERS - MLB
|
|
# =============================================================================
|
|
|
|
def scrape_mlb_baseball_reference(season: int) -> list[Game]:
|
|
"""
|
|
Scrape MLB schedule from Baseball-Reference.
|
|
URL: https://www.baseball-reference.com/leagues/majors/{YEAR}-schedule.shtml
|
|
"""
|
|
games = []
|
|
url = f"https://www.baseball-reference.com/leagues/majors/{season}-schedule.shtml"
|
|
|
|
print(f"Scraping MLB {season} from Baseball-Reference...")
|
|
soup = fetch_page(url, 'baseball-reference.com')
|
|
|
|
if not soup:
|
|
return games
|
|
|
|
# Baseball-Reference groups games by date in h3 headers
|
|
current_date = None
|
|
|
|
# Find the schedule section
|
|
schedule_div = soup.find('div', {'id': 'all_schedule'})
|
|
if not schedule_div:
|
|
schedule_div = soup
|
|
|
|
# Process all elements to track date context
|
|
for element in schedule_div.find_all(['h3', 'p', 'div']):
|
|
# Check for date header
|
|
if element.name == 'h3':
|
|
date_text = element.get_text(strip=True)
|
|
# Parse date like "Thursday, March 27, 2025"
|
|
try:
|
|
for fmt in ['%A, %B %d, %Y', '%B %d, %Y', '%a, %b %d, %Y']:
|
|
try:
|
|
parsed = datetime.strptime(date_text, fmt)
|
|
current_date = parsed.strftime('%Y-%m-%d')
|
|
break
|
|
except:
|
|
continue
|
|
except:
|
|
pass
|
|
|
|
# Check for game entries
|
|
elif element.name == 'p' and 'game' in element.get('class', []):
|
|
if not current_date:
|
|
continue
|
|
|
|
try:
|
|
links = element.find_all('a')
|
|
if len(links) >= 2:
|
|
away_team = links[0].text.strip()
|
|
home_team = links[1].text.strip()
|
|
|
|
# Generate unique game ID
|
|
away_abbrev = get_team_abbrev(away_team, 'MLB')
|
|
home_abbrev = get_team_abbrev(home_team, 'MLB')
|
|
game_id = f"mlb_br_{current_date}_{away_abbrev}_{home_abbrev}".lower()
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='MLB',
|
|
season=str(season),
|
|
date=current_date,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=home_abbrev,
|
|
away_team_abbrev=away_abbrev,
|
|
venue='',
|
|
source='baseball-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Baseball-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_mlb_statsapi(season: int) -> list[Game]:
|
|
"""
|
|
Fetch MLB schedule from official Stats API (JSON).
|
|
URL: https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={YEAR}&gameType=R
|
|
"""
|
|
games = []
|
|
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&season={season}&gameType=R&hydrate=team,venue"
|
|
|
|
print(f"Fetching MLB {season} from Stats API...")
|
|
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for date_entry in data.get('dates', []):
|
|
game_date = date_entry.get('date', '')
|
|
|
|
for game_data in date_entry.get('games', []):
|
|
try:
|
|
teams = game_data.get('teams', {})
|
|
away = teams.get('away', {}).get('team', {})
|
|
home = teams.get('home', {}).get('team', {})
|
|
venue = game_data.get('venue', {})
|
|
|
|
game_time = game_data.get('gameDate', '')
|
|
if 'T' in game_time:
|
|
time_str = game_time.split('T')[1][:5]
|
|
else:
|
|
time_str = None
|
|
|
|
game = Game(
|
|
id='', # Will be assigned by assign_stable_ids
|
|
sport='MLB',
|
|
season=str(season),
|
|
date=game_date,
|
|
time=time_str,
|
|
home_team=home.get('name', ''),
|
|
away_team=away.get('name', ''),
|
|
home_team_abbrev=home.get('abbreviation', ''),
|
|
away_team_abbrev=away.get('abbreviation', ''),
|
|
venue=venue.get('name', ''),
|
|
source='statsapi.mlb.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
except Exception as e:
|
|
print(f" Error fetching MLB API: {e}")
|
|
|
|
print(f" Found {len(games)} games from MLB Stats API")
|
|
return games
|
|
|
|
|
|
def scrape_mlb_espn(season: int) -> list[Game]:
|
|
"""Fetch MLB schedule from ESPN API."""
|
|
games = []
|
|
print(f"Fetching MLB {season} from ESPN API...")
|
|
|
|
# MLB regular season: Late March - Early October
|
|
start = f"{season}0320"
|
|
end = f"{season}1010"
|
|
|
|
url = "https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard"
|
|
params = {
|
|
'dates': f"{start}-{end}",
|
|
'limit': 1000
|
|
}
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, params=params, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
events = data.get('events', [])
|
|
|
|
for event in events:
|
|
try:
|
|
date_str = event.get('date', '')[:10]
|
|
time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None
|
|
|
|
competitions = event.get('competitions', [{}])
|
|
if not competitions:
|
|
continue
|
|
|
|
comp = competitions[0]
|
|
competitors = comp.get('competitors', [])
|
|
|
|
if len(competitors) < 2:
|
|
continue
|
|
|
|
home_team = away_team = home_abbrev = away_abbrev = None
|
|
|
|
for team in competitors:
|
|
team_data = team.get('team', {})
|
|
team_name = team_data.get('displayName', team_data.get('name', ''))
|
|
team_abbrev = team_data.get('abbreviation', '')
|
|
|
|
if team.get('homeAway') == 'home':
|
|
home_team = team_name
|
|
home_abbrev = team_abbrev
|
|
else:
|
|
away_team = team_name
|
|
away_abbrev = team_abbrev
|
|
|
|
if not home_team or not away_team:
|
|
continue
|
|
|
|
venue = comp.get('venue', {}).get('fullName', '')
|
|
|
|
game_id = f"mlb_{date_str}_{away_abbrev}_{home_abbrev}".lower()
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='MLB',
|
|
season=str(season),
|
|
date=date_str,
|
|
time=time_str,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=home_abbrev or get_team_abbrev(home_team, 'MLB'),
|
|
away_team_abbrev=away_abbrev or get_team_abbrev(away_team, 'MLB'),
|
|
venue=venue,
|
|
source='espn.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from ESPN")
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching ESPN MLB: {e}")
|
|
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPERS - NHL
|
|
# =============================================================================
|
|
|
|
def scrape_nhl_hockey_reference(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NHL schedule from Hockey-Reference.
|
|
URL: https://www.hockey-reference.com/leagues/NHL_{YEAR}_games.html
|
|
"""
|
|
games = []
|
|
url = f"https://www.hockey-reference.com/leagues/NHL_{season}_games.html"
|
|
|
|
print(f"Scraping NHL {season} from Hockey-Reference...")
|
|
soup = fetch_page(url, 'hockey-reference.com')
|
|
|
|
if not soup:
|
|
return games
|
|
|
|
table = soup.find('table', {'id': 'games'})
|
|
if not table:
|
|
print(" Could not find games table")
|
|
return games
|
|
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
return games
|
|
|
|
for row in tbody.find_all('tr'):
|
|
try:
|
|
cells = row.find_all(['td', 'th'])
|
|
if len(cells) < 5:
|
|
continue
|
|
|
|
# Parse date
|
|
date_cell = row.find('th', {'data-stat': 'date_game'})
|
|
if not date_cell:
|
|
continue
|
|
date_link = date_cell.find('a')
|
|
date_str = date_link.text if date_link else date_cell.text
|
|
|
|
# Parse teams
|
|
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
|
|
home_cell = row.find('td', {'data-stat': 'home_team_name'})
|
|
|
|
if not visitor_cell or not home_cell:
|
|
continue
|
|
|
|
visitor_link = visitor_cell.find('a')
|
|
home_link = home_cell.find('a')
|
|
|
|
away_team = visitor_link.text if visitor_link else visitor_cell.text
|
|
home_team = home_link.text if home_link else home_cell.text
|
|
|
|
# Convert date
|
|
try:
|
|
parsed_date = datetime.strptime(date_str.strip(), '%Y-%m-%d')
|
|
date_formatted = parsed_date.strftime('%Y-%m-%d')
|
|
except:
|
|
continue
|
|
|
|
game_id = f"nhl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NHL',
|
|
season=f"{season-1}-{str(season)[2:]}",
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'NHL'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'NHL'),
|
|
venue='',
|
|
source='hockey-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Hockey-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_nhl_api(season: int) -> list[Game]:
|
|
"""
|
|
Fetch NHL schedule from official API (JSON).
|
|
URL: https://api-web.nhle.com/v1/schedule/{YYYY-MM-DD}
|
|
"""
|
|
games = []
|
|
print(f"Fetching NHL {season} from NHL API...")
|
|
|
|
# NHL API provides club schedules
|
|
# We'd need to iterate through dates or teams
|
|
# Simplified implementation here
|
|
|
|
return games
|
|
|
|
|
|
def scrape_nhl_espn(season: int) -> list[Game]:
|
|
"""Fetch NHL schedule from ESPN API."""
|
|
games = []
|
|
print(f"Fetching NHL {season} from ESPN API...")
|
|
|
|
# NHL regular season: October - April (spans calendar years)
|
|
start = f"{season-1}1001"
|
|
end = f"{season}0430"
|
|
|
|
url = "https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard"
|
|
params = {
|
|
'dates': f"{start}-{end}",
|
|
'limit': 1000
|
|
}
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, params=params, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
events = data.get('events', [])
|
|
|
|
for event in events:
|
|
try:
|
|
date_str = event.get('date', '')[:10]
|
|
time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None
|
|
|
|
competitions = event.get('competitions', [{}])
|
|
if not competitions:
|
|
continue
|
|
|
|
comp = competitions[0]
|
|
competitors = comp.get('competitors', [])
|
|
|
|
if len(competitors) < 2:
|
|
continue
|
|
|
|
home_team = away_team = home_abbrev = away_abbrev = None
|
|
|
|
for team in competitors:
|
|
team_data = team.get('team', {})
|
|
team_name = team_data.get('displayName', team_data.get('name', ''))
|
|
team_abbrev = team_data.get('abbreviation', '')
|
|
|
|
if team.get('homeAway') == 'home':
|
|
home_team = team_name
|
|
home_abbrev = team_abbrev
|
|
else:
|
|
away_team = team_name
|
|
away_abbrev = team_abbrev
|
|
|
|
if not home_team or not away_team:
|
|
continue
|
|
|
|
venue = comp.get('venue', {}).get('fullName', '')
|
|
|
|
game_id = f"nhl_{date_str}_{away_abbrev}_{home_abbrev}".lower()
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NHL',
|
|
season=str(season),
|
|
date=date_str,
|
|
time=time_str,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=home_abbrev or get_team_abbrev(home_team, 'NHL'),
|
|
away_team_abbrev=away_abbrev or get_team_abbrev(away_team, 'NHL'),
|
|
venue=venue,
|
|
source='espn.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from ESPN")
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching ESPN NHL: {e}")
|
|
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPERS - ESPN API (WNBA, MLS, NWSL)
|
|
# =============================================================================
|
|
|
|
def scrape_espn_schedule(sport: str, league: str, season: int, date_range: tuple[str, str]) -> list[Game]:
|
|
"""
|
|
Fetch schedule from ESPN API.
|
|
|
|
Args:
|
|
sport: 'basketball' or 'soccer'
|
|
league: 'wnba', 'usa.1' (MLS), 'usa.nwsl' (NWSL)
|
|
season: Season year
|
|
date_range: (start_date, end_date) in YYYYMMDD format
|
|
"""
|
|
games = []
|
|
sport_upper = {
|
|
'wnba': 'WNBA',
|
|
'usa.1': 'MLS',
|
|
'usa.nwsl': 'NWSL',
|
|
'nfl': 'NFL',
|
|
'mens-college-basketball': 'CBB'
|
|
}.get(league, league.upper())
|
|
|
|
print(f"Fetching {sport_upper} {season} from ESPN API...")
|
|
|
|
url = f"https://site.api.espn.com/apis/site/v2/sports/{sport}/{league}/scoreboard"
|
|
params = {
|
|
'dates': f"{date_range[0]}-{date_range[1]}",
|
|
'limit': 1000
|
|
}
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, params=params, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
events = data.get('events', [])
|
|
|
|
for event in events:
|
|
try:
|
|
# Parse date/time
|
|
date_str = event.get('date', '')[:10] # YYYY-MM-DD
|
|
time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None
|
|
|
|
# Get teams
|
|
competitions = event.get('competitions', [{}])
|
|
if not competitions:
|
|
continue
|
|
|
|
comp = competitions[0]
|
|
competitors = comp.get('competitors', [])
|
|
|
|
if len(competitors) < 2:
|
|
continue
|
|
|
|
home_team = None
|
|
away_team = None
|
|
home_abbrev = None
|
|
away_abbrev = None
|
|
|
|
for team in competitors:
|
|
team_data = team.get('team', {})
|
|
team_name = team_data.get('displayName', team_data.get('name', ''))
|
|
team_abbrev = team_data.get('abbreviation', '')
|
|
|
|
if team.get('homeAway') == 'home':
|
|
home_team = team_name
|
|
home_abbrev = team_abbrev
|
|
else:
|
|
away_team = team_name
|
|
away_abbrev = team_abbrev
|
|
|
|
if not home_team or not away_team:
|
|
continue
|
|
|
|
# Get venue
|
|
venue = comp.get('venue', {}).get('fullName', '')
|
|
|
|
game_id = f"{sport_upper.lower()}_{date_str}_{away_abbrev}_{home_abbrev}".lower()
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport=sport_upper,
|
|
season=str(season),
|
|
date=date_str,
|
|
time=time_str,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=home_abbrev or get_team_abbrev(home_team, sport_upper),
|
|
away_team_abbrev=away_abbrev or get_team_abbrev(away_team, sport_upper),
|
|
venue=venue,
|
|
source='espn.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from ESPN")
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching ESPN {sport_upper}: {e}")
|
|
|
|
return games
|
|
|
|
|
|
def scrape_wnba_espn(season: int) -> list[Game]:
|
|
"""Fetch WNBA schedule from ESPN API."""
|
|
# WNBA season: May - October
|
|
start = f"{season}0501"
|
|
end = f"{season}1031"
|
|
return scrape_espn_schedule('basketball', 'wnba', season, (start, end))
|
|
|
|
|
|
def scrape_mls_espn(season: int) -> list[Game]:
|
|
"""Fetch MLS schedule from ESPN API."""
|
|
# MLS season: February - December
|
|
start = f"{season}0201"
|
|
end = f"{season}1231"
|
|
return scrape_espn_schedule('soccer', 'usa.1', season, (start, end))
|
|
|
|
|
|
def scrape_nwsl_espn(season: int) -> list[Game]:
|
|
"""Fetch NWSL schedule from ESPN API."""
|
|
# NWSL season: March - November
|
|
start = f"{season}0301"
|
|
end = f"{season}1130"
|
|
return scrape_espn_schedule('soccer', 'usa.nwsl', season, (start, end))
|
|
|
|
|
|
def scrape_nfl_espn(season: int) -> list[Game]:
|
|
"""Fetch NFL schedule from ESPN API."""
|
|
# NFL season: September - February (spans years)
|
|
start = f"{season-1}0901"
|
|
end = f"{season}0228"
|
|
return scrape_espn_schedule('football', 'nfl', season, (start, end))
|
|
|
|
|
|
def scrape_nfl_pro_football_reference(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NFL schedule from Pro-Football-Reference.
|
|
URL: https://www.pro-football-reference.com/years/{YEAR}/games.htm
|
|
Season year is the starting year (e.g., 2025 for 2025-26 season)
|
|
"""
|
|
games = []
|
|
year = season - 1 # PFR uses starting year
|
|
url = f"https://www.pro-football-reference.com/years/{year}/games.htm"
|
|
|
|
print(f"Scraping NFL {season} from Pro-Football-Reference...")
|
|
soup = fetch_page(url, 'pro-football-reference.com')
|
|
|
|
if not soup:
|
|
return games
|
|
|
|
table = soup.find('table', {'id': 'games'})
|
|
if not table:
|
|
print(" Could not find games table")
|
|
return games
|
|
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
return games
|
|
|
|
for row in tbody.find_all('tr'):
|
|
if row.get('class') and 'thead' in row.get('class'):
|
|
continue
|
|
|
|
try:
|
|
# Parse date
|
|
date_cell = row.find('td', {'data-stat': 'game_date'})
|
|
if not date_cell:
|
|
continue
|
|
date_str = date_cell.text.strip()
|
|
|
|
# Parse teams
|
|
winner_cell = row.find('td', {'data-stat': 'winner'})
|
|
loser_cell = row.find('td', {'data-stat': 'loser'})
|
|
home_cell = row.find('td', {'data-stat': 'game_location'})
|
|
|
|
if not winner_cell or not loser_cell:
|
|
continue
|
|
|
|
winner_link = winner_cell.find('a')
|
|
loser_link = loser_cell.find('a')
|
|
|
|
winner = winner_link.text if winner_link else winner_cell.text.strip()
|
|
loser = loser_link.text if loser_link else loser_cell.text.strip()
|
|
|
|
# Determine home/away - '@' in game_location means winner was away
|
|
is_at_loser = home_cell and '@' in home_cell.text
|
|
if is_at_loser:
|
|
home_team, away_team = loser, winner
|
|
else:
|
|
home_team, away_team = winner, loser
|
|
|
|
# Convert date (e.g., "September 7" or "2025-09-07")
|
|
try:
|
|
if '-' in date_str:
|
|
parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
|
|
else:
|
|
# Add year based on month
|
|
month_str = date_str.split()[0]
|
|
if month_str in ['January', 'February']:
|
|
date_with_year = f"{date_str}, {year + 1}"
|
|
else:
|
|
date_with_year = f"{date_str}, {year}"
|
|
parsed_date = datetime.strptime(date_with_year, '%B %d, %Y')
|
|
date_formatted = parsed_date.strftime('%Y-%m-%d')
|
|
except:
|
|
continue
|
|
|
|
game_id = f"nfl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NFL',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'NFL'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'NFL'),
|
|
venue='',
|
|
source='pro-football-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Pro-Football-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_nfl_cbssports(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NFL schedule from CBS Sports API.
|
|
Provides more structured data than web scraping.
|
|
"""
|
|
games = []
|
|
year = season - 1 # CBS uses starting year
|
|
print(f"Fetching NFL {season} from CBS Sports...")
|
|
|
|
# CBS Sports schedule endpoint
|
|
url = f"https://www.cbssports.com/nfl/schedule/{year}/regular/"
|
|
|
|
soup = fetch_page(url, 'cbssports.com')
|
|
if not soup:
|
|
return games
|
|
|
|
# Find game tables
|
|
tables = soup.find_all('table', class_='TableBase-table')
|
|
|
|
for table in tables:
|
|
rows = table.find_all('tr')
|
|
for row in rows:
|
|
try:
|
|
cells = row.find_all('td')
|
|
if len(cells) < 3:
|
|
continue
|
|
|
|
# Parse matchup
|
|
away_cell = cells[0] if len(cells) > 0 else None
|
|
home_cell = cells[1] if len(cells) > 1 else None
|
|
|
|
if not away_cell or not home_cell:
|
|
continue
|
|
|
|
away_team = away_cell.get_text(strip=True)
|
|
home_team = home_cell.get_text(strip=True)
|
|
|
|
if not away_team or not home_team:
|
|
continue
|
|
|
|
# CBS includes @ symbol
|
|
away_team = away_team.replace('@', '').strip()
|
|
|
|
# Get date from parent section if available
|
|
date_formatted = datetime.now().strftime('%Y-%m-%d') # Placeholder
|
|
|
|
game_id = f"nfl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NFL',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'NFL'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'NFL'),
|
|
venue='',
|
|
source='cbssports.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from CBS Sports")
|
|
return games
|
|
|
|
|
|
def scrape_cbb_espn(season: int) -> list[Game]:
|
|
"""Fetch College Basketball schedule from ESPN API (D1 only)."""
|
|
# CBB season: November - April
|
|
start = f"{season-1}1101"
|
|
end = f"{season}0415"
|
|
return scrape_espn_schedule('basketball', 'mens-college-basketball', season, (start, end))
|
|
|
|
|
|
def scrape_cbb_sports_reference(season: int) -> list[Game]:
|
|
"""
|
|
Scrape College Basketball schedule from Sports-Reference.
|
|
URL: https://www.sports-reference.com/cbb/seasons/{YEAR}-schedule.html
|
|
"""
|
|
games = []
|
|
url = f"https://www.sports-reference.com/cbb/seasons/{season}-schedule.html"
|
|
|
|
print(f"Scraping CBB {season} from Sports-Reference...")
|
|
soup = fetch_page(url, 'sports-reference.com')
|
|
|
|
if not soup:
|
|
return games
|
|
|
|
table = soup.find('table', {'id': 'schedule'})
|
|
if not table:
|
|
print(" Could not find schedule table")
|
|
return games
|
|
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
return games
|
|
|
|
for row in tbody.find_all('tr'):
|
|
if row.get('class') and 'thead' in row.get('class'):
|
|
continue
|
|
|
|
try:
|
|
date_cell = row.find('td', {'data-stat': 'date_game'})
|
|
if not date_cell:
|
|
continue
|
|
date_str = date_cell.text.strip()
|
|
|
|
home_cell = row.find('td', {'data-stat': 'home_team_name'})
|
|
away_cell = row.find('td', {'data-stat': 'away_team_name'})
|
|
|
|
if not home_cell or not away_cell:
|
|
continue
|
|
|
|
home_team = home_cell.get_text(strip=True)
|
|
away_team = away_cell.get_text(strip=True)
|
|
|
|
try:
|
|
parsed_date = datetime.strptime(date_str, '%b %d, %Y')
|
|
date_formatted = parsed_date.strftime('%Y-%m-%d')
|
|
except:
|
|
continue
|
|
|
|
game_id = f"cbb_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='CBB',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=away_team[:3].upper(),
|
|
away_team_abbrev=home_team[:3].upper(),
|
|
venue='',
|
|
source='sports-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Sports-Reference")
|
|
return games
|
|
|
|
|
|
def scrape_cbb_cbssports(season: int) -> list[Game]:
|
|
"""Fetch College Basketball schedule from CBS Sports."""
|
|
games = []
|
|
print(f"Fetching CBB {season} from CBS Sports...")
|
|
|
|
url = "https://www.cbssports.com/college-basketball/schedule/"
|
|
|
|
soup = fetch_page(url, 'cbssports.com')
|
|
if not soup:
|
|
return games
|
|
|
|
tables = soup.find_all('table', class_='TableBase-table')
|
|
|
|
for table in tables:
|
|
rows = table.find_all('tr')
|
|
for row in rows:
|
|
try:
|
|
cells = row.find_all('td')
|
|
if len(cells) < 2:
|
|
continue
|
|
|
|
team_cells = row.find_all('a', class_='TeamName')
|
|
if len(team_cells) < 2:
|
|
continue
|
|
|
|
away_team = team_cells[0].get_text(strip=True)
|
|
home_team = team_cells[1].get_text(strip=True)
|
|
|
|
date_formatted = datetime.now().strftime('%Y-%m-%d')
|
|
|
|
game_id = f"cbb_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='CBB',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=away_team[:3].upper(),
|
|
away_team_abbrev=home_team[:3].upper(),
|
|
venue='',
|
|
source='cbssports.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from CBS Sports")
|
|
return games
|
|
|
|
|
|
def scrape_wnba_cbssports(season: int) -> list[Game]:
|
|
"""Fetch WNBA schedule from CBS Sports."""
|
|
games = []
|
|
print(f"Fetching WNBA {season} from CBS Sports...")
|
|
|
|
url = "https://www.cbssports.com/wnba/schedule/"
|
|
|
|
soup = fetch_page(url, 'cbssports.com')
|
|
if not soup:
|
|
return games
|
|
|
|
tables = soup.find_all('table', class_='TableBase-table')
|
|
|
|
for table in tables:
|
|
rows = table.find_all('tr')
|
|
for row in rows:
|
|
try:
|
|
cells = row.find_all('td')
|
|
if len(cells) < 2:
|
|
continue
|
|
|
|
team_cells = row.find_all('a', class_='TeamName')
|
|
if len(team_cells) < 2:
|
|
continue
|
|
|
|
away_team = team_cells[0].get_text(strip=True)
|
|
home_team = team_cells[1].get_text(strip=True)
|
|
|
|
date_formatted = datetime.now().strftime('%Y-%m-%d')
|
|
|
|
game_id = f"wnba_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='WNBA',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'WNBA'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'WNBA'),
|
|
venue='',
|
|
source='cbssports.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from CBS Sports")
|
|
return games
|
|
|
|
|
|
def scrape_mls_mlssoccer(season: int) -> list[Game]:
|
|
"""Fetch MLS schedule from official MLSSoccer.com."""
|
|
games = []
|
|
print(f"Fetching MLS {season} from MLSSoccer.com...")
|
|
|
|
url = f"https://www.mlssoccer.com/schedule/{season}"
|
|
|
|
soup = fetch_page(url, 'mlssoccer.com')
|
|
if not soup:
|
|
return games
|
|
|
|
# MLS schedule is typically rendered via JavaScript
|
|
# This is a fallback parser for any static content
|
|
tables = soup.find_all('table')
|
|
|
|
for table in tables:
|
|
rows = table.find_all('tr')
|
|
for row in rows:
|
|
try:
|
|
cells = row.find_all('td')
|
|
if len(cells) < 2:
|
|
continue
|
|
|
|
away_team = cells[0].get_text(strip=True) if cells else ''
|
|
home_team = cells[1].get_text(strip=True) if len(cells) > 1 else ''
|
|
|
|
if not away_team or not home_team:
|
|
continue
|
|
|
|
date_formatted = datetime.now().strftime('%Y-%m-%d')
|
|
|
|
game_id = f"mls_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='MLS',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'MLS'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'MLS'),
|
|
venue='',
|
|
source='mlssoccer.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from MLSSoccer.com")
|
|
return games
|
|
|
|
|
|
def scrape_nwsl_nwslsoccer(season: int) -> list[Game]:
|
|
"""Fetch NWSL schedule from official NWSL site."""
|
|
games = []
|
|
print(f"Fetching NWSL {season} from NWSL.com...")
|
|
|
|
url = f"https://www.nwslsoccer.com/schedule/{season}"
|
|
|
|
soup = fetch_page(url, 'nwslsoccer.com')
|
|
if not soup:
|
|
return games
|
|
|
|
tables = soup.find_all('table')
|
|
|
|
for table in tables:
|
|
rows = table.find_all('tr')
|
|
for row in rows:
|
|
try:
|
|
cells = row.find_all('td')
|
|
if len(cells) < 2:
|
|
continue
|
|
|
|
away_team = cells[0].get_text(strip=True) if cells else ''
|
|
home_team = cells[1].get_text(strip=True) if len(cells) > 1 else ''
|
|
|
|
if not away_team or not home_team:
|
|
continue
|
|
|
|
date_formatted = datetime.now().strftime('%Y-%m-%d')
|
|
|
|
game_id = f"nwsl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NWSL',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=None,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'NWSL'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'NWSL'),
|
|
venue='',
|
|
source='nwslsoccer.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from NWSL.com")
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPERS - WNBA (Basketball-Reference fallback)
|
|
# =============================================================================
|
|
|
|
def scrape_wnba_basketball_reference(season: int) -> list[Game]:
|
|
"""
|
|
Scrape WNBA schedule from Basketball-Reference.
|
|
URL: https://www.basketball-reference.com/wnba/years/{YEAR}_games.html
|
|
"""
|
|
games = []
|
|
url = f"https://www.basketball-reference.com/wnba/years/{season}_games.html"
|
|
|
|
print(f"Scraping WNBA {season} from Basketball-Reference...")
|
|
soup = fetch_page(url, 'basketball-reference.com')
|
|
|
|
if not soup:
|
|
return games
|
|
|
|
table = soup.find('table', {'id': 'schedule'})
|
|
if not table:
|
|
print(" Could not find schedule table")
|
|
return games
|
|
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
return games
|
|
|
|
for row in tbody.find_all('tr'):
|
|
if row.get('class') and 'thead' in row.get('class'):
|
|
continue
|
|
|
|
cells = row.find_all(['td', 'th'])
|
|
if len(cells) < 6:
|
|
continue
|
|
|
|
try:
|
|
# Parse date
|
|
date_cell = row.find('th', {'data-stat': 'date_game'})
|
|
if not date_cell:
|
|
continue
|
|
date_link = date_cell.find('a')
|
|
date_str = date_link.text if date_link else date_cell.text
|
|
|
|
# Parse time
|
|
time_cell = row.find('td', {'data-stat': 'game_start_time'})
|
|
time_str = time_cell.text.strip() if time_cell else None
|
|
|
|
# Parse teams
|
|
visitor_cell = row.find('td', {'data-stat': 'visitor_team_name'})
|
|
home_cell = row.find('td', {'data-stat': 'home_team_name'})
|
|
|
|
if not visitor_cell or not home_cell:
|
|
continue
|
|
|
|
visitor_link = visitor_cell.find('a')
|
|
home_link = home_cell.find('a')
|
|
|
|
away_team = visitor_link.text if visitor_link else visitor_cell.text
|
|
home_team = home_link.text if home_link else home_cell.text
|
|
|
|
# Parse arena
|
|
arena_cell = row.find('td', {'data-stat': 'arena_name'})
|
|
arena = arena_cell.text.strip() if arena_cell else ''
|
|
|
|
# Convert date
|
|
try:
|
|
parsed_date = datetime.strptime(date_str.strip(), '%a, %b %d, %Y')
|
|
date_formatted = parsed_date.strftime('%Y-%m-%d')
|
|
except:
|
|
continue
|
|
|
|
game_id = f"wnba_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='WNBA',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=time_str,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'WNBA'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'WNBA'),
|
|
venue=arena,
|
|
source='basketball-reference.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from Basketball-Reference")
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPERS - MLS
|
|
# =============================================================================
|
|
|
|
def scrape_mls_fbref(season: int) -> list[Game]:
|
|
"""
|
|
Scrape MLS schedule from FBref.
|
|
URL: https://fbref.com/en/comps/22/{YEAR}/schedule/{YEAR}-Major-League-Soccer-Scores-and-Fixtures
|
|
"""
|
|
games = []
|
|
url = f"https://fbref.com/en/comps/22/{season}/schedule/{season}-Major-League-Soccer-Scores-and-Fixtures"
|
|
|
|
print(f"Scraping MLS {season} from FBref...")
|
|
soup = fetch_page(url, 'fbref.com')
|
|
|
|
if not soup:
|
|
return games
|
|
|
|
table = soup.find('table', {'id': 'sched_all'}) or soup.find('table', {'id': re.compile(r'sched.*')})
|
|
if not table:
|
|
print(" Could not find schedule table")
|
|
return games
|
|
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
return games
|
|
|
|
for row in tbody.find_all('tr'):
|
|
if row.get('class') and 'spacer' in row.get('class'):
|
|
continue
|
|
|
|
try:
|
|
# Parse date
|
|
date_cell = row.find('td', {'data-stat': 'date'})
|
|
if not date_cell:
|
|
continue
|
|
date_str = date_cell.text.strip()
|
|
|
|
# Parse time
|
|
time_cell = row.find('td', {'data-stat': 'time'})
|
|
time_str = time_cell.text.strip() if time_cell else None
|
|
|
|
# Parse teams
|
|
home_cell = row.find('td', {'data-stat': 'home_team'})
|
|
away_cell = row.find('td', {'data-stat': 'away_team'})
|
|
|
|
if not home_cell or not away_cell:
|
|
continue
|
|
|
|
home_team = home_cell.text.strip()
|
|
away_team = away_cell.text.strip()
|
|
|
|
# Parse venue
|
|
venue_cell = row.find('td', {'data-stat': 'venue'})
|
|
venue = venue_cell.text.strip() if venue_cell else ''
|
|
|
|
# Convert date
|
|
try:
|
|
parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
|
|
date_formatted = parsed_date.strftime('%Y-%m-%d')
|
|
except:
|
|
continue
|
|
|
|
game_id = f"mls_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='MLS',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=time_str,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'MLS'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'MLS'),
|
|
venue=venue,
|
|
source='fbref.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from FBref")
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# SCRAPERS - NWSL
|
|
# =============================================================================
|
|
|
|
def scrape_nwsl_fbref(season: int) -> list[Game]:
|
|
"""
|
|
Scrape NWSL schedule from FBref.
|
|
URL: https://fbref.com/en/comps/182/{YEAR}/schedule/{YEAR}-NWSL-Scores-and-Fixtures
|
|
"""
|
|
games = []
|
|
url = f"https://fbref.com/en/comps/182/{season}/schedule/{season}-NWSL-Scores-and-Fixtures"
|
|
|
|
print(f"Scraping NWSL {season} from FBref...")
|
|
soup = fetch_page(url, 'fbref.com')
|
|
|
|
if not soup:
|
|
return games
|
|
|
|
table = soup.find('table', {'id': 'sched_all'}) or soup.find('table', {'id': re.compile(r'sched.*')})
|
|
if not table:
|
|
print(" Could not find schedule table")
|
|
return games
|
|
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
return games
|
|
|
|
for row in tbody.find_all('tr'):
|
|
if row.get('class') and 'spacer' in row.get('class'):
|
|
continue
|
|
|
|
try:
|
|
# Parse date
|
|
date_cell = row.find('td', {'data-stat': 'date'})
|
|
if not date_cell:
|
|
continue
|
|
date_str = date_cell.text.strip()
|
|
|
|
# Parse time
|
|
time_cell = row.find('td', {'data-stat': 'time'})
|
|
time_str = time_cell.text.strip() if time_cell else None
|
|
|
|
# Parse teams
|
|
home_cell = row.find('td', {'data-stat': 'home_team'})
|
|
away_cell = row.find('td', {'data-stat': 'away_team'})
|
|
|
|
if not home_cell or not away_cell:
|
|
continue
|
|
|
|
home_team = home_cell.text.strip()
|
|
away_team = away_cell.text.strip()
|
|
|
|
# Parse venue
|
|
venue_cell = row.find('td', {'data-stat': 'venue'})
|
|
venue = venue_cell.text.strip() if venue_cell else ''
|
|
|
|
# Convert date
|
|
try:
|
|
parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
|
|
date_formatted = parsed_date.strftime('%Y-%m-%d')
|
|
except:
|
|
continue
|
|
|
|
game_id = f"nwsl_{date_formatted}_{away_team[:3]}_{home_team[:3]}".lower().replace(' ', '')
|
|
|
|
game = Game(
|
|
id=game_id,
|
|
sport='NWSL',
|
|
season=str(season),
|
|
date=date_formatted,
|
|
time=time_str,
|
|
home_team=home_team,
|
|
away_team=away_team,
|
|
home_team_abbrev=get_team_abbrev(home_team, 'NWSL'),
|
|
away_team_abbrev=get_team_abbrev(away_team, 'NWSL'),
|
|
venue=venue,
|
|
source='fbref.com'
|
|
)
|
|
games.append(game)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
print(f" Found {len(games)} games from FBref")
|
|
return games
|
|
|
|
|
|
# =============================================================================
|
|
# STADIUM SCRAPER
|
|
# =============================================================================
|
|
|
|
def scrape_stadiums_hifld() -> list[Stadium]:
|
|
"""
|
|
Fetch stadium data from HIFLD Open Data (US Government).
|
|
Returns GeoJSON with coordinates.
|
|
"""
|
|
stadiums = []
|
|
url = "https://services1.arcgis.com/Hp6G80Pky0om7QvQ/arcgis/rest/services/Major_Sport_Venues/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
|
|
|
|
print("Fetching stadiums from HIFLD Open Data...")
|
|
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for feature in data.get('features', []):
|
|
attrs = feature.get('attributes', {})
|
|
geom = feature.get('geometry', {})
|
|
|
|
# Filter for NBA, MLB, NHL venues
|
|
league = attrs.get('LEAGUE', '')
|
|
if league not in ['NBA', 'MLB', 'NHL', 'NFL']:
|
|
continue
|
|
|
|
sport_map = {'NBA': 'NBA', 'MLB': 'MLB', 'NHL': 'NHL'}
|
|
if league not in sport_map:
|
|
continue
|
|
|
|
stadium = Stadium(
|
|
id=f"hifld_{attrs.get('OBJECTID', '')}",
|
|
name=attrs.get('NAME', ''),
|
|
city=attrs.get('CITY', ''),
|
|
state=attrs.get('STATE', ''),
|
|
latitude=geom.get('y', 0),
|
|
longitude=geom.get('x', 0),
|
|
capacity=attrs.get('CAPACITY', 0) or 0,
|
|
sport=sport_map.get(league, ''),
|
|
team_abbrevs=[attrs.get('TEAM', '')],
|
|
source='hifld.gov',
|
|
year_opened=attrs.get('YEAR_OPEN')
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
except Exception as e:
|
|
print(f" Error fetching HIFLD data: {e}")
|
|
|
|
print(f" Found {len(stadiums)} stadiums from HIFLD")
|
|
return stadiums
|
|
|
|
|
|
# =============================================================================
|
|
# SPORT-SPECIFIC STADIUM SCRAPERS
|
|
# =============================================================================
|
|
|
|
def scrape_mlb_stadiums_scorebot() -> list[Stadium]:
|
|
"""
|
|
Source 1: MLBScoreBot/ballparks GitHub (public domain).
|
|
"""
|
|
stadiums = []
|
|
url = "https://raw.githubusercontent.com/MLBScoreBot/ballparks/main/ballparks.json"
|
|
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for name, info in data.items():
|
|
stadium = Stadium(
|
|
id=f"mlb_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info.get('city', ''),
|
|
state=info.get('state', ''),
|
|
latitude=info.get('lat', 0) / 1000000 if info.get('lat') else 0,
|
|
longitude=info.get('long', 0) / 1000000 if info.get('long') else 0,
|
|
capacity=info.get('capacity', 0),
|
|
sport='MLB',
|
|
team_abbrevs=[info.get('team', '')],
|
|
source='github.com/MLBScoreBot'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_mlb_stadiums_geojson() -> list[Stadium]:
|
|
"""
|
|
Source 2: cageyjames/GeoJSON-Ballparks GitHub.
|
|
"""
|
|
stadiums = []
|
|
url = "https://raw.githubusercontent.com/cageyjames/GeoJSON-Ballparks/master/ballparks.geojson"
|
|
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for feature in data.get('features', []):
|
|
props = feature.get('properties', {})
|
|
coords = feature.get('geometry', {}).get('coordinates', [0, 0])
|
|
|
|
# Only include MLB stadiums (filter by League)
|
|
if props.get('League', '').upper() != 'MLB':
|
|
continue
|
|
|
|
stadium = Stadium(
|
|
id=f"mlb_{props.get('Ballpark', '').lower().replace(' ', '_')[:30]}",
|
|
name=props.get('Ballpark', ''),
|
|
city=props.get('City', ''),
|
|
state=props.get('State', ''),
|
|
latitude=coords[1] if len(coords) > 1 else 0,
|
|
longitude=coords[0] if len(coords) > 0 else 0,
|
|
capacity=0, # Not in this dataset
|
|
sport='MLB',
|
|
team_abbrevs=[props.get('Team', '')],
|
|
source='github.com/cageyjames'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_mlb_stadiums_hardcoded() -> list[Stadium]:
|
|
"""
|
|
Source 3: Hardcoded MLB ballparks (fallback).
|
|
"""
|
|
mlb_ballparks = {
|
|
'Chase Field': {'city': 'Phoenix', 'state': 'AZ', 'lat': 33.4453, 'lng': -112.0667, 'capacity': 48519, 'teams': ['ARI']},
|
|
'Truist Park': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.8907, 'lng': -84.4677, 'capacity': 41084, 'teams': ['ATL']},
|
|
'Oriole Park at Camden Yards': {'city': 'Baltimore', 'state': 'MD', 'lat': 39.2839, 'lng': -76.6216, 'capacity': 44970, 'teams': ['BAL']},
|
|
'Fenway Park': {'city': 'Boston', 'state': 'MA', 'lat': 42.3467, 'lng': -71.0972, 'capacity': 37755, 'teams': ['BOS']},
|
|
'Wrigley Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.9484, 'lng': -87.6553, 'capacity': 41649, 'teams': ['CHC']},
|
|
'Guaranteed Rate Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8299, 'lng': -87.6338, 'capacity': 40615, 'teams': ['CHW']},
|
|
'Great American Ball Park': {'city': 'Cincinnati', 'state': 'OH', 'lat': 39.0979, 'lng': -84.5082, 'capacity': 42319, 'teams': ['CIN']},
|
|
'Progressive Field': {'city': 'Cleveland', 'state': 'OH', 'lat': 41.4958, 'lng': -81.6853, 'capacity': 34830, 'teams': ['CLE']},
|
|
'Coors Field': {'city': 'Denver', 'state': 'CO', 'lat': 39.7559, 'lng': -104.9942, 'capacity': 50144, 'teams': ['COL']},
|
|
'Comerica Park': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3390, 'lng': -83.0485, 'capacity': 41083, 'teams': ['DET']},
|
|
'Minute Maid Park': {'city': 'Houston', 'state': 'TX', 'lat': 29.7573, 'lng': -95.3555, 'capacity': 41168, 'teams': ['HOU']},
|
|
'Kauffman Stadium': {'city': 'Kansas City', 'state': 'MO', 'lat': 39.0517, 'lng': -94.4803, 'capacity': 37903, 'teams': ['KCR']},
|
|
'Angel Stadium': {'city': 'Anaheim', 'state': 'CA', 'lat': 33.8003, 'lng': -117.8827, 'capacity': 45517, 'teams': ['LAA']},
|
|
'Dodger Stadium': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0739, 'lng': -118.2400, 'capacity': 56000, 'teams': ['LAD']},
|
|
'LoanDepot Park': {'city': 'Miami', 'state': 'FL', 'lat': 25.7781, 'lng': -80.2196, 'capacity': 36742, 'teams': ['MIA']},
|
|
'American Family Field': {'city': 'Milwaukee', 'state': 'WI', 'lat': 43.0280, 'lng': -87.9712, 'capacity': 41900, 'teams': ['MIL']},
|
|
'Target Field': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9818, 'lng': -93.2775, 'capacity': 38544, 'teams': ['MIN']},
|
|
'Citi Field': {'city': 'Queens', 'state': 'NY', 'lat': 40.7571, 'lng': -73.8458, 'capacity': 41922, 'teams': ['NYM']},
|
|
'Yankee Stadium': {'city': 'Bronx', 'state': 'NY', 'lat': 40.8296, 'lng': -73.9262, 'capacity': 46537, 'teams': ['NYY']},
|
|
'Oakland Coliseum': {'city': 'Oakland', 'state': 'CA', 'lat': 37.7516, 'lng': -122.2005, 'capacity': 46847, 'teams': ['OAK']},
|
|
'Citizens Bank Park': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9061, 'lng': -75.1665, 'capacity': 42901, 'teams': ['PHI']},
|
|
'PNC Park': {'city': 'Pittsburgh', 'state': 'PA', 'lat': 40.4469, 'lng': -80.0057, 'capacity': 38362, 'teams': ['PIT']},
|
|
'Petco Park': {'city': 'San Diego', 'state': 'CA', 'lat': 32.7073, 'lng': -117.1566, 'capacity': 40209, 'teams': ['SDP']},
|
|
'Oracle Park': {'city': 'San Francisco', 'state': 'CA', 'lat': 37.7786, 'lng': -122.3893, 'capacity': 41915, 'teams': ['SFG']},
|
|
'T-Mobile Park': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5914, 'lng': -122.3325, 'capacity': 47929, 'teams': ['SEA']},
|
|
'Busch Stadium': {'city': 'St. Louis', 'state': 'MO', 'lat': 38.6226, 'lng': -90.1928, 'capacity': 45538, 'teams': ['STL']},
|
|
'Tropicana Field': {'city': 'St. Petersburg', 'state': 'FL', 'lat': 27.7682, 'lng': -82.6534, 'capacity': 25000, 'teams': ['TBR']},
|
|
'Globe Life Field': {'city': 'Arlington', 'state': 'TX', 'lat': 32.7473, 'lng': -97.0844, 'capacity': 40300, 'teams': ['TEX']},
|
|
'Rogers Centre': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6414, 'lng': -79.3894, 'capacity': 49282, 'teams': ['TOR']},
|
|
'Nationals Park': {'city': 'Washington', 'state': 'DC', 'lat': 38.8729, 'lng': -77.0074, 'capacity': 41339, 'teams': ['WSN']},
|
|
}
|
|
|
|
stadiums = []
|
|
for name, info in mlb_ballparks.items():
|
|
stadium = Stadium(
|
|
id=f"mlb_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info['city'],
|
|
state=info['state'],
|
|
latitude=info['lat'],
|
|
longitude=info['lng'],
|
|
capacity=info['capacity'],
|
|
sport='MLB',
|
|
team_abbrevs=info['teams'],
|
|
source='mlb_hardcoded'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_mlb_stadiums() -> list[Stadium]:
|
|
"""
|
|
Fetch MLB stadium data with multi-source fallback.
|
|
"""
|
|
print("\nMLB STADIUMS")
|
|
print("-" * 40)
|
|
|
|
sources = [
|
|
StadiumScraperSource('MLBScoreBot', scrape_mlb_stadiums_scorebot, priority=1, min_venues=25),
|
|
StadiumScraperSource('GeoJSON-Ballparks', scrape_mlb_stadiums_geojson, priority=2, min_venues=25),
|
|
StadiumScraperSource('Hardcoded', scrape_mlb_stadiums_hardcoded, priority=3, min_venues=25),
|
|
]
|
|
|
|
return scrape_stadiums_with_fallback('MLB', sources)
|
|
|
|
|
|
def scrape_nfl_stadiums_scorebot() -> list[Stadium]:
|
|
"""
|
|
Source 1: NFLScoreBot/stadiums GitHub (public domain).
|
|
"""
|
|
stadiums = []
|
|
url = "https://raw.githubusercontent.com/NFLScoreBot/stadiums/main/stadiums.json"
|
|
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for name, info in data.items():
|
|
stadium = Stadium(
|
|
id=f"nfl_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info.get('city', ''),
|
|
state=info.get('state', ''),
|
|
latitude=info.get('lat', 0) / 1000000 if info.get('lat') else 0,
|
|
longitude=info.get('long', 0) / 1000000 if info.get('long') else 0,
|
|
capacity=info.get('capacity', 0),
|
|
sport='NFL',
|
|
team_abbrevs=info.get('teams', []),
|
|
source='github.com/NFLScoreBot'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_nfl_stadiums_geojson() -> list[Stadium]:
|
|
"""
|
|
Source 2: brianhatchl/nfl-stadiums GeoJSON gist.
|
|
"""
|
|
stadiums = []
|
|
url = "https://gist.githubusercontent.com/brianhatchl/6265918/raw/dbe6acfe5deb48f51ce5a4c4f8f5dded4f02b9bd/nfl_stadiums.geojson"
|
|
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for feature in data.get('features', []):
|
|
props = feature.get('properties', {})
|
|
coords = feature.get('geometry', {}).get('coordinates', [0, 0])
|
|
|
|
stadium = Stadium(
|
|
id=f"nfl_{props.get('Stadium', '').lower().replace(' ', '_')[:30]}",
|
|
name=props.get('Stadium', ''),
|
|
city=props.get('City', ''),
|
|
state=props.get('State', ''),
|
|
latitude=coords[1] if len(coords) > 1 else 0,
|
|
longitude=coords[0] if len(coords) > 0 else 0,
|
|
capacity=int(props.get('Capacity', 0) or 0),
|
|
sport='NFL',
|
|
team_abbrevs=[props.get('Team', '')],
|
|
source='gist.github.com/brianhatchl'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_nfl_stadiums_hardcoded() -> list[Stadium]:
|
|
"""
|
|
Source 3: Hardcoded NFL stadiums (fallback).
|
|
"""
|
|
nfl_stadiums_data = {
|
|
'State Farm Stadium': {'city': 'Glendale', 'state': 'AZ', 'lat': 33.5276, 'lng': -112.2626, 'capacity': 63400, 'teams': ['ARI']},
|
|
'Mercedes-Benz Stadium': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.7553, 'lng': -84.4006, 'capacity': 71000, 'teams': ['ATL']},
|
|
'M&T Bank Stadium': {'city': 'Baltimore', 'state': 'MD', 'lat': 39.2780, 'lng': -76.6227, 'capacity': 71008, 'teams': ['BAL']},
|
|
'Highmark Stadium': {'city': 'Orchard Park', 'state': 'NY', 'lat': 42.7738, 'lng': -78.7870, 'capacity': 71608, 'teams': ['BUF']},
|
|
'Bank of America Stadium': {'city': 'Charlotte', 'state': 'NC', 'lat': 35.2258, 'lng': -80.8528, 'capacity': 75523, 'teams': ['CAR']},
|
|
'Soldier Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8623, 'lng': -87.6167, 'capacity': 61500, 'teams': ['CHI']},
|
|
'Paycor Stadium': {'city': 'Cincinnati', 'state': 'OH', 'lat': 39.0954, 'lng': -84.5160, 'capacity': 65515, 'teams': ['CIN']},
|
|
'Cleveland Browns Stadium': {'city': 'Cleveland', 'state': 'OH', 'lat': 41.5061, 'lng': -81.6995, 'capacity': 67895, 'teams': ['CLE']},
|
|
'AT&T Stadium': {'city': 'Arlington', 'state': 'TX', 'lat': 32.7480, 'lng': -97.0928, 'capacity': 80000, 'teams': ['DAL']},
|
|
'Empower Field at Mile High': {'city': 'Denver', 'state': 'CO', 'lat': 39.7439, 'lng': -105.0201, 'capacity': 76125, 'teams': ['DEN']},
|
|
'Ford Field': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3400, 'lng': -83.0456, 'capacity': 65000, 'teams': ['DET']},
|
|
'Lambeau Field': {'city': 'Green Bay', 'state': 'WI', 'lat': 44.5013, 'lng': -88.0622, 'capacity': 81435, 'teams': ['GB']},
|
|
'NRG Stadium': {'city': 'Houston', 'state': 'TX', 'lat': 29.6847, 'lng': -95.4107, 'capacity': 72220, 'teams': ['HOU']},
|
|
'Lucas Oil Stadium': {'city': 'Indianapolis', 'state': 'IN', 'lat': 39.7601, 'lng': -86.1639, 'capacity': 67000, 'teams': ['IND']},
|
|
'EverBank Stadium': {'city': 'Jacksonville', 'state': 'FL', 'lat': 30.3239, 'lng': -81.6373, 'capacity': 67814, 'teams': ['JAX']},
|
|
'GEHA Field at Arrowhead Stadium': {'city': 'Kansas City', 'state': 'MO', 'lat': 39.0489, 'lng': -94.4839, 'capacity': 76416, 'teams': ['KC']},
|
|
'Allegiant Stadium': {'city': 'Las Vegas', 'state': 'NV', 'lat': 36.0909, 'lng': -115.1833, 'capacity': 65000, 'teams': ['LV']},
|
|
'SoFi Stadium': {'city': 'Inglewood', 'state': 'CA', 'lat': 33.9535, 'lng': -118.3392, 'capacity': 70240, 'teams': ['LAC', 'LAR']},
|
|
'Hard Rock Stadium': {'city': 'Miami Gardens', 'state': 'FL', 'lat': 25.9580, 'lng': -80.2389, 'capacity': 64767, 'teams': ['MIA']},
|
|
'U.S. Bank Stadium': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9736, 'lng': -93.2575, 'capacity': 66655, 'teams': ['MIN']},
|
|
'Gillette Stadium': {'city': 'Foxborough', 'state': 'MA', 'lat': 42.0909, 'lng': -71.2643, 'capacity': 65878, 'teams': ['NE']},
|
|
'Caesars Superdome': {'city': 'New Orleans', 'state': 'LA', 'lat': 29.9511, 'lng': -90.0812, 'capacity': 73208, 'teams': ['NO']},
|
|
'MetLife Stadium': {'city': 'East Rutherford', 'state': 'NJ', 'lat': 40.8135, 'lng': -74.0745, 'capacity': 82500, 'teams': ['NYG', 'NYJ']},
|
|
'Lincoln Financial Field': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9008, 'lng': -75.1675, 'capacity': 69596, 'teams': ['PHI']},
|
|
'Acrisure Stadium': {'city': 'Pittsburgh', 'state': 'PA', 'lat': 40.4468, 'lng': -80.0158, 'capacity': 68400, 'teams': ['PIT']},
|
|
'Levi\'s Stadium': {'city': 'Santa Clara', 'state': 'CA', 'lat': 37.4032, 'lng': -121.9698, 'capacity': 68500, 'teams': ['SF']},
|
|
'Lumen Field': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5952, 'lng': -122.3316, 'capacity': 68740, 'teams': ['SEA']},
|
|
'Raymond James Stadium': {'city': 'Tampa', 'state': 'FL', 'lat': 27.9759, 'lng': -82.5033, 'capacity': 65618, 'teams': ['TB']},
|
|
'Nissan Stadium': {'city': 'Nashville', 'state': 'TN', 'lat': 36.1665, 'lng': -86.7713, 'capacity': 69143, 'teams': ['TEN']},
|
|
'Commanders Field': {'city': 'Landover', 'state': 'MD', 'lat': 38.9076, 'lng': -76.8645, 'capacity': 67617, 'teams': ['WAS']},
|
|
}
|
|
|
|
stadiums = []
|
|
for name, info in nfl_stadiums_data.items():
|
|
stadium = Stadium(
|
|
id=f"nfl_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info['city'],
|
|
state=info['state'],
|
|
latitude=info['lat'],
|
|
longitude=info['lng'],
|
|
capacity=info['capacity'],
|
|
sport='NFL',
|
|
team_abbrevs=info['teams'],
|
|
source='nfl_hardcoded'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_nfl_stadiums() -> list[Stadium]:
|
|
"""
|
|
Fetch NFL stadium data with multi-source fallback.
|
|
"""
|
|
print("\nNFL STADIUMS")
|
|
print("-" * 40)
|
|
|
|
sources = [
|
|
StadiumScraperSource('NFLScoreBot', scrape_nfl_stadiums_scorebot, priority=1, min_venues=28),
|
|
StadiumScraperSource('GeoJSON-Gist', scrape_nfl_stadiums_geojson, priority=2, min_venues=28),
|
|
StadiumScraperSource('Hardcoded', scrape_nfl_stadiums_hardcoded, priority=3, min_venues=28),
|
|
]
|
|
|
|
return scrape_stadiums_with_fallback('NFL', sources)
|
|
|
|
|
|
def scrape_mls_stadiums_geojson() -> list[Stadium]:
|
|
"""
|
|
Source 1: gavinr/usa-soccer GeoJSON.
|
|
"""
|
|
stadiums = []
|
|
url = "https://raw.githubusercontent.com/gavinr/usa-soccer/master/mls.geojson"
|
|
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for feature in data.get('features', []):
|
|
props = feature.get('properties', {})
|
|
coords = feature.get('geometry', {}).get('coordinates', [0, 0])
|
|
|
|
stadium = Stadium(
|
|
id=f"mls_{props.get('stadium', '').lower().replace(' ', '_')[:30]}",
|
|
name=props.get('stadium', ''),
|
|
city=props.get('city', ''),
|
|
state=props.get('state', ''),
|
|
latitude=coords[1] if len(coords) > 1 else 0,
|
|
longitude=coords[0] if len(coords) > 0 else 0,
|
|
capacity=props.get('capacity', 0),
|
|
sport='MLS',
|
|
team_abbrevs=[props.get('team', '')],
|
|
source='github.com/gavinr'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_mls_stadiums_csv() -> list[Stadium]:
|
|
"""
|
|
Source 2: gavinr/usa-soccer CSV.
|
|
"""
|
|
stadiums = []
|
|
url = "https://raw.githubusercontent.com/gavinr/usa-soccer/master/mls.csv"
|
|
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
import csv
|
|
from io import StringIO
|
|
reader = csv.DictReader(StringIO(response.text))
|
|
|
|
for row in reader:
|
|
stadium = Stadium(
|
|
id=f"mls_{row.get('stadium', '').lower().replace(' ', '_')[:30]}",
|
|
name=row.get('stadium', ''),
|
|
city=row.get('city', ''),
|
|
state=row.get('state', ''),
|
|
latitude=float(row.get('lat', 0) or 0),
|
|
longitude=float(row.get('lng', 0) or 0),
|
|
capacity=int(row.get('capacity', 0) or 0),
|
|
sport='MLS',
|
|
team_abbrevs=[row.get('team', '')],
|
|
source='github.com/gavinr/csv'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_mls_stadiums_hardcoded() -> list[Stadium]:
|
|
"""
|
|
Source 3: Hardcoded MLS stadiums (fallback).
|
|
"""
|
|
mls_stadiums_data = {
|
|
'Mercedes-Benz Stadium': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.7553, 'lng': -84.4006, 'capacity': 42500, 'team': 'ATL'},
|
|
'Q2 Stadium': {'city': 'Austin', 'state': 'TX', 'lat': 30.3879, 'lng': -97.7195, 'capacity': 20738, 'team': 'ATX'},
|
|
'Audi Field': {'city': 'Washington', 'state': 'DC', 'lat': 38.8687, 'lng': -77.0128, 'capacity': 20000, 'team': 'DC'},
|
|
'TQL Stadium': {'city': 'Cincinnati', 'state': 'OH', 'lat': 39.1107, 'lng': -84.5228, 'capacity': 26000, 'team': 'CIN'},
|
|
'Lower.com Field': {'city': 'Columbus', 'state': 'OH', 'lat': 39.9689, 'lng': -83.0172, 'capacity': 20371, 'team': 'CLB'},
|
|
'Toyota Stadium': {'city': 'Frisco', 'state': 'TX', 'lat': 33.1542, 'lng': -96.8350, 'capacity': 20500, 'team': 'DAL'},
|
|
'Dick\'s Sporting Goods Park': {'city': 'Commerce City', 'state': 'CO', 'lat': 39.8056, 'lng': -104.8919, 'capacity': 18061, 'team': 'COL'},
|
|
'Shell Energy Stadium': {'city': 'Houston', 'state': 'TX', 'lat': 29.7523, 'lng': -95.3526, 'capacity': 22039, 'team': 'HOU'},
|
|
'Dignity Health Sports Park': {'city': 'Carson', 'state': 'CA', 'lat': 33.8644, 'lng': -118.2611, 'capacity': 27000, 'team': 'LA'},
|
|
'BMO Stadium': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0128, 'lng': -118.2841, 'capacity': 22000, 'team': 'LAFC'},
|
|
'Chase Stadium': {'city': 'Fort Lauderdale', 'state': 'FL', 'lat': 26.1931, 'lng': -80.1606, 'capacity': 21550, 'team': 'MIA'},
|
|
'Allianz Field': {'city': 'Saint Paul', 'state': 'MN', 'lat': 44.9530, 'lng': -93.1653, 'capacity': 19400, 'team': 'MIN'},
|
|
'Stade Saputo': {'city': 'Montreal', 'state': 'QC', 'lat': 45.5629, 'lng': -73.5528, 'capacity': 19619, 'team': 'MTL'},
|
|
'Geodis Park': {'city': 'Nashville', 'state': 'TN', 'lat': 36.1306, 'lng': -86.7658, 'capacity': 30000, 'team': 'NSH'},
|
|
'Yankee Stadium': {'city': 'Bronx', 'state': 'NY', 'lat': 40.8296, 'lng': -73.9262, 'capacity': 30321, 'team': 'NYC'},
|
|
'Red Bull Arena': {'city': 'Harrison', 'state': 'NJ', 'lat': 40.7369, 'lng': -74.1503, 'capacity': 25000, 'team': 'NYRB'},
|
|
'Inter&Co Stadium': {'city': 'Orlando', 'state': 'FL', 'lat': 28.5412, 'lng': -81.3896, 'capacity': 25500, 'team': 'ORL'},
|
|
'Subaru Park': {'city': 'Chester', 'state': 'PA', 'lat': 39.8328, 'lng': -75.3789, 'capacity': 18500, 'team': 'PHI'},
|
|
'Providence Park': {'city': 'Portland', 'state': 'OR', 'lat': 45.5217, 'lng': -122.6918, 'capacity': 25218, 'team': 'POR'},
|
|
'America First Field': {'city': 'Sandy', 'state': 'UT', 'lat': 40.5829, 'lng': -111.8933, 'capacity': 20213, 'team': 'RSL'},
|
|
'PayPal Park': {'city': 'San Jose', 'state': 'CA', 'lat': 37.3512, 'lng': -121.9251, 'capacity': 18000, 'team': 'SJ'},
|
|
'Lumen Field': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5952, 'lng': -122.3316, 'capacity': 69000, 'team': 'SEA'},
|
|
'Children\'s Mercy Park': {'city': 'Kansas City', 'state': 'KS', 'lat': 39.1218, 'lng': -94.8231, 'capacity': 18467, 'team': 'SKC'},
|
|
'CityPark': {'city': 'St. Louis', 'state': 'MO', 'lat': 38.6316, 'lng': -90.2094, 'capacity': 22500, 'team': 'STL'},
|
|
'BMO Field': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6332, 'lng': -79.4185, 'capacity': 30000, 'team': 'TOR'},
|
|
'BC Place': {'city': 'Vancouver', 'state': 'BC', 'lat': 49.2768, 'lng': -123.1117, 'capacity': 22120, 'team': 'VAN'},
|
|
}
|
|
|
|
stadiums = []
|
|
for name, info in mls_stadiums_data.items():
|
|
stadium = Stadium(
|
|
id=f"mls_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info['city'],
|
|
state=info['state'],
|
|
latitude=info['lat'],
|
|
longitude=info['lng'],
|
|
capacity=info['capacity'],
|
|
sport='MLS',
|
|
team_abbrevs=[info['team']],
|
|
source='mls_hardcoded'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_mls_stadiums() -> list[Stadium]:
|
|
"""
|
|
Fetch MLS stadium data with multi-source fallback.
|
|
"""
|
|
print("\nMLS STADIUMS")
|
|
print("-" * 40)
|
|
|
|
sources = [
|
|
StadiumScraperSource('gavinr GeoJSON', scrape_mls_stadiums_geojson, priority=1, min_venues=20),
|
|
StadiumScraperSource('gavinr CSV', scrape_mls_stadiums_csv, priority=2, min_venues=20),
|
|
StadiumScraperSource('Hardcoded', scrape_mls_stadiums_hardcoded, priority=3, min_venues=20),
|
|
]
|
|
|
|
return scrape_stadiums_with_fallback('MLS', sources)
|
|
|
|
|
|
def scrape_nhl_stadiums() -> list[Stadium]:
|
|
"""
|
|
Fetch NHL arena data from NHL API.
|
|
"""
|
|
stadiums = []
|
|
url = "https://api-web.nhle.com/v1/standings/now"
|
|
|
|
print(" Fetching NHL arenas from NHL API...")
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
seen_venues = set()
|
|
for team in data.get('standings', []):
|
|
venue_name = team.get('homepageUrl', '') # Try to extract venue
|
|
team_name = team.get('teamName', {}).get('default', '')
|
|
team_abbrev = team.get('teamAbbrev', {}).get('default', '')
|
|
|
|
# NHL API doesn't give venue directly, use team info
|
|
# We'll supplement with hardcoded data
|
|
if team_abbrev and team_abbrev not in seen_venues:
|
|
seen_venues.add(team_abbrev)
|
|
|
|
# Fallback to hardcoded NHL arenas with coordinates
|
|
nhl_arenas = {
|
|
'TD Garden': {'city': 'Boston', 'state': 'MA', 'lat': 42.3662, 'lng': -71.0621, 'capacity': 17850, 'teams': ['BOS']},
|
|
'KeyBank Center': {'city': 'Buffalo', 'state': 'NY', 'lat': 42.8750, 'lng': -78.8764, 'capacity': 19070, 'teams': ['BUF']},
|
|
'Little Caesars Arena': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3411, 'lng': -83.0553, 'capacity': 19515, 'teams': ['DET']},
|
|
'Amerant Bank Arena': {'city': 'Sunrise', 'state': 'FL', 'lat': 26.1584, 'lng': -80.3256, 'capacity': 19250, 'teams': ['FLA']},
|
|
'Bell Centre': {'city': 'Montreal', 'state': 'QC', 'lat': 45.4961, 'lng': -73.5693, 'capacity': 21302, 'teams': ['MTL']},
|
|
'Canadian Tire Centre': {'city': 'Ottawa', 'state': 'ON', 'lat': 45.2969, 'lng': -75.9272, 'capacity': 18652, 'teams': ['OTT']},
|
|
'Amalie Arena': {'city': 'Tampa', 'state': 'FL', 'lat': 27.9426, 'lng': -82.4519, 'capacity': 19092, 'teams': ['TBL']},
|
|
'Scotiabank Arena': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6435, 'lng': -79.3791, 'capacity': 18800, 'teams': ['TOR']},
|
|
'PNC Arena': {'city': 'Raleigh', 'state': 'NC', 'lat': 35.8033, 'lng': -78.7220, 'capacity': 18680, 'teams': ['CAR']},
|
|
'Nationwide Arena': {'city': 'Columbus', 'state': 'OH', 'lat': 39.9692, 'lng': -83.0061, 'capacity': 18500, 'teams': ['CBJ']},
|
|
'Prudential Center': {'city': 'Newark', 'state': 'NJ', 'lat': 40.7334, 'lng': -74.1713, 'capacity': 16514, 'teams': ['NJD']},
|
|
'UBS Arena': {'city': 'Elmont', 'state': 'NY', 'lat': 40.7170, 'lng': -73.7260, 'capacity': 17255, 'teams': ['NYI']},
|
|
'Madison Square Garden': {'city': 'New York', 'state': 'NY', 'lat': 40.7505, 'lng': -73.9934, 'capacity': 18006, 'teams': ['NYR']},
|
|
'Wells Fargo Center': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9012, 'lng': -75.1720, 'capacity': 19500, 'teams': ['PHI']},
|
|
'PPG Paints Arena': {'city': 'Pittsburgh', 'state': 'PA', 'lat': 40.4395, 'lng': -79.9892, 'capacity': 18387, 'teams': ['PIT']},
|
|
'Capital One Arena': {'city': 'Washington', 'state': 'DC', 'lat': 38.8982, 'lng': -77.0209, 'capacity': 18573, 'teams': ['WSH']},
|
|
'United Center': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8807, 'lng': -87.6742, 'capacity': 19717, 'teams': ['CHI']},
|
|
'Ball Arena': {'city': 'Denver', 'state': 'CO', 'lat': 39.7487, 'lng': -105.0077, 'capacity': 18007, 'teams': ['COL']},
|
|
'American Airlines Center': {'city': 'Dallas', 'state': 'TX', 'lat': 32.7905, 'lng': -96.8103, 'capacity': 18532, 'teams': ['DAL']},
|
|
'Xcel Energy Center': {'city': 'Saint Paul', 'state': 'MN', 'lat': 44.9448, 'lng': -93.1010, 'capacity': 17954, 'teams': ['MIN']},
|
|
'Bridgestone Arena': {'city': 'Nashville', 'state': 'TN', 'lat': 36.1592, 'lng': -86.7785, 'capacity': 17159, 'teams': ['NSH']},
|
|
'Enterprise Center': {'city': 'St. Louis', 'state': 'MO', 'lat': 38.6268, 'lng': -90.2025, 'capacity': 18096, 'teams': ['STL']},
|
|
'Canada Life Centre': {'city': 'Winnipeg', 'state': 'MB', 'lat': 49.8928, 'lng': -97.1437, 'capacity': 15321, 'teams': ['WPG']},
|
|
'Honda Center': {'city': 'Anaheim', 'state': 'CA', 'lat': 33.8078, 'lng': -117.8765, 'capacity': 17174, 'teams': ['ANA']},
|
|
'Footprint Center': {'city': 'Tempe', 'state': 'AZ', 'lat': 33.4457, 'lng': -112.0712, 'capacity': 16210, 'teams': ['UTA']},
|
|
'SAP Center': {'city': 'San Jose', 'state': 'CA', 'lat': 37.3327, 'lng': -121.9012, 'capacity': 17562, 'teams': ['SJS']},
|
|
'Rogers Arena': {'city': 'Vancouver', 'state': 'BC', 'lat': 49.2778, 'lng': -123.1089, 'capacity': 18910, 'teams': ['VAN']},
|
|
'T-Mobile Arena': {'city': 'Las Vegas', 'state': 'NV', 'lat': 36.1028, 'lng': -115.1784, 'capacity': 17500, 'teams': ['VGK']},
|
|
'Climate Pledge Arena': {'city': 'Seattle', 'state': 'WA', 'lat': 47.6220, 'lng': -122.3540, 'capacity': 17100, 'teams': ['SEA']},
|
|
'Crypto.com Arena': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0430, 'lng': -118.2673, 'capacity': 18230, 'teams': ['LAK']},
|
|
'Rogers Place': {'city': 'Edmonton', 'state': 'AB', 'lat': 53.5469, 'lng': -113.4979, 'capacity': 18347, 'teams': ['EDM']},
|
|
'Scotiabank Saddledome': {'city': 'Calgary', 'state': 'AB', 'lat': 51.0374, 'lng': -114.0519, 'capacity': 19289, 'teams': ['CGY']},
|
|
}
|
|
|
|
for name, info in nhl_arenas.items():
|
|
stadium = Stadium(
|
|
id=f"nhl_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info['city'],
|
|
state=info['state'],
|
|
latitude=info['lat'],
|
|
longitude=info['lng'],
|
|
capacity=info['capacity'],
|
|
sport='NHL',
|
|
team_abbrevs=info['teams'],
|
|
source='nhl_hardcoded'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
print(f" Found {len(stadiums)} NHL arenas")
|
|
except Exception as e:
|
|
print(f" Error fetching NHL arenas: {e}")
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_nba_stadiums() -> list[Stadium]:
|
|
"""
|
|
Fetch NBA arena data (hardcoded with accurate coordinates).
|
|
"""
|
|
print(" Loading NBA arenas...")
|
|
|
|
nba_arenas = {
|
|
'State Farm Arena': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.7573, 'lng': -84.3963, 'capacity': 18118, 'teams': ['ATL']},
|
|
'TD Garden': {'city': 'Boston', 'state': 'MA', 'lat': 42.3662, 'lng': -71.0621, 'capacity': 19156, 'teams': ['BOS']},
|
|
'Barclays Center': {'city': 'Brooklyn', 'state': 'NY', 'lat': 40.6826, 'lng': -73.9754, 'capacity': 17732, 'teams': ['BKN']},
|
|
'Spectrum Center': {'city': 'Charlotte', 'state': 'NC', 'lat': 35.2251, 'lng': -80.8392, 'capacity': 19077, 'teams': ['CHA']},
|
|
'United Center': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8807, 'lng': -87.6742, 'capacity': 20917, 'teams': ['CHI']},
|
|
'Rocket Mortgage FieldHouse': {'city': 'Cleveland', 'state': 'OH', 'lat': 41.4965, 'lng': -81.6882, 'capacity': 19432, 'teams': ['CLE']},
|
|
'American Airlines Center': {'city': 'Dallas', 'state': 'TX', 'lat': 32.7905, 'lng': -96.8103, 'capacity': 19200, 'teams': ['DAL']},
|
|
'Ball Arena': {'city': 'Denver', 'state': 'CO', 'lat': 39.7487, 'lng': -105.0077, 'capacity': 19520, 'teams': ['DEN']},
|
|
'Little Caesars Arena': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3411, 'lng': -83.0553, 'capacity': 20332, 'teams': ['DET']},
|
|
'Chase Center': {'city': 'San Francisco', 'state': 'CA', 'lat': 37.7680, 'lng': -122.3879, 'capacity': 18064, 'teams': ['GSW']},
|
|
'Toyota Center': {'city': 'Houston', 'state': 'TX', 'lat': 29.7508, 'lng': -95.3621, 'capacity': 18055, 'teams': ['HOU']},
|
|
'Gainbridge Fieldhouse': {'city': 'Indianapolis', 'state': 'IN', 'lat': 39.7640, 'lng': -86.1555, 'capacity': 17923, 'teams': ['IND']},
|
|
'Intuit Dome': {'city': 'Inglewood', 'state': 'CA', 'lat': 33.9425, 'lng': -118.3419, 'capacity': 18000, 'teams': ['LAC']},
|
|
'Crypto.com Arena': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0430, 'lng': -118.2673, 'capacity': 18997, 'teams': ['LAL']},
|
|
'FedExForum': {'city': 'Memphis', 'state': 'TN', 'lat': 35.1382, 'lng': -90.0506, 'capacity': 17794, 'teams': ['MEM']},
|
|
'Kaseya Center': {'city': 'Miami', 'state': 'FL', 'lat': 25.7814, 'lng': -80.1870, 'capacity': 19600, 'teams': ['MIA']},
|
|
'Fiserv Forum': {'city': 'Milwaukee', 'state': 'WI', 'lat': 43.0451, 'lng': -87.9174, 'capacity': 17341, 'teams': ['MIL']},
|
|
'Target Center': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9795, 'lng': -93.2761, 'capacity': 18978, 'teams': ['MIN']},
|
|
'Smoothie King Center': {'city': 'New Orleans', 'state': 'LA', 'lat': 29.9490, 'lng': -90.0821, 'capacity': 16867, 'teams': ['NOP']},
|
|
'Madison Square Garden': {'city': 'New York', 'state': 'NY', 'lat': 40.7505, 'lng': -73.9934, 'capacity': 19812, 'teams': ['NYK']},
|
|
'Paycom Center': {'city': 'Oklahoma City', 'state': 'OK', 'lat': 35.4634, 'lng': -97.5151, 'capacity': 18203, 'teams': ['OKC']},
|
|
'Kia Center': {'city': 'Orlando', 'state': 'FL', 'lat': 28.5392, 'lng': -81.3839, 'capacity': 18846, 'teams': ['ORL']},
|
|
'Wells Fargo Center': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9012, 'lng': -75.1720, 'capacity': 20478, 'teams': ['PHI']},
|
|
'Footprint Center': {'city': 'Phoenix', 'state': 'AZ', 'lat': 33.4457, 'lng': -112.0712, 'capacity': 17071, 'teams': ['PHX']},
|
|
'Moda Center': {'city': 'Portland', 'state': 'OR', 'lat': 45.5316, 'lng': -122.6668, 'capacity': 19393, 'teams': ['POR']},
|
|
'Golden 1 Center': {'city': 'Sacramento', 'state': 'CA', 'lat': 38.5802, 'lng': -121.4997, 'capacity': 17608, 'teams': ['SAC']},
|
|
'Frost Bank Center': {'city': 'San Antonio', 'state': 'TX', 'lat': 29.4270, 'lng': -98.4375, 'capacity': 18418, 'teams': ['SAS']},
|
|
'Scotiabank Arena': {'city': 'Toronto', 'state': 'ON', 'lat': 43.6435, 'lng': -79.3791, 'capacity': 19800, 'teams': ['TOR']},
|
|
'Delta Center': {'city': 'Salt Lake City', 'state': 'UT', 'lat': 40.7683, 'lng': -111.9011, 'capacity': 18306, 'teams': ['UTA']},
|
|
'Capital One Arena': {'city': 'Washington', 'state': 'DC', 'lat': 38.8982, 'lng': -77.0209, 'capacity': 20356, 'teams': ['WAS']},
|
|
}
|
|
|
|
stadiums = []
|
|
for name, info in nba_arenas.items():
|
|
stadium = Stadium(
|
|
id=f"nba_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info['city'],
|
|
state=info['state'],
|
|
latitude=info['lat'],
|
|
longitude=info['lng'],
|
|
capacity=info['capacity'],
|
|
sport='NBA',
|
|
team_abbrevs=info['teams'],
|
|
source='nba_hardcoded'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
print(f" Found {len(stadiums)} NBA arenas")
|
|
return stadiums
|
|
|
|
|
|
def scrape_wnba_stadiums() -> list[Stadium]:
|
|
"""
|
|
Fetch WNBA arena data (hardcoded with accurate coordinates).
|
|
"""
|
|
print(" Loading WNBA arenas...")
|
|
|
|
wnba_arenas = {
|
|
'Gateway Center Arena': {'city': 'College Park', 'state': 'GA', 'lat': 33.6532, 'lng': -84.4474, 'capacity': 3500, 'teams': ['ATL']},
|
|
'Wintrust Arena': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8658, 'lng': -87.6169, 'capacity': 10387, 'teams': ['CHI']},
|
|
'Mohegan Sun Arena': {'city': 'Uncasville', 'state': 'CT', 'lat': 41.4932, 'lng': -72.0889, 'capacity': 10000, 'teams': ['CON']},
|
|
'College Park Center': {'city': 'Arlington', 'state': 'TX', 'lat': 32.7299, 'lng': -97.1100, 'capacity': 7000, 'teams': ['DAL']},
|
|
'Chase Center': {'city': 'San Francisco', 'state': 'CA', 'lat': 37.7680, 'lng': -122.3879, 'capacity': 18064, 'teams': ['GSV']},
|
|
'Gainbridge Fieldhouse': {'city': 'Indianapolis', 'state': 'IN', 'lat': 39.7640, 'lng': -86.1555, 'capacity': 17923, 'teams': ['IND']},
|
|
'Michelob ULTRA Arena': {'city': 'Las Vegas', 'state': 'NV', 'lat': 36.0909, 'lng': -115.1761, 'capacity': 12000, 'teams': ['LVA']},
|
|
'Crypto.com Arena': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0430, 'lng': -118.2673, 'capacity': 18997, 'teams': ['LAS']},
|
|
'Target Center': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9795, 'lng': -93.2761, 'capacity': 20000, 'teams': ['MIN']},
|
|
'Barclays Center': {'city': 'Brooklyn', 'state': 'NY', 'lat': 40.6826, 'lng': -73.9754, 'capacity': 17732, 'teams': ['NYL']},
|
|
'Footprint Center': {'city': 'Phoenix', 'state': 'AZ', 'lat': 33.4457, 'lng': -112.0712, 'capacity': 17071, 'teams': ['PHX']},
|
|
'Climate Pledge Arena': {'city': 'Seattle', 'state': 'WA', 'lat': 47.6220, 'lng': -122.3540, 'capacity': 18100, 'teams': ['SEA']},
|
|
'Entertainment & Sports Arena': {'city': 'Washington', 'state': 'DC', 'lat': 38.8688, 'lng': -76.9731, 'capacity': 4200, 'teams': ['WAS']},
|
|
}
|
|
|
|
stadiums = []
|
|
for name, info in wnba_arenas.items():
|
|
stadium = Stadium(
|
|
id=f"wnba_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info['city'],
|
|
state=info['state'],
|
|
latitude=info['lat'],
|
|
longitude=info['lng'],
|
|
capacity=info['capacity'],
|
|
sport='WNBA',
|
|
team_abbrevs=info['teams'],
|
|
source='wnba_hardcoded'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
print(f" Found {len(stadiums)} WNBA arenas")
|
|
return stadiums
|
|
|
|
|
|
def scrape_nwsl_stadiums() -> list[Stadium]:
|
|
"""
|
|
Fetch NWSL stadium data (hardcoded with accurate coordinates).
|
|
"""
|
|
print(" Loading NWSL stadiums...")
|
|
|
|
nwsl_stadiums = {
|
|
'BMO Stadium': {'city': 'Los Angeles', 'state': 'CA', 'lat': 34.0128, 'lng': -118.2841, 'capacity': 22000, 'teams': ['ANG']},
|
|
'WakeMed Soccer Park': {'city': 'Cary', 'state': 'NC', 'lat': 35.7645, 'lng': -78.7761, 'capacity': 10000, 'teams': ['NCC']},
|
|
'SeatGeek Stadium': {'city': 'Bridgeview', 'state': 'IL', 'lat': 41.7653, 'lng': -87.8020, 'capacity': 20000, 'teams': ['CHI']},
|
|
'Shell Energy Stadium': {'city': 'Houston', 'state': 'TX', 'lat': 29.7523, 'lng': -95.3526, 'capacity': 22039, 'teams': ['HOU']},
|
|
'CPKC Stadium': {'city': 'Kansas City', 'state': 'MO', 'lat': 39.1243, 'lng': -94.8232, 'capacity': 11500, 'teams': ['KCC']},
|
|
'Lynn Family Stadium': {'city': 'Louisville', 'state': 'KY', 'lat': 38.2210, 'lng': -85.7388, 'capacity': 15304, 'teams': ['LOU']},
|
|
'Red Bull Arena': {'city': 'Harrison', 'state': 'NJ', 'lat': 40.7369, 'lng': -74.1503, 'capacity': 25000, 'teams': ['NJG']},
|
|
'Inter&Co Stadium': {'city': 'Orlando', 'state': 'FL', 'lat': 28.5412, 'lng': -81.3896, 'capacity': 25500, 'teams': ['ORL']},
|
|
'Providence Park': {'city': 'Portland', 'state': 'OR', 'lat': 45.5217, 'lng': -122.6918, 'capacity': 25218, 'teams': ['POR']},
|
|
'Snapdragon Stadium': {'city': 'San Diego', 'state': 'CA', 'lat': 32.7839, 'lng': -117.1194, 'capacity': 32000, 'teams': ['SDW']},
|
|
'PayPal Park': {'city': 'San Jose', 'state': 'CA', 'lat': 37.3512, 'lng': -121.9251, 'capacity': 18000, 'teams': ['SJE']},
|
|
'Lumen Field': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5952, 'lng': -122.3316, 'capacity': 69000, 'teams': ['SEA']},
|
|
'America First Field': {'city': 'Sandy', 'state': 'UT', 'lat': 40.5829, 'lng': -111.8933, 'capacity': 20213, 'teams': ['UTA']},
|
|
'Audi Field': {'city': 'Washington', 'state': 'DC', 'lat': 38.8687, 'lng': -77.0128, 'capacity': 20000, 'teams': ['WAS']},
|
|
}
|
|
|
|
stadiums = []
|
|
for name, info in nwsl_stadiums.items():
|
|
stadium = Stadium(
|
|
id=f"nwsl_{name.lower().replace(' ', '_')[:30]}",
|
|
name=name,
|
|
city=info['city'],
|
|
state=info['state'],
|
|
latitude=info['lat'],
|
|
longitude=info['lng'],
|
|
capacity=info['capacity'],
|
|
sport='NWSL',
|
|
team_abbrevs=info['teams'],
|
|
source='nwsl_hardcoded'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
print(f" Found {len(stadiums)} NWSL stadiums")
|
|
return stadiums
|
|
|
|
|
|
def scrape_cbb_stadiums() -> list[Stadium]:
|
|
"""
|
|
Fetch CBB (College Basketball) arena data from Wikipedia.
|
|
This scrapes the List of NCAA Division I basketball arenas.
|
|
"""
|
|
stadiums = []
|
|
url = "https://en.wikipedia.org/wiki/List_of_NCAA_Division_I_basketball_arenas"
|
|
|
|
print(" Fetching CBB arenas from Wikipedia...")
|
|
try:
|
|
response = requests.get(url, headers=HEADERS, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'lxml')
|
|
|
|
# Find tables with arena data
|
|
tables = soup.find_all('table', class_='wikitable')
|
|
|
|
for table in tables:
|
|
rows = table.find_all('tr')[1:] # Skip header
|
|
|
|
for row in rows:
|
|
cells = row.find_all(['td', 'th'])
|
|
if len(cells) >= 4:
|
|
try:
|
|
arena_name = cells[0].get_text(strip=True)
|
|
city_state = cells[1].get_text(strip=True) if len(cells) > 1 else ''
|
|
capacity_text = cells[2].get_text(strip=True) if len(cells) > 2 else '0'
|
|
team = cells[3].get_text(strip=True) if len(cells) > 3 else ''
|
|
|
|
# Parse capacity (remove commas)
|
|
capacity = int(re.sub(r'[^\d]', '', capacity_text) or 0)
|
|
|
|
# Parse city/state
|
|
city = city_state.split(',')[0].strip() if ',' in city_state else city_state
|
|
state = city_state.split(',')[-1].strip() if ',' in city_state else ''
|
|
|
|
if arena_name and capacity > 0:
|
|
stadium = Stadium(
|
|
id=f"cbb_{arena_name.lower().replace(' ', '_')[:30]}",
|
|
name=arena_name,
|
|
city=city,
|
|
state=state,
|
|
latitude=0, # Wikipedia doesn't have coords in table
|
|
longitude=0,
|
|
capacity=capacity,
|
|
sport='CBB',
|
|
team_abbrevs=[team[:3].upper()] if team else [],
|
|
source='wikipedia'
|
|
)
|
|
stadiums.append(stadium)
|
|
except (ValueError, IndexError):
|
|
continue
|
|
|
|
print(f" Found {len(stadiums)} CBB arenas")
|
|
except Exception as e:
|
|
print(f" Error fetching CBB arenas: {e}")
|
|
|
|
return stadiums
|
|
|
|
|
|
def scrape_all_stadiums() -> list[Stadium]:
|
|
"""
|
|
Scrape stadium/venue data for ALL 8 sports.
|
|
Returns a combined list of all venues.
|
|
"""
|
|
all_stadiums = []
|
|
|
|
print("\n" + "="*60)
|
|
print("SCRAPING ALL STADIUMS/VENUES")
|
|
print("="*60)
|
|
|
|
# Pro leagues
|
|
all_stadiums.extend(scrape_nba_stadiums())
|
|
all_stadiums.extend(scrape_mlb_stadiums())
|
|
all_stadiums.extend(scrape_nhl_stadiums())
|
|
all_stadiums.extend(scrape_nfl_stadiums())
|
|
all_stadiums.extend(scrape_wnba_stadiums())
|
|
all_stadiums.extend(scrape_mls_stadiums())
|
|
all_stadiums.extend(scrape_nwsl_stadiums())
|
|
|
|
# College sports
|
|
all_stadiums.extend(scrape_cbb_stadiums())
|
|
|
|
print(f"\n TOTAL: {len(all_stadiums)} stadiums/venues across all sports")
|
|
|
|
return all_stadiums
|
|
|
|
|
|
def generate_stadiums_from_teams() -> list[Stadium]:
|
|
"""
|
|
Generate stadium data from team mappings with manual coordinates.
|
|
This serves as a fallback/validation source.
|
|
"""
|
|
stadiums = []
|
|
|
|
# NBA Arenas with coordinates (manually curated)
|
|
nba_coords = {
|
|
'State Farm Arena': (33.7573, -84.3963),
|
|
'TD Garden': (42.3662, -71.0621),
|
|
'Barclays Center': (40.6826, -73.9754),
|
|
'Spectrum Center': (35.2251, -80.8392),
|
|
'United Center': (41.8807, -87.6742),
|
|
'Rocket Mortgage FieldHouse': (41.4965, -81.6882),
|
|
'American Airlines Center': (32.7905, -96.8103),
|
|
'Ball Arena': (39.7487, -105.0077),
|
|
'Little Caesars Arena': (42.3411, -83.0553),
|
|
'Chase Center': (37.7680, -122.3879),
|
|
'Toyota Center': (29.7508, -95.3621),
|
|
'Gainbridge Fieldhouse': (39.7640, -86.1555),
|
|
'Intuit Dome': (33.9425, -118.3419),
|
|
'Crypto.com Arena': (34.0430, -118.2673),
|
|
'FedExForum': (35.1382, -90.0506),
|
|
'Kaseya Center': (25.7814, -80.1870),
|
|
'Fiserv Forum': (43.0451, -87.9174),
|
|
'Target Center': (44.9795, -93.2761),
|
|
'Smoothie King Center': (29.9490, -90.0821),
|
|
'Madison Square Garden': (40.7505, -73.9934),
|
|
'Paycom Center': (35.4634, -97.5151),
|
|
'Kia Center': (28.5392, -81.3839),
|
|
'Wells Fargo Center': (39.9012, -75.1720),
|
|
'Footprint Center': (33.4457, -112.0712),
|
|
'Moda Center': (45.5316, -122.6668),
|
|
'Golden 1 Center': (38.5802, -121.4997),
|
|
'Frost Bank Center': (29.4270, -98.4375),
|
|
'Scotiabank Arena': (43.6435, -79.3791),
|
|
'Delta Center': (40.7683, -111.9011),
|
|
'Capital One Arena': (38.8982, -77.0209),
|
|
}
|
|
|
|
for abbrev, info in NBA_TEAMS.items():
|
|
arena = info['arena']
|
|
coords = nba_coords.get(arena, (0, 0))
|
|
|
|
stadium = Stadium(
|
|
id=f"manual_nba_{abbrev.lower()}",
|
|
name=arena,
|
|
city=info['city'],
|
|
state='',
|
|
latitude=coords[0],
|
|
longitude=coords[1],
|
|
capacity=0,
|
|
sport='NBA',
|
|
team_abbrevs=[abbrev],
|
|
source='manual'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
# MLB Stadiums with coordinates
|
|
mlb_coords = {
|
|
'Chase Field': (33.4453, -112.0667, 'AZ', 48686),
|
|
'Truist Park': (33.8907, -84.4678, 'GA', 41084),
|
|
'Oriole Park at Camden Yards': (39.2838, -76.6218, 'MD', 45971),
|
|
'Fenway Park': (42.3467, -71.0972, 'MA', 37755),
|
|
'Wrigley Field': (41.9484, -87.6553, 'IL', 41649),
|
|
'Guaranteed Rate Field': (41.8299, -87.6338, 'IL', 40615),
|
|
'Great American Ball Park': (39.0979, -84.5082, 'OH', 42319),
|
|
'Progressive Field': (41.4962, -81.6852, 'OH', 34830),
|
|
'Coors Field': (39.7559, -104.9942, 'CO', 50144),
|
|
'Comerica Park': (42.3390, -83.0485, 'MI', 41083),
|
|
'Minute Maid Park': (29.7573, -95.3555, 'TX', 41168),
|
|
'Kauffman Stadium': (39.0517, -94.4803, 'MO', 37903),
|
|
'Angel Stadium': (33.8003, -117.8827, 'CA', 45517),
|
|
'Dodger Stadium': (34.0739, -118.2400, 'CA', 56000),
|
|
'LoanDepot Park': (25.7781, -80.2196, 'FL', 36742),
|
|
'American Family Field': (43.0280, -87.9712, 'WI', 41900),
|
|
'Target Field': (44.9817, -93.2776, 'MN', 38544),
|
|
'Citi Field': (40.7571, -73.8458, 'NY', 41922),
|
|
'Yankee Stadium': (40.8296, -73.9262, 'NY', 46537),
|
|
'Sutter Health Park': (38.5802, -121.5097, 'CA', 14014),
|
|
'Citizens Bank Park': (39.9061, -75.1665, 'PA', 42792),
|
|
'PNC Park': (40.4469, -80.0057, 'PA', 38362),
|
|
'Petco Park': (32.7076, -117.1570, 'CA', 40209),
|
|
'Oracle Park': (37.7786, -122.3893, 'CA', 41265),
|
|
'T-Mobile Park': (47.5914, -122.3325, 'WA', 47929),
|
|
'Busch Stadium': (38.6226, -90.1928, 'MO', 45494),
|
|
'Tropicana Field': (27.7682, -82.6534, 'FL', 25000),
|
|
'Globe Life Field': (32.7473, -97.0845, 'TX', 40300),
|
|
'Rogers Centre': (43.6414, -79.3894, 'ON', 49282),
|
|
'Nationals Park': (38.8730, -77.0074, 'DC', 41339),
|
|
}
|
|
|
|
for abbrev, info in MLB_TEAMS.items():
|
|
stadium_name = info['stadium']
|
|
coord_data = mlb_coords.get(stadium_name, (0, 0, '', 0))
|
|
|
|
stadium = Stadium(
|
|
id=f"manual_mlb_{abbrev.lower()}",
|
|
name=stadium_name,
|
|
city=info['city'],
|
|
state=coord_data[2] if len(coord_data) > 2 else '',
|
|
latitude=coord_data[0],
|
|
longitude=coord_data[1],
|
|
capacity=coord_data[3] if len(coord_data) > 3 else 0,
|
|
sport='MLB',
|
|
team_abbrevs=[abbrev],
|
|
source='manual'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
# NHL Arenas with coordinates
|
|
nhl_coords = {
|
|
'Honda Center': (33.8078, -117.8765, 'CA', 17174),
|
|
'Delta Center': (40.7683, -111.9011, 'UT', 18306),
|
|
'TD Garden': (42.3662, -71.0621, 'MA', 17565),
|
|
'KeyBank Center': (42.8750, -78.8764, 'NY', 19070),
|
|
'Scotiabank Saddledome': (51.0374, -114.0519, 'AB', 19289),
|
|
'PNC Arena': (35.8034, -78.7220, 'NC', 18680),
|
|
'United Center': (41.8807, -87.6742, 'IL', 19717),
|
|
'Ball Arena': (39.7487, -105.0077, 'CO', 18007),
|
|
'Nationwide Arena': (39.9693, -83.0061, 'OH', 18500),
|
|
'American Airlines Center': (32.7905, -96.8103, 'TX', 18532),
|
|
'Little Caesars Arena': (42.3411, -83.0553, 'MI', 19515),
|
|
'Rogers Place': (53.5469, -113.4978, 'AB', 18347),
|
|
'Amerant Bank Arena': (26.1584, -80.3256, 'FL', 19250),
|
|
'Crypto.com Arena': (34.0430, -118.2673, 'CA', 18230),
|
|
'Xcel Energy Center': (44.9448, -93.1010, 'MN', 17954),
|
|
'Bell Centre': (45.4961, -73.5693, 'QC', 21302),
|
|
'Bridgestone Arena': (36.1592, -86.7785, 'TN', 17159),
|
|
'Prudential Center': (40.7334, -74.1712, 'NJ', 16514),
|
|
'UBS Arena': (40.7161, -73.7246, 'NY', 17255),
|
|
'Madison Square Garden': (40.7505, -73.9934, 'NY', 18006),
|
|
'Canadian Tire Centre': (45.2969, -75.9272, 'ON', 18652),
|
|
'Wells Fargo Center': (39.9012, -75.1720, 'PA', 19543),
|
|
'PPG Paints Arena': (40.4395, -79.9892, 'PA', 18387),
|
|
'SAP Center': (37.3327, -121.9010, 'CA', 17562),
|
|
'Climate Pledge Arena': (47.6221, -122.3540, 'WA', 17100),
|
|
'Enterprise Center': (38.6268, -90.2025, 'MO', 18096),
|
|
'Amalie Arena': (27.9426, -82.4519, 'FL', 19092),
|
|
'Scotiabank Arena': (43.6435, -79.3791, 'ON', 18819),
|
|
'Rogers Arena': (49.2778, -123.1089, 'BC', 18910),
|
|
'T-Mobile Arena': (36.1028, -115.1784, 'NV', 17500),
|
|
'Capital One Arena': (38.8982, -77.0209, 'DC', 18573),
|
|
'Canada Life Centre': (49.8928, -97.1436, 'MB', 15321),
|
|
}
|
|
|
|
for abbrev, info in NHL_TEAMS.items():
|
|
arena_name = info['arena']
|
|
coord_data = nhl_coords.get(arena_name, (0, 0, '', 0))
|
|
|
|
stadium = Stadium(
|
|
id=f"manual_nhl_{abbrev.lower()}",
|
|
name=arena_name,
|
|
city=info['city'],
|
|
state=coord_data[2] if len(coord_data) > 2 else '',
|
|
latitude=coord_data[0],
|
|
longitude=coord_data[1],
|
|
capacity=coord_data[3] if len(coord_data) > 3 else 0,
|
|
sport='NHL',
|
|
team_abbrevs=[abbrev],
|
|
source='manual'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
# WNBA Arenas with coordinates
|
|
wnba_coords = {
|
|
'Gateway Center Arena': (33.6534, -84.4480, 'GA', 3500),
|
|
'Wintrust Arena': (41.8622, -87.6164, 'IL', 10387),
|
|
'Mohegan Sun Arena': (41.4946, -72.0874, 'CT', 10000),
|
|
'College Park Center': (32.7298, -97.1137, 'TX', 7000),
|
|
'Chase Center': (37.7680, -122.3879, 'CA', 18064),
|
|
'Gainbridge Fieldhouse': (39.7640, -86.1555, 'IN', 17274),
|
|
'Michelob Ultra Arena': (36.0929, -115.1757, 'NV', 12000),
|
|
'Crypto.com Arena': (34.0430, -118.2673, 'CA', 19068),
|
|
'Target Center': (44.9795, -93.2761, 'MN', 17500),
|
|
'Barclays Center': (40.6826, -73.9754, 'NY', 17732),
|
|
'Footprint Center': (33.4457, -112.0712, 'AZ', 17000),
|
|
'Climate Pledge Arena': (47.6221, -122.3540, 'WA', 17100),
|
|
'Entertainment & Sports Arena': (38.8701, -76.9728, 'DC', 4200),
|
|
}
|
|
|
|
for abbrev, info in WNBA_TEAMS.items():
|
|
arena_name = info['arena']
|
|
coord_data = wnba_coords.get(arena_name, (0, 0, '', 0))
|
|
|
|
stadium = Stadium(
|
|
id=f"manual_wnba_{abbrev.lower()}",
|
|
name=arena_name,
|
|
city=info['city'],
|
|
state=coord_data[2] if len(coord_data) > 2 else '',
|
|
latitude=coord_data[0],
|
|
longitude=coord_data[1],
|
|
capacity=coord_data[3] if len(coord_data) > 3 else 0,
|
|
sport='WNBA',
|
|
team_abbrevs=[abbrev],
|
|
source='manual'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
# MLS Stadiums with coordinates
|
|
mls_coords = {
|
|
'Mercedes-Benz Stadium': (33.7553, -84.4006, 'GA', 71000),
|
|
'Q2 Stadium': (30.3876, -97.7200, 'TX', 20738),
|
|
'Bank of America Stadium': (35.2258, -80.8528, 'NC', 74867),
|
|
'Soldier Field': (41.8623, -87.6167, 'IL', 61500),
|
|
'TQL Stadium': (39.1113, -84.5212, 'OH', 26000),
|
|
"Dick's Sporting Goods Park": (39.8056, -104.8919, 'CO', 18061),
|
|
'Lower.com Field': (39.9689, -83.0173, 'OH', 20371),
|
|
'Toyota Stadium': (33.1546, -96.8353, 'TX', 20500),
|
|
'Audi Field': (38.8686, -77.0128, 'DC', 20000),
|
|
'Shell Energy Stadium': (29.7523, -95.3522, 'TX', 22039),
|
|
'Dignity Health Sports Park': (33.8644, -118.2611, 'CA', 27000),
|
|
'BMO Stadium': (34.0128, -118.2841, 'CA', 22000),
|
|
'Chase Stadium': (26.1902, -80.1630, 'FL', 21550),
|
|
'Allianz Field': (44.9532, -93.1653, 'MN', 19400),
|
|
'Stade Saputo': (45.5628, -73.5530, 'QC', 19619),
|
|
'Geodis Park': (36.1303, -86.7663, 'TN', 30000),
|
|
'Gillette Stadium': (42.0909, -71.2643, 'MA', 65878),
|
|
'Yankee Stadium': (40.8296, -73.9262, 'NY', 46537),
|
|
'Red Bull Arena': (40.7368, -74.1503, 'NJ', 25000),
|
|
'Inter&Co Stadium': (28.5411, -81.3899, 'FL', 25500),
|
|
'Subaru Park': (39.8328, -75.3789, 'PA', 18500),
|
|
'Providence Park': (45.5217, -122.6917, 'OR', 25218),
|
|
'America First Field': (40.5828, -111.8933, 'UT', 20213),
|
|
'PayPal Park': (37.3513, -121.9253, 'CA', 18000),
|
|
'Lumen Field': (47.5952, -122.3316, 'WA', 68740),
|
|
"Children's Mercy Park": (39.1218, -94.8234, 'KS', 18467),
|
|
'CityPark': (38.6322, -90.2094, 'MO', 22500),
|
|
'BMO Field': (43.6332, -79.4186, 'ON', 30000),
|
|
'BC Place': (49.2768, -123.1118, 'BC', 54320),
|
|
'Snapdragon Stadium': (32.7839, -117.1224, 'CA', 35000),
|
|
}
|
|
|
|
for abbrev, info in MLS_TEAMS.items():
|
|
stadium_name = info['stadium']
|
|
coord_data = mls_coords.get(stadium_name, (0, 0, '', 0))
|
|
|
|
stadium = Stadium(
|
|
id=f"manual_mls_{abbrev.lower()}",
|
|
name=stadium_name,
|
|
city=info['city'],
|
|
state=coord_data[2] if len(coord_data) > 2 else '',
|
|
latitude=coord_data[0],
|
|
longitude=coord_data[1],
|
|
capacity=coord_data[3] if len(coord_data) > 3 else 0,
|
|
sport='MLS',
|
|
team_abbrevs=[abbrev],
|
|
source='manual'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
# NWSL Stadiums with coordinates
|
|
nwsl_coords = {
|
|
'BMO Stadium': (34.0128, -118.2841, 'CA', 22000),
|
|
'PayPal Park': (37.3513, -121.9253, 'CA', 18000),
|
|
'SeatGeek Stadium': (41.6462, -87.7304, 'IL', 20000),
|
|
'Shell Energy Stadium': (29.7523, -95.3522, 'TX', 22039),
|
|
'CPKC Stadium': (39.0851, -94.5582, 'KS', 11500),
|
|
'Red Bull Arena': (40.7368, -74.1503, 'NJ', 25000),
|
|
'WakeMed Soccer Park': (35.8589, -78.7989, 'NC', 10000),
|
|
'Inter&Co Stadium': (28.5411, -81.3899, 'FL', 25500),
|
|
'Providence Park': (45.5217, -122.6917, 'OR', 25218),
|
|
'Lumen Field': (47.5952, -122.3316, 'WA', 68740),
|
|
'Snapdragon Stadium': (32.7839, -117.1224, 'CA', 35000),
|
|
'America First Field': (40.5828, -111.8933, 'UT', 20213),
|
|
'Audi Field': (38.8686, -77.0128, 'DC', 20000),
|
|
}
|
|
|
|
for abbrev, info in NWSL_TEAMS.items():
|
|
stadium_name = info['stadium']
|
|
coord_data = nwsl_coords.get(stadium_name, (0, 0, '', 0))
|
|
|
|
stadium = Stadium(
|
|
id=f"manual_nwsl_{abbrev.lower()}",
|
|
name=stadium_name,
|
|
city=info['city'],
|
|
state=coord_data[2] if len(coord_data) > 2 else '',
|
|
latitude=coord_data[0],
|
|
longitude=coord_data[1],
|
|
capacity=coord_data[3] if len(coord_data) > 3 else 0,
|
|
sport='NWSL',
|
|
team_abbrevs=[abbrev],
|
|
source='manual'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
# NFL Stadiums with coordinates
|
|
nfl_coords = {
|
|
'State Farm Stadium': (33.5276, -112.2626, 'AZ', 63400),
|
|
'Mercedes-Benz Stadium': (33.7553, -84.4006, 'GA', 71000),
|
|
'M&T Bank Stadium': (39.2780, -76.6227, 'MD', 71008),
|
|
'Highmark Stadium': (42.7738, -78.7870, 'NY', 71608),
|
|
'Bank of America Stadium': (35.2258, -80.8528, 'NC', 74867),
|
|
'Soldier Field': (41.8623, -87.6167, 'IL', 61500),
|
|
'Paycor Stadium': (39.0954, -84.5160, 'OH', 65515),
|
|
'Cleveland Browns Stadium': (41.5061, -81.6995, 'OH', 67431),
|
|
'AT&T Stadium': (32.7480, -97.0928, 'TX', 80000),
|
|
'Empower Field at Mile High': (39.7439, -105.0201, 'CO', 76125),
|
|
'Ford Field': (42.3400, -83.0456, 'MI', 65000),
|
|
'Lambeau Field': (44.5013, -88.0622, 'WI', 81435),
|
|
'NRG Stadium': (29.6847, -95.4107, 'TX', 72220),
|
|
'Lucas Oil Stadium': (39.7601, -86.1639, 'IN', 67000),
|
|
'EverBank Stadium': (30.3239, -81.6373, 'FL', 67814),
|
|
'GEHA Field at Arrowhead Stadium': (39.0489, -94.4839, 'MO', 76416),
|
|
'Allegiant Stadium': (36.0909, -115.1833, 'NV', 65000),
|
|
'SoFi Stadium': (33.9535, -118.3392, 'CA', 70240),
|
|
'Hard Rock Stadium': (25.9580, -80.2389, 'FL', 65326),
|
|
'U.S. Bank Stadium': (44.9737, -93.2577, 'MN', 66655),
|
|
'Gillette Stadium': (42.0909, -71.2643, 'MA', 65878),
|
|
'Caesars Superdome': (29.9511, -90.0812, 'LA', 73208),
|
|
'MetLife Stadium': (40.8128, -74.0742, 'NJ', 82500),
|
|
'Lincoln Financial Field': (39.9008, -75.1674, 'PA', 69176),
|
|
'Acrisure Stadium': (40.4468, -80.0158, 'PA', 68400),
|
|
"Levi's Stadium": (37.4032, -121.9698, 'CA', 68500),
|
|
'Lumen Field': (47.5952, -122.3316, 'WA', 68740),
|
|
'Raymond James Stadium': (27.9759, -82.5033, 'FL', 65618),
|
|
'Nissan Stadium': (36.1665, -86.7713, 'TN', 69143),
|
|
'Northwest Stadium': (38.9076, -76.8645, 'MD', 67617),
|
|
}
|
|
|
|
for abbrev, info in NFL_TEAMS.items():
|
|
stadium_name = info['stadium']
|
|
coord_data = nfl_coords.get(stadium_name, (0, 0, '', 0))
|
|
|
|
stadium = Stadium(
|
|
id=f"manual_nfl_{abbrev.lower()}",
|
|
name=stadium_name,
|
|
city=info['city'],
|
|
state=coord_data[2] if len(coord_data) > 2 else '',
|
|
latitude=coord_data[0],
|
|
longitude=coord_data[1],
|
|
capacity=coord_data[3] if len(coord_data) > 3 else 0,
|
|
sport='NFL',
|
|
team_abbrevs=[abbrev],
|
|
source='manual'
|
|
)
|
|
stadiums.append(stadium)
|
|
|
|
return stadiums
|
|
|
|
|
|
# =============================================================================
|
|
# HELPERS
|
|
# =============================================================================
|
|
|
|
def assign_stable_ids(games: list[Game], sport: str, season: str) -> list[Game]:
|
|
"""
|
|
Assign IDs based on matchup + date.
|
|
Format: {sport}_{season}_{away}_{home}_{MMDD} (or {MMDD}_2 for doubleheaders)
|
|
|
|
When games are rescheduled, the old ID becomes orphaned and a new one is created.
|
|
Use --delete-all before import to clean up orphaned records.
|
|
"""
|
|
from collections import defaultdict
|
|
|
|
season_str = season.replace('-', '')
|
|
|
|
# Track how many times we've seen each base ID (for doubleheaders)
|
|
id_counts = defaultdict(int)
|
|
|
|
for game in games:
|
|
away = game.away_team_abbrev.lower()
|
|
home = game.home_team_abbrev.lower()
|
|
# Extract MMDD from date (YYYY-MM-DD)
|
|
date_parts = game.date.split('-')
|
|
mmdd = f"{date_parts[1]}{date_parts[2]}" if len(date_parts) == 3 else "0000"
|
|
|
|
base_id = f"{sport.lower()}_{season_str}_{away}_{home}_{mmdd}"
|
|
id_counts[base_id] += 1
|
|
|
|
# Add suffix for doubleheaders (game 2+)
|
|
if id_counts[base_id] > 1:
|
|
game.id = f"{base_id}_{id_counts[base_id]}"
|
|
else:
|
|
game.id = base_id
|
|
|
|
return games
|
|
|
|
|
|
def get_team_abbrev(team_name: str, sport: str) -> str:
|
|
"""Get team abbreviation from full name."""
|
|
teams = {
|
|
'NBA': NBA_TEAMS,
|
|
'MLB': MLB_TEAMS,
|
|
'NHL': NHL_TEAMS,
|
|
'WNBA': WNBA_TEAMS,
|
|
'MLS': MLS_TEAMS,
|
|
'NWSL': NWSL_TEAMS,
|
|
}.get(sport, {})
|
|
|
|
for abbrev, info in teams.items():
|
|
if info['name'].lower() == team_name.lower():
|
|
return abbrev
|
|
if team_name.lower() in info['name'].lower():
|
|
return abbrev
|
|
|
|
# Return first 3 letters as fallback
|
|
return team_name[:3].upper()
|
|
|
|
|
|
def validate_games(games_by_source: dict) -> dict:
|
|
"""
|
|
Cross-validate games from multiple sources.
|
|
Returns discrepancies.
|
|
"""
|
|
discrepancies = {
|
|
'missing_in_source': [],
|
|
'date_mismatch': [],
|
|
'time_mismatch': [],
|
|
'venue_mismatch': [],
|
|
}
|
|
|
|
sources = list(games_by_source.keys())
|
|
if len(sources) < 2:
|
|
return discrepancies
|
|
|
|
primary = sources[0]
|
|
primary_games = {g.id: g for g in games_by_source[primary]}
|
|
|
|
for source in sources[1:]:
|
|
secondary_games = {g.id: g for g in games_by_source[source]}
|
|
|
|
for game_id, game in primary_games.items():
|
|
if game_id not in secondary_games:
|
|
discrepancies['missing_in_source'].append({
|
|
'game_id': game_id,
|
|
'present_in': primary,
|
|
'missing_in': source
|
|
})
|
|
|
|
return discrepancies
|
|
|
|
|
|
def export_to_json(games: list[Game], stadiums: list[Stadium], output_dir: Path):
|
|
"""
|
|
Export scraped data to organized JSON files.
|
|
|
|
Structure:
|
|
data/
|
|
games/
|
|
mlb_2025.json
|
|
nba_2025.json
|
|
...
|
|
canonical/
|
|
stadiums.json
|
|
stadiums.json (legacy, for backward compatibility)
|
|
"""
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create subdirectories
|
|
games_dir = output_dir / 'games'
|
|
canonical_dir = output_dir / 'canonical'
|
|
games_dir.mkdir(exist_ok=True)
|
|
canonical_dir.mkdir(exist_ok=True)
|
|
|
|
# Group games by sport and season
|
|
games_by_sport_season = {}
|
|
for game in games:
|
|
sport = game.sport.lower()
|
|
season = game.season
|
|
key = f"{sport}_{season}"
|
|
if key not in games_by_sport_season:
|
|
games_by_sport_season[key] = []
|
|
games_by_sport_season[key].append(game)
|
|
|
|
# Export games by sport/season
|
|
total_exported = 0
|
|
for key, sport_games in games_by_sport_season.items():
|
|
games_data = [asdict(g) for g in sport_games]
|
|
filepath = games_dir / f"{key}.json"
|
|
with open(filepath, 'w') as f:
|
|
json.dump(games_data, f, indent=2)
|
|
print(f" Exported {len(sport_games):,} games to games/{key}.json")
|
|
total_exported += len(sport_games)
|
|
|
|
# Export combined games.json for backward compatibility
|
|
all_games_data = [asdict(g) for g in games]
|
|
with open(output_dir / 'games.json', 'w') as f:
|
|
json.dump(all_games_data, f, indent=2)
|
|
|
|
# Export stadiums to canonical/
|
|
stadiums_data = [asdict(s) for s in stadiums]
|
|
with open(canonical_dir / 'stadiums.json', 'w') as f:
|
|
json.dump(stadiums_data, f, indent=2)
|
|
|
|
# Also export to root for backward compatibility
|
|
with open(output_dir / 'stadiums.json', 'w') as f:
|
|
json.dump(stadiums_data, f, indent=2)
|
|
|
|
# Export as CSV for easy viewing
|
|
if games:
|
|
df_games = pd.DataFrame(all_games_data)
|
|
df_games.to_csv(output_dir / 'games.csv', index=False)
|
|
|
|
if stadiums:
|
|
df_stadiums = pd.DataFrame(stadiums_data)
|
|
df_stadiums.to_csv(output_dir / 'stadiums.csv', index=False)
|
|
|
|
print(f"\nExported {total_exported:,} games across {len(games_by_sport_season)} sport/season files")
|
|
print(f"Exported {len(stadiums):,} stadiums to canonical/stadiums.json")
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Scrape sports schedules')
|
|
parser.add_argument('--sport', choices=['nba', 'mlb', 'nhl', 'nfl', 'wnba', 'mls', 'nwsl', 'cbb', 'all'], default='all')
|
|
parser.add_argument('--season', type=int, default=2026, help='Season year (ending year)')
|
|
parser.add_argument('--stadiums-only', action='store_true', help='Only scrape stadium data (legacy method)')
|
|
parser.add_argument('--stadiums-update', action='store_true', help='Scrape ALL stadium data for all 8 sports (comprehensive)')
|
|
parser.add_argument('--output', type=str, default='./data', help='Output directory')
|
|
|
|
args = parser.parse_args()
|
|
output_dir = Path(args.output)
|
|
|
|
all_games = []
|
|
all_stadiums = []
|
|
|
|
# Scrape stadiums
|
|
print("\n" + "="*60)
|
|
print("SCRAPING STADIUMS")
|
|
print("="*60)
|
|
|
|
if args.stadiums_update:
|
|
# Comprehensive scraping for ALL 11 sports
|
|
print("Using comprehensive stadium scrapers for all 11 sports...")
|
|
all_stadiums.extend(scrape_all_stadiums())
|
|
print(f" Total stadiums scraped: {len(all_stadiums)}")
|
|
else:
|
|
# Legacy method (HIFLD + manual team mappings)
|
|
all_stadiums.extend(scrape_stadiums_hifld())
|
|
all_stadiums.extend(generate_stadiums_from_teams())
|
|
|
|
# If stadiums-only mode, export and exit (skip schedule scraping)
|
|
if args.stadiums_only:
|
|
export_to_json([], all_stadiums, output_dir)
|
|
return
|
|
|
|
# Scrape schedules with multi-source fallback
|
|
if args.sport in ['nba', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING NBA {args.season}")
|
|
print("="*60)
|
|
|
|
nba_sources = [
|
|
ScraperSource('Basketball-Reference', scrape_nba_basketball_reference, priority=1, min_games=500),
|
|
ScraperSource('ESPN', scrape_nba_espn, priority=2, min_games=500),
|
|
ScraperSource('CBS Sports', scrape_nba_cbssports, priority=3, min_games=100),
|
|
]
|
|
nba_games = scrape_with_fallback('NBA', args.season, nba_sources)
|
|
nba_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2024-25"
|
|
nba_games = assign_stable_ids(nba_games, 'NBA', nba_season)
|
|
all_games.extend(nba_games)
|
|
|
|
if args.sport in ['mlb', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING MLB {args.season}")
|
|
print("="*60)
|
|
|
|
mlb_sources = [
|
|
ScraperSource('MLB Stats API', scrape_mlb_statsapi, priority=1, min_games=1000),
|
|
ScraperSource('Baseball-Reference', scrape_mlb_baseball_reference, priority=2, min_games=500),
|
|
ScraperSource('ESPN', scrape_mlb_espn, priority=3, min_games=500),
|
|
]
|
|
mlb_games = scrape_with_fallback('MLB', args.season, mlb_sources)
|
|
mlb_games = assign_stable_ids(mlb_games, 'MLB', str(args.season))
|
|
all_games.extend(mlb_games)
|
|
|
|
if args.sport in ['nhl', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING NHL {args.season}")
|
|
print("="*60)
|
|
|
|
nhl_sources = [
|
|
ScraperSource('Hockey-Reference', scrape_nhl_hockey_reference, priority=1, min_games=500),
|
|
ScraperSource('ESPN', scrape_nhl_espn, priority=2, min_games=500),
|
|
ScraperSource('NHL API', scrape_nhl_api, priority=3, min_games=100),
|
|
]
|
|
nhl_games = scrape_with_fallback('NHL', args.season, nhl_sources)
|
|
nhl_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2024-25"
|
|
nhl_games = assign_stable_ids(nhl_games, 'NHL', nhl_season)
|
|
all_games.extend(nhl_games)
|
|
|
|
if args.sport in ['wnba', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING WNBA {args.season}")
|
|
print("="*60)
|
|
|
|
wnba_sources = [
|
|
ScraperSource('ESPN', scrape_wnba_espn, priority=1, min_games=100),
|
|
ScraperSource('Basketball-Reference', scrape_wnba_basketball_reference, priority=2, min_games=100),
|
|
ScraperSource('CBS Sports', scrape_wnba_cbssports, priority=3, min_games=50),
|
|
]
|
|
wnba_games = scrape_with_fallback('WNBA', args.season, wnba_sources)
|
|
wnba_games = assign_stable_ids(wnba_games, 'WNBA', str(args.season))
|
|
all_games.extend(wnba_games)
|
|
|
|
if args.sport in ['mls', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING MLS {args.season}")
|
|
print("="*60)
|
|
|
|
mls_sources = [
|
|
ScraperSource('ESPN', scrape_mls_espn, priority=1, min_games=200),
|
|
ScraperSource('FBref', scrape_mls_fbref, priority=2, min_games=100),
|
|
ScraperSource('MLSSoccer.com', scrape_mls_mlssoccer, priority=3, min_games=100),
|
|
]
|
|
mls_games = scrape_with_fallback('MLS', args.season, mls_sources)
|
|
mls_games = assign_stable_ids(mls_games, 'MLS', str(args.season))
|
|
all_games.extend(mls_games)
|
|
|
|
if args.sport in ['nwsl', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING NWSL {args.season}")
|
|
print("="*60)
|
|
|
|
nwsl_sources = [
|
|
ScraperSource('ESPN', scrape_nwsl_espn, priority=1, min_games=100),
|
|
ScraperSource('FBref', scrape_nwsl_fbref, priority=2, min_games=50),
|
|
ScraperSource('NWSL.com', scrape_nwsl_nwslsoccer, priority=3, min_games=50),
|
|
]
|
|
nwsl_games = scrape_with_fallback('NWSL', args.season, nwsl_sources)
|
|
nwsl_games = assign_stable_ids(nwsl_games, 'NWSL', str(args.season))
|
|
all_games.extend(nwsl_games)
|
|
|
|
if args.sport in ['nfl', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING NFL {args.season}")
|
|
print("="*60)
|
|
|
|
nfl_sources = [
|
|
ScraperSource('ESPN', scrape_nfl_espn, priority=1, min_games=200),
|
|
ScraperSource('Pro-Football-Reference', scrape_nfl_pro_football_reference, priority=2, min_games=200),
|
|
ScraperSource('CBS Sports', scrape_nfl_cbssports, priority=3, min_games=100),
|
|
]
|
|
nfl_games = scrape_with_fallback('NFL', args.season, nfl_sources)
|
|
nfl_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2025-26"
|
|
nfl_games = assign_stable_ids(nfl_games, 'NFL', nfl_season)
|
|
all_games.extend(nfl_games)
|
|
|
|
if args.sport in ['cbb', 'all']:
|
|
print("\n" + "="*60)
|
|
print(f"SCRAPING CBB {args.season}")
|
|
print("="*60)
|
|
|
|
cbb_sources = [
|
|
ScraperSource('ESPN', scrape_cbb_espn, priority=1, min_games=1000),
|
|
ScraperSource('Sports-Reference', scrape_cbb_sports_reference, priority=2, min_games=500),
|
|
ScraperSource('CBS Sports', scrape_cbb_cbssports, priority=3, min_games=300),
|
|
]
|
|
cbb_games = scrape_with_fallback('CBB', args.season, cbb_sources)
|
|
cbb_season = f"{args.season-1}-{str(args.season)[2:]}" # e.g., "2025-26"
|
|
cbb_games = assign_stable_ids(cbb_games, 'CBB', cbb_season)
|
|
all_games.extend(cbb_games)
|
|
|
|
# Export
|
|
print("\n" + "="*60)
|
|
print("EXPORTING DATA")
|
|
print("="*60)
|
|
|
|
export_to_json(all_games, all_stadiums, output_dir)
|
|
|
|
# Summary
|
|
print("\n" + "="*60)
|
|
print("SUMMARY")
|
|
print("="*60)
|
|
print(f"Total games scraped: {len(all_games)}")
|
|
print(f"Total stadiums: {len(all_stadiums)}")
|
|
|
|
# Games by sport
|
|
by_sport = {}
|
|
for g in all_games:
|
|
by_sport[g.sport] = by_sport.get(g.sport, 0) + 1
|
|
for sport, count in by_sport.items():
|
|
print(f" {sport}: {count} games")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|