#!/usr/bin/env python3 """ NFL schedule and stadium scrapers for SportsTime. This module provides: - NFL game scrapers (ESPN, Pro-Football-Reference, CBS Sports) - NFL stadium scrapers (ScoreBot, GeoJSON, hardcoded) - Multi-source fallback configurations """ from datetime import datetime from typing import Optional import requests # Support both direct execution and import from parent directory try: from core import ( Game, Stadium, ScraperSource, StadiumScraperSource, fetch_page, scrape_with_fallback, scrape_stadiums_with_fallback, ) except ImportError: from Scripts.core import ( Game, Stadium, ScraperSource, StadiumScraperSource, fetch_page, scrape_with_fallback, scrape_stadiums_with_fallback, ) __all__ = [ # Team data 'NFL_TEAMS', # Game scrapers 'scrape_nfl_espn', 'scrape_nfl_pro_football_reference', 'scrape_nfl_cbssports', # Stadium scrapers 'scrape_nfl_stadiums', 'scrape_nfl_stadiums_scorebot', 'scrape_nfl_stadiums_geojson', 'scrape_nfl_stadiums_hardcoded', # Source configurations 'NFL_GAME_SOURCES', 'NFL_STADIUM_SOURCES', # Convenience functions 'scrape_nfl_games', 'get_nfl_season_string', ] # ============================================================================= # TEAM MAPPINGS # ============================================================================= NFL_TEAMS = { 'ARI': {'name': 'Arizona Cardinals', 'city': 'Glendale', 'stadium': 'State Farm Stadium'}, 'ATL': {'name': 'Atlanta Falcons', 'city': 'Atlanta', 'stadium': 'Mercedes-Benz Stadium'}, 'BAL': {'name': 'Baltimore Ravens', 'city': 'Baltimore', 'stadium': 'M&T Bank Stadium'}, 'BUF': {'name': 'Buffalo Bills', 'city': 'Orchard Park', 'stadium': 'Highmark Stadium'}, 'CAR': {'name': 'Carolina Panthers', 'city': 'Charlotte', 'stadium': 'Bank of America Stadium'}, 'CHI': {'name': 'Chicago Bears', 'city': 'Chicago', 'stadium': 'Soldier Field'}, 'CIN': {'name': 'Cincinnati Bengals', 'city': 'Cincinnati', 'stadium': 'Paycor Stadium'}, 'CLE': {'name': 'Cleveland Browns', 'city': 'Cleveland', 'stadium': 'Cleveland Browns Stadium'}, 'DAL': {'name': 'Dallas Cowboys', 'city': 'Arlington', 'stadium': 'AT&T Stadium'}, 'DEN': {'name': 'Denver Broncos', 'city': 'Denver', 'stadium': 'Empower Field at Mile High'}, 'DET': {'name': 'Detroit Lions', 'city': 'Detroit', 'stadium': 'Ford Field'}, 'GB': {'name': 'Green Bay Packers', 'city': 'Green Bay', 'stadium': 'Lambeau Field'}, 'HOU': {'name': 'Houston Texans', 'city': 'Houston', 'stadium': 'NRG Stadium'}, 'IND': {'name': 'Indianapolis Colts', 'city': 'Indianapolis', 'stadium': 'Lucas Oil Stadium'}, 'JAX': {'name': 'Jacksonville Jaguars', 'city': 'Jacksonville', 'stadium': 'EverBank Stadium'}, 'KC': {'name': 'Kansas City Chiefs', 'city': 'Kansas City', 'stadium': 'GEHA Field at Arrowhead Stadium'}, 'LV': {'name': 'Las Vegas Raiders', 'city': 'Las Vegas', 'stadium': 'Allegiant Stadium'}, 'LAC': {'name': 'Los Angeles Chargers', 'city': 'Inglewood', 'stadium': 'SoFi Stadium'}, 'LAR': {'name': 'Los Angeles Rams', 'city': 'Inglewood', 'stadium': 'SoFi Stadium'}, 'MIA': {'name': 'Miami Dolphins', 'city': 'Miami Gardens', 'stadium': 'Hard Rock Stadium'}, 'MIN': {'name': 'Minnesota Vikings', 'city': 'Minneapolis', 'stadium': 'U.S. Bank Stadium'}, 'NE': {'name': 'New England Patriots', 'city': 'Foxborough', 'stadium': 'Gillette Stadium'}, 'NO': {'name': 'New Orleans Saints', 'city': 'New Orleans', 'stadium': 'Caesars Superdome'}, 'NYG': {'name': 'New York Giants', 'city': 'East Rutherford', 'stadium': 'MetLife Stadium'}, 'NYJ': {'name': 'New York Jets', 'city': 'East Rutherford', 'stadium': 'MetLife Stadium'}, 'PHI': {'name': 'Philadelphia Eagles', 'city': 'Philadelphia', 'stadium': 'Lincoln Financial Field'}, 'PIT': {'name': 'Pittsburgh Steelers', 'city': 'Pittsburgh', 'stadium': 'Acrisure Stadium'}, 'SF': {'name': 'San Francisco 49ers', 'city': 'Santa Clara', 'stadium': "Levi's Stadium"}, 'SEA': {'name': 'Seattle Seahawks', 'city': 'Seattle', 'stadium': 'Lumen Field'}, 'TB': {'name': 'Tampa Bay Buccaneers', 'city': 'Tampa', 'stadium': 'Raymond James Stadium'}, 'TEN': {'name': 'Tennessee Titans', 'city': 'Nashville', 'stadium': 'Nissan Stadium'}, 'WAS': {'name': 'Washington Commanders', 'city': 'Landover', 'stadium': 'Northwest Stadium'}, } def get_nfl_team_abbrev(team_name: str) -> str: """Get NFL team abbreviation from full name.""" for abbrev, info in NFL_TEAMS.items(): if info['name'].lower() == team_name.lower(): return abbrev if team_name.lower() in info['name'].lower(): return abbrev # Return first 3 letters as fallback return team_name[:3].upper() def get_nfl_season_string(season: int) -> str: """ Get NFL season string in "2025-26" format. Args: season: The ending year of the season (e.g., 2026 for 2025-26 season) Returns: Season string like "2025-26" """ return f"{season-1}-{str(season)[2:]}" # ============================================================================= # GAME SCRAPERS # ============================================================================= def _scrape_espn_schedule(sport: str, league: str, season: int, date_range: tuple[str, str]) -> list[Game]: """ Fetch schedule from ESPN API. Args: sport: 'football' league: 'nfl' season: Season year date_range: (start_date, end_date) in YYYYMMDD format """ games = [] sport_upper = 'NFL' print(f"Fetching {sport_upper} {season} from ESPN API...") url = f"https://site.api.espn.com/apis/site/v2/sports/{sport}/{league}/scoreboard" params = { 'dates': f"{date_range[0]}-{date_range[1]}", 'limit': 1000 } headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } try: response = requests.get(url, params=params, headers=headers, timeout=30) response.raise_for_status() data = response.json() events = data.get('events', []) for event in events: try: # Parse date/time date_str = event.get('date', '')[:10] # YYYY-MM-DD time_str = event.get('date', '')[11:16] if len(event.get('date', '')) > 11 else None # Get teams competitions = event.get('competitions', [{}]) if not competitions: continue comp = competitions[0] competitors = comp.get('competitors', []) if len(competitors) < 2: continue home_team = None away_team = None home_abbrev = None away_abbrev = None for team in competitors: team_data = team.get('team', {}) team_name = team_data.get('displayName', team_data.get('name', '')) team_abbrev = team_data.get('abbreviation', '') if team.get('homeAway') == 'home': home_team = team_name home_abbrev = team_abbrev else: away_team = team_name away_abbrev = team_abbrev if not home_team or not away_team: continue # Get venue venue = comp.get('venue', {}).get('fullName', '') game_id = f"nfl_{date_str}_{away_abbrev}_{home_abbrev}".lower() game = Game( id=game_id, sport='NFL', season=get_nfl_season_string(season), date=date_str, time=time_str, home_team=home_team, away_team=away_team, home_team_abbrev=home_abbrev or get_nfl_team_abbrev(home_team), away_team_abbrev=away_abbrev or get_nfl_team_abbrev(away_team), venue=venue, source='espn.com' ) games.append(game) except Exception: continue print(f" Found {len(games)} games from ESPN") except Exception as e: print(f"Error fetching ESPN NFL: {e}") return games def scrape_nfl_espn(season: int) -> list[Game]: """Fetch NFL schedule from ESPN API.""" # NFL season: September - February (spans years) start = f"{season-1}0901" end = f"{season}0228" return _scrape_espn_schedule('football', 'nfl', season, (start, end)) def scrape_nfl_pro_football_reference(season: int) -> list[Game]: """ Scrape NFL schedule from Pro-Football-Reference. URL: https://www.pro-football-reference.com/years/{YEAR}/games.htm Season year is the starting year (e.g., 2025 for 2025-26 season) """ games = [] year = season - 1 # PFR uses starting year url = f"https://www.pro-football-reference.com/years/{year}/games.htm" print(f"Scraping NFL {season} from Pro-Football-Reference...") soup = fetch_page(url, 'pro-football-reference.com') if not soup: return games table = soup.find('table', {'id': 'games'}) if not table: print(" Could not find games table") return games tbody = table.find('tbody') if not tbody: return games for row in tbody.find_all('tr'): if row.get('class') and 'thead' in row.get('class'): continue try: # Parse date date_cell = row.find('td', {'data-stat': 'game_date'}) if not date_cell: continue date_str = date_cell.text.strip() # Parse teams winner_cell = row.find('td', {'data-stat': 'winner'}) loser_cell = row.find('td', {'data-stat': 'loser'}) home_cell = row.find('td', {'data-stat': 'game_location'}) if not winner_cell or not loser_cell: continue winner_link = winner_cell.find('a') loser_link = loser_cell.find('a') winner = winner_link.text if winner_link else winner_cell.text.strip() loser = loser_link.text if loser_link else loser_cell.text.strip() # Determine home/away - '@' in game_location means winner was away is_at_loser = home_cell and '@' in home_cell.text if is_at_loser: home_team, away_team = loser, winner else: home_team, away_team = winner, loser # Convert date (e.g., "September 7" or "2025-09-07") try: if '-' in date_str: parsed_date = datetime.strptime(date_str, '%Y-%m-%d') else: # Add year based on month month_str = date_str.split()[0] if month_str in ['January', 'February']: date_with_year = f"{date_str}, {year + 1}" else: date_with_year = f"{date_str}, {year}" parsed_date = datetime.strptime(date_with_year, '%B %d, %Y') date_formatted = parsed_date.strftime('%Y-%m-%d') except: continue away_abbrev = get_nfl_team_abbrev(away_team) home_abbrev = get_nfl_team_abbrev(home_team) game_id = f"nfl_{date_formatted}_{away_abbrev}_{home_abbrev}".lower().replace(' ', '') game = Game( id=game_id, sport='NFL', season=get_nfl_season_string(season), date=date_formatted, time=None, home_team=home_team, away_team=away_team, home_team_abbrev=home_abbrev, away_team_abbrev=away_abbrev, venue='', source='pro-football-reference.com' ) games.append(game) except Exception: continue print(f" Found {len(games)} games from Pro-Football-Reference") return games def scrape_nfl_cbssports(season: int) -> list[Game]: """ Scrape NFL schedule from CBS Sports. Provides structured schedule data via web scraping. """ games = [] year = season - 1 # CBS uses starting year print(f"Fetching NFL {season} from CBS Sports...") # CBS Sports schedule endpoint url = f"https://www.cbssports.com/nfl/schedule/{year}/regular/" soup = fetch_page(url, 'cbssports.com') if not soup: return games # Find game tables tables = soup.find_all('table', class_='TableBase-table') for table in tables: rows = table.find_all('tr') for row in rows: try: cells = row.find_all('td') if len(cells) < 3: continue # Parse matchup away_cell = cells[0] if len(cells) > 0 else None home_cell = cells[1] if len(cells) > 1 else None if not away_cell or not home_cell: continue away_team = away_cell.get_text(strip=True) home_team = home_cell.get_text(strip=True) if not away_team or not home_team: continue # CBS includes @ symbol away_team = away_team.replace('@', '').strip() # Get date from parent section if available date_formatted = datetime.now().strftime('%Y-%m-%d') # Placeholder away_abbrev = get_nfl_team_abbrev(away_team) home_abbrev = get_nfl_team_abbrev(home_team) game_id = f"nfl_{date_formatted}_{away_abbrev}_{home_abbrev}".lower().replace(' ', '') game = Game( id=game_id, sport='NFL', season=get_nfl_season_string(season), date=date_formatted, time=None, home_team=home_team, away_team=away_team, home_team_abbrev=home_abbrev, away_team_abbrev=away_abbrev, venue='', source='cbssports.com' ) games.append(game) except Exception: continue print(f" Found {len(games)} games from CBS Sports") return games # ============================================================================= # STADIUM SCRAPERS # ============================================================================= def scrape_nfl_stadiums_scorebot() -> list[Stadium]: """ Source 1: NFLScoreBot/stadiums GitHub (public domain). """ stadiums = [] url = "https://raw.githubusercontent.com/NFLScoreBot/stadiums/main/stadiums.json" response = requests.get(url, timeout=30) response.raise_for_status() data = response.json() for name, info in data.items(): stadium = Stadium( id=f"nfl_{name.lower().replace(' ', '_')[:30]}", name=name, city=info.get('city', ''), state=info.get('state', ''), latitude=info.get('lat', 0) / 1000000 if info.get('lat') else 0, longitude=info.get('long', 0) / 1000000 if info.get('long') else 0, capacity=info.get('capacity', 0), sport='NFL', team_abbrevs=info.get('teams', []), source='github.com/NFLScoreBot' ) stadiums.append(stadium) return stadiums def scrape_nfl_stadiums_geojson() -> list[Stadium]: """ Source 2: brianhatchl/nfl-stadiums GeoJSON gist. """ stadiums = [] url = "https://gist.githubusercontent.com/brianhatchl/6265918/raw/dbe6acfe5deb48f51ce5a4c4f8f5dded4f02b9bd/nfl_stadiums.geojson" response = requests.get(url, timeout=30) response.raise_for_status() data = response.json() for feature in data.get('features', []): props = feature.get('properties', {}) coords = feature.get('geometry', {}).get('coordinates', [0, 0]) stadium = Stadium( id=f"nfl_{props.get('Stadium', '').lower().replace(' ', '_')[:30]}", name=props.get('Stadium', ''), city=props.get('City', ''), state=props.get('State', ''), latitude=coords[1] if len(coords) > 1 else 0, longitude=coords[0] if len(coords) > 0 else 0, capacity=int(props.get('Capacity', 0) or 0), sport='NFL', team_abbrevs=[props.get('Team', '')], source='gist.github.com/brianhatchl' ) stadiums.append(stadium) return stadiums def scrape_nfl_stadiums_hardcoded() -> list[Stadium]: """ Source 3: Hardcoded NFL stadiums (fallback). """ nfl_stadiums_data = { 'State Farm Stadium': {'city': 'Glendale', 'state': 'AZ', 'lat': 33.5276, 'lng': -112.2626, 'capacity': 63400, 'teams': ['ARI']}, 'Mercedes-Benz Stadium': {'city': 'Atlanta', 'state': 'GA', 'lat': 33.7553, 'lng': -84.4006, 'capacity': 71000, 'teams': ['ATL']}, 'M&T Bank Stadium': {'city': 'Baltimore', 'state': 'MD', 'lat': 39.2780, 'lng': -76.6227, 'capacity': 71008, 'teams': ['BAL']}, 'Highmark Stadium': {'city': 'Orchard Park', 'state': 'NY', 'lat': 42.7738, 'lng': -78.7870, 'capacity': 71608, 'teams': ['BUF']}, 'Bank of America Stadium': {'city': 'Charlotte', 'state': 'NC', 'lat': 35.2258, 'lng': -80.8528, 'capacity': 75523, 'teams': ['CAR']}, 'Soldier Field': {'city': 'Chicago', 'state': 'IL', 'lat': 41.8623, 'lng': -87.6167, 'capacity': 61500, 'teams': ['CHI']}, 'Paycor Stadium': {'city': 'Cincinnati', 'state': 'OH', 'lat': 39.0954, 'lng': -84.5160, 'capacity': 65515, 'teams': ['CIN']}, 'Cleveland Browns Stadium': {'city': 'Cleveland', 'state': 'OH', 'lat': 41.5061, 'lng': -81.6995, 'capacity': 67895, 'teams': ['CLE']}, 'AT&T Stadium': {'city': 'Arlington', 'state': 'TX', 'lat': 32.7480, 'lng': -97.0928, 'capacity': 80000, 'teams': ['DAL']}, 'Empower Field at Mile High': {'city': 'Denver', 'state': 'CO', 'lat': 39.7439, 'lng': -105.0201, 'capacity': 76125, 'teams': ['DEN']}, 'Ford Field': {'city': 'Detroit', 'state': 'MI', 'lat': 42.3400, 'lng': -83.0456, 'capacity': 65000, 'teams': ['DET']}, 'Lambeau Field': {'city': 'Green Bay', 'state': 'WI', 'lat': 44.5013, 'lng': -88.0622, 'capacity': 81435, 'teams': ['GB']}, 'NRG Stadium': {'city': 'Houston', 'state': 'TX', 'lat': 29.6847, 'lng': -95.4107, 'capacity': 72220, 'teams': ['HOU']}, 'Lucas Oil Stadium': {'city': 'Indianapolis', 'state': 'IN', 'lat': 39.7601, 'lng': -86.1639, 'capacity': 67000, 'teams': ['IND']}, 'EverBank Stadium': {'city': 'Jacksonville', 'state': 'FL', 'lat': 30.3239, 'lng': -81.6373, 'capacity': 67814, 'teams': ['JAX']}, 'GEHA Field at Arrowhead Stadium': {'city': 'Kansas City', 'state': 'MO', 'lat': 39.0489, 'lng': -94.4839, 'capacity': 76416, 'teams': ['KC']}, 'Allegiant Stadium': {'city': 'Las Vegas', 'state': 'NV', 'lat': 36.0909, 'lng': -115.1833, 'capacity': 65000, 'teams': ['LV']}, 'SoFi Stadium': {'city': 'Inglewood', 'state': 'CA', 'lat': 33.9535, 'lng': -118.3392, 'capacity': 70240, 'teams': ['LAC', 'LAR']}, 'Hard Rock Stadium': {'city': 'Miami Gardens', 'state': 'FL', 'lat': 25.9580, 'lng': -80.2389, 'capacity': 64767, 'teams': ['MIA']}, 'U.S. Bank Stadium': {'city': 'Minneapolis', 'state': 'MN', 'lat': 44.9736, 'lng': -93.2575, 'capacity': 66655, 'teams': ['MIN']}, 'Gillette Stadium': {'city': 'Foxborough', 'state': 'MA', 'lat': 42.0909, 'lng': -71.2643, 'capacity': 65878, 'teams': ['NE']}, 'Caesars Superdome': {'city': 'New Orleans', 'state': 'LA', 'lat': 29.9511, 'lng': -90.0812, 'capacity': 73208, 'teams': ['NO']}, 'MetLife Stadium': {'city': 'East Rutherford', 'state': 'NJ', 'lat': 40.8135, 'lng': -74.0745, 'capacity': 82500, 'teams': ['NYG', 'NYJ']}, 'Lincoln Financial Field': {'city': 'Philadelphia', 'state': 'PA', 'lat': 39.9008, 'lng': -75.1675, 'capacity': 69596, 'teams': ['PHI']}, 'Acrisure Stadium': {'city': 'Pittsburgh', 'state': 'PA', 'lat': 40.4468, 'lng': -80.0158, 'capacity': 68400, 'teams': ['PIT']}, "Levi's Stadium": {'city': 'Santa Clara', 'state': 'CA', 'lat': 37.4032, 'lng': -121.9698, 'capacity': 68500, 'teams': ['SF']}, 'Lumen Field': {'city': 'Seattle', 'state': 'WA', 'lat': 47.5952, 'lng': -122.3316, 'capacity': 68740, 'teams': ['SEA']}, 'Raymond James Stadium': {'city': 'Tampa', 'state': 'FL', 'lat': 27.9759, 'lng': -82.5033, 'capacity': 65618, 'teams': ['TB']}, 'Nissan Stadium': {'city': 'Nashville', 'state': 'TN', 'lat': 36.1665, 'lng': -86.7713, 'capacity': 69143, 'teams': ['TEN']}, 'Northwest Stadium': {'city': 'Landover', 'state': 'MD', 'lat': 38.9076, 'lng': -76.8645, 'capacity': 67617, 'teams': ['WAS']}, } stadiums = [] for name, info in nfl_stadiums_data.items(): stadium = Stadium( id=f"nfl_{name.lower().replace(' ', '_')[:30]}", name=name, city=info['city'], state=info['state'], latitude=info['lat'], longitude=info['lng'], capacity=info['capacity'], sport='NFL', team_abbrevs=info['teams'], source='nfl_hardcoded' ) stadiums.append(stadium) return stadiums def scrape_nfl_stadiums() -> list[Stadium]: """ Fetch NFL stadium data with multi-source fallback. """ print("\nNFL STADIUMS") print("-" * 40) return scrape_stadiums_with_fallback('NFL', NFL_STADIUM_SOURCES) # ============================================================================= # SOURCE CONFIGURATIONS # ============================================================================= NFL_GAME_SOURCES = [ ScraperSource('ESPN', scrape_nfl_espn, priority=1, min_games=200), ScraperSource('Pro-Football-Reference', scrape_nfl_pro_football_reference, priority=2, min_games=200), ScraperSource('CBS Sports', scrape_nfl_cbssports, priority=3, min_games=100), ] NFL_STADIUM_SOURCES = [ StadiumScraperSource('NFLScoreBot', scrape_nfl_stadiums_scorebot, priority=1, min_venues=28), StadiumScraperSource('GeoJSON-Gist', scrape_nfl_stadiums_geojson, priority=2, min_venues=28), StadiumScraperSource('Hardcoded', scrape_nfl_stadiums_hardcoded, priority=3, min_venues=28), ] # ============================================================================= # CONVENIENCE FUNCTIONS # ============================================================================= def scrape_nfl_games(season: int) -> list[Game]: """ Scrape NFL games for a season using multi-source fallback. Args: season: Season ending year (e.g., 2026 for 2025-26 season) Returns: List of Game objects from the first successful source """ print(f"\nNFL {get_nfl_season_string(season)} SCHEDULE") print("-" * 40) return scrape_with_fallback('NFL', season, NFL_GAME_SOURCES)