This commit is contained in:
Trey t
2026-01-19 22:12:53 -06:00
parent 11c0ae70d2
commit a8b0491571
19 changed files with 1328 additions and 525 deletions

View File

@@ -7,7 +7,16 @@
"Skill(superpowers:subagent-driven-development)",
"Bash(git add:*)",
"Bash(git commit:*)",
"WebSearch"
"WebSearch",
"Bash(wc:*)",
"Bash(find:*)",
"Skill(lead-designer)",
"Skill(frontend-design:frontend-design)",
"Bash(python3:*)",
"Bash(ls:*)",
"Bash(xargs basename:*)",
"Bash(python -m sportstime_parser:*)",
"Bash(python -m py_compile:*)"
]
}
}

View File

@@ -221,11 +221,11 @@ def get_scraper(sport: str, season: int):
def cmd_scrape(args: argparse.Namespace) -> int:
"""Execute the scrape command."""
from .models.game import save_games
from .models.team import save_teams
from .models.stadium import save_stadiums
"""Execute the scrape command with canonical output format."""
import json
from .validators.report import generate_report, validate_games
from .normalizers.timezone import get_stadium_timezone
from .validators.schema import SchemaValidationError, validate_batch
logger = get_logger()
@@ -282,14 +282,60 @@ def cmd_scrape(args: argparse.Namespace) -> int:
logger.info(f"Review items: {report.summary.review_count}")
if not args.dry_run:
# Save output files
# Build mappings for canonical conversion
stadium_timezone_map: dict[str, str] = {}
for stadium in result.stadiums:
tz = get_stadium_timezone(stadium.state, stadium.timezone)
stadium_timezone_map[stadium.id] = tz
stadium_team_abbrevs: dict[str, list[str]] = {}
for team in result.teams:
if team.stadium_id:
if team.stadium_id not in stadium_team_abbrevs:
stadium_team_abbrevs[team.stadium_id] = []
stadium_team_abbrevs[team.stadium_id].append(team.abbreviation)
# Convert to canonical format
canonical_stadiums = [
s.to_canonical_dict(primary_team_abbrevs=stadium_team_abbrevs.get(s.id, []))
for s in result.stadiums
]
canonical_teams = [t.to_canonical_dict() for t in result.teams]
canonical_games = [
g.to_canonical_dict(stadium_timezone=stadium_timezone_map.get(g.stadium_id, "America/New_York"))
for g in result.games
]
# Validate canonical output
stadium_errors = validate_batch(canonical_stadiums, "stadium", fail_fast=False)
team_errors = validate_batch(canonical_teams, "team", fail_fast=False)
game_errors = validate_batch(canonical_games, "game", fail_fast=False)
if stadium_errors or team_errors or game_errors:
for idx, errors in stadium_errors:
for e in errors:
logger.error(f"Stadium {result.stadiums[idx].id}: {e}")
for idx, errors in team_errors:
for e in errors:
logger.error(f"Team {result.teams[idx].id}: {e}")
for idx, errors in game_errors[:10]:
for e in errors:
logger.error(f"Game {result.games[idx].id}: {e}")
if len(game_errors) > 10:
logger.error(f"... and {len(game_errors) - 10} more game errors")
raise SchemaValidationError("canonical", ["Schema validation failed"])
# Save canonical output files
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
save_games(result.games, str(games_file))
save_teams(result.teams, str(teams_file))
save_stadiums(result.stadiums, str(stadiums_file))
with open(games_file, "w", encoding="utf-8") as f:
json.dump(canonical_games, f, indent=2)
with open(teams_file, "w", encoding="utf-8") as f:
json.dump(canonical_teams, f, indent=2)
with open(stadiums_file, "w", encoding="utf-8") as f:
json.dump(canonical_stadiums, f, indent=2)
# Save validation report
report_path = report.save()
@@ -307,6 +353,11 @@ def cmd_scrape(args: argparse.Namespace) -> int:
failure_count += 1
continue
except SchemaValidationError as e:
log_failure(f"{sport.upper()}: {e}")
failure_count += 1
continue
except Exception as e:
log_failure(f"{sport.upper()}: {e}")
logger.exception("Scraping failed")

View File

@@ -3,6 +3,7 @@
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
from zoneinfo import ZoneInfo
import json
@@ -64,9 +65,53 @@ class Game:
"raw_stadium": self.raw_stadium,
}
def to_canonical_dict(
self,
stadium_timezone: str,
is_playoff: bool = False,
broadcast: Optional[str] = None,
) -> dict:
"""Convert to canonical dictionary format matching iOS app schema.
Args:
stadium_timezone: IANA timezone of the stadium (e.g., 'America/Chicago')
is_playoff: Whether this is a playoff game
broadcast: Broadcast network info (e.g., 'ESPN')
Returns:
Dictionary with field names matching JSONCanonicalGame in BootstrapService.swift
"""
# Convert game_date to UTC
if self.game_date.tzinfo is None:
# Localize naive datetime to stadium timezone first
local_tz = ZoneInfo(stadium_timezone)
local_dt = self.game_date.replace(tzinfo=local_tz)
else:
local_dt = self.game_date
utc_dt = local_dt.astimezone(ZoneInfo("UTC"))
# Format season as string (e.g., 2025 -> "2025-26" for NBA/NHL, "2025" for MLB)
if self.sport in ("nba", "nhl"):
season_str = f"{self.season}-{str(self.season + 1)[-2:]}"
else:
season_str = str(self.season)
return {
"canonical_id": self.id,
"sport": self.sport,
"season": season_str,
"game_datetime_utc": utc_dt.strftime("%Y-%m-%dT%H:%M:%SZ"),
"home_team_canonical_id": self.home_team_id,
"away_team_canonical_id": self.away_team_id,
"stadium_canonical_id": self.stadium_id,
"is_playoff": is_playoff,
"broadcast": broadcast,
}
@classmethod
def from_dict(cls, data: dict) -> "Game":
"""Create a Game from a dictionary."""
"""Create a Game from a dictionary (internal format)."""
game_date = data["game_date"]
if isinstance(game_date, str):
game_date = datetime.fromisoformat(game_date)
@@ -89,6 +134,26 @@ class Game:
raw_stadium=data.get("raw_stadium"),
)
@classmethod
def from_canonical_dict(cls, data: dict) -> "Game":
"""Create a Game from a canonical dictionary (iOS app format)."""
game_date = datetime.fromisoformat(data["game_datetime_utc"])
# Parse season string (e.g., "2025-26" -> 2025, or "2025" -> 2025)
season_str = data["season"]
season = int(season_str.split("-")[0])
return cls(
id=data["canonical_id"],
sport=data["sport"],
season=season,
home_team_id=data["home_team_canonical_id"],
away_team_id=data["away_team_canonical_id"],
stadium_id=data["stadium_canonical_id"],
game_date=game_date,
status="scheduled",
)
def to_json(self) -> str:
"""Serialize to JSON string."""
return json.dumps(self.to_dict(), indent=2)
@@ -106,7 +171,10 @@ def save_games(games: list[Game], filepath: str) -> None:
def load_games(filepath: str) -> list[Game]:
"""Load a list of games from a JSON file."""
"""Load a list of games from a JSON file (auto-detects format)."""
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
# Detect format: canonical has "canonical_id" and "game_datetime_utc", internal has "id"
if data and "canonical_id" in data[0] and "game_datetime_utc" in data[0]:
return [Game.from_canonical_dict(d) for d in data]
return [Game.from_dict(d) for d in data]

View File

@@ -60,9 +60,32 @@ class Stadium:
"timezone": self.timezone,
}
def to_canonical_dict(self, primary_team_abbrevs: list[str] | None = None) -> dict:
"""Convert to canonical dictionary format matching iOS app schema.
Args:
primary_team_abbrevs: List of team abbreviations that play at this stadium.
If None, defaults to empty list.
Returns:
Dictionary with field names matching JSONCanonicalStadium in BootstrapService.swift
"""
return {
"canonical_id": self.id,
"name": self.name,
"city": self.city,
"state": self.state,
"latitude": self.latitude,
"longitude": self.longitude,
"capacity": self.capacity if self.capacity is not None else 0,
"sport": self.sport,
"primary_team_abbrevs": primary_team_abbrevs or [],
"year_opened": self.opened_year,
}
@classmethod
def from_dict(cls, data: dict) -> "Stadium":
"""Create a Stadium from a dictionary."""
"""Create a Stadium from a dictionary (internal format)."""
return cls(
id=data["id"],
sport=data["sport"],
@@ -80,6 +103,22 @@ class Stadium:
timezone=data.get("timezone"),
)
@classmethod
def from_canonical_dict(cls, data: dict) -> "Stadium":
"""Create a Stadium from a canonical dictionary (iOS app format)."""
return cls(
id=data["canonical_id"],
sport=data["sport"],
name=data["name"],
city=data["city"],
state=data["state"],
country="USA", # Canonical format doesn't include country
latitude=data["latitude"],
longitude=data["longitude"],
capacity=data.get("capacity"),
opened_year=data.get("year_opened"),
)
def to_json(self) -> str:
"""Serialize to JSON string."""
return json.dumps(self.to_dict(), indent=2)
@@ -102,7 +141,10 @@ def save_stadiums(stadiums: list[Stadium], filepath: str) -> None:
def load_stadiums(filepath: str) -> list[Stadium]:
"""Load a list of stadiums from a JSON file."""
"""Load a list of stadiums from a JSON file (auto-detects format)."""
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
# Detect format: canonical has "canonical_id", internal has "id"
if data and "canonical_id" in data[0]:
return [Stadium.from_canonical_dict(d) for d in data]
return [Stadium.from_dict(d) for d in data]

View File

@@ -54,9 +54,28 @@ class Team:
"stadium_id": self.stadium_id,
}
def to_canonical_dict(self) -> dict:
"""Convert to canonical dictionary format matching iOS app schema.
Returns:
Dictionary with field names matching JSONCanonicalTeam in BootstrapService.swift
"""
return {
"canonical_id": self.id,
"name": self.name,
"abbreviation": self.abbreviation,
"sport": self.sport,
"city": self.city,
"stadium_canonical_id": self.stadium_id or "",
"conference_id": self.conference,
"division_id": self.division,
"primary_color": self.primary_color,
"secondary_color": self.secondary_color,
}
@classmethod
def from_dict(cls, data: dict) -> "Team":
"""Create a Team from a dictionary."""
"""Create a Team from a dictionary (internal format)."""
return cls(
id=data["id"],
sport=data["sport"],
@@ -72,6 +91,23 @@ class Team:
stadium_id=data.get("stadium_id"),
)
@classmethod
def from_canonical_dict(cls, data: dict) -> "Team":
"""Create a Team from a canonical dictionary (iOS app format)."""
return cls(
id=data["canonical_id"],
sport=data["sport"],
city=data["city"],
name=data["name"],
full_name=f"{data['city']} {data['name']}", # Reconstruct full_name
abbreviation=data["abbreviation"],
conference=data.get("conference_id"),
division=data.get("division_id"),
primary_color=data.get("primary_color"),
secondary_color=data.get("secondary_color"),
stadium_id=data.get("stadium_canonical_id"),
)
def to_json(self) -> str:
"""Serialize to JSON string."""
return json.dumps(self.to_dict(), indent=2)
@@ -89,7 +125,10 @@ def save_teams(teams: list[Team], filepath: str) -> None:
def load_teams(filepath: str) -> list[Team]:
"""Load a list of teams from a JSON file."""
"""Load a list of teams from a JSON file (auto-detects format)."""
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
# Detect format: canonical has "canonical_id", internal has "id"
if data and "canonical_id" in data[0]:
return [Team.from_canonical_dict(d) for d in data]
return [Team.from_dict(d) for d in data]

View File

@@ -240,6 +240,13 @@ STADIUM_MAPPINGS: dict[str, dict[str, StadiumInfo]] = {
"stadium_nwsl_america_first_field": StadiumInfo("stadium_nwsl_america_first_field", "America First Field", "Sandy", "UT", "USA", "nwsl", 40.5830, -111.8933),
"stadium_nwsl_audi_field": StadiumInfo("stadium_nwsl_audi_field", "Audi Field", "Washington", "DC", "USA", "nwsl", 38.8687, -77.0128),
"stadium_nwsl_paypal_park": StadiumInfo("stadium_nwsl_paypal_park", "PayPal Park", "San Jose", "CA", "USA", "nwsl", 37.3511, -121.9250),
# Boston Legacy FC venues
"stadium_nwsl_gillette_stadium": StadiumInfo("stadium_nwsl_gillette_stadium", "Gillette Stadium", "Foxborough", "MA", "USA", "nwsl", 42.0909, -71.2643),
"stadium_nwsl_centreville_bank_stadium": StadiumInfo("stadium_nwsl_centreville_bank_stadium", "Centreville Bank Stadium", "Pawtucket", "RI", "USA", "nwsl", 41.8770, -71.3910),
# Denver Summit FC venues
"stadium_nwsl_empower_field": StadiumInfo("stadium_nwsl_empower_field", "Empower Field at Mile High", "Denver", "CO", "USA", "nwsl", 39.7439, -105.0201, "America/Denver"),
"stadium_nwsl_dicks_sporting_goods_park": StadiumInfo("stadium_nwsl_dicks_sporting_goods_park", "Dick's Sporting Goods Park", "Commerce City", "CO", "USA", "nwsl", 39.8056, -104.8922, "America/Denver"),
"stadium_nwsl_centennial_stadium": StadiumInfo("stadium_nwsl_centennial_stadium", "Centennial Stadium", "Centennial", "CO", "USA", "nwsl", 39.6000, -104.8800, "America/Denver"),
},
}

View File

@@ -265,6 +265,8 @@ TEAM_MAPPINGS: dict[str, dict[str, tuple[str, str, str]]] = {
"SLC": ("team_nwsl_slc", "Utah Royals", "Utah"),
"WAS": ("team_nwsl_was", "Washington Spirit", "Washington"),
"BFC": ("team_nwsl_bfc", "Bay FC", "San Francisco"),
"BOS": ("team_nwsl_bos", "Boston Legacy FC", "Boston"),
"DEN": ("team_nwsl_den", "Denver Summit FC", "Denver"),
},
}

View File

@@ -185,9 +185,12 @@ class BaseScraper(ABC):
"""
sources = self._get_sources()
last_error: Optional[str] = None
sources_tried = 0
max_sources_to_try = 2 # Don't try all sources if first few return nothing
for source in sources:
self._logger.info(f"Trying source: {source}")
sources_tried += 1
try:
# Scrape raw data
@@ -195,6 +198,12 @@ class BaseScraper(ABC):
if not raw_games:
log_warning(f"No games found from {source}")
# If multiple sources return nothing, the schedule likely doesn't exist
if sources_tried >= max_sources_to_try:
return ScrapeResult(
success=False,
error_message=f"No schedule data available (tried {sources_tried} sources)",
)
continue
self._logger.info(f"Found {len(raw_games)} raw games from {source}")
@@ -216,7 +225,9 @@ class BaseScraper(ABC):
except Exception as e:
last_error = str(e)
log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
# Discard partial data and try next source
# If we've tried enough sources, bail out
if sources_tried >= max_sources_to_try:
break
continue
# All sources failed

View File

@@ -1,6 +1,6 @@
"""MLB scraper implementation with multi-source fallback."""
from datetime import datetime, date
from datetime import datetime, date, timedelta
from typing import Optional
from bs4 import BeautifulSoup
@@ -45,7 +45,10 @@ class MLBScraper(BaseScraper):
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
return ["baseball_reference", "mlb_api", "espn"]
# MLB API is best - returns full schedule in one request
# ESPN caps at ~25 results for baseball
# Baseball-Reference requires HTML parsing
return ["mlb_api", "espn", "baseball_reference"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
@@ -215,43 +218,29 @@ class MLBScraper(BaseScraper):
)
def _scrape_mlb_api(self) -> list[RawGameData]:
"""Scrape games from MLB Stats API.
"""Scrape games from MLB Stats API using full season query."""
# Build date range for entire season (March-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
MLB API allows date range queries.
"""
all_games: list[RawGameData] = []
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
# Query by month to avoid hitting API limits
for year, month in self._get_season_months():
start_date = date(year, month, 1)
start_date = date(start_year, start_month, 1)
# Get last day of month
if month == 12:
end_date = date(year + 1, 1, 1)
else:
end_date = date(year, month + 1, 1)
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date.strftime('%Y-%m-%d')}&endDate={end_date.strftime('%Y-%m-%d')}"
self._logger.info(f"Fetching MLB schedule: {start_date} to {end_date}")
# Adjust end date to last day of month
from datetime import timedelta
end_date = end_date - timedelta(days=1)
url = self._get_source_url(
"mlb_api",
start_date=start_date.strftime("%Y-%m-%d"),
end_date=end_date.strftime("%Y-%m-%d"),
)
try:
data = self.session.get_json(url)
games = self._parse_mlb_api_response(data, url)
all_games.extend(games)
self._logger.debug(f"Found {len(games)} games in {year}-{month:02d}")
except Exception as e:
self._logger.debug(f"MLB API error for {year}-{month}: {e}")
continue
return all_games
try:
data = self.session.get_json(url)
return self._parse_mlb_api_response(data, url)
except Exception as e:
self._logger.error(f"MLB API error: {e}")
return []
def _parse_mlb_api_response(
self,
@@ -345,33 +334,30 @@ class MLBScraper(BaseScraper):
)
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API."""
all_games: list[RawGameData] = []
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (March-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
days_in_month = (next_month - date(year, month, 1)).days
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
url = f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?limit=3000&dates={date_range}"
self._logger.info(f"Fetching MLB schedule: {date_range}")
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
continue
return all_games
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,

View File

@@ -1,6 +1,6 @@
"""MLS scraper implementation with multi-source fallback."""
from datetime import datetime, date
from datetime import datetime, date, timedelta
from typing import Optional
from .base import BaseScraper, RawGameData, ScrapeResult
@@ -78,33 +78,30 @@ class MLSScraper(BaseScraper):
raise ValueError(f"Unknown source: {source}")
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API."""
all_games: list[RawGameData] = []
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (Feb-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
days_in_month = (next_month - date(year, month, 1)).days
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.1/scoreboard?limit=1000&dates={date_range}"
self._logger.info(f"Fetching MLS schedule: {date_range}")
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
continue
return all_games
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,

View File

@@ -95,9 +95,11 @@ class NBAScraper(BaseScraper):
BR organizes games by month with separate pages.
Format: https://www.basketball-reference.com/leagues/NBA_YYYY_games-month.html
where YYYY is the ending year of the season.
Bails early if first few months have no data (season doesn't exist).
"""
all_games: list[RawGameData] = []
end_year = self.season + 1
consecutive_empty_months = 0
for month in BR_MONTHS:
url = self._get_source_url("basketball_reference", month=month, year=end_year)
@@ -105,13 +107,23 @@ class NBAScraper(BaseScraper):
try:
html = self.session.get_html(url)
games = self._parse_basketball_reference(html, url)
all_games.extend(games)
self._logger.debug(f"Found {len(games)} games in {month}")
if games:
all_games.extend(games)
consecutive_empty_months = 0
self._logger.debug(f"Found {len(games)} games in {month}")
else:
consecutive_empty_months += 1
except Exception as e:
# Some months may not exist (e.g., no games in August)
self._logger.debug(f"No data for {month}: {e}")
continue
consecutive_empty_months += 1
# If first 3 months (Oct, Nov, Dec) all have no data, season doesn't exist
if consecutive_empty_months >= 3 and not all_games:
self._logger.info(f"No games found in first {consecutive_empty_months} months, season likely doesn't exist")
break
return all_games
@@ -247,8 +259,11 @@ class NBAScraper(BaseScraper):
ESPN API returns games for a specific date range.
We iterate through each day of the season.
Bails out early if no games found after checking first month.
"""
all_games: list[RawGameData] = []
consecutive_empty_days = 0
max_empty_days = 45 # Bail after ~1.5 months of no games
for year, month in self._get_season_months():
# Get number of days in month
@@ -267,10 +282,25 @@ class NBAScraper(BaseScraper):
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
if games:
all_games.extend(games)
consecutive_empty_days = 0
else:
consecutive_empty_days += 1
# Bail early if no games found for a long stretch
if consecutive_empty_days >= max_empty_days:
self._logger.info(f"No games found for {max_empty_days} consecutive days, stopping ESPN scrape")
return all_games
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
consecutive_empty_days += 1
if consecutive_empty_days >= max_empty_days:
self._logger.info(f"Too many consecutive failures, stopping ESPN scrape")
return all_games
continue
return all_games

View File

@@ -1,6 +1,6 @@
"""NWSL scraper implementation with multi-source fallback."""
from datetime import datetime, date
from datetime import datetime, date, timedelta
from typing import Optional
from .base import BaseScraper, RawGameData, ScrapeResult
@@ -73,33 +73,30 @@ class NWSLScraper(BaseScraper):
raise ValueError(f"Unknown source: {source}")
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API."""
all_games: list[RawGameData] = []
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (March-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
days_in_month = (next_month - date(year, month, 1)).days
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.nwsl/scoreboard?limit=1000&dates={date_range}"
self._logger.info(f"Fetching NWSL schedule: {date_range}")
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
continue
return all_games
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,

View File

@@ -1,6 +1,6 @@
"""WNBA scraper implementation with multi-source fallback."""
from datetime import datetime, date
from datetime import datetime, date, timedelta
from typing import Optional
from .base import BaseScraper, RawGameData, ScrapeResult
@@ -73,33 +73,30 @@ class WNBAScraper(BaseScraper):
raise ValueError(f"Unknown source: {source}")
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API."""
all_games: list[RawGameData] = []
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (May-October)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
days_in_month = (next_month - date(year, month, 1)).days
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
url = f"https://site.api.espn.com/apis/site/v2/sports/basketball/wnba/scoreboard?limit=1000&dates={date_range}"
self._logger.info(f"Fetching WNBA schedule: {date_range}")
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
continue
return all_games
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,

View File

@@ -8,10 +8,25 @@ from .report import (
validate_games,
)
from .schema import (
SchemaValidationError,
validate_canonical_stadium,
validate_canonical_team,
validate_canonical_game,
validate_and_raise,
validate_batch,
)
__all__ = [
"ValidationReport",
"ValidationSummary",
"generate_report",
"detect_duplicate_games",
"validate_games",
"SchemaValidationError",
"validate_canonical_stadium",
"validate_canonical_team",
"validate_canonical_game",
"validate_and_raise",
"validate_batch",
]

View File

@@ -0,0 +1,246 @@
"""JSON Schema validation for canonical output matching iOS app expectations.
This module defines schemas that match the Swift structs in BootstrapService.swift:
- JSONCanonicalStadium
- JSONCanonicalTeam
- JSONCanonicalGame
Validation is performed at runtime before outputting JSON to ensure
Python output matches what the iOS app expects.
"""
import re
from dataclasses import dataclass
from typing import Any, Callable, Optional, Union
class SchemaValidationError(Exception):
"""Raised when canonical output fails schema validation."""
def __init__(self, model_type: str, errors: list[str]):
self.model_type = model_type
self.errors = errors
super().__init__(f"{model_type} schema validation failed:\n" + "\n".join(f" - {e}" for e in errors))
# ISO8601 UTC datetime pattern: YYYY-MM-DDTHH:MM:SSZ
ISO8601_UTC_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
# Season format patterns
SEASON_SPLIT_PATTERN = re.compile(r"^\d{4}-\d{2}$") # e.g., "2025-26"
SEASON_SINGLE_PATTERN = re.compile(r"^\d{4}$") # e.g., "2025"
@dataclass
class FieldSpec:
"""Specification for a field in the canonical schema."""
name: str
required: bool
field_type: Union[type, tuple]
validator: Optional[Callable] = None
# Schema definitions matching Swift structs in BootstrapService.swift
STADIUM_SCHEMA: list[FieldSpec] = [
FieldSpec("canonical_id", required=True, field_type=str),
FieldSpec("name", required=True, field_type=str),
FieldSpec("city", required=True, field_type=str),
FieldSpec("state", required=True, field_type=str),
FieldSpec("latitude", required=True, field_type=(int, float)),
FieldSpec("longitude", required=True, field_type=(int, float)),
FieldSpec("capacity", required=True, field_type=int),
FieldSpec("sport", required=True, field_type=str),
FieldSpec("primary_team_abbrevs", required=True, field_type=list),
FieldSpec("year_opened", required=False, field_type=(int, type(None))),
]
TEAM_SCHEMA: list[FieldSpec] = [
FieldSpec("canonical_id", required=True, field_type=str),
FieldSpec("name", required=True, field_type=str),
FieldSpec("abbreviation", required=True, field_type=str),
FieldSpec("sport", required=True, field_type=str),
FieldSpec("city", required=True, field_type=str),
FieldSpec("stadium_canonical_id", required=True, field_type=str),
FieldSpec("conference_id", required=False, field_type=(str, type(None))),
FieldSpec("division_id", required=False, field_type=(str, type(None))),
FieldSpec("primary_color", required=False, field_type=(str, type(None))),
FieldSpec("secondary_color", required=False, field_type=(str, type(None))),
]
GAME_SCHEMA: list[FieldSpec] = [
FieldSpec("canonical_id", required=True, field_type=str),
FieldSpec("sport", required=True, field_type=str),
FieldSpec(
"season",
required=True,
field_type=str,
validator=lambda v: SEASON_SPLIT_PATTERN.match(v) or SEASON_SINGLE_PATTERN.match(v),
),
FieldSpec(
"game_datetime_utc",
required=True,
field_type=str,
validator=lambda v: ISO8601_UTC_PATTERN.match(v),
),
FieldSpec("home_team_canonical_id", required=True, field_type=str),
FieldSpec("away_team_canonical_id", required=True, field_type=str),
FieldSpec("stadium_canonical_id", required=True, field_type=str),
FieldSpec("is_playoff", required=True, field_type=bool),
FieldSpec("broadcast", required=False, field_type=(str, type(None))),
]
def validate_field(data: dict[str, Any], spec: FieldSpec) -> list[str]:
"""Validate a single field against its specification.
Args:
data: The dictionary to validate
spec: The field specification
Returns:
List of error messages (empty if valid)
"""
errors = []
if spec.name not in data:
if spec.required:
errors.append(f"Missing required field: {spec.name}")
return errors
value = data[spec.name]
# Check type
if not isinstance(value, spec.field_type):
expected = spec.field_type.__name__ if isinstance(spec.field_type, type) else str(spec.field_type)
actual = type(value).__name__
errors.append(f"Field '{spec.name}' has wrong type: expected {expected}, got {actual} (value: {value!r})")
return errors
# Check custom validator
if spec.validator and value is not None:
if not spec.validator(value):
errors.append(f"Field '{spec.name}' failed validation: {value!r}")
return errors
def validate_canonical_stadium(data: dict[str, Any]) -> list[str]:
"""Validate a canonical stadium dictionary.
Args:
data: Stadium dictionary from to_canonical_dict()
Returns:
List of error messages (empty if valid)
"""
errors = []
for spec in STADIUM_SCHEMA:
errors.extend(validate_field(data, spec))
# Additional validation: primary_team_abbrevs should contain strings
if "primary_team_abbrevs" in data and isinstance(data["primary_team_abbrevs"], list):
for i, abbrev in enumerate(data["primary_team_abbrevs"]):
if not isinstance(abbrev, str):
errors.append(f"primary_team_abbrevs[{i}] must be string, got {type(abbrev).__name__}")
return errors
def validate_canonical_team(data: dict[str, Any]) -> list[str]:
"""Validate a canonical team dictionary.
Args:
data: Team dictionary from to_canonical_dict()
Returns:
List of error messages (empty if valid)
"""
errors = []
for spec in TEAM_SCHEMA:
errors.extend(validate_field(data, spec))
return errors
def validate_canonical_game(data: dict[str, Any]) -> list[str]:
"""Validate a canonical game dictionary.
Args:
data: Game dictionary from to_canonical_dict()
Returns:
List of error messages (empty if valid)
"""
errors = []
for spec in GAME_SCHEMA:
errors.extend(validate_field(data, spec))
return errors
def validate_and_raise(data: dict[str, Any], model_type: str) -> None:
"""Validate a canonical dictionary and raise on error.
Args:
data: Dictionary from to_canonical_dict()
model_type: One of 'stadium', 'team', 'game'
Raises:
SchemaValidationError: If validation fails
ValueError: If model_type is unknown
"""
validators = {
"stadium": validate_canonical_stadium,
"team": validate_canonical_team,
"game": validate_canonical_game,
}
if model_type not in validators:
raise ValueError(f"Unknown model type: {model_type}")
errors = validators[model_type](data)
if errors:
raise SchemaValidationError(model_type, errors)
def validate_batch(
items: list[dict[str, Any]],
model_type: str,
fail_fast: bool = True,
) -> list[tuple[int, list[str]]]:
"""Validate a batch of canonical dictionaries.
Args:
items: List of dictionaries from to_canonical_dict()
model_type: One of 'stadium', 'team', 'game'
fail_fast: If True, raise on first error; if False, collect all errors
Returns:
List of (index, errors) tuples for items with validation errors
Raises:
SchemaValidationError: If fail_fast=True and validation fails
"""
validators = {
"stadium": validate_canonical_stadium,
"team": validate_canonical_team,
"game": validate_canonical_game,
}
if model_type not in validators:
raise ValueError(f"Unknown model type: {model_type}")
validator = validators[model_type]
all_errors = []
for i, item in enumerate(items):
errors = validator(item)
if errors:
if fail_fast:
raise SchemaValidationError(
model_type,
[f"Item {i}: {e}" for e in errors],
)
all_errors.append((i, errors))
return all_errors