wip
This commit is contained in:
@@ -221,11 +221,11 @@ def get_scraper(sport: str, season: int):
|
||||
|
||||
|
||||
def cmd_scrape(args: argparse.Namespace) -> int:
|
||||
"""Execute the scrape command."""
|
||||
from .models.game import save_games
|
||||
from .models.team import save_teams
|
||||
from .models.stadium import save_stadiums
|
||||
"""Execute the scrape command with canonical output format."""
|
||||
import json
|
||||
from .validators.report import generate_report, validate_games
|
||||
from .normalizers.timezone import get_stadium_timezone
|
||||
from .validators.schema import SchemaValidationError, validate_batch
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
@@ -282,14 +282,60 @@ def cmd_scrape(args: argparse.Namespace) -> int:
|
||||
logger.info(f"Review items: {report.summary.review_count}")
|
||||
|
||||
if not args.dry_run:
|
||||
# Save output files
|
||||
# Build mappings for canonical conversion
|
||||
stadium_timezone_map: dict[str, str] = {}
|
||||
for stadium in result.stadiums:
|
||||
tz = get_stadium_timezone(stadium.state, stadium.timezone)
|
||||
stadium_timezone_map[stadium.id] = tz
|
||||
|
||||
stadium_team_abbrevs: dict[str, list[str]] = {}
|
||||
for team in result.teams:
|
||||
if team.stadium_id:
|
||||
if team.stadium_id not in stadium_team_abbrevs:
|
||||
stadium_team_abbrevs[team.stadium_id] = []
|
||||
stadium_team_abbrevs[team.stadium_id].append(team.abbreviation)
|
||||
|
||||
# Convert to canonical format
|
||||
canonical_stadiums = [
|
||||
s.to_canonical_dict(primary_team_abbrevs=stadium_team_abbrevs.get(s.id, []))
|
||||
for s in result.stadiums
|
||||
]
|
||||
canonical_teams = [t.to_canonical_dict() for t in result.teams]
|
||||
canonical_games = [
|
||||
g.to_canonical_dict(stadium_timezone=stadium_timezone_map.get(g.stadium_id, "America/New_York"))
|
||||
for g in result.games
|
||||
]
|
||||
|
||||
# Validate canonical output
|
||||
stadium_errors = validate_batch(canonical_stadiums, "stadium", fail_fast=False)
|
||||
team_errors = validate_batch(canonical_teams, "team", fail_fast=False)
|
||||
game_errors = validate_batch(canonical_games, "game", fail_fast=False)
|
||||
|
||||
if stadium_errors or team_errors or game_errors:
|
||||
for idx, errors in stadium_errors:
|
||||
for e in errors:
|
||||
logger.error(f"Stadium {result.stadiums[idx].id}: {e}")
|
||||
for idx, errors in team_errors:
|
||||
for e in errors:
|
||||
logger.error(f"Team {result.teams[idx].id}: {e}")
|
||||
for idx, errors in game_errors[:10]:
|
||||
for e in errors:
|
||||
logger.error(f"Game {result.games[idx].id}: {e}")
|
||||
if len(game_errors) > 10:
|
||||
logger.error(f"... and {len(game_errors) - 10} more game errors")
|
||||
raise SchemaValidationError("canonical", ["Schema validation failed"])
|
||||
|
||||
# Save canonical output files
|
||||
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
|
||||
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
|
||||
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
|
||||
|
||||
save_games(result.games, str(games_file))
|
||||
save_teams(result.teams, str(teams_file))
|
||||
save_stadiums(result.stadiums, str(stadiums_file))
|
||||
with open(games_file, "w", encoding="utf-8") as f:
|
||||
json.dump(canonical_games, f, indent=2)
|
||||
with open(teams_file, "w", encoding="utf-8") as f:
|
||||
json.dump(canonical_teams, f, indent=2)
|
||||
with open(stadiums_file, "w", encoding="utf-8") as f:
|
||||
json.dump(canonical_stadiums, f, indent=2)
|
||||
|
||||
# Save validation report
|
||||
report_path = report.save()
|
||||
@@ -307,6 +353,11 @@ def cmd_scrape(args: argparse.Namespace) -> int:
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
except SchemaValidationError as e:
|
||||
log_failure(f"{sport.upper()}: {e}")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
log_failure(f"{sport.upper()}: {e}")
|
||||
logger.exception("Scraping failed")
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
import json
|
||||
|
||||
|
||||
@@ -64,9 +65,53 @@ class Game:
|
||||
"raw_stadium": self.raw_stadium,
|
||||
}
|
||||
|
||||
def to_canonical_dict(
|
||||
self,
|
||||
stadium_timezone: str,
|
||||
is_playoff: bool = False,
|
||||
broadcast: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""Convert to canonical dictionary format matching iOS app schema.
|
||||
|
||||
Args:
|
||||
stadium_timezone: IANA timezone of the stadium (e.g., 'America/Chicago')
|
||||
is_playoff: Whether this is a playoff game
|
||||
broadcast: Broadcast network info (e.g., 'ESPN')
|
||||
|
||||
Returns:
|
||||
Dictionary with field names matching JSONCanonicalGame in BootstrapService.swift
|
||||
"""
|
||||
# Convert game_date to UTC
|
||||
if self.game_date.tzinfo is None:
|
||||
# Localize naive datetime to stadium timezone first
|
||||
local_tz = ZoneInfo(stadium_timezone)
|
||||
local_dt = self.game_date.replace(tzinfo=local_tz)
|
||||
else:
|
||||
local_dt = self.game_date
|
||||
|
||||
utc_dt = local_dt.astimezone(ZoneInfo("UTC"))
|
||||
|
||||
# Format season as string (e.g., 2025 -> "2025-26" for NBA/NHL, "2025" for MLB)
|
||||
if self.sport in ("nba", "nhl"):
|
||||
season_str = f"{self.season}-{str(self.season + 1)[-2:]}"
|
||||
else:
|
||||
season_str = str(self.season)
|
||||
|
||||
return {
|
||||
"canonical_id": self.id,
|
||||
"sport": self.sport,
|
||||
"season": season_str,
|
||||
"game_datetime_utc": utc_dt.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"home_team_canonical_id": self.home_team_id,
|
||||
"away_team_canonical_id": self.away_team_id,
|
||||
"stadium_canonical_id": self.stadium_id,
|
||||
"is_playoff": is_playoff,
|
||||
"broadcast": broadcast,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "Game":
|
||||
"""Create a Game from a dictionary."""
|
||||
"""Create a Game from a dictionary (internal format)."""
|
||||
game_date = data["game_date"]
|
||||
if isinstance(game_date, str):
|
||||
game_date = datetime.fromisoformat(game_date)
|
||||
@@ -89,6 +134,26 @@ class Game:
|
||||
raw_stadium=data.get("raw_stadium"),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_canonical_dict(cls, data: dict) -> "Game":
|
||||
"""Create a Game from a canonical dictionary (iOS app format)."""
|
||||
game_date = datetime.fromisoformat(data["game_datetime_utc"])
|
||||
|
||||
# Parse season string (e.g., "2025-26" -> 2025, or "2025" -> 2025)
|
||||
season_str = data["season"]
|
||||
season = int(season_str.split("-")[0])
|
||||
|
||||
return cls(
|
||||
id=data["canonical_id"],
|
||||
sport=data["sport"],
|
||||
season=season,
|
||||
home_team_id=data["home_team_canonical_id"],
|
||||
away_team_id=data["away_team_canonical_id"],
|
||||
stadium_id=data["stadium_canonical_id"],
|
||||
game_date=game_date,
|
||||
status="scheduled",
|
||||
)
|
||||
|
||||
def to_json(self) -> str:
|
||||
"""Serialize to JSON string."""
|
||||
return json.dumps(self.to_dict(), indent=2)
|
||||
@@ -106,7 +171,10 @@ def save_games(games: list[Game], filepath: str) -> None:
|
||||
|
||||
|
||||
def load_games(filepath: str) -> list[Game]:
|
||||
"""Load a list of games from a JSON file."""
|
||||
"""Load a list of games from a JSON file (auto-detects format)."""
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
# Detect format: canonical has "canonical_id" and "game_datetime_utc", internal has "id"
|
||||
if data and "canonical_id" in data[0] and "game_datetime_utc" in data[0]:
|
||||
return [Game.from_canonical_dict(d) for d in data]
|
||||
return [Game.from_dict(d) for d in data]
|
||||
|
||||
@@ -60,9 +60,32 @@ class Stadium:
|
||||
"timezone": self.timezone,
|
||||
}
|
||||
|
||||
def to_canonical_dict(self, primary_team_abbrevs: list[str] | None = None) -> dict:
|
||||
"""Convert to canonical dictionary format matching iOS app schema.
|
||||
|
||||
Args:
|
||||
primary_team_abbrevs: List of team abbreviations that play at this stadium.
|
||||
If None, defaults to empty list.
|
||||
|
||||
Returns:
|
||||
Dictionary with field names matching JSONCanonicalStadium in BootstrapService.swift
|
||||
"""
|
||||
return {
|
||||
"canonical_id": self.id,
|
||||
"name": self.name,
|
||||
"city": self.city,
|
||||
"state": self.state,
|
||||
"latitude": self.latitude,
|
||||
"longitude": self.longitude,
|
||||
"capacity": self.capacity if self.capacity is not None else 0,
|
||||
"sport": self.sport,
|
||||
"primary_team_abbrevs": primary_team_abbrevs or [],
|
||||
"year_opened": self.opened_year,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "Stadium":
|
||||
"""Create a Stadium from a dictionary."""
|
||||
"""Create a Stadium from a dictionary (internal format)."""
|
||||
return cls(
|
||||
id=data["id"],
|
||||
sport=data["sport"],
|
||||
@@ -80,6 +103,22 @@ class Stadium:
|
||||
timezone=data.get("timezone"),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_canonical_dict(cls, data: dict) -> "Stadium":
|
||||
"""Create a Stadium from a canonical dictionary (iOS app format)."""
|
||||
return cls(
|
||||
id=data["canonical_id"],
|
||||
sport=data["sport"],
|
||||
name=data["name"],
|
||||
city=data["city"],
|
||||
state=data["state"],
|
||||
country="USA", # Canonical format doesn't include country
|
||||
latitude=data["latitude"],
|
||||
longitude=data["longitude"],
|
||||
capacity=data.get("capacity"),
|
||||
opened_year=data.get("year_opened"),
|
||||
)
|
||||
|
||||
def to_json(self) -> str:
|
||||
"""Serialize to JSON string."""
|
||||
return json.dumps(self.to_dict(), indent=2)
|
||||
@@ -102,7 +141,10 @@ def save_stadiums(stadiums: list[Stadium], filepath: str) -> None:
|
||||
|
||||
|
||||
def load_stadiums(filepath: str) -> list[Stadium]:
|
||||
"""Load a list of stadiums from a JSON file."""
|
||||
"""Load a list of stadiums from a JSON file (auto-detects format)."""
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
# Detect format: canonical has "canonical_id", internal has "id"
|
||||
if data and "canonical_id" in data[0]:
|
||||
return [Stadium.from_canonical_dict(d) for d in data]
|
||||
return [Stadium.from_dict(d) for d in data]
|
||||
|
||||
@@ -54,9 +54,28 @@ class Team:
|
||||
"stadium_id": self.stadium_id,
|
||||
}
|
||||
|
||||
def to_canonical_dict(self) -> dict:
|
||||
"""Convert to canonical dictionary format matching iOS app schema.
|
||||
|
||||
Returns:
|
||||
Dictionary with field names matching JSONCanonicalTeam in BootstrapService.swift
|
||||
"""
|
||||
return {
|
||||
"canonical_id": self.id,
|
||||
"name": self.name,
|
||||
"abbreviation": self.abbreviation,
|
||||
"sport": self.sport,
|
||||
"city": self.city,
|
||||
"stadium_canonical_id": self.stadium_id or "",
|
||||
"conference_id": self.conference,
|
||||
"division_id": self.division,
|
||||
"primary_color": self.primary_color,
|
||||
"secondary_color": self.secondary_color,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "Team":
|
||||
"""Create a Team from a dictionary."""
|
||||
"""Create a Team from a dictionary (internal format)."""
|
||||
return cls(
|
||||
id=data["id"],
|
||||
sport=data["sport"],
|
||||
@@ -72,6 +91,23 @@ class Team:
|
||||
stadium_id=data.get("stadium_id"),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_canonical_dict(cls, data: dict) -> "Team":
|
||||
"""Create a Team from a canonical dictionary (iOS app format)."""
|
||||
return cls(
|
||||
id=data["canonical_id"],
|
||||
sport=data["sport"],
|
||||
city=data["city"],
|
||||
name=data["name"],
|
||||
full_name=f"{data['city']} {data['name']}", # Reconstruct full_name
|
||||
abbreviation=data["abbreviation"],
|
||||
conference=data.get("conference_id"),
|
||||
division=data.get("division_id"),
|
||||
primary_color=data.get("primary_color"),
|
||||
secondary_color=data.get("secondary_color"),
|
||||
stadium_id=data.get("stadium_canonical_id"),
|
||||
)
|
||||
|
||||
def to_json(self) -> str:
|
||||
"""Serialize to JSON string."""
|
||||
return json.dumps(self.to_dict(), indent=2)
|
||||
@@ -89,7 +125,10 @@ def save_teams(teams: list[Team], filepath: str) -> None:
|
||||
|
||||
|
||||
def load_teams(filepath: str) -> list[Team]:
|
||||
"""Load a list of teams from a JSON file."""
|
||||
"""Load a list of teams from a JSON file (auto-detects format)."""
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
# Detect format: canonical has "canonical_id", internal has "id"
|
||||
if data and "canonical_id" in data[0]:
|
||||
return [Team.from_canonical_dict(d) for d in data]
|
||||
return [Team.from_dict(d) for d in data]
|
||||
|
||||
@@ -240,6 +240,13 @@ STADIUM_MAPPINGS: dict[str, dict[str, StadiumInfo]] = {
|
||||
"stadium_nwsl_america_first_field": StadiumInfo("stadium_nwsl_america_first_field", "America First Field", "Sandy", "UT", "USA", "nwsl", 40.5830, -111.8933),
|
||||
"stadium_nwsl_audi_field": StadiumInfo("stadium_nwsl_audi_field", "Audi Field", "Washington", "DC", "USA", "nwsl", 38.8687, -77.0128),
|
||||
"stadium_nwsl_paypal_park": StadiumInfo("stadium_nwsl_paypal_park", "PayPal Park", "San Jose", "CA", "USA", "nwsl", 37.3511, -121.9250),
|
||||
# Boston Legacy FC venues
|
||||
"stadium_nwsl_gillette_stadium": StadiumInfo("stadium_nwsl_gillette_stadium", "Gillette Stadium", "Foxborough", "MA", "USA", "nwsl", 42.0909, -71.2643),
|
||||
"stadium_nwsl_centreville_bank_stadium": StadiumInfo("stadium_nwsl_centreville_bank_stadium", "Centreville Bank Stadium", "Pawtucket", "RI", "USA", "nwsl", 41.8770, -71.3910),
|
||||
# Denver Summit FC venues
|
||||
"stadium_nwsl_empower_field": StadiumInfo("stadium_nwsl_empower_field", "Empower Field at Mile High", "Denver", "CO", "USA", "nwsl", 39.7439, -105.0201, "America/Denver"),
|
||||
"stadium_nwsl_dicks_sporting_goods_park": StadiumInfo("stadium_nwsl_dicks_sporting_goods_park", "Dick's Sporting Goods Park", "Commerce City", "CO", "USA", "nwsl", 39.8056, -104.8922, "America/Denver"),
|
||||
"stadium_nwsl_centennial_stadium": StadiumInfo("stadium_nwsl_centennial_stadium", "Centennial Stadium", "Centennial", "CO", "USA", "nwsl", 39.6000, -104.8800, "America/Denver"),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -265,6 +265,8 @@ TEAM_MAPPINGS: dict[str, dict[str, tuple[str, str, str]]] = {
|
||||
"SLC": ("team_nwsl_slc", "Utah Royals", "Utah"),
|
||||
"WAS": ("team_nwsl_was", "Washington Spirit", "Washington"),
|
||||
"BFC": ("team_nwsl_bfc", "Bay FC", "San Francisco"),
|
||||
"BOS": ("team_nwsl_bos", "Boston Legacy FC", "Boston"),
|
||||
"DEN": ("team_nwsl_den", "Denver Summit FC", "Denver"),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -185,9 +185,12 @@ class BaseScraper(ABC):
|
||||
"""
|
||||
sources = self._get_sources()
|
||||
last_error: Optional[str] = None
|
||||
sources_tried = 0
|
||||
max_sources_to_try = 2 # Don't try all sources if first few return nothing
|
||||
|
||||
for source in sources:
|
||||
self._logger.info(f"Trying source: {source}")
|
||||
sources_tried += 1
|
||||
|
||||
try:
|
||||
# Scrape raw data
|
||||
@@ -195,6 +198,12 @@ class BaseScraper(ABC):
|
||||
|
||||
if not raw_games:
|
||||
log_warning(f"No games found from {source}")
|
||||
# If multiple sources return nothing, the schedule likely doesn't exist
|
||||
if sources_tried >= max_sources_to_try:
|
||||
return ScrapeResult(
|
||||
success=False,
|
||||
error_message=f"No schedule data available (tried {sources_tried} sources)",
|
||||
)
|
||||
continue
|
||||
|
||||
self._logger.info(f"Found {len(raw_games)} raw games from {source}")
|
||||
@@ -216,7 +225,9 @@ class BaseScraper(ABC):
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
|
||||
# Discard partial data and try next source
|
||||
# If we've tried enough sources, bail out
|
||||
if sources_tried >= max_sources_to_try:
|
||||
break
|
||||
continue
|
||||
|
||||
# All sources failed
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""MLB scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@@ -45,7 +45,10 @@ class MLBScraper(BaseScraper):
|
||||
|
||||
def _get_sources(self) -> list[str]:
|
||||
"""Return source list in priority order."""
|
||||
return ["baseball_reference", "mlb_api", "espn"]
|
||||
# MLB API is best - returns full schedule in one request
|
||||
# ESPN caps at ~25 results for baseball
|
||||
# Baseball-Reference requires HTML parsing
|
||||
return ["mlb_api", "espn", "baseball_reference"]
|
||||
|
||||
def _get_source_url(self, source: str, **kwargs) -> str:
|
||||
"""Build URL for a source."""
|
||||
@@ -215,43 +218,29 @@ class MLBScraper(BaseScraper):
|
||||
)
|
||||
|
||||
def _scrape_mlb_api(self) -> list[RawGameData]:
|
||||
"""Scrape games from MLB Stats API.
|
||||
"""Scrape games from MLB Stats API using full season query."""
|
||||
# Build date range for entire season (March-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
MLB API allows date range queries.
|
||||
"""
|
||||
all_games: list[RawGameData] = []
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
# Query by month to avoid hitting API limits
|
||||
for year, month in self._get_season_months():
|
||||
start_date = date(year, month, 1)
|
||||
start_date = date(start_year, start_month, 1)
|
||||
|
||||
# Get last day of month
|
||||
if month == 12:
|
||||
end_date = date(year + 1, 1, 1)
|
||||
else:
|
||||
end_date = date(year, month + 1, 1)
|
||||
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date.strftime('%Y-%m-%d')}&endDate={end_date.strftime('%Y-%m-%d')}"
|
||||
self._logger.info(f"Fetching MLB schedule: {start_date} to {end_date}")
|
||||
|
||||
# Adjust end date to last day of month
|
||||
from datetime import timedelta
|
||||
end_date = end_date - timedelta(days=1)
|
||||
|
||||
url = self._get_source_url(
|
||||
"mlb_api",
|
||||
start_date=start_date.strftime("%Y-%m-%d"),
|
||||
end_date=end_date.strftime("%Y-%m-%d"),
|
||||
)
|
||||
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_mlb_api_response(data, url)
|
||||
all_games.extend(games)
|
||||
self._logger.debug(f"Found {len(games)} games in {year}-{month:02d}")
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"MLB API error for {year}-{month}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_mlb_api_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"MLB API error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_mlb_api_response(
|
||||
self,
|
||||
@@ -345,33 +334,30 @@ class MLBScraper(BaseScraper):
|
||||
)
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API."""
|
||||
all_games: list[RawGameData] = []
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (March-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
if month == 12:
|
||||
next_month = date(year + 1, 1, 1)
|
||||
else:
|
||||
next_month = date(year, month + 1, 1)
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
days_in_month = (next_month - date(year, month, 1)).days
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
for day in range(1, days_in_month + 1):
|
||||
try:
|
||||
game_date = date(year, month, day)
|
||||
date_str = game_date.strftime("%Y%m%d")
|
||||
url = self._get_source_url("espn", date=date_str)
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?limit=3000&dates={date_range}"
|
||||
self._logger.info(f"Fetching MLB schedule: {date_range}")
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""MLS scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -78,33 +78,30 @@ class MLSScraper(BaseScraper):
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API."""
|
||||
all_games: list[RawGameData] = []
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (Feb-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
if month == 12:
|
||||
next_month = date(year + 1, 1, 1)
|
||||
else:
|
||||
next_month = date(year, month + 1, 1)
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
days_in_month = (next_month - date(year, month, 1)).days
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
for day in range(1, days_in_month + 1):
|
||||
try:
|
||||
game_date = date(year, month, day)
|
||||
date_str = game_date.strftime("%Y%m%d")
|
||||
url = self._get_source_url("espn", date=date_str)
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.1/scoreboard?limit=1000&dates={date_range}"
|
||||
self._logger.info(f"Fetching MLS schedule: {date_range}")
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
|
||||
@@ -95,9 +95,11 @@ class NBAScraper(BaseScraper):
|
||||
BR organizes games by month with separate pages.
|
||||
Format: https://www.basketball-reference.com/leagues/NBA_YYYY_games-month.html
|
||||
where YYYY is the ending year of the season.
|
||||
Bails early if first few months have no data (season doesn't exist).
|
||||
"""
|
||||
all_games: list[RawGameData] = []
|
||||
end_year = self.season + 1
|
||||
consecutive_empty_months = 0
|
||||
|
||||
for month in BR_MONTHS:
|
||||
url = self._get_source_url("basketball_reference", month=month, year=end_year)
|
||||
@@ -105,13 +107,23 @@ class NBAScraper(BaseScraper):
|
||||
try:
|
||||
html = self.session.get_html(url)
|
||||
games = self._parse_basketball_reference(html, url)
|
||||
all_games.extend(games)
|
||||
self._logger.debug(f"Found {len(games)} games in {month}")
|
||||
|
||||
if games:
|
||||
all_games.extend(games)
|
||||
consecutive_empty_months = 0
|
||||
self._logger.debug(f"Found {len(games)} games in {month}")
|
||||
else:
|
||||
consecutive_empty_months += 1
|
||||
|
||||
except Exception as e:
|
||||
# Some months may not exist (e.g., no games in August)
|
||||
self._logger.debug(f"No data for {month}: {e}")
|
||||
continue
|
||||
consecutive_empty_months += 1
|
||||
|
||||
# If first 3 months (Oct, Nov, Dec) all have no data, season doesn't exist
|
||||
if consecutive_empty_months >= 3 and not all_games:
|
||||
self._logger.info(f"No games found in first {consecutive_empty_months} months, season likely doesn't exist")
|
||||
break
|
||||
|
||||
return all_games
|
||||
|
||||
@@ -247,8 +259,11 @@ class NBAScraper(BaseScraper):
|
||||
|
||||
ESPN API returns games for a specific date range.
|
||||
We iterate through each day of the season.
|
||||
Bails out early if no games found after checking first month.
|
||||
"""
|
||||
all_games: list[RawGameData] = []
|
||||
consecutive_empty_days = 0
|
||||
max_empty_days = 45 # Bail after ~1.5 months of no games
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
@@ -267,10 +282,25 @@ class NBAScraper(BaseScraper):
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
|
||||
if games:
|
||||
all_games.extend(games)
|
||||
consecutive_empty_days = 0
|
||||
else:
|
||||
consecutive_empty_days += 1
|
||||
|
||||
# Bail early if no games found for a long stretch
|
||||
if consecutive_empty_days >= max_empty_days:
|
||||
self._logger.info(f"No games found for {max_empty_days} consecutive days, stopping ESPN scrape")
|
||||
return all_games
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
consecutive_empty_days += 1
|
||||
|
||||
if consecutive_empty_days >= max_empty_days:
|
||||
self._logger.info(f"Too many consecutive failures, stopping ESPN scrape")
|
||||
return all_games
|
||||
continue
|
||||
|
||||
return all_games
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""NWSL scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -73,33 +73,30 @@ class NWSLScraper(BaseScraper):
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API."""
|
||||
all_games: list[RawGameData] = []
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (March-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
if month == 12:
|
||||
next_month = date(year + 1, 1, 1)
|
||||
else:
|
||||
next_month = date(year, month + 1, 1)
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
days_in_month = (next_month - date(year, month, 1)).days
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
for day in range(1, days_in_month + 1):
|
||||
try:
|
||||
game_date = date(year, month, day)
|
||||
date_str = game_date.strftime("%Y%m%d")
|
||||
url = self._get_source_url("espn", date=date_str)
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.nwsl/scoreboard?limit=1000&dates={date_range}"
|
||||
self._logger.info(f"Fetching NWSL schedule: {date_range}")
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""WNBA scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -73,33 +73,30 @@ class WNBAScraper(BaseScraper):
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API."""
|
||||
all_games: list[RawGameData] = []
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (May-October)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
if month == 12:
|
||||
next_month = date(year + 1, 1, 1)
|
||||
else:
|
||||
next_month = date(year, month + 1, 1)
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
days_in_month = (next_month - date(year, month, 1)).days
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
for day in range(1, days_in_month + 1):
|
||||
try:
|
||||
game_date = date(year, month, day)
|
||||
date_str = game_date.strftime("%Y%m%d")
|
||||
url = self._get_source_url("espn", date=date_str)
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/basketball/wnba/scoreboard?limit=1000&dates={date_range}"
|
||||
self._logger.info(f"Fetching WNBA schedule: {date_range}")
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
|
||||
@@ -8,10 +8,25 @@ from .report import (
|
||||
validate_games,
|
||||
)
|
||||
|
||||
from .schema import (
|
||||
SchemaValidationError,
|
||||
validate_canonical_stadium,
|
||||
validate_canonical_team,
|
||||
validate_canonical_game,
|
||||
validate_and_raise,
|
||||
validate_batch,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ValidationReport",
|
||||
"ValidationSummary",
|
||||
"generate_report",
|
||||
"detect_duplicate_games",
|
||||
"validate_games",
|
||||
"SchemaValidationError",
|
||||
"validate_canonical_stadium",
|
||||
"validate_canonical_team",
|
||||
"validate_canonical_game",
|
||||
"validate_and_raise",
|
||||
"validate_batch",
|
||||
]
|
||||
|
||||
246
Scripts/sportstime_parser/validators/schema.py
Normal file
246
Scripts/sportstime_parser/validators/schema.py
Normal file
@@ -0,0 +1,246 @@
|
||||
"""JSON Schema validation for canonical output matching iOS app expectations.
|
||||
|
||||
This module defines schemas that match the Swift structs in BootstrapService.swift:
|
||||
- JSONCanonicalStadium
|
||||
- JSONCanonicalTeam
|
||||
- JSONCanonicalGame
|
||||
|
||||
Validation is performed at runtime before outputting JSON to ensure
|
||||
Python output matches what the iOS app expects.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
|
||||
class SchemaValidationError(Exception):
|
||||
"""Raised when canonical output fails schema validation."""
|
||||
|
||||
def __init__(self, model_type: str, errors: list[str]):
|
||||
self.model_type = model_type
|
||||
self.errors = errors
|
||||
super().__init__(f"{model_type} schema validation failed:\n" + "\n".join(f" - {e}" for e in errors))
|
||||
|
||||
|
||||
# ISO8601 UTC datetime pattern: YYYY-MM-DDTHH:MM:SSZ
|
||||
ISO8601_UTC_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
|
||||
|
||||
# Season format patterns
|
||||
SEASON_SPLIT_PATTERN = re.compile(r"^\d{4}-\d{2}$") # e.g., "2025-26"
|
||||
SEASON_SINGLE_PATTERN = re.compile(r"^\d{4}$") # e.g., "2025"
|
||||
|
||||
|
||||
@dataclass
|
||||
class FieldSpec:
|
||||
"""Specification for a field in the canonical schema."""
|
||||
|
||||
name: str
|
||||
required: bool
|
||||
field_type: Union[type, tuple]
|
||||
validator: Optional[Callable] = None
|
||||
|
||||
|
||||
# Schema definitions matching Swift structs in BootstrapService.swift
|
||||
|
||||
STADIUM_SCHEMA: list[FieldSpec] = [
|
||||
FieldSpec("canonical_id", required=True, field_type=str),
|
||||
FieldSpec("name", required=True, field_type=str),
|
||||
FieldSpec("city", required=True, field_type=str),
|
||||
FieldSpec("state", required=True, field_type=str),
|
||||
FieldSpec("latitude", required=True, field_type=(int, float)),
|
||||
FieldSpec("longitude", required=True, field_type=(int, float)),
|
||||
FieldSpec("capacity", required=True, field_type=int),
|
||||
FieldSpec("sport", required=True, field_type=str),
|
||||
FieldSpec("primary_team_abbrevs", required=True, field_type=list),
|
||||
FieldSpec("year_opened", required=False, field_type=(int, type(None))),
|
||||
]
|
||||
|
||||
TEAM_SCHEMA: list[FieldSpec] = [
|
||||
FieldSpec("canonical_id", required=True, field_type=str),
|
||||
FieldSpec("name", required=True, field_type=str),
|
||||
FieldSpec("abbreviation", required=True, field_type=str),
|
||||
FieldSpec("sport", required=True, field_type=str),
|
||||
FieldSpec("city", required=True, field_type=str),
|
||||
FieldSpec("stadium_canonical_id", required=True, field_type=str),
|
||||
FieldSpec("conference_id", required=False, field_type=(str, type(None))),
|
||||
FieldSpec("division_id", required=False, field_type=(str, type(None))),
|
||||
FieldSpec("primary_color", required=False, field_type=(str, type(None))),
|
||||
FieldSpec("secondary_color", required=False, field_type=(str, type(None))),
|
||||
]
|
||||
|
||||
GAME_SCHEMA: list[FieldSpec] = [
|
||||
FieldSpec("canonical_id", required=True, field_type=str),
|
||||
FieldSpec("sport", required=True, field_type=str),
|
||||
FieldSpec(
|
||||
"season",
|
||||
required=True,
|
||||
field_type=str,
|
||||
validator=lambda v: SEASON_SPLIT_PATTERN.match(v) or SEASON_SINGLE_PATTERN.match(v),
|
||||
),
|
||||
FieldSpec(
|
||||
"game_datetime_utc",
|
||||
required=True,
|
||||
field_type=str,
|
||||
validator=lambda v: ISO8601_UTC_PATTERN.match(v),
|
||||
),
|
||||
FieldSpec("home_team_canonical_id", required=True, field_type=str),
|
||||
FieldSpec("away_team_canonical_id", required=True, field_type=str),
|
||||
FieldSpec("stadium_canonical_id", required=True, field_type=str),
|
||||
FieldSpec("is_playoff", required=True, field_type=bool),
|
||||
FieldSpec("broadcast", required=False, field_type=(str, type(None))),
|
||||
]
|
||||
|
||||
|
||||
def validate_field(data: dict[str, Any], spec: FieldSpec) -> list[str]:
|
||||
"""Validate a single field against its specification.
|
||||
|
||||
Args:
|
||||
data: The dictionary to validate
|
||||
spec: The field specification
|
||||
|
||||
Returns:
|
||||
List of error messages (empty if valid)
|
||||
"""
|
||||
errors = []
|
||||
|
||||
if spec.name not in data:
|
||||
if spec.required:
|
||||
errors.append(f"Missing required field: {spec.name}")
|
||||
return errors
|
||||
|
||||
value = data[spec.name]
|
||||
|
||||
# Check type
|
||||
if not isinstance(value, spec.field_type):
|
||||
expected = spec.field_type.__name__ if isinstance(spec.field_type, type) else str(spec.field_type)
|
||||
actual = type(value).__name__
|
||||
errors.append(f"Field '{spec.name}' has wrong type: expected {expected}, got {actual} (value: {value!r})")
|
||||
return errors
|
||||
|
||||
# Check custom validator
|
||||
if spec.validator and value is not None:
|
||||
if not spec.validator(value):
|
||||
errors.append(f"Field '{spec.name}' failed validation: {value!r}")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_canonical_stadium(data: dict[str, Any]) -> list[str]:
|
||||
"""Validate a canonical stadium dictionary.
|
||||
|
||||
Args:
|
||||
data: Stadium dictionary from to_canonical_dict()
|
||||
|
||||
Returns:
|
||||
List of error messages (empty if valid)
|
||||
"""
|
||||
errors = []
|
||||
for spec in STADIUM_SCHEMA:
|
||||
errors.extend(validate_field(data, spec))
|
||||
|
||||
# Additional validation: primary_team_abbrevs should contain strings
|
||||
if "primary_team_abbrevs" in data and isinstance(data["primary_team_abbrevs"], list):
|
||||
for i, abbrev in enumerate(data["primary_team_abbrevs"]):
|
||||
if not isinstance(abbrev, str):
|
||||
errors.append(f"primary_team_abbrevs[{i}] must be string, got {type(abbrev).__name__}")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_canonical_team(data: dict[str, Any]) -> list[str]:
|
||||
"""Validate a canonical team dictionary.
|
||||
|
||||
Args:
|
||||
data: Team dictionary from to_canonical_dict()
|
||||
|
||||
Returns:
|
||||
List of error messages (empty if valid)
|
||||
"""
|
||||
errors = []
|
||||
for spec in TEAM_SCHEMA:
|
||||
errors.extend(validate_field(data, spec))
|
||||
return errors
|
||||
|
||||
|
||||
def validate_canonical_game(data: dict[str, Any]) -> list[str]:
|
||||
"""Validate a canonical game dictionary.
|
||||
|
||||
Args:
|
||||
data: Game dictionary from to_canonical_dict()
|
||||
|
||||
Returns:
|
||||
List of error messages (empty if valid)
|
||||
"""
|
||||
errors = []
|
||||
for spec in GAME_SCHEMA:
|
||||
errors.extend(validate_field(data, spec))
|
||||
return errors
|
||||
|
||||
|
||||
def validate_and_raise(data: dict[str, Any], model_type: str) -> None:
|
||||
"""Validate a canonical dictionary and raise on error.
|
||||
|
||||
Args:
|
||||
data: Dictionary from to_canonical_dict()
|
||||
model_type: One of 'stadium', 'team', 'game'
|
||||
|
||||
Raises:
|
||||
SchemaValidationError: If validation fails
|
||||
ValueError: If model_type is unknown
|
||||
"""
|
||||
validators = {
|
||||
"stadium": validate_canonical_stadium,
|
||||
"team": validate_canonical_team,
|
||||
"game": validate_canonical_game,
|
||||
}
|
||||
|
||||
if model_type not in validators:
|
||||
raise ValueError(f"Unknown model type: {model_type}")
|
||||
|
||||
errors = validators[model_type](data)
|
||||
if errors:
|
||||
raise SchemaValidationError(model_type, errors)
|
||||
|
||||
|
||||
def validate_batch(
|
||||
items: list[dict[str, Any]],
|
||||
model_type: str,
|
||||
fail_fast: bool = True,
|
||||
) -> list[tuple[int, list[str]]]:
|
||||
"""Validate a batch of canonical dictionaries.
|
||||
|
||||
Args:
|
||||
items: List of dictionaries from to_canonical_dict()
|
||||
model_type: One of 'stadium', 'team', 'game'
|
||||
fail_fast: If True, raise on first error; if False, collect all errors
|
||||
|
||||
Returns:
|
||||
List of (index, errors) tuples for items with validation errors
|
||||
|
||||
Raises:
|
||||
SchemaValidationError: If fail_fast=True and validation fails
|
||||
"""
|
||||
validators = {
|
||||
"stadium": validate_canonical_stadium,
|
||||
"team": validate_canonical_team,
|
||||
"game": validate_canonical_game,
|
||||
}
|
||||
|
||||
if model_type not in validators:
|
||||
raise ValueError(f"Unknown model type: {model_type}")
|
||||
|
||||
validator = validators[model_type]
|
||||
all_errors = []
|
||||
|
||||
for i, item in enumerate(items):
|
||||
errors = validator(item)
|
||||
if errors:
|
||||
if fail_fast:
|
||||
raise SchemaValidationError(
|
||||
model_type,
|
||||
[f"Item {i}: {e}" for e in errors],
|
||||
)
|
||||
all_errors.append((i, errors))
|
||||
|
||||
return all_errors
|
||||
Reference in New Issue
Block a user