This commit is contained in:
Trey t
2026-01-19 22:12:53 -06:00
parent 11c0ae70d2
commit a8b0491571
19 changed files with 1328 additions and 525 deletions

View File

@@ -221,11 +221,11 @@ def get_scraper(sport: str, season: int):
def cmd_scrape(args: argparse.Namespace) -> int:
"""Execute the scrape command."""
from .models.game import save_games
from .models.team import save_teams
from .models.stadium import save_stadiums
"""Execute the scrape command with canonical output format."""
import json
from .validators.report import generate_report, validate_games
from .normalizers.timezone import get_stadium_timezone
from .validators.schema import SchemaValidationError, validate_batch
logger = get_logger()
@@ -282,14 +282,60 @@ def cmd_scrape(args: argparse.Namespace) -> int:
logger.info(f"Review items: {report.summary.review_count}")
if not args.dry_run:
# Save output files
# Build mappings for canonical conversion
stadium_timezone_map: dict[str, str] = {}
for stadium in result.stadiums:
tz = get_stadium_timezone(stadium.state, stadium.timezone)
stadium_timezone_map[stadium.id] = tz
stadium_team_abbrevs: dict[str, list[str]] = {}
for team in result.teams:
if team.stadium_id:
if team.stadium_id not in stadium_team_abbrevs:
stadium_team_abbrevs[team.stadium_id] = []
stadium_team_abbrevs[team.stadium_id].append(team.abbreviation)
# Convert to canonical format
canonical_stadiums = [
s.to_canonical_dict(primary_team_abbrevs=stadium_team_abbrevs.get(s.id, []))
for s in result.stadiums
]
canonical_teams = [t.to_canonical_dict() for t in result.teams]
canonical_games = [
g.to_canonical_dict(stadium_timezone=stadium_timezone_map.get(g.stadium_id, "America/New_York"))
for g in result.games
]
# Validate canonical output
stadium_errors = validate_batch(canonical_stadiums, "stadium", fail_fast=False)
team_errors = validate_batch(canonical_teams, "team", fail_fast=False)
game_errors = validate_batch(canonical_games, "game", fail_fast=False)
if stadium_errors or team_errors or game_errors:
for idx, errors in stadium_errors:
for e in errors:
logger.error(f"Stadium {result.stadiums[idx].id}: {e}")
for idx, errors in team_errors:
for e in errors:
logger.error(f"Team {result.teams[idx].id}: {e}")
for idx, errors in game_errors[:10]:
for e in errors:
logger.error(f"Game {result.games[idx].id}: {e}")
if len(game_errors) > 10:
logger.error(f"... and {len(game_errors) - 10} more game errors")
raise SchemaValidationError("canonical", ["Schema validation failed"])
# Save canonical output files
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
save_games(result.games, str(games_file))
save_teams(result.teams, str(teams_file))
save_stadiums(result.stadiums, str(stadiums_file))
with open(games_file, "w", encoding="utf-8") as f:
json.dump(canonical_games, f, indent=2)
with open(teams_file, "w", encoding="utf-8") as f:
json.dump(canonical_teams, f, indent=2)
with open(stadiums_file, "w", encoding="utf-8") as f:
json.dump(canonical_stadiums, f, indent=2)
# Save validation report
report_path = report.save()
@@ -307,6 +353,11 @@ def cmd_scrape(args: argparse.Namespace) -> int:
failure_count += 1
continue
except SchemaValidationError as e:
log_failure(f"{sport.upper()}: {e}")
failure_count += 1
continue
except Exception as e:
log_failure(f"{sport.upper()}: {e}")
logger.exception("Scraping failed")