wip

2026-01-19 22:12:53 -06:00
parent 11c0ae70d2
commit a8b0491571
19 changed files with 1328 additions and 525 deletions
--- a/Scripts/sportstime_parser/cli.py
+++ b/Scripts/sportstime_parser/cli.py
@@ -221,11 +221,11 @@ def get_scraper(sport: str, season: int):


 def cmd_scrape(args: argparse.Namespace) -> int:
-    """Execute the scrape command."""
-    from .models.game import save_games
-    from .models.team import save_teams
-    from .models.stadium import save_stadiums
+    """Execute the scrape command with canonical output format."""
+    import json
    from .validators.report import generate_report, validate_games
+    from .normalizers.timezone import get_stadium_timezone
+    from .validators.schema import SchemaValidationError, validate_batch

    logger = get_logger()

@@ -282,14 +282,60 @@ def cmd_scrape(args: argparse.Namespace) -> int:
            logger.info(f"Review items: {report.summary.review_count}")

            if not args.dry_run:
-                # Save output files
+                # Build mappings for canonical conversion
+                stadium_timezone_map: dict[str, str] = {}
+                for stadium in result.stadiums:
+                    tz = get_stadium_timezone(stadium.state, stadium.timezone)
+                    stadium_timezone_map[stadium.id] = tz
+
+                stadium_team_abbrevs: dict[str, list[str]] = {}
+                for team in result.teams:
+                    if team.stadium_id:
+                        if team.stadium_id not in stadium_team_abbrevs:
+                            stadium_team_abbrevs[team.stadium_id] = []
+                        stadium_team_abbrevs[team.stadium_id].append(team.abbreviation)
+
+                # Convert to canonical format
+                canonical_stadiums = [
+                    s.to_canonical_dict(primary_team_abbrevs=stadium_team_abbrevs.get(s.id, []))
+                    for s in result.stadiums
+                ]
+                canonical_teams = [t.to_canonical_dict() for t in result.teams]
+                canonical_games = [
+                    g.to_canonical_dict(stadium_timezone=stadium_timezone_map.get(g.stadium_id, "America/New_York"))
+                    for g in result.games
+                ]
+
+                # Validate canonical output
+                stadium_errors = validate_batch(canonical_stadiums, "stadium", fail_fast=False)
+                team_errors = validate_batch(canonical_teams, "team", fail_fast=False)
+                game_errors = validate_batch(canonical_games, "game", fail_fast=False)
+
+                if stadium_errors or team_errors or game_errors:
+                    for idx, errors in stadium_errors:
+                        for e in errors:
+                            logger.error(f"Stadium {result.stadiums[idx].id}: {e}")
+                    for idx, errors in team_errors:
+                        for e in errors:
+                            logger.error(f"Team {result.teams[idx].id}: {e}")
+                    for idx, errors in game_errors[:10]:
+                        for e in errors:
+                            logger.error(f"Game {result.games[idx].id}: {e}")
+                    if len(game_errors) > 10:
+                        logger.error(f"... and {len(game_errors) - 10} more game errors")
+                    raise SchemaValidationError("canonical", ["Schema validation failed"])
+
+                # Save canonical output files
                games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
                teams_file = OUTPUT_DIR / f"teams_{sport}.json"
                stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"

-                save_games(result.games, str(games_file))
-                save_teams(result.teams, str(teams_file))
-                save_stadiums(result.stadiums, str(stadiums_file))
+                with open(games_file, "w", encoding="utf-8") as f:
+                    json.dump(canonical_games, f, indent=2)
+                with open(teams_file, "w", encoding="utf-8") as f:
+                    json.dump(canonical_teams, f, indent=2)
+                with open(stadiums_file, "w", encoding="utf-8") as f:
+                    json.dump(canonical_stadiums, f, indent=2)

                # Save validation report
                report_path = report.save()
@@ -307,6 +353,11 @@ def cmd_scrape(args: argparse.Namespace) -> int:
            failure_count += 1
            continue

+        except SchemaValidationError as e:
+            log_failure(f"{sport.upper()}: {e}")
+            failure_count += 1
+            continue
+
        except Exception as e:
            log_failure(f"{sport.upper()}: {e}")
            logger.exception("Scraping failed")