wip
This commit is contained in:
@@ -221,11 +221,11 @@ def get_scraper(sport: str, season: int):
|
||||
|
||||
|
||||
def cmd_scrape(args: argparse.Namespace) -> int:
|
||||
"""Execute the scrape command."""
|
||||
from .models.game import save_games
|
||||
from .models.team import save_teams
|
||||
from .models.stadium import save_stadiums
|
||||
"""Execute the scrape command with canonical output format."""
|
||||
import json
|
||||
from .validators.report import generate_report, validate_games
|
||||
from .normalizers.timezone import get_stadium_timezone
|
||||
from .validators.schema import SchemaValidationError, validate_batch
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
@@ -282,14 +282,60 @@ def cmd_scrape(args: argparse.Namespace) -> int:
|
||||
logger.info(f"Review items: {report.summary.review_count}")
|
||||
|
||||
if not args.dry_run:
|
||||
# Save output files
|
||||
# Build mappings for canonical conversion
|
||||
stadium_timezone_map: dict[str, str] = {}
|
||||
for stadium in result.stadiums:
|
||||
tz = get_stadium_timezone(stadium.state, stadium.timezone)
|
||||
stadium_timezone_map[stadium.id] = tz
|
||||
|
||||
stadium_team_abbrevs: dict[str, list[str]] = {}
|
||||
for team in result.teams:
|
||||
if team.stadium_id:
|
||||
if team.stadium_id not in stadium_team_abbrevs:
|
||||
stadium_team_abbrevs[team.stadium_id] = []
|
||||
stadium_team_abbrevs[team.stadium_id].append(team.abbreviation)
|
||||
|
||||
# Convert to canonical format
|
||||
canonical_stadiums = [
|
||||
s.to_canonical_dict(primary_team_abbrevs=stadium_team_abbrevs.get(s.id, []))
|
||||
for s in result.stadiums
|
||||
]
|
||||
canonical_teams = [t.to_canonical_dict() for t in result.teams]
|
||||
canonical_games = [
|
||||
g.to_canonical_dict(stadium_timezone=stadium_timezone_map.get(g.stadium_id, "America/New_York"))
|
||||
for g in result.games
|
||||
]
|
||||
|
||||
# Validate canonical output
|
||||
stadium_errors = validate_batch(canonical_stadiums, "stadium", fail_fast=False)
|
||||
team_errors = validate_batch(canonical_teams, "team", fail_fast=False)
|
||||
game_errors = validate_batch(canonical_games, "game", fail_fast=False)
|
||||
|
||||
if stadium_errors or team_errors or game_errors:
|
||||
for idx, errors in stadium_errors:
|
||||
for e in errors:
|
||||
logger.error(f"Stadium {result.stadiums[idx].id}: {e}")
|
||||
for idx, errors in team_errors:
|
||||
for e in errors:
|
||||
logger.error(f"Team {result.teams[idx].id}: {e}")
|
||||
for idx, errors in game_errors[:10]:
|
||||
for e in errors:
|
||||
logger.error(f"Game {result.games[idx].id}: {e}")
|
||||
if len(game_errors) > 10:
|
||||
logger.error(f"... and {len(game_errors) - 10} more game errors")
|
||||
raise SchemaValidationError("canonical", ["Schema validation failed"])
|
||||
|
||||
# Save canonical output files
|
||||
games_file = OUTPUT_DIR / f"games_{sport}_{args.season}.json"
|
||||
teams_file = OUTPUT_DIR / f"teams_{sport}.json"
|
||||
stadiums_file = OUTPUT_DIR / f"stadiums_{sport}.json"
|
||||
|
||||
save_games(result.games, str(games_file))
|
||||
save_teams(result.teams, str(teams_file))
|
||||
save_stadiums(result.stadiums, str(stadiums_file))
|
||||
with open(games_file, "w", encoding="utf-8") as f:
|
||||
json.dump(canonical_games, f, indent=2)
|
||||
with open(teams_file, "w", encoding="utf-8") as f:
|
||||
json.dump(canonical_teams, f, indent=2)
|
||||
with open(stadiums_file, "w", encoding="utf-8") as f:
|
||||
json.dump(canonical_stadiums, f, indent=2)
|
||||
|
||||
# Save validation report
|
||||
report_path = report.save()
|
||||
@@ -307,6 +353,11 @@ def cmd_scrape(args: argparse.Namespace) -> int:
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
except SchemaValidationError as e:
|
||||
log_failure(f"{sport.upper()}: {e}")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
log_failure(f"{sport.upper()}: {e}")
|
||||
logger.exception("Scraping failed")
|
||||
|
||||
Reference in New Issue
Block a user