feat(scripts): complete data pipeline remediation

Scripts changes: - Add WNBA abbreviation aliases to team_resolver.py - Fix NHL stadium coordinates in stadium_resolver.py - Add validate_aliases.py script for orphan detection - Update scrapers with improved error handling - Add DATA_AUDIT.md and REMEDIATION_PLAN.md documentation - Update alias JSON files with new mappings iOS bundle updates: - Update games_canonical.json with latest scraped data - Update teams_canonical.json and stadiums_canonical.json - Sync alias files with Scripts versions All 5 remediation phases complete. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 18:58:47 -06:00
parent 51419fccf2
commit 8ea3e6112a
21 changed files with 56592 additions and 35714 deletions
@@ -186,7 +186,9 @@ class BaseScraper(ABC):
        sources = self._get_sources()
        last_error: Optional[str] = None
        sources_tried = 0
-        max_sources_to_try = 2  # Don't try all sources if first few return nothing
+        # Allow 3 sources to be tried. This enables NHL to fall back to NHL API
+        # for venue data since Hockey Reference doesn't provide it.
+        max_sources_to_try = 3

        for source in sources:
            self._logger.info(f"Trying source: {source}")
@@ -42,7 +42,8 @@ class MLSScraper(BaseScraper):

    def _get_sources(self) -> list[str]:
        """Return source list in priority order."""
-        return ["espn", "fbref"]
+        # FBref scraper not yet implemented - TODO for future
+        return ["espn"]

    def _get_source_url(self, source: str, **kwargs) -> str:
        """Build URL for a source."""
@@ -60,7 +60,8 @@ class NBAScraper(BaseScraper):

    def _get_sources(self) -> list[str]:
        """Return source list in priority order."""
-        return ["basketball_reference", "espn", "cbs"]
+        # CBS scraper not yet implemented - TODO for future
+        return ["basketball_reference", "espn"]

    def _get_source_url(self, source: str, **kwargs) -> str:
        """Build URL for a source."""
@@ -48,7 +48,8 @@ class NFLScraper(BaseScraper):

    def _get_sources(self) -> list[str]:
        """Return source list in priority order."""
-        return ["espn", "pro_football_reference", "cbs"]
+        # CBS scraper not yet implemented - TODO for future
+        return ["espn", "pro_football_reference"]

    def _get_source_url(self, source: str, **kwargs) -> str:
        """Build URL for a source."""
@@ -531,6 +531,16 @@ class NHLScraper(BaseScraper):

            stadium_id = stadium_result.canonical_id

+        # Fallback: Use home team's default stadium if no venue provided
+        # This is common for Hockey-Reference which doesn't have venue data
+        if not stadium_id:
+            home_team_data = TEAM_MAPPINGS.get("nhl", {})
+            home_abbrev = self._get_abbreviation(home_result.canonical_id)
+            for abbrev, (team_id, _, _, default_stadium) in home_team_data.items():
+                if team_id == home_result.canonical_id:
+                    stadium_id = default_stadium
+                    break
+
        # Get abbreviations for game ID
        home_abbrev = self._get_abbreviation(home_result.canonical_id)
        away_abbrev = self._get_abbreviation(away_result.canonical_id)