This commit is contained in:
Trey t
2026-01-19 22:12:53 -06:00
parent 11c0ae70d2
commit a8b0491571
19 changed files with 1328 additions and 525 deletions

View File

@@ -95,9 +95,11 @@ class NBAScraper(BaseScraper):
BR organizes games by month with separate pages.
Format: https://www.basketball-reference.com/leagues/NBA_YYYY_games-month.html
where YYYY is the ending year of the season.
Bails early if first few months have no data (season doesn't exist).
"""
all_games: list[RawGameData] = []
end_year = self.season + 1
consecutive_empty_months = 0
for month in BR_MONTHS:
url = self._get_source_url("basketball_reference", month=month, year=end_year)
@@ -105,13 +107,23 @@ class NBAScraper(BaseScraper):
try:
html = self.session.get_html(url)
games = self._parse_basketball_reference(html, url)
all_games.extend(games)
self._logger.debug(f"Found {len(games)} games in {month}")
if games:
all_games.extend(games)
consecutive_empty_months = 0
self._logger.debug(f"Found {len(games)} games in {month}")
else:
consecutive_empty_months += 1
except Exception as e:
# Some months may not exist (e.g., no games in August)
self._logger.debug(f"No data for {month}: {e}")
continue
consecutive_empty_months += 1
# If first 3 months (Oct, Nov, Dec) all have no data, season doesn't exist
if consecutive_empty_months >= 3 and not all_games:
self._logger.info(f"No games found in first {consecutive_empty_months} months, season likely doesn't exist")
break
return all_games
@@ -247,8 +259,11 @@ class NBAScraper(BaseScraper):
ESPN API returns games for a specific date range.
We iterate through each day of the season.
Bails out early if no games found after checking first month.
"""
all_games: list[RawGameData] = []
consecutive_empty_days = 0
max_empty_days = 45 # Bail after ~1.5 months of no games
for year, month in self._get_season_months():
# Get number of days in month
@@ -267,10 +282,25 @@ class NBAScraper(BaseScraper):
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
if games:
all_games.extend(games)
consecutive_empty_days = 0
else:
consecutive_empty_days += 1
# Bail early if no games found for a long stretch
if consecutive_empty_days >= max_empty_days:
self._logger.info(f"No games found for {max_empty_days} consecutive days, stopping ESPN scrape")
return all_games
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
consecutive_empty_days += 1
if consecutive_empty_days >= max_empty_days:
self._logger.info(f"Too many consecutive failures, stopping ESPN scrape")
return all_games
continue
return all_games