This commit is contained in:
Trey t
2026-01-19 22:12:53 -06:00
parent 11c0ae70d2
commit a8b0491571
19 changed files with 1328 additions and 525 deletions

View File

@@ -185,9 +185,12 @@ class BaseScraper(ABC):
"""
sources = self._get_sources()
last_error: Optional[str] = None
sources_tried = 0
max_sources_to_try = 2 # Don't try all sources if first few return nothing
for source in sources:
self._logger.info(f"Trying source: {source}")
sources_tried += 1
try:
# Scrape raw data
@@ -195,6 +198,12 @@ class BaseScraper(ABC):
if not raw_games:
log_warning(f"No games found from {source}")
# If multiple sources return nothing, the schedule likely doesn't exist
if sources_tried >= max_sources_to_try:
return ScrapeResult(
success=False,
error_message=f"No schedule data available (tried {sources_tried} sources)",
)
continue
self._logger.info(f"Found {len(raw_games)} raw games from {source}")
@@ -216,7 +225,9 @@ class BaseScraper(ABC):
except Exception as e:
last_error = str(e)
log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
# Discard partial data and try next source
# If we've tried enough sources, bail out
if sources_tried >= max_sources_to_try:
break
continue
# All sources failed

View File

@@ -1,6 +1,6 @@
"""MLB scraper implementation with multi-source fallback."""
from datetime import datetime, date
from datetime import datetime, date, timedelta
from typing import Optional
from bs4 import BeautifulSoup
@@ -45,7 +45,10 @@ class MLBScraper(BaseScraper):
def _get_sources(self) -> list[str]:
"""Return source list in priority order."""
return ["baseball_reference", "mlb_api", "espn"]
# MLB API is best - returns full schedule in one request
# ESPN caps at ~25 results for baseball
# Baseball-Reference requires HTML parsing
return ["mlb_api", "espn", "baseball_reference"]
def _get_source_url(self, source: str, **kwargs) -> str:
"""Build URL for a source."""
@@ -215,43 +218,29 @@ class MLBScraper(BaseScraper):
)
def _scrape_mlb_api(self) -> list[RawGameData]:
"""Scrape games from MLB Stats API.
"""Scrape games from MLB Stats API using full season query."""
# Build date range for entire season (March-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
MLB API allows date range queries.
"""
all_games: list[RawGameData] = []
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
# Query by month to avoid hitting API limits
for year, month in self._get_season_months():
start_date = date(year, month, 1)
start_date = date(start_year, start_month, 1)
# Get last day of month
if month == 12:
end_date = date(year + 1, 1, 1)
else:
end_date = date(year, month + 1, 1)
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date.strftime('%Y-%m-%d')}&endDate={end_date.strftime('%Y-%m-%d')}"
self._logger.info(f"Fetching MLB schedule: {start_date} to {end_date}")
# Adjust end date to last day of month
from datetime import timedelta
end_date = end_date - timedelta(days=1)
url = self._get_source_url(
"mlb_api",
start_date=start_date.strftime("%Y-%m-%d"),
end_date=end_date.strftime("%Y-%m-%d"),
)
try:
data = self.session.get_json(url)
games = self._parse_mlb_api_response(data, url)
all_games.extend(games)
self._logger.debug(f"Found {len(games)} games in {year}-{month:02d}")
except Exception as e:
self._logger.debug(f"MLB API error for {year}-{month}: {e}")
continue
return all_games
try:
data = self.session.get_json(url)
return self._parse_mlb_api_response(data, url)
except Exception as e:
self._logger.error(f"MLB API error: {e}")
return []
def _parse_mlb_api_response(
self,
@@ -345,33 +334,30 @@ class MLBScraper(BaseScraper):
)
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API."""
all_games: list[RawGameData] = []
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (March-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
days_in_month = (next_month - date(year, month, 1)).days
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
url = f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?limit=3000&dates={date_range}"
self._logger.info(f"Fetching MLB schedule: {date_range}")
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
continue
return all_games
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,

View File

@@ -1,6 +1,6 @@
"""MLS scraper implementation with multi-source fallback."""
from datetime import datetime, date
from datetime import datetime, date, timedelta
from typing import Optional
from .base import BaseScraper, RawGameData, ScrapeResult
@@ -78,33 +78,30 @@ class MLSScraper(BaseScraper):
raise ValueError(f"Unknown source: {source}")
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API."""
all_games: list[RawGameData] = []
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (Feb-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
days_in_month = (next_month - date(year, month, 1)).days
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.1/scoreboard?limit=1000&dates={date_range}"
self._logger.info(f"Fetching MLS schedule: {date_range}")
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
continue
return all_games
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,

View File

@@ -95,9 +95,11 @@ class NBAScraper(BaseScraper):
BR organizes games by month with separate pages.
Format: https://www.basketball-reference.com/leagues/NBA_YYYY_games-month.html
where YYYY is the ending year of the season.
Bails early if first few months have no data (season doesn't exist).
"""
all_games: list[RawGameData] = []
end_year = self.season + 1
consecutive_empty_months = 0
for month in BR_MONTHS:
url = self._get_source_url("basketball_reference", month=month, year=end_year)
@@ -105,13 +107,23 @@ class NBAScraper(BaseScraper):
try:
html = self.session.get_html(url)
games = self._parse_basketball_reference(html, url)
all_games.extend(games)
self._logger.debug(f"Found {len(games)} games in {month}")
if games:
all_games.extend(games)
consecutive_empty_months = 0
self._logger.debug(f"Found {len(games)} games in {month}")
else:
consecutive_empty_months += 1
except Exception as e:
# Some months may not exist (e.g., no games in August)
self._logger.debug(f"No data for {month}: {e}")
continue
consecutive_empty_months += 1
# If first 3 months (Oct, Nov, Dec) all have no data, season doesn't exist
if consecutive_empty_months >= 3 and not all_games:
self._logger.info(f"No games found in first {consecutive_empty_months} months, season likely doesn't exist")
break
return all_games
@@ -247,8 +259,11 @@ class NBAScraper(BaseScraper):
ESPN API returns games for a specific date range.
We iterate through each day of the season.
Bails out early if no games found after checking first month.
"""
all_games: list[RawGameData] = []
consecutive_empty_days = 0
max_empty_days = 45 # Bail after ~1.5 months of no games
for year, month in self._get_season_months():
# Get number of days in month
@@ -267,10 +282,25 @@ class NBAScraper(BaseScraper):
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
if games:
all_games.extend(games)
consecutive_empty_days = 0
else:
consecutive_empty_days += 1
# Bail early if no games found for a long stretch
if consecutive_empty_days >= max_empty_days:
self._logger.info(f"No games found for {max_empty_days} consecutive days, stopping ESPN scrape")
return all_games
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
consecutive_empty_days += 1
if consecutive_empty_days >= max_empty_days:
self._logger.info(f"Too many consecutive failures, stopping ESPN scrape")
return all_games
continue
return all_games

View File

@@ -1,6 +1,6 @@
"""NWSL scraper implementation with multi-source fallback."""
from datetime import datetime, date
from datetime import datetime, date, timedelta
from typing import Optional
from .base import BaseScraper, RawGameData, ScrapeResult
@@ -73,33 +73,30 @@ class NWSLScraper(BaseScraper):
raise ValueError(f"Unknown source: {source}")
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API."""
all_games: list[RawGameData] = []
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (March-November)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
days_in_month = (next_month - date(year, month, 1)).days
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.nwsl/scoreboard?limit=1000&dates={date_range}"
self._logger.info(f"Fetching NWSL schedule: {date_range}")
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
continue
return all_games
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,

View File

@@ -1,6 +1,6 @@
"""WNBA scraper implementation with multi-source fallback."""
from datetime import datetime, date
from datetime import datetime, date, timedelta
from typing import Optional
from .base import BaseScraper, RawGameData, ScrapeResult
@@ -73,33 +73,30 @@ class WNBAScraper(BaseScraper):
raise ValueError(f"Unknown source: {source}")
def _scrape_espn(self) -> list[RawGameData]:
"""Scrape games from ESPN API."""
all_games: list[RawGameData] = []
"""Scrape games from ESPN API using date range query."""
# Build date range for entire season (May-October)
season_months = self._get_season_months()
start_year, start_month = season_months[0]
end_year, end_month = season_months[-1]
for year, month in self._get_season_months():
# Get number of days in month
if month == 12:
next_month = date(year + 1, 1, 1)
else:
next_month = date(year, month + 1, 1)
# Get last day of end month
if end_month == 12:
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
else:
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
days_in_month = (next_month - date(year, month, 1)).days
start_date = date(start_year, start_month, 1)
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
for day in range(1, days_in_month + 1):
try:
game_date = date(year, month, day)
date_str = game_date.strftime("%Y%m%d")
url = self._get_source_url("espn", date=date_str)
url = f"https://site.api.espn.com/apis/site/v2/sports/basketball/wnba/scoreboard?limit=1000&dates={date_range}"
self._logger.info(f"Fetching WNBA schedule: {date_range}")
data = self.session.get_json(url)
games = self._parse_espn_response(data, url)
all_games.extend(games)
except Exception as e:
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
continue
return all_games
try:
data = self.session.get_json(url)
return self._parse_espn_response(data, url)
except Exception as e:
self._logger.error(f"ESPN error: {e}")
return []
def _parse_espn_response(
self,