wip
This commit is contained in:
@@ -185,9 +185,12 @@ class BaseScraper(ABC):
|
||||
"""
|
||||
sources = self._get_sources()
|
||||
last_error: Optional[str] = None
|
||||
sources_tried = 0
|
||||
max_sources_to_try = 2 # Don't try all sources if first few return nothing
|
||||
|
||||
for source in sources:
|
||||
self._logger.info(f"Trying source: {source}")
|
||||
sources_tried += 1
|
||||
|
||||
try:
|
||||
# Scrape raw data
|
||||
@@ -195,6 +198,12 @@ class BaseScraper(ABC):
|
||||
|
||||
if not raw_games:
|
||||
log_warning(f"No games found from {source}")
|
||||
# If multiple sources return nothing, the schedule likely doesn't exist
|
||||
if sources_tried >= max_sources_to_try:
|
||||
return ScrapeResult(
|
||||
success=False,
|
||||
error_message=f"No schedule data available (tried {sources_tried} sources)",
|
||||
)
|
||||
continue
|
||||
|
||||
self._logger.info(f"Found {len(raw_games)} raw games from {source}")
|
||||
@@ -216,7 +225,9 @@ class BaseScraper(ABC):
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
log_error(f"Failed to scrape from {source}: {e}", exc_info=True)
|
||||
# Discard partial data and try next source
|
||||
# If we've tried enough sources, bail out
|
||||
if sources_tried >= max_sources_to_try:
|
||||
break
|
||||
continue
|
||||
|
||||
# All sources failed
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""MLB scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@@ -45,7 +45,10 @@ class MLBScraper(BaseScraper):
|
||||
|
||||
def _get_sources(self) -> list[str]:
|
||||
"""Return source list in priority order."""
|
||||
return ["baseball_reference", "mlb_api", "espn"]
|
||||
# MLB API is best - returns full schedule in one request
|
||||
# ESPN caps at ~25 results for baseball
|
||||
# Baseball-Reference requires HTML parsing
|
||||
return ["mlb_api", "espn", "baseball_reference"]
|
||||
|
||||
def _get_source_url(self, source: str, **kwargs) -> str:
|
||||
"""Build URL for a source."""
|
||||
@@ -215,43 +218,29 @@ class MLBScraper(BaseScraper):
|
||||
)
|
||||
|
||||
def _scrape_mlb_api(self) -> list[RawGameData]:
|
||||
"""Scrape games from MLB Stats API.
|
||||
"""Scrape games from MLB Stats API using full season query."""
|
||||
# Build date range for entire season (March-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
MLB API allows date range queries.
|
||||
"""
|
||||
all_games: list[RawGameData] = []
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
# Query by month to avoid hitting API limits
|
||||
for year, month in self._get_season_months():
|
||||
start_date = date(year, month, 1)
|
||||
start_date = date(start_year, start_month, 1)
|
||||
|
||||
# Get last day of month
|
||||
if month == 12:
|
||||
end_date = date(year + 1, 1, 1)
|
||||
else:
|
||||
end_date = date(year, month + 1, 1)
|
||||
url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date.strftime('%Y-%m-%d')}&endDate={end_date.strftime('%Y-%m-%d')}"
|
||||
self._logger.info(f"Fetching MLB schedule: {start_date} to {end_date}")
|
||||
|
||||
# Adjust end date to last day of month
|
||||
from datetime import timedelta
|
||||
end_date = end_date - timedelta(days=1)
|
||||
|
||||
url = self._get_source_url(
|
||||
"mlb_api",
|
||||
start_date=start_date.strftime("%Y-%m-%d"),
|
||||
end_date=end_date.strftime("%Y-%m-%d"),
|
||||
)
|
||||
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_mlb_api_response(data, url)
|
||||
all_games.extend(games)
|
||||
self._logger.debug(f"Found {len(games)} games in {year}-{month:02d}")
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"MLB API error for {year}-{month}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_mlb_api_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"MLB API error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_mlb_api_response(
|
||||
self,
|
||||
@@ -345,33 +334,30 @@ class MLBScraper(BaseScraper):
|
||||
)
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API."""
|
||||
all_games: list[RawGameData] = []
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (March-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
if month == 12:
|
||||
next_month = date(year + 1, 1, 1)
|
||||
else:
|
||||
next_month = date(year, month + 1, 1)
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
days_in_month = (next_month - date(year, month, 1)).days
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
for day in range(1, days_in_month + 1):
|
||||
try:
|
||||
game_date = date(year, month, day)
|
||||
date_str = game_date.strftime("%Y%m%d")
|
||||
url = self._get_source_url("espn", date=date_str)
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/scoreboard?limit=3000&dates={date_range}"
|
||||
self._logger.info(f"Fetching MLB schedule: {date_range}")
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""MLS scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -78,33 +78,30 @@ class MLSScraper(BaseScraper):
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API."""
|
||||
all_games: list[RawGameData] = []
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (Feb-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
if month == 12:
|
||||
next_month = date(year + 1, 1, 1)
|
||||
else:
|
||||
next_month = date(year, month + 1, 1)
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
days_in_month = (next_month - date(year, month, 1)).days
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
for day in range(1, days_in_month + 1):
|
||||
try:
|
||||
game_date = date(year, month, day)
|
||||
date_str = game_date.strftime("%Y%m%d")
|
||||
url = self._get_source_url("espn", date=date_str)
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.1/scoreboard?limit=1000&dates={date_range}"
|
||||
self._logger.info(f"Fetching MLS schedule: {date_range}")
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
|
||||
@@ -95,9 +95,11 @@ class NBAScraper(BaseScraper):
|
||||
BR organizes games by month with separate pages.
|
||||
Format: https://www.basketball-reference.com/leagues/NBA_YYYY_games-month.html
|
||||
where YYYY is the ending year of the season.
|
||||
Bails early if first few months have no data (season doesn't exist).
|
||||
"""
|
||||
all_games: list[RawGameData] = []
|
||||
end_year = self.season + 1
|
||||
consecutive_empty_months = 0
|
||||
|
||||
for month in BR_MONTHS:
|
||||
url = self._get_source_url("basketball_reference", month=month, year=end_year)
|
||||
@@ -105,13 +107,23 @@ class NBAScraper(BaseScraper):
|
||||
try:
|
||||
html = self.session.get_html(url)
|
||||
games = self._parse_basketball_reference(html, url)
|
||||
all_games.extend(games)
|
||||
self._logger.debug(f"Found {len(games)} games in {month}")
|
||||
|
||||
if games:
|
||||
all_games.extend(games)
|
||||
consecutive_empty_months = 0
|
||||
self._logger.debug(f"Found {len(games)} games in {month}")
|
||||
else:
|
||||
consecutive_empty_months += 1
|
||||
|
||||
except Exception as e:
|
||||
# Some months may not exist (e.g., no games in August)
|
||||
self._logger.debug(f"No data for {month}: {e}")
|
||||
continue
|
||||
consecutive_empty_months += 1
|
||||
|
||||
# If first 3 months (Oct, Nov, Dec) all have no data, season doesn't exist
|
||||
if consecutive_empty_months >= 3 and not all_games:
|
||||
self._logger.info(f"No games found in first {consecutive_empty_months} months, season likely doesn't exist")
|
||||
break
|
||||
|
||||
return all_games
|
||||
|
||||
@@ -247,8 +259,11 @@ class NBAScraper(BaseScraper):
|
||||
|
||||
ESPN API returns games for a specific date range.
|
||||
We iterate through each day of the season.
|
||||
Bails out early if no games found after checking first month.
|
||||
"""
|
||||
all_games: list[RawGameData] = []
|
||||
consecutive_empty_days = 0
|
||||
max_empty_days = 45 # Bail after ~1.5 months of no games
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
@@ -267,10 +282,25 @@ class NBAScraper(BaseScraper):
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
|
||||
if games:
|
||||
all_games.extend(games)
|
||||
consecutive_empty_days = 0
|
||||
else:
|
||||
consecutive_empty_days += 1
|
||||
|
||||
# Bail early if no games found for a long stretch
|
||||
if consecutive_empty_days >= max_empty_days:
|
||||
self._logger.info(f"No games found for {max_empty_days} consecutive days, stopping ESPN scrape")
|
||||
return all_games
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
consecutive_empty_days += 1
|
||||
|
||||
if consecutive_empty_days >= max_empty_days:
|
||||
self._logger.info(f"Too many consecutive failures, stopping ESPN scrape")
|
||||
return all_games
|
||||
continue
|
||||
|
||||
return all_games
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""NWSL scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -73,33 +73,30 @@ class NWSLScraper(BaseScraper):
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API."""
|
||||
all_games: list[RawGameData] = []
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (March-November)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
if month == 12:
|
||||
next_month = date(year + 1, 1, 1)
|
||||
else:
|
||||
next_month = date(year, month + 1, 1)
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
days_in_month = (next_month - date(year, month, 1)).days
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
for day in range(1, days_in_month + 1):
|
||||
try:
|
||||
game_date = date(year, month, day)
|
||||
date_str = game_date.strftime("%Y%m%d")
|
||||
url = self._get_source_url("espn", date=date_str)
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/soccer/usa.nwsl/scoreboard?limit=1000&dates={date_range}"
|
||||
self._logger.info(f"Fetching NWSL schedule: {date_range}")
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""WNBA scraper implementation with multi-source fallback."""
|
||||
|
||||
from datetime import datetime, date
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from .base import BaseScraper, RawGameData, ScrapeResult
|
||||
@@ -73,33 +73,30 @@ class WNBAScraper(BaseScraper):
|
||||
raise ValueError(f"Unknown source: {source}")
|
||||
|
||||
def _scrape_espn(self) -> list[RawGameData]:
|
||||
"""Scrape games from ESPN API."""
|
||||
all_games: list[RawGameData] = []
|
||||
"""Scrape games from ESPN API using date range query."""
|
||||
# Build date range for entire season (May-October)
|
||||
season_months = self._get_season_months()
|
||||
start_year, start_month = season_months[0]
|
||||
end_year, end_month = season_months[-1]
|
||||
|
||||
for year, month in self._get_season_months():
|
||||
# Get number of days in month
|
||||
if month == 12:
|
||||
next_month = date(year + 1, 1, 1)
|
||||
else:
|
||||
next_month = date(year, month + 1, 1)
|
||||
# Get last day of end month
|
||||
if end_month == 12:
|
||||
end_date = date(end_year + 1, 1, 1) - timedelta(days=1)
|
||||
else:
|
||||
end_date = date(end_year, end_month + 1, 1) - timedelta(days=1)
|
||||
|
||||
days_in_month = (next_month - date(year, month, 1)).days
|
||||
start_date = date(start_year, start_month, 1)
|
||||
date_range = f"{start_date.strftime('%Y%m%d')}-{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
for day in range(1, days_in_month + 1):
|
||||
try:
|
||||
game_date = date(year, month, day)
|
||||
date_str = game_date.strftime("%Y%m%d")
|
||||
url = self._get_source_url("espn", date=date_str)
|
||||
url = f"https://site.api.espn.com/apis/site/v2/sports/basketball/wnba/scoreboard?limit=1000&dates={date_range}"
|
||||
self._logger.info(f"Fetching WNBA schedule: {date_range}")
|
||||
|
||||
data = self.session.get_json(url)
|
||||
games = self._parse_espn_response(data, url)
|
||||
all_games.extend(games)
|
||||
|
||||
except Exception as e:
|
||||
self._logger.debug(f"ESPN error for {year}-{month}-{day}: {e}")
|
||||
continue
|
||||
|
||||
return all_games
|
||||
try:
|
||||
data = self.session.get_json(url)
|
||||
return self._parse_espn_response(data, url)
|
||||
except Exception as e:
|
||||
self._logger.error(f"ESPN error: {e}")
|
||||
return []
|
||||
|
||||
def _parse_espn_response(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user