feat(scripts): rewrite parser as modular Python CLI

Replace monolithic scraping scripts with sportstime_parser package:

- Multi-source scrapers with automatic fallback for 7 sports
- Canonical ID generation for games, teams, and stadiums
- Fuzzy matching with configurable thresholds for name resolution
- CloudKit Web Services uploader with JWT auth, diff-based updates
- Resumable uploads with checkpoint state persistence
- Validation reports with manual review items and suggested matches
- Comprehensive test suite (249 tests)

CLI: sportstime-parser scrape|validate|upload|status|retry|clear

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-01-10 21:06:12 -06:00
parent 284a10d9e1
commit eeaf900e5a
109 changed files with 18415 additions and 266211 deletions

View File

@@ -0,0 +1,79 @@
<!DOCTYPE html>
<html>
<head>
<title>2025-26 NBA Schedule - Edge Cases | Basketball-Reference.com</title>
</head>
<body>
<table id="schedule" class="stats_table">
<thead>
<tr>
<th data-stat="date_game">Date</th>
<th data-stat="game_start_time">Start (ET)</th>
<th data-stat="visitor_team_name">Visitor/Neutral</th>
<th data-stat="visitor_pts">PTS</th>
<th data-stat="home_team_name">Home/Neutral</th>
<th data-stat="home_pts">PTS</th>
<th data-stat="arena_name">Arena</th>
<th data-stat="game_remarks">Notes</th>
</tr>
</thead>
<tbody>
<!-- Postponed game -->
<tr>
<th data-stat="date_game">Sat, Jan 11, 2026</th>
<td data-stat="game_start_time">7:30p</td>
<td data-stat="visitor_team_name">Los Angeles Lakers</td>
<td data-stat="visitor_pts"></td>
<td data-stat="home_team_name">Phoenix Suns</td>
<td data-stat="home_pts"></td>
<td data-stat="arena_name">Footprint Center</td>
<td data-stat="game_remarks">Postponed - Weather</td>
</tr>
<!-- Neutral site game (Mexico City) -->
<tr>
<th data-stat="date_game">Sat, Nov 8, 2025</th>
<td data-stat="game_start_time">7:00p</td>
<td data-stat="visitor_team_name">Miami Heat</td>
<td data-stat="visitor_pts">105</td>
<td data-stat="home_team_name">Washington Wizards</td>
<td data-stat="home_pts">99</td>
<td data-stat="arena_name">Arena CDMX</td>
<td data-stat="game_remarks">NBA Mexico City Games</td>
</tr>
<!-- Cancelled game -->
<tr>
<th data-stat="date_game">Wed, Dec 3, 2025</th>
<td data-stat="game_start_time">8:00p</td>
<td data-stat="visitor_team_name">Portland Trail Blazers</td>
<td data-stat="visitor_pts"></td>
<td data-stat="home_team_name">Sacramento Kings</td>
<td data-stat="home_pts"></td>
<td data-stat="arena_name">Golden 1 Center</td>
<td data-stat="game_remarks">Cancelled</td>
</tr>
<!-- Regular completed game with high scores -->
<tr>
<th data-stat="date_game">Sun, Mar 15, 2026</th>
<td data-stat="game_start_time">3:30p</td>
<td data-stat="visitor_team_name">Indiana Pacers</td>
<td data-stat="visitor_pts">147</td>
<td data-stat="home_team_name">Atlanta Hawks</td>
<td data-stat="home_pts">150</td>
<td data-stat="arena_name">State Farm Arena</td>
<td data-stat="game_remarks">OT</td>
</tr>
<!-- Game at arena with special characters -->
<tr>
<th data-stat="date_game">Mon, Feb 2, 2026</th>
<td data-stat="game_start_time">10:30p</td>
<td data-stat="visitor_team_name">Golden State Warriors</td>
<td data-stat="visitor_pts">118</td>
<td data-stat="home_team_name">Los Angeles Clippers</td>
<td data-stat="home_pts">115</td>
<td data-stat="arena_name">Intuit Dome</td>
<td data-stat="game_remarks"></td>
</tr>
</tbody>
</table>
</body>
</html>

View File

@@ -0,0 +1,94 @@
<!DOCTYPE html>
<html>
<head>
<title>2025-26 NBA Schedule - October | Basketball-Reference.com</title>
</head>
<body>
<table id="schedule" class="stats_table">
<thead>
<tr>
<th data-stat="date_game">Date</th>
<th data-stat="game_start_time">Start (ET)</th>
<th data-stat="visitor_team_name">Visitor/Neutral</th>
<th data-stat="visitor_pts">PTS</th>
<th data-stat="home_team_name">Home/Neutral</th>
<th data-stat="home_pts">PTS</th>
<th data-stat="arena_name">Arena</th>
<th data-stat="game_remarks">Notes</th>
</tr>
</thead>
<tbody>
<tr>
<th data-stat="date_game">Tue, Oct 22, 2025</th>
<td data-stat="game_start_time">7:30p</td>
<td data-stat="visitor_team_name">Boston Celtics</td>
<td data-stat="visitor_pts">112</td>
<td data-stat="home_team_name">Cleveland Cavaliers</td>
<td data-stat="home_pts">108</td>
<td data-stat="arena_name">Rocket Mortgage FieldHouse</td>
<td data-stat="game_remarks"></td>
</tr>
<tr>
<th data-stat="date_game">Tue, Oct 22, 2025</th>
<td data-stat="game_start_time">10:00p</td>
<td data-stat="visitor_team_name">Denver Nuggets</td>
<td data-stat="visitor_pts">119</td>
<td data-stat="home_team_name">Los Angeles Lakers</td>
<td data-stat="home_pts">127</td>
<td data-stat="arena_name">Crypto.com Arena</td>
<td data-stat="game_remarks"></td>
</tr>
<tr>
<th data-stat="date_game">Wed, Oct 23, 2025</th>
<td data-stat="game_start_time">7:00p</td>
<td data-stat="visitor_team_name">Houston Rockets</td>
<td data-stat="visitor_pts"></td>
<td data-stat="home_team_name">Oklahoma City Thunder</td>
<td data-stat="home_pts"></td>
<td data-stat="arena_name">Paycom Center</td>
<td data-stat="game_remarks"></td>
</tr>
<tr>
<th data-stat="date_game">Wed, Oct 23, 2025</th>
<td data-stat="game_start_time">7:30p</td>
<td data-stat="visitor_team_name">New York Knicks</td>
<td data-stat="visitor_pts"></td>
<td data-stat="home_team_name">Brooklyn Nets</td>
<td data-stat="home_pts"></td>
<td data-stat="arena_name">Barclays Center</td>
<td data-stat="game_remarks"></td>
</tr>
<tr>
<th data-stat="date_game">Thu, Oct 24, 2025</th>
<td data-stat="game_start_time">7:00p</td>
<td data-stat="visitor_team_name">Chicago Bulls</td>
<td data-stat="visitor_pts"></td>
<td data-stat="home_team_name">Miami Heat</td>
<td data-stat="home_pts"></td>
<td data-stat="arena_name">Kaseya Center</td>
<td data-stat="game_remarks"></td>
</tr>
<tr>
<th data-stat="date_game">Fri, Oct 25, 2025</th>
<td data-stat="game_start_time">7:30p</td>
<td data-stat="visitor_team_name">Toronto Raptors</td>
<td data-stat="visitor_pts"></td>
<td data-stat="home_team_name">Boston Celtics</td>
<td data-stat="home_pts"></td>
<td data-stat="arena_name">TD Garden</td>
<td data-stat="game_remarks"></td>
</tr>
<tr>
<th data-stat="date_game">Sat, Oct 26, 2025</th>
<td data-stat="game_start_time">8:00p</td>
<td data-stat="visitor_team_name">Minnesota Timberwolves</td>
<td data-stat="visitor_pts"></td>
<td data-stat="home_team_name">Dallas Mavericks</td>
<td data-stat="home_pts"></td>
<td data-stat="arena_name">American Airlines Center</td>
<td data-stat="game_remarks"></td>
</tr>
</tbody>
</table>
</body>
</html>

View File

@@ -0,0 +1,245 @@
{
"leagues": [
{
"id": "46",
"uid": "s:40~l:46",
"name": "National Basketball Association",
"abbreviation": "NBA"
}
],
"season": {
"type": 2,
"year": 2026
},
"day": {
"date": "2025-10-22T00:00:00Z"
},
"events": [
{
"id": "401584721",
"uid": "s:40~l:46~e:401584721",
"date": "2025-10-22T23:30:00Z",
"name": "Boston Celtics at Cleveland Cavaliers",
"shortName": "BOS @ CLE",
"competitions": [
{
"id": "401584721",
"uid": "s:40~l:46~e:401584721~c:401584721",
"date": "2025-10-22T23:30:00Z",
"attendance": 20562,
"type": {
"id": "1",
"abbreviation": "STD"
},
"venue": {
"id": "5064",
"fullName": "Rocket Mortgage FieldHouse",
"address": {
"city": "Cleveland",
"state": "OH"
},
"capacity": 19432,
"indoor": true
},
"competitors": [
{
"id": "5",
"uid": "s:40~l:46~t:5",
"type": "team",
"order": 0,
"homeAway": "home",
"team": {
"id": "5",
"uid": "s:40~l:46~t:5",
"location": "Cleveland",
"name": "Cavaliers",
"abbreviation": "CLE",
"displayName": "Cleveland Cavaliers"
},
"score": "108",
"winner": false
},
{
"id": "2",
"uid": "s:40~l:46~t:2",
"type": "team",
"order": 1,
"homeAway": "away",
"team": {
"id": "2",
"uid": "s:40~l:46~t:2",
"location": "Boston",
"name": "Celtics",
"abbreviation": "BOS",
"displayName": "Boston Celtics"
},
"score": "112",
"winner": true
}
],
"status": {
"clock": 0,
"displayClock": "0:00",
"period": 4,
"type": {
"id": "3",
"name": "STATUS_FINAL",
"state": "post",
"completed": true
}
}
}
]
},
{
"id": "401584722",
"uid": "s:40~l:46~e:401584722",
"date": "2025-10-23T02:00:00Z",
"name": "Denver Nuggets at Los Angeles Lakers",
"shortName": "DEN @ LAL",
"competitions": [
{
"id": "401584722",
"uid": "s:40~l:46~e:401584722~c:401584722",
"date": "2025-10-23T02:00:00Z",
"type": {
"id": "1",
"abbreviation": "STD"
},
"venue": {
"id": "5091",
"fullName": "Crypto.com Arena",
"address": {
"city": "Los Angeles",
"state": "CA"
},
"capacity": 19068,
"indoor": true
},
"competitors": [
{
"id": "13",
"uid": "s:40~l:46~t:13",
"type": "team",
"order": 0,
"homeAway": "home",
"team": {
"id": "13",
"uid": "s:40~l:46~t:13",
"location": "Los Angeles",
"name": "Lakers",
"abbreviation": "LAL",
"displayName": "Los Angeles Lakers"
},
"score": "127",
"winner": true
},
{
"id": "7",
"uid": "s:40~l:46~t:7",
"type": "team",
"order": 1,
"homeAway": "away",
"team": {
"id": "7",
"uid": "s:40~l:46~t:7",
"location": "Denver",
"name": "Nuggets",
"abbreviation": "DEN",
"displayName": "Denver Nuggets"
},
"score": "119",
"winner": false
}
],
"status": {
"clock": 0,
"displayClock": "0:00",
"period": 4,
"type": {
"id": "3",
"name": "STATUS_FINAL",
"state": "post",
"completed": true
}
}
}
]
},
{
"id": "401584723",
"uid": "s:40~l:46~e:401584723",
"date": "2025-10-24T00:00:00Z",
"name": "Houston Rockets at Oklahoma City Thunder",
"shortName": "HOU @ OKC",
"competitions": [
{
"id": "401584723",
"uid": "s:40~l:46~e:401584723~c:401584723",
"date": "2025-10-24T00:00:00Z",
"type": {
"id": "1",
"abbreviation": "STD"
},
"venue": {
"id": "4922",
"fullName": "Paycom Center",
"address": {
"city": "Oklahoma City",
"state": "OK"
},
"capacity": 18203,
"indoor": true
},
"competitors": [
{
"id": "25",
"uid": "s:40~l:46~t:25",
"type": "team",
"order": 0,
"homeAway": "home",
"team": {
"id": "25",
"uid": "s:40~l:46~t:25",
"location": "Oklahoma City",
"name": "Thunder",
"abbreviation": "OKC",
"displayName": "Oklahoma City Thunder"
},
"score": null,
"winner": null
},
{
"id": "10",
"uid": "s:40~l:46~t:10",
"type": "team",
"order": 1,
"homeAway": "away",
"team": {
"id": "10",
"uid": "s:40~l:46~t:10",
"location": "Houston",
"name": "Rockets",
"abbreviation": "HOU",
"displayName": "Houston Rockets"
},
"score": null,
"winner": null
}
],
"status": {
"clock": 0,
"displayClock": "0:00",
"period": 0,
"type": {
"id": "1",
"name": "STATUS_SCHEDULED",
"state": "pre",
"completed": false
}
}
}
]
}
]
}