Initial commit: Conjuga Spanish conjugation app
Includes SwiftData dual-store architecture (local reference + CloudKit user data), JSON-based data seeding, 20 tense guides, 20 grammar notes, SRS review system, course vocabulary, and widget support. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
453
Conjuga/Scripts/scrape_all_courses.py
Normal file
453
Conjuga/Scripts/scrape_all_courses.py
Normal file
@@ -0,0 +1,453 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrape 7 LanGo Spanish course packs from Brainscape, plus example sentences
|
||||
from SpanishDict. Outputs all_courses_data.json with all courses, decks, cards,
|
||||
and examples organized by week.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
BASE_URL = "https://www.brainscape.com"
|
||||
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/all_courses_data.json"
|
||||
MAX_EXAMPLES = 3
|
||||
|
||||
PACK_URLS = [
|
||||
"https://www.brainscape.com/packs/lango-spanish-beginner-ii-16514996",
|
||||
"https://www.brainscape.com/packs/lango-spanish-beginner-iii-conversation-18477688",
|
||||
"https://www.brainscape.com/packs/lango-spanish-intermediate-i-21508666",
|
||||
"https://www.brainscape.com/packs/lango-spanish-intermediate-ii-21906841",
|
||||
"https://www.brainscape.com/packs/lango-spanish-intermediate-iii-spanish-through-stories-20677744",
|
||||
"https://www.brainscape.com/packs/lango-spanish-advanced-i-21511244",
|
||||
"https://www.brainscape.com/packs/lango-spanish-advanced-ii-21649461",
|
||||
]
|
||||
|
||||
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parsing helpers (copied from scrape_brainscape.py and scrape_examples.py)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_title_and_week(text):
|
||||
"""Extract week number and clean title from page text."""
|
||||
# Match "Week N: Title" or "Semana N: Title" or "Semana N Title"
|
||||
m = re.match(r'(?:Week|Semana)\s+(\d+)[:\s]+(.+)', text, re.IGNORECASE)
|
||||
if m:
|
||||
return int(m.group(1)), m.group(2).strip()
|
||||
return 0, text.strip()
|
||||
|
||||
|
||||
def parse_cards(text):
|
||||
"""Parse flashcard Q/A pairs from page text."""
|
||||
cards = []
|
||||
lines = text.split('\n')
|
||||
|
||||
skip = {'Q', 'A', 'Study These Flashcards', '', 'Brainscape', 'Find Flashcards',
|
||||
'Make Flashcards', 'How It Works', 'Educators', 'Businesses', 'Academy',
|
||||
'Log in', 'Get Started'}
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
if re.match(r'^\d+$', line):
|
||||
num = int(line)
|
||||
parts = []
|
||||
j = i + 1
|
||||
while j < len(lines) and len(parts) < 6:
|
||||
nextline = lines[j].strip()
|
||||
|
||||
if re.match(r'^\d+$', nextline) and int(nextline) == num + 1:
|
||||
break
|
||||
if nextline.startswith('LanGo Spanish') or nextline.startswith('Decks in class'):
|
||||
break
|
||||
if re.match(r'^(?:Week|Semana) \d+', nextline):
|
||||
break
|
||||
if nextline in skip:
|
||||
j += 1
|
||||
continue
|
||||
|
||||
parts.append(nextline)
|
||||
j += 1
|
||||
|
||||
if len(parts) >= 2:
|
||||
cards.append({
|
||||
"front": parts[0],
|
||||
"back": parts[1],
|
||||
})
|
||||
i = j
|
||||
else:
|
||||
i += 1
|
||||
|
||||
cards = [c for c in cards if not re.match(r'^(?:Week|Semana) \d+', c['front'])
|
||||
and c['front'] not in ('Decks in class (39)', '# Cards')
|
||||
and not c['front'].startswith('LanGo Spanish')
|
||||
and not c['front'].startswith('You may prefer')]
|
||||
return cards
|
||||
|
||||
|
||||
def extract_word_for_lookup(front):
|
||||
"""Extract the best lookup word from a card front."""
|
||||
word = front.strip()
|
||||
word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE)
|
||||
word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE)
|
||||
if ',' in word:
|
||||
word = word.split(',')[0].strip()
|
||||
if '/' in word:
|
||||
word = word.split('/')[0].strip()
|
||||
return word.lower().strip()
|
||||
|
||||
|
||||
def parse_examples(text, lookup_word):
|
||||
"""Parse example sentences from SpanishDict page text."""
|
||||
examples = []
|
||||
lines = text.split('\n')
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
l = line.strip()
|
||||
if not l or len(l) < 15:
|
||||
continue
|
||||
|
||||
inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l)
|
||||
if inline_match:
|
||||
es = inline_match.group(1).strip()
|
||||
en = inline_match.group(2).strip()
|
||||
if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5:
|
||||
examples.append({"es": es, "en": en})
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
continue
|
||||
|
||||
if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300:
|
||||
for j in range(i + 1, min(i + 3, len(lines))):
|
||||
next_l = lines[j].strip()
|
||||
if not next_l:
|
||||
continue
|
||||
if (next_l[0].isupper() and
|
||||
not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])):
|
||||
examples.append({"es": l, "en": next_l})
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
break
|
||||
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
|
||||
return examples
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scraping logic
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def discover_deck_urls(page, pack_url):
|
||||
"""Visit a pack page and discover all deck URLs within it."""
|
||||
print(f"\nDiscovering decks in {pack_url}...")
|
||||
await page.goto(pack_url, wait_until="networkidle", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# Scroll to load all content
|
||||
for _ in range(10):
|
||||
await page.evaluate("window.scrollBy(0, 1000)")
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
# Extract pack ID from URL
|
||||
pack_id = pack_url.rstrip('/').split('-')[-1]
|
||||
|
||||
# Find all deck links matching /flashcards/*/packs/*
|
||||
links = await page.eval_on_selector_all(
|
||||
'a[href*="/flashcards/"]',
|
||||
'els => els.map(e => e.getAttribute("href"))'
|
||||
)
|
||||
|
||||
deck_urls = []
|
||||
seen = set()
|
||||
for href in links:
|
||||
if href and '/flashcards/' in href and '/packs/' in href:
|
||||
# Normalize
|
||||
if href.startswith('http'):
|
||||
href = href.replace(BASE_URL, '')
|
||||
if href not in seen:
|
||||
seen.add(href)
|
||||
deck_urls.append(href)
|
||||
|
||||
# Extract course name from the page
|
||||
text = await page.inner_text("body")
|
||||
course_name = None
|
||||
# Try to find "LanGo Spanish | ..." pattern
|
||||
m = re.search(r'(LanGo Spanish\s*\|\s*[^>\n]+)', text)
|
||||
if m:
|
||||
course_name = m.group(1).strip()
|
||||
# Clean trailing noise
|
||||
course_name = re.sub(r'\s*>\s*$', '', course_name).strip()
|
||||
# Remove "Flashcards" suffix if present
|
||||
course_name = re.sub(r'\s*Flashcards\s*$', '', course_name).strip()
|
||||
else:
|
||||
# Fallback: derive from URL slug
|
||||
slug = pack_url.rstrip('/').split('/')[-1]
|
||||
slug = re.sub(r'-\d+$', '', slug)
|
||||
course_name = slug.replace('-', ' ').title()
|
||||
|
||||
print(f" Course: {course_name}")
|
||||
print(f" Found {len(deck_urls)} deck URLs")
|
||||
return course_name, deck_urls
|
||||
|
||||
|
||||
async def scrape_deck(page, url):
|
||||
"""Scrape a single deck page for flashcard data."""
|
||||
full_url = BASE_URL + url if url.startswith('/') else url
|
||||
await page.goto(full_url, wait_until="networkidle", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
for _ in range(5):
|
||||
await page.evaluate("window.scrollBy(0, 1000)")
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
text = await page.inner_text("body")
|
||||
|
||||
# Extract title — handle both "Week N:" and "Semana N" patterns
|
||||
title_match = re.search(r'>\s*((?:Week|Semana)\s+\d+[:\s].+?)\s*>\s*Flashcards', text)
|
||||
if title_match:
|
||||
raw_title = title_match.group(1).strip()
|
||||
else:
|
||||
heading_match = re.search(r'((?:Week|Semana)\s+\d+[:\s].+?)\s*Flashcards', text)
|
||||
if heading_match:
|
||||
raw_title = heading_match.group(1).strip()
|
||||
else:
|
||||
slug = url.split('/')[2] if len(url.split('/')) > 2 else url
|
||||
slug_clean = re.sub(r'-\d+$', '', slug)
|
||||
slug_clean = re.sub(r'-al-rev(e|é)s$', ' AL REVÉS', slug_clean)
|
||||
raw_title = slug_clean.replace('-', ' ').title()
|
||||
wm = re.match(r'Week\s+(\d+)', raw_title, re.IGNORECASE)
|
||||
if not wm:
|
||||
raw_title = "Week 0: " + raw_title
|
||||
|
||||
week, title = parse_title_and_week(raw_title)
|
||||
cards = parse_cards(text)
|
||||
is_reversed = "al rev" in url.lower() or "AL REVÉS" in raw_title.upper()
|
||||
|
||||
return {
|
||||
"week": week,
|
||||
"title": title,
|
||||
"isReversed": is_reversed,
|
||||
"cardCount": len(cards),
|
||||
"cards": cards,
|
||||
"url": url,
|
||||
}
|
||||
|
||||
|
||||
async def scrape_examples_for_word(page, lookup):
|
||||
"""Scrape example sentences from SpanishDict for a single word."""
|
||||
url = f"https://www.spanishdict.com/translate/{lookup}"
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
||||
await page.wait_for_timeout(2000)
|
||||
text = await page.inner_text("body")
|
||||
return parse_examples(text, lookup)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def save_progress(data):
|
||||
"""Save current data to output file."""
|
||||
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def load_progress():
|
||||
"""Load existing progress if available."""
|
||||
if os.path.exists(OUTPUT):
|
||||
try:
|
||||
with open(OUTPUT) as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
async def main():
|
||||
# Check for existing progress
|
||||
existing = load_progress()
|
||||
completed_courses = set()
|
||||
examples_done = {} # lookup -> examples list
|
||||
|
||||
if existing and 'courses' in existing:
|
||||
for course in existing['courses']:
|
||||
if course.get('_examples_done'):
|
||||
completed_courses.add(course['course'])
|
||||
# Collect already-scraped examples
|
||||
for week in course.get('weeks', []):
|
||||
for deck in week.get('decks', []):
|
||||
for card in deck.get('cards', []):
|
||||
if card.get('examples'):
|
||||
lookup = extract_word_for_lookup(card['front'])
|
||||
examples_done[lookup] = card['examples']
|
||||
print(f"Loaded progress: {len(completed_courses)} completed courses, {len(examples_done)} words with examples")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent=USER_AGENT)
|
||||
page = await context.new_page()
|
||||
|
||||
all_courses = []
|
||||
|
||||
# If we have existing data for completed courses, keep them
|
||||
if existing and 'courses' in existing:
|
||||
for course in existing['courses']:
|
||||
if course['course'] in completed_courses:
|
||||
all_courses.append(course)
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Phase 1: Discover decks and scrape cards for each course pack
|
||||
# ---------------------------------------------------------------
|
||||
for pack_url in PACK_URLS:
|
||||
course_name, deck_urls = await discover_deck_urls(page, pack_url)
|
||||
|
||||
# Skip if already completed
|
||||
if course_name in completed_courses:
|
||||
print(f" Skipping {course_name} (already completed)")
|
||||
continue
|
||||
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
all_decks = []
|
||||
total_cards = 0
|
||||
|
||||
for i, deck_url in enumerate(deck_urls):
|
||||
slug = deck_url.split('/')[2] if len(deck_url.split('/')) > 2 else deck_url
|
||||
print(f" [{i+1}/{len(deck_urls)}] Scraping {slug[:60]}...")
|
||||
try:
|
||||
deck = await scrape_deck(page, deck_url)
|
||||
all_decks.append(deck)
|
||||
total_cards += deck["cardCount"]
|
||||
print(f" -> Week {deck['week']}: {deck['title']} ({deck['cardCount']} cards)")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
# Organize by week
|
||||
weeks = {}
|
||||
for deck in all_decks:
|
||||
w = deck["week"]
|
||||
if w not in weeks:
|
||||
weeks[w] = []
|
||||
weeks[w].append({
|
||||
"title": deck["title"],
|
||||
"isReversed": deck["isReversed"],
|
||||
"cardCount": deck["cardCount"],
|
||||
"cards": deck["cards"],
|
||||
})
|
||||
|
||||
course_data = {
|
||||
"course": course_name,
|
||||
"totalDecks": len(all_decks),
|
||||
"totalCards": total_cards,
|
||||
"_examples_done": False,
|
||||
"weeks": [
|
||||
{"week": w, "decks": weeks[w]}
|
||||
for w in sorted(weeks.keys())
|
||||
],
|
||||
}
|
||||
all_courses.append(course_data)
|
||||
|
||||
# Save after each course
|
||||
save_progress({"courses": all_courses})
|
||||
print(f" Saved {course_name}: {len(all_decks)} decks, {total_cards} cards")
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Phase 2: Scrape example sentences from SpanishDict
|
||||
# ---------------------------------------------------------------
|
||||
print("\n" + "=" * 60)
|
||||
print("Phase 2: Scraping example sentences from SpanishDict")
|
||||
print("=" * 60)
|
||||
|
||||
# Collect all unique words across all courses (non-reversed decks)
|
||||
unique_words = {} # lookup -> original front
|
||||
for course in all_courses:
|
||||
for week in course['weeks']:
|
||||
for deck in week['decks']:
|
||||
if deck.get('isReversed'):
|
||||
continue
|
||||
for card in deck['cards']:
|
||||
front = card['front']
|
||||
lookup = extract_word_for_lookup(front)
|
||||
if lookup and lookup not in unique_words:
|
||||
unique_words[lookup] = front
|
||||
|
||||
print(f"Found {len(unique_words)} unique words to look up")
|
||||
print(f"Already have examples for {len(examples_done)} words")
|
||||
|
||||
words_scraped = 0
|
||||
total_words = len(unique_words)
|
||||
|
||||
for i, (lookup, original) in enumerate(unique_words.items()):
|
||||
if lookup in examples_done:
|
||||
continue
|
||||
|
||||
print(f"[{i+1}/{total_words}] {lookup}...", end=" ", flush=True)
|
||||
try:
|
||||
examples = await scrape_examples_for_word(page, lookup)
|
||||
examples_done[lookup] = examples
|
||||
if examples:
|
||||
print(f"{len(examples)} examples")
|
||||
else:
|
||||
print("no examples")
|
||||
except Exception as e:
|
||||
print(f"error: {e}")
|
||||
examples_done[lookup] = []
|
||||
|
||||
words_scraped += 1
|
||||
|
||||
# Save progress every 20 words
|
||||
if words_scraped % 20 == 0:
|
||||
# Attach examples to cards before saving
|
||||
_attach_examples(all_courses, examples_done)
|
||||
save_progress({"courses": all_courses})
|
||||
print(f" [saved progress - {len(examples_done)} words done]")
|
||||
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
await browser.close()
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Final: attach all examples to cards and save
|
||||
# ---------------------------------------------------------------
|
||||
_attach_examples(all_courses, examples_done)
|
||||
|
||||
# Mark all courses as examples_done and remove internal flag
|
||||
for course in all_courses:
|
||||
course['_examples_done'] = True
|
||||
|
||||
# Clean up internal flags before final save
|
||||
for course in all_courses:
|
||||
course.pop('_examples_done', None)
|
||||
|
||||
save_progress({"courses": all_courses})
|
||||
|
||||
total_decks = sum(c['totalDecks'] for c in all_courses)
|
||||
total_cards = sum(c['totalCards'] for c in all_courses)
|
||||
print(f"\nDone! {len(all_courses)} courses, {total_decks} decks, {total_cards} cards")
|
||||
print(f"Examples scraped for {len(examples_done)} unique words")
|
||||
print(f"Output: {OUTPUT}")
|
||||
|
||||
|
||||
def _attach_examples(courses, examples_done):
|
||||
"""Attach scraped examples to card objects in place."""
|
||||
for course in courses:
|
||||
for week in course['weeks']:
|
||||
for deck in week['decks']:
|
||||
for card in deck['cards']:
|
||||
lookup = extract_word_for_lookup(card['front'])
|
||||
if lookup in examples_done and examples_done[lookup]:
|
||||
card['examples'] = examples_done[lookup]
|
||||
elif 'examples' not in card:
|
||||
card['examples'] = []
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user