Initial commit: Conjuga Spanish conjugation app

Includes SwiftData dual-store architecture (local reference + CloudKit user data), JSON-based data seeding, 20 tense guides, 20 grammar notes, SRS review system, course vocabulary, and widget support. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 20:58:33 -05:00
commit 4b467ec136
95 changed files with 82599 additions and 0 deletions
--- a/Conjuga/Scripts/scrape_all_courses.py
+++ b/Conjuga/Scripts/scrape_all_courses.py
@@ -0,0 +1,453 @@
+#!/usr/bin/env python3
+"""
+Scrape 7 LanGo Spanish course packs from Brainscape, plus example sentences
+from SpanishDict. Outputs all_courses_data.json with all courses, decks, cards,
+and examples organized by week.
+"""
+
+import asyncio
+import json
+import re
+import os
+from playwright.async_api import async_playwright
+
+BASE_URL = "https://www.brainscape.com"
+OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/all_courses_data.json"
+MAX_EXAMPLES = 3
+
+PACK_URLS = [
+    "https://www.brainscape.com/packs/lango-spanish-beginner-ii-16514996",
+    "https://www.brainscape.com/packs/lango-spanish-beginner-iii-conversation-18477688",
+    "https://www.brainscape.com/packs/lango-spanish-intermediate-i-21508666",
+    "https://www.brainscape.com/packs/lango-spanish-intermediate-ii-21906841",
+    "https://www.brainscape.com/packs/lango-spanish-intermediate-iii-spanish-through-stories-20677744",
+    "https://www.brainscape.com/packs/lango-spanish-advanced-i-21511244",
+    "https://www.brainscape.com/packs/lango-spanish-advanced-ii-21649461",
+]
+
+USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+
+
+# ---------------------------------------------------------------------------
+# Parsing helpers (copied from scrape_brainscape.py and scrape_examples.py)
+# ---------------------------------------------------------------------------
+
+def parse_title_and_week(text):
+    """Extract week number and clean title from page text."""
+    # Match "Week N: Title" or "Semana N: Title" or "Semana N Title"
+    m = re.match(r'(?:Week|Semana)\s+(\d+)[:\s]+(.+)', text, re.IGNORECASE)
+    if m:
+        return int(m.group(1)), m.group(2).strip()
+    return 0, text.strip()
+
+
+def parse_cards(text):
+    """Parse flashcard Q/A pairs from page text."""
+    cards = []
+    lines = text.split('\n')
+
+    skip = {'Q', 'A', 'Study These Flashcards', '', 'Brainscape', 'Find Flashcards',
+            'Make Flashcards', 'How It Works', 'Educators', 'Businesses', 'Academy',
+            'Log in', 'Get Started'}
+
+    i = 0
+    while i < len(lines):
+        line = lines[i].strip()
+
+        if re.match(r'^\d+$', line):
+            num = int(line)
+            parts = []
+            j = i + 1
+            while j < len(lines) and len(parts) < 6:
+                nextline = lines[j].strip()
+
+                if re.match(r'^\d+$', nextline) and int(nextline) == num + 1:
+                    break
+                if nextline.startswith('LanGo Spanish') or nextline.startswith('Decks in class'):
+                    break
+                if re.match(r'^(?:Week|Semana) \d+', nextline):
+                    break
+                if nextline in skip:
+                    j += 1
+                    continue
+
+                parts.append(nextline)
+                j += 1
+
+            if len(parts) >= 2:
+                cards.append({
+                    "front": parts[0],
+                    "back": parts[1],
+                })
+            i = j
+        else:
+            i += 1
+
+    cards = [c for c in cards if not re.match(r'^(?:Week|Semana) \d+', c['front'])
+             and c['front'] not in ('Decks in class (39)', '# Cards')
+             and not c['front'].startswith('LanGo Spanish')
+             and not c['front'].startswith('You may prefer')]
+    return cards
+
+
+def extract_word_for_lookup(front):
+    """Extract the best lookup word from a card front."""
+    word = front.strip()
+    word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE)
+    word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE)
+    if ',' in word:
+        word = word.split(',')[0].strip()
+    if '/' in word:
+        word = word.split('/')[0].strip()
+    return word.lower().strip()
+
+
+def parse_examples(text, lookup_word):
+    """Parse example sentences from SpanishDict page text."""
+    examples = []
+    lines = text.split('\n')
+
+    for i, line in enumerate(lines):
+        l = line.strip()
+        if not l or len(l) < 15:
+            continue
+
+        inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l)
+        if inline_match:
+            es = inline_match.group(1).strip()
+            en = inline_match.group(2).strip()
+            if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5:
+                examples.append({"es": es, "en": en})
+                if len(examples) >= MAX_EXAMPLES:
+                    break
+                continue
+
+        if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300:
+            for j in range(i + 1, min(i + 3, len(lines))):
+                next_l = lines[j].strip()
+                if not next_l:
+                    continue
+                if (next_l[0].isupper() and
+                    not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])):
+                    examples.append({"es": l, "en": next_l})
+                    if len(examples) >= MAX_EXAMPLES:
+                        break
+                break
+
+        if len(examples) >= MAX_EXAMPLES:
+            break
+
+    return examples
+
+
+# ---------------------------------------------------------------------------
+# Scraping logic
+# ---------------------------------------------------------------------------
+
+async def discover_deck_urls(page, pack_url):
+    """Visit a pack page and discover all deck URLs within it."""
+    print(f"\nDiscovering decks in {pack_url}...")
+    await page.goto(pack_url, wait_until="networkidle", timeout=30000)
+    await page.wait_for_timeout(2000)
+
+    # Scroll to load all content
+    for _ in range(10):
+        await page.evaluate("window.scrollBy(0, 1000)")
+        await page.wait_for_timeout(300)
+
+    # Extract pack ID from URL
+    pack_id = pack_url.rstrip('/').split('-')[-1]
+
+    # Find all deck links matching /flashcards/*/packs/*
+    links = await page.eval_on_selector_all(
+        'a[href*="/flashcards/"]',
+        'els => els.map(e => e.getAttribute("href"))'
+    )
+
+    deck_urls = []
+    seen = set()
+    for href in links:
+        if href and '/flashcards/' in href and '/packs/' in href:
+            # Normalize
+            if href.startswith('http'):
+                href = href.replace(BASE_URL, '')
+            if href not in seen:
+                seen.add(href)
+                deck_urls.append(href)
+
+    # Extract course name from the page
+    text = await page.inner_text("body")
+    course_name = None
+    # Try to find "LanGo Spanish | ..." pattern
+    m = re.search(r'(LanGo Spanish\s*\|\s*[^>\n]+)', text)
+    if m:
+        course_name = m.group(1).strip()
+        # Clean trailing noise
+        course_name = re.sub(r'\s*>\s*$', '', course_name).strip()
+        # Remove "Flashcards" suffix if present
+        course_name = re.sub(r'\s*Flashcards\s*$', '', course_name).strip()
+    else:
+        # Fallback: derive from URL slug
+        slug = pack_url.rstrip('/').split('/')[-1]
+        slug = re.sub(r'-\d+$', '', slug)
+        course_name = slug.replace('-', ' ').title()
+
+    print(f"  Course: {course_name}")
+    print(f"  Found {len(deck_urls)} deck URLs")
+    return course_name, deck_urls
+
+
+async def scrape_deck(page, url):
+    """Scrape a single deck page for flashcard data."""
+    full_url = BASE_URL + url if url.startswith('/') else url
+    await page.goto(full_url, wait_until="networkidle", timeout=30000)
+    await page.wait_for_timeout(2000)
+
+    for _ in range(5):
+        await page.evaluate("window.scrollBy(0, 1000)")
+        await page.wait_for_timeout(300)
+
+    text = await page.inner_text("body")
+
+    # Extract title — handle both "Week N:" and "Semana N" patterns
+    title_match = re.search(r'>\s*((?:Week|Semana)\s+\d+[:\s].+?)\s*>\s*Flashcards', text)
+    if title_match:
+        raw_title = title_match.group(1).strip()
+    else:
+        heading_match = re.search(r'((?:Week|Semana)\s+\d+[:\s].+?)\s*Flashcards', text)
+        if heading_match:
+            raw_title = heading_match.group(1).strip()
+        else:
+            slug = url.split('/')[2] if len(url.split('/')) > 2 else url
+            slug_clean = re.sub(r'-\d+$', '', slug)
+            slug_clean = re.sub(r'-al-rev(e|é)s$', ' AL REVÉS', slug_clean)
+            raw_title = slug_clean.replace('-', ' ').title()
+            wm = re.match(r'Week\s+(\d+)', raw_title, re.IGNORECASE)
+            if not wm:
+                raw_title = "Week 0: " + raw_title
+
+    week, title = parse_title_and_week(raw_title)
+    cards = parse_cards(text)
+    is_reversed = "al rev" in url.lower() or "AL REVÉS" in raw_title.upper()
+
+    return {
+        "week": week,
+        "title": title,
+        "isReversed": is_reversed,
+        "cardCount": len(cards),
+        "cards": cards,
+        "url": url,
+    }
+
+
+async def scrape_examples_for_word(page, lookup):
+    """Scrape example sentences from SpanishDict for a single word."""
+    url = f"https://www.spanishdict.com/translate/{lookup}"
+    try:
+        await page.goto(url, wait_until="domcontentloaded", timeout=15000)
+        await page.wait_for_timeout(2000)
+        text = await page.inner_text("body")
+        return parse_examples(text, lookup)
+    except Exception:
+        return []
+
+
+def save_progress(data):
+    """Save current data to output file."""
+    with open(OUTPUT, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+
+def load_progress():
+    """Load existing progress if available."""
+    if os.path.exists(OUTPUT):
+        try:
+            with open(OUTPUT) as f:
+                return json.load(f)
+        except (json.JSONDecodeError, KeyError):
+            pass
+    return None
+
+
+async def main():
+    # Check for existing progress
+    existing = load_progress()
+    completed_courses = set()
+    examples_done = {}  # lookup -> examples list
+
+    if existing and 'courses' in existing:
+        for course in existing['courses']:
+            if course.get('_examples_done'):
+                completed_courses.add(course['course'])
+            # Collect already-scraped examples
+            for week in course.get('weeks', []):
+                for deck in week.get('decks', []):
+                    for card in deck.get('cards', []):
+                        if card.get('examples'):
+                            lookup = extract_word_for_lookup(card['front'])
+                            examples_done[lookup] = card['examples']
+        print(f"Loaded progress: {len(completed_courses)} completed courses, {len(examples_done)} words with examples")
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(user_agent=USER_AGENT)
+        page = await context.new_page()
+
+        all_courses = []
+
+        # If we have existing data for completed courses, keep them
+        if existing and 'courses' in existing:
+            for course in existing['courses']:
+                if course['course'] in completed_courses:
+                    all_courses.append(course)
+
+        # ---------------------------------------------------------------
+        # Phase 1: Discover decks and scrape cards for each course pack
+        # ---------------------------------------------------------------
+        for pack_url in PACK_URLS:
+            course_name, deck_urls = await discover_deck_urls(page, pack_url)
+
+            # Skip if already completed
+            if course_name in completed_courses:
+                print(f"  Skipping {course_name} (already completed)")
+                continue
+
+            await page.wait_for_timeout(300)
+
+            all_decks = []
+            total_cards = 0
+
+            for i, deck_url in enumerate(deck_urls):
+                slug = deck_url.split('/')[2] if len(deck_url.split('/')) > 2 else deck_url
+                print(f"  [{i+1}/{len(deck_urls)}] Scraping {slug[:60]}...")
+                try:
+                    deck = await scrape_deck(page, deck_url)
+                    all_decks.append(deck)
+                    total_cards += deck["cardCount"]
+                    print(f"    -> Week {deck['week']}: {deck['title']} ({deck['cardCount']} cards)")
+                except Exception as e:
+                    print(f"    ERROR: {e}")
+
+                await page.wait_for_timeout(300)
+
+            # Organize by week
+            weeks = {}
+            for deck in all_decks:
+                w = deck["week"]
+                if w not in weeks:
+                    weeks[w] = []
+                weeks[w].append({
+                    "title": deck["title"],
+                    "isReversed": deck["isReversed"],
+                    "cardCount": deck["cardCount"],
+                    "cards": deck["cards"],
+                })
+
+            course_data = {
+                "course": course_name,
+                "totalDecks": len(all_decks),
+                "totalCards": total_cards,
+                "_examples_done": False,
+                "weeks": [
+                    {"week": w, "decks": weeks[w]}
+                    for w in sorted(weeks.keys())
+                ],
+            }
+            all_courses.append(course_data)
+
+            # Save after each course
+            save_progress({"courses": all_courses})
+            print(f"  Saved {course_name}: {len(all_decks)} decks, {total_cards} cards")
+
+        # ---------------------------------------------------------------
+        # Phase 2: Scrape example sentences from SpanishDict
+        # ---------------------------------------------------------------
+        print("\n" + "=" * 60)
+        print("Phase 2: Scraping example sentences from SpanishDict")
+        print("=" * 60)
+
+        # Collect all unique words across all courses (non-reversed decks)
+        unique_words = {}  # lookup -> original front
+        for course in all_courses:
+            for week in course['weeks']:
+                for deck in week['decks']:
+                    if deck.get('isReversed'):
+                        continue
+                    for card in deck['cards']:
+                        front = card['front']
+                        lookup = extract_word_for_lookup(front)
+                        if lookup and lookup not in unique_words:
+                            unique_words[lookup] = front
+
+        print(f"Found {len(unique_words)} unique words to look up")
+        print(f"Already have examples for {len(examples_done)} words")
+
+        words_scraped = 0
+        total_words = len(unique_words)
+
+        for i, (lookup, original) in enumerate(unique_words.items()):
+            if lookup in examples_done:
+                continue
+
+            print(f"[{i+1}/{total_words}] {lookup}...", end=" ", flush=True)
+            try:
+                examples = await scrape_examples_for_word(page, lookup)
+                examples_done[lookup] = examples
+                if examples:
+                    print(f"{len(examples)} examples")
+                else:
+                    print("no examples")
+            except Exception as e:
+                print(f"error: {e}")
+                examples_done[lookup] = []
+
+            words_scraped += 1
+
+            # Save progress every 20 words
+            if words_scraped % 20 == 0:
+                # Attach examples to cards before saving
+                _attach_examples(all_courses, examples_done)
+                save_progress({"courses": all_courses})
+                print(f"  [saved progress - {len(examples_done)} words done]")
+
+            await page.wait_for_timeout(300)
+
+        await browser.close()
+
+    # ---------------------------------------------------------------
+    # Final: attach all examples to cards and save
+    # ---------------------------------------------------------------
+    _attach_examples(all_courses, examples_done)
+
+    # Mark all courses as examples_done and remove internal flag
+    for course in all_courses:
+        course['_examples_done'] = True
+
+    # Clean up internal flags before final save
+    for course in all_courses:
+        course.pop('_examples_done', None)
+
+    save_progress({"courses": all_courses})
+
+    total_decks = sum(c['totalDecks'] for c in all_courses)
+    total_cards = sum(c['totalCards'] for c in all_courses)
+    print(f"\nDone! {len(all_courses)} courses, {total_decks} decks, {total_cards} cards")
+    print(f"Examples scraped for {len(examples_done)} unique words")
+    print(f"Output: {OUTPUT}")
+
+
+def _attach_examples(courses, examples_done):
+    """Attach scraped examples to card objects in place."""
+    for course in courses:
+        for week in course['weeks']:
+            for deck in week['decks']:
+                for card in deck['cards']:
+                    lookup = extract_word_for_lookup(card['front'])
+                    if lookup in examples_done and examples_done[lookup]:
+                        card['examples'] = examples_done[lookup]
+                    elif 'examples' not in card:
+                        card['examples'] = []
+
+
+if __name__ == "__main__":
+    asyncio.run(main())