Initial commit: Conjuga Spanish conjugation app

Includes SwiftData dual-store architecture (local reference + CloudKit user data), JSON-based data seeding, 20 tense guides, 20 grammar notes, SRS review system, course vocabulary, and widget support. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 20:58:33 -05:00
commit 4b467ec136
95 changed files with 82599 additions and 0 deletions
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Scrape 2-3 example sentences per vocab word from SpanishDict.
+Reads words from course_data.json, outputs examples to course_examples.json.
+"""
+
+import asyncio
+import json
+import re
+import os
+from playwright.async_api import async_playwright
+
+INPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json"
+OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_examples.json"
+MAX_EXAMPLES = 3
+
+def extract_word_for_lookup(front):
+    """Extract the best lookup word from a card front.
+    e.g. 'barato, barata' -> 'barato'
+    e.g. 'el/la periodista' -> 'periodista'
+    """
+    word = front.strip()
+    # Remove articles
+    word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE)
+    word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE)
+    # Take first word if comma-separated (barato, barata -> barato)
+    if ',' in word:
+        word = word.split(',')[0].strip()
+    # Take first word if slash-separated
+    if '/' in word:
+        word = word.split('/')[0].strip()
+    return word.lower().strip()
+
+
+def parse_examples(text, lookup_word):
+    """Parse example sentences from SpanishDict page text."""
+    examples = []
+    lines = text.split('\n')
+
+    for i, line in enumerate(lines):
+        l = line.strip()
+        if not l or len(l) < 15:
+            continue
+
+        # Pattern: "Spanish sentence.English sentence." (inline on one line)
+        # SpanishDict puts them together with no space between period and capital
+        # e.g. "Esta tienda es muy barata.This store is really cheap."
+        inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l)
+        if inline_match:
+            es = inline_match.group(1).strip()
+            en = inline_match.group(2).strip()
+            # Verify it contains our word (case-insensitive)
+            if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5:
+                examples.append({"es": es, "en": en})
+                if len(examples) >= MAX_EXAMPLES:
+                    break
+                continue
+
+        # Pattern: standalone Spanish sentence with word, followed by English on next line
+        if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300:
+            # Check if next non-empty line is English
+            for j in range(i + 1, min(i + 3, len(lines))):
+                next_l = lines[j].strip()
+                if not next_l:
+                    continue
+                # Check if it looks English (starts with capital, has common English words)
+                if (next_l[0].isupper() and
+                    not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])):
+                    examples.append({"es": l, "en": next_l})
+                    if len(examples) >= MAX_EXAMPLES:
+                        break
+                break
+
+        if len(examples) >= MAX_EXAMPLES:
+            break
+
+    return examples
+
+
+async def scrape_word(page, word, lookup):
+    """Scrape examples for a single word."""
+    url = f"https://www.spanishdict.com/translate/{lookup}"
+    try:
+        await page.goto(url, wait_until="domcontentloaded", timeout=15000)
+        await page.wait_for_timeout(2000)
+        text = await page.inner_text("body")
+        examples = parse_examples(text, lookup)
+        return examples
+    except Exception as e:
+        return []
+
+
+async def main():
+    # Load course data
+    with open(INPUT) as f:
+        data = json.load(f)
+
+    # Collect unique words (front values from non-reversed decks)
+    words = {}  # lookup -> original front
+    for week in data['weeks']:
+        for deck in week['decks']:
+            if deck.get('isReversed'):
+                continue
+            for card in deck['cards']:
+                front = card['front']
+                lookup = extract_word_for_lookup(front)
+                if lookup and lookup not in words:
+                    words[lookup] = front
+
+    print(f"Found {len(words)} unique words to look up")
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        ctx = await browser.new_context(
+            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+        )
+        page = await ctx.new_page()
+
+        # Load existing progress if any
+        results = {}
+        if os.path.exists(OUTPUT):
+            with open(OUTPUT) as f:
+                results = json.load(f)
+            print(f"Loaded {len(results)} existing results")
+
+        found = len(results)
+        total = len(words)
+
+        for i, (lookup, original) in enumerate(words.items()):
+            # Skip already scraped
+            if original in results:
+                continue
+
+            print(f"[{i+1}/{total}] {lookup}...", end=" ", flush=True)
+            try:
+                examples = await scrape_word(page, original, lookup)
+                if examples:
+                    results[original] = examples
+                    found += 1
+                    print(f"{len(examples)} examples")
+                else:
+                    results[original] = []
+                    print("no examples")
+            except Exception as e:
+                print(f"error: {e}")
+                results[original] = []
+
+            # Save progress every 20 words
+            if (i + 1) % 20 == 0:
+                with open(OUTPUT, 'w', encoding='utf-8') as f:
+                    json.dump(results, f, ensure_ascii=False, indent=2)
+                print(f"  [saved {len(results)} results]")
+
+            await page.wait_for_timeout(300)
+
+        await browser.close()
+
+    # Save results
+    with open(OUTPUT, 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+
+    print(f"\nDone! {found}/{total} words with examples → {OUTPUT}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())