Initial commit: Conjuga Spanish conjugation app

Includes SwiftData dual-store architecture (local reference + CloudKit user data),
JSON-based data seeding, 20 tense guides, 20 grammar notes, SRS review system,
course vocabulary, and widget support.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-04-09 20:58:33 -05:00
commit 4b467ec136
95 changed files with 82599 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
#!/usr/bin/env swift
// Run this script to generate a pre-built SwiftData store (default.store)
// that ships with the app bundle. No first-launch seeding needed.
import Foundation
import SwiftData
// We can't easily run this as a standalone script because it needs
// the @Model types compiled. Instead, we'll build it as part of the app.
// See DataLoader.buildPreloadedStore() below.
print("Use DataLoader.buildPreloadedStore() from within the app to generate the store.")
print("Then copy the .store file to the bundle.")

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,160 @@
{
"stats": {
"verbs": 1750,
"verbForms": 209014,
"irregularSpans": 14078,
"tenseGuides": 20
},
"sampleVerb": {
"id": 1,
"infinitive": "ser",
"english": "to be",
"rank": 1,
"ending": "er",
"reflexive": 0,
"level": "basic",
"hasConjuuData": true
},
"sampleForms": [
{
"verbId": 1,
"tenseId": "ind_presente",
"personIndex": 0,
"form": "soy",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_presente",
"personIndex": 1,
"form": "eres",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_presente",
"personIndex": 2,
"form": "es",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_presente",
"personIndex": 3,
"form": "somos",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_presente",
"personIndex": 4,
"form": "sois",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_presente",
"personIndex": 5,
"form": "son",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_preterito",
"personIndex": 0,
"form": "fui",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_preterito",
"personIndex": 1,
"form": "fuiste",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_preterito",
"personIndex": 2,
"form": "fue",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_preterito",
"personIndex": 3,
"form": "fuimos",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_preterito",
"personIndex": 4,
"form": "fuisteis",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_preterito",
"personIndex": 5,
"form": "fueron",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_imperfecto",
"personIndex": 0,
"form": "era",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_imperfecto",
"personIndex": 1,
"form": "eras",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_imperfecto",
"personIndex": 2,
"form": "era",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_imperfecto",
"personIndex": 3,
"form": "éramos",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_imperfecto",
"personIndex": 4,
"form": "erais",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_imperfecto",
"personIndex": 5,
"form": "eran",
"regularity": "irregular"
},
{
"verbId": 1,
"tenseId": "ind_futuro",
"personIndex": 0,
"form": "seré",
"regularity": "ordinary"
},
{
"verbId": 1,
"tenseId": "ind_futuro",
"personIndex": 1,
"form": "serás",
"regularity": "ordinary"
}
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,550 @@
#!/usr/bin/env python3
"""
Merge ConjuGato + Conjuu ES data into unified JSON for Conjuga app.
Sources:
- ConjuGato: 1,750 verbs (verb.md), irregular forms, spans, irregularity bitmasks
- Conjuu ES: 621 verbs with full conjugation tables, tense guides, conjugation rules
Output: conjuga_data.json with all verbs, forms, spans, guides
"""
import csv
import json
import re
import sqlite3
import os
import plistlib
import subprocess
BASE = "/Users/treyt/Desktop/code/Spanish"
CONJUGATO_DB = "/Applications/ConjuGato.app/WrappedBundle/Verbs.sqlite"
CONJUU_VOCAB = "/Applications/Conjuu ES.app/Contents/Resources/Vocabulary.csv"
CONJUU_GUIDE = "/Applications/Conjuu ES.app/Contents/Resources/en.lproj/Guide.strings"
CONJUU_RULES = "/Applications/Conjuu ES.app/Contents/Resources/GuideTableEntries.plist"
CONJUU_LEVELS = "/Applications/Conjuu ES.app/Contents/Resources"
OUTPUT = os.path.join(BASE, "Conjuga", "Scripts", "conjuga_data.json")
# ─── Tense metadata ───
TENSES = [
{"id": "ind_presente", "spanish": "Indicativo Presente", "english": "Present", "mood": "Indicative", "order": 0},
{"id": "ind_preterito", "spanish": "Indicativo Pretérito", "english": "Preterite", "mood": "Indicative", "order": 1},
{"id": "ind_imperfecto", "spanish": "Indicativo Imperfecto", "english": "Imperfect", "mood": "Indicative", "order": 2},
{"id": "ind_futuro", "spanish": "Indicativo Futuro", "english": "Future", "mood": "Indicative", "order": 3},
{"id": "ind_perfecto", "spanish": "Indicativo Perfecto", "english": "Present Perfect", "mood": "Indicative", "order": 4},
{"id": "ind_pluscuamperfecto", "spanish": "Indicativo Pluscuamperfecto", "english": "Pluperfect", "mood": "Indicative", "order": 5},
{"id": "ind_futuro_perfecto", "spanish": "Indicativo Futuro Perfecto", "english": "Future Perfect", "mood": "Indicative", "order": 6},
{"id": "ind_preterito_anterior", "spanish": "Indicativo Pretérito Anterior", "english": "Preterite Perfect", "mood": "Indicative", "order": 7},
{"id": "cond_presente", "spanish": "Condicional Presente", "english": "Conditional", "mood": "Conditional", "order": 8},
{"id": "cond_perfecto", "spanish": "Condicional Perfecto", "english": "Conditional Perfect", "mood": "Conditional", "order": 9},
{"id": "subj_presente", "spanish": "Subjuntivo Presente", "english": "Present Subjunctive", "mood": "Subjunctive", "order": 10},
{"id": "subj_imperfecto_1", "spanish": "Subjuntivo Imperfecto I", "english": "Past Subjunctive (ra)", "mood": "Subjunctive", "order": 11},
{"id": "subj_imperfecto_2", "spanish": "Subjuntivo Imperfecto II", "english": "Past Subjunctive (se)", "mood": "Subjunctive", "order": 12},
{"id": "subj_perfecto", "spanish": "Subjuntivo Perfecto", "english": "Subjunctive Perfect", "mood": "Subjunctive", "order": 13},
{"id": "subj_pluscuamperfecto_1", "spanish": "Subjuntivo Pluscuamperfecto I", "english": "Subjunctive Pluperfect (ra)", "mood": "Subjunctive", "order": 14},
{"id": "subj_pluscuamperfecto_2", "spanish": "Subjuntivo Pluscuamperfecto II", "english": "Subjunctive Pluperfect (se)", "mood": "Subjunctive", "order": 15},
{"id": "subj_futuro", "spanish": "Subjuntivo Futuro", "english": "Subjunctive Future", "mood": "Subjunctive", "order": 16},
{"id": "subj_futuro_perfecto", "spanish": "Subjuntivo Futuro Perfecto", "english": "Subjunctive Future Perfect", "mood": "Subjunctive", "order": 17},
{"id": "imp_afirmativo", "spanish": "Imperativo Afirmativo", "english": "Imperative", "mood": "Imperative", "order": 18},
{"id": "imp_negativo", "spanish": "Imperativo Negativo", "english": "Negative Imperative", "mood": "Imperative", "order": 19},
]
TENSE_LOOKUP = {}
for t in TENSES:
TENSE_LOOKUP[t["spanish"]] = t["id"]
PERSONS = ["yo", "", "él/ella/Ud.", "nosotros", "vosotros", "ellos/ellas/Uds."]
ENDINGS = {
"ar": {
"ind_presente": ["o", "as", "a", "amos", "áis", "an"],
"ind_preterito": ["é", "aste", "ó", "amos", "asteis", "aron"],
"ind_imperfecto": ["aba", "abas", "aba", "ábamos", "abais", "aban"],
"ind_futuro": ["aré", "arás", "ará", "aremos", "aréis", "arán"],
"cond_presente": ["aría", "arías", "aría", "aríamos", "aríais", "arían"],
"subj_presente": ["e", "es", "e", "emos", "éis", "en"],
"subj_imperfecto_1": ["ara", "aras", "ara", "áramos", "arais", "aran"],
"subj_imperfecto_2": ["ase", "ases", "ase", "ásemos", "aseis", "asen"],
"subj_futuro": ["are", "ares", "are", "áremos", "areis", "aren"],
"imp_afirmativo": ["", "a", "e", "emos", "ad", "en"],
"imp_negativo": ["", "es", "e", "emos", "éis", "en"],
},
"er": {
"ind_presente": ["o", "es", "e", "emos", "éis", "en"],
"ind_preterito": ["í", "iste", "", "imos", "isteis", "ieron"],
"ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"],
"ind_futuro": ["eré", "erás", "erá", "eremos", "eréis", "erán"],
"cond_presente": ["ería", "erías", "ería", "eríamos", "eríais", "erían"],
"subj_presente": ["a", "as", "a", "amos", "áis", "an"],
"subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"],
"subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"],
"subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"],
"imp_afirmativo": ["", "e", "a", "amos", "ed", "an"],
"imp_negativo": ["", "as", "a", "amos", "áis", "an"],
},
"ir": {
"ind_presente": ["o", "es", "e", "imos", "ís", "en"],
"ind_preterito": ["í", "iste", "", "imos", "isteis", "ieron"],
"ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"],
"ind_futuro": ["iré", "irás", "irá", "iremos", "iréis", "irán"],
"cond_presente": ["iría", "irías", "iría", "iríamos", "iríais", "irían"],
"subj_presente": ["a", "as", "a", "amos", "áis", "an"],
"subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"],
"subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"],
"subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"],
"imp_afirmativo": ["", "e", "a", "amos", "id", "an"],
"imp_negativo": ["", "as", "a", "amos", "áis", "an"],
},
}
# Compound tenses: auxiliary haber forms
HABER = {
"ind_perfecto": ["he", "has", "ha", "hemos", "habéis", "han"],
"ind_pluscuamperfecto": ["había", "habías", "había", "habíamos", "habíais", "habían"],
"ind_futuro_perfecto": ["habré", "habrás", "habrá", "habremos", "habréis", "habrán"],
"ind_preterito_anterior": ["hube", "hubiste", "hubo", "hubimos", "hubisteis", "hubieron"],
"cond_perfecto": ["habría", "habrías", "habría", "habríamos", "habríais", "habrían"],
"subj_perfecto": ["haya", "hayas", "haya", "hayamos", "hayáis", "hayan"],
"subj_pluscuamperfecto_1": ["hubiera", "hubieras", "hubiera", "hubiéramos", "hubierais", "hubieran"],
"subj_pluscuamperfecto_2": ["hubiese", "hubieses", "hubiese", "hubiésemos", "hubieseis", "hubiesen"],
"subj_futuro_perfecto": ["hubiere", "hubieres", "hubiere", "hubiéremos", "hubiereis", "hubieren"],
}
def get_ending_type(infinitive):
inf = infinitive.lower()
if inf.endswith("arse") or inf.endswith("erse") or inf.endswith("irse"):
core = inf[:-2]
else:
core = inf
if core.endswith("ar"):
return "ar"
elif core.endswith("er"):
return "er"
elif core.endswith("ir") or core.endswith("ír"):
return "ir"
return "ar"
def get_stem(infinitive, ending_type):
inf = infinitive.lower()
if inf.endswith("se"):
inf = inf[:-2]
if ending_type == "ar" and inf.endswith("ar"):
return inf[:-2]
elif ending_type == "er" and inf.endswith("er"):
return inf[:-2]
elif ending_type == "ir" and (inf.endswith("ir") or inf.endswith("ír")):
return inf[:-2]
return inf[:-2]
def get_participle(infinitive, ending_type):
stem = get_stem(infinitive, ending_type)
if ending_type == "ar":
return stem + "ado"
else:
return stem + "ido"
def conjugate_regular(infinitive, tense_id, ending_type):
stem = get_stem(infinitive, ending_type)
if tense_id in HABER:
participle = get_participle(infinitive, ending_type)
return [f"{aux} {participle}" for aux in HABER[tense_id]]
if tense_id in ("ind_futuro", "cond_presente"):
return [infinitive.lower().rstrip("se") + e.lstrip(ending_type[0] if tense_id == "ind_futuro" else "")
for e in ENDINGS[ending_type][tense_id]]
# Actually for future/conditional, the stem is the full infinitive
base = infinitive.lower()
if base.endswith("se"):
base = base[:-2]
return [base + ENDINGS[ending_type][tense_id][i] for i in range(6)]
if tense_id in ENDINGS[ending_type]:
endings = ENDINGS[ending_type][tense_id]
return [stem + e for e in endings]
return [""] * 6
def conjugate_future_cond(infinitive, tense_id, ending_type):
base = infinitive.lower()
if base.endswith("se"):
base = base[:-2]
endings_map = {
"ind_futuro": ["é", "ás", "á", "emos", "éis", "án"],
"cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"],
}
if tense_id in endings_map:
return [base + e for e in endings_map[tense_id]]
return None
# ─── Step 1: Load ConjuGato verbs ───
print("Loading ConjuGato data...")
conn = sqlite3.connect(CONJUGATO_DB)
cursor = conn.cursor()
# Verbs
cursor.execute("SELECT Id, Rank, Ending, Reflexive, Spanish, English FROM Verb ORDER BY Rank")
conjugato_verbs = {}
for row in cursor.fetchall():
vid, rank, ending, reflexive, spanish, english = row
ending_map = {1: "ar", 2: "er", 4: "ir"}
conjugato_verbs[vid] = {
"id": vid,
"rank": rank,
"ending": ending_map.get(ending, "ar"),
"reflexive": reflexive,
"infinitive": spanish,
"english": english,
}
# Irregular verb forms
cursor.execute("SELECT VerbFormId, Form FROM IrregularVerbForm ORDER BY VerbFormId")
irregular_forms = {}
for vfid, form in cursor.fetchall():
irregular_forms[vfid] = form
# Irregular spans
cursor.execute("SELECT Id, VerbFormId, Type, Pattern, Start, End FROM IrregularSpan ORDER BY Id")
irregular_spans = []
for sid, vfid, stype, pattern, start, end in cursor.fetchall():
irregular_spans.append({
"verbFormId": vfid,
"type": stype,
"pattern": pattern,
"start": start,
"end": end,
})
# Irregularity bitmasks
cursor.execute("SELECT * FROM Irregularity ORDER BY VerbId")
irregularity_cols = [d[0] for d in cursor.description]
irregularity_data = {}
for row in cursor.fetchall():
verb_id = row[0]
irregularity_data[verb_id] = dict(zip(irregularity_cols[1:], row[1:]))
conn.close()
print(f" {len(conjugato_verbs)} verbs, {len(irregular_forms)} irregular forms, {len(irregular_spans)} spans")
# ─── Step 2: Load Conjuu ES conjugations ───
print("Loading Conjuu ES data...")
conjuu_verbs = {}
with open(CONJUU_VOCAB, 'r') as f:
for row in csv.reader(f):
verb_name = row[0]
tense_spanish = row[2]
tense_id = TENSE_LOOKUP.get(tense_spanish)
if not tense_id:
continue
regularity = row[1]
forms = row[3:9] # yo, tú, él, nosotros, vosotros, ellos
english = row[9]
rank = int(row[13]) if row[13] else 99999
key = verb_name.lower()
if key not in conjuu_verbs:
conjuu_verbs[key] = {
"infinitive": verb_name,
"english": english,
"rank": rank,
"tenses": {},
}
conjuu_verbs[key]["tenses"][tense_id] = {
"regularity": regularity,
"forms": forms,
}
print(f" {len(conjuu_verbs)} verbs with conjugations")
# ─── Step 3: Load tense guides ───
print("Loading tense guides...")
result = subprocess.run(['plutil', '-convert', 'xml1', '-o', '-', CONJUU_GUIDE], capture_output=True)
guide_data = plistlib.loads(result.stdout)
tense_guides = {}
for key, value in guide_data.items():
m = re.match(r'LL(.+)Guide(Top|Bottom)', key)
if m:
tense_name = m.group(1)
part = m.group(2)
if tense_name not in tense_guides:
tense_guides[tense_name] = {}
tense_guides[tense_name][part] = value
guides_output = []
for t in TENSES:
guide_key = t["spanish"].replace("Indicativo ", "").replace("Condicional ", "").replace("Subjuntivo ", "").replace("Imperativo ", "")
# Try exact match first, then various key patterns
guide = None
for gk, gv in tense_guides.items():
if gk == guide_key or gk == t["spanish"] or gk.replace(" ", "") == guide_key.replace(" ", ""):
guide = gv
break
if not guide:
# Try partial match
for gk, gv in tense_guides.items():
if guide_key.lower() in gk.lower() or gk.lower() in guide_key.lower():
guide = gv
break
guides_output.append({
"tenseId": t["id"],
"title": guide.get("Top", t["english"]) if guide else t["english"],
"body": guide.get("Bottom", "") if guide else "",
})
print(f" {len(guides_output)} tense guides")
# ─── Step 4: Load difficulty levels ───
print("Loading difficulty levels...")
level_files = [
("basic", "Basic.csv"),
("elementary_1", "Elementary-1.csv"),
("elementary_2", "Elementary-2.csv"),
("elementary_3", "Elementary-3.csv"),
("intermediate_1", "Intermediate-1.csv"),
("intermediate_2", "Intermediate-2.csv"),
("intermediate_3", "Intermediate-3.csv"),
("intermediate_4", "Intermediate-4.csv"),
]
level_verbs = {}
for level_id, filename in level_files:
path = os.path.join(CONJUU_LEVELS, filename)
with open(path, 'r') as f:
for row in csv.reader(f):
level_verbs[row[0].lower()] = level_id
print(f" {len(level_verbs)} verbs with curated levels")
# ─── Step 5: Merge everything ───
print("Merging data...")
# Map ConjuGato VerbFormId encoding
# VerbFormId = (1000 + VerbId) * 10000 + MTPP
# M: 1=Indicative, 2=Subjunctive, 3=Imperative
# T: tense within mood
# PP: person (01-08)
CONJUGATO_TENSE_MAP = {
# (mood, tense) -> tense_id
(1, 1): "ind_presente",
(1, 2): "ind_preterito",
(1, 3): "ind_imperfecto",
(1, 6): "cond_presente",
(1, 7): "ind_futuro",
(2, 1): "subj_presente",
(2, 3): "subj_imperfecto_1",
(2, 4): "subj_imperfecto_2",
(2, 7): "subj_futuro",
(3, 0): "imp_afirmativo", # person-specific
}
def decode_verb_form_id(vfid):
"""Decode VerbFormId into (verb_id, tense_id, person_index)"""
s = str(vfid)
if len(s) != 8:
return None, None, None
verb_id = int(s[:4]) - 1000
mood = int(s[4])
tense_num = int(s[5])
person = int(s[6:8])
# Handle imperative
if mood == 3:
if person >= 800:
tense_id = "imp_negativo"
person = person - 800
else:
tense_id = "imp_afirmativo"
else:
tense_id = CONJUGATO_TENSE_MAP.get((mood, tense_num))
if person >= 1 and person <= 6:
person_idx = person - 1
elif person == 7 or person == 8:
person_idx = None # vos/voseo - skip for now
else:
person_idx = None
return verb_id, tense_id, person_idx
def assign_level(rank):
if rank <= 25:
return "basic"
elif rank <= 100:
return "elementary"
elif rank <= 300:
return "intermediate"
elif rank <= 700:
return "advanced"
else:
return "expert"
# Build unified verb list
all_verbs = []
verb_forms = []
spans_output = []
for vid, cv in sorted(conjugato_verbs.items(), key=lambda x: x[1]["rank"]):
infinitive = cv["infinitive"]
inf_lower = infinitive.lower()
ending = cv["ending"]
rank = cv["rank"]
# Check Conjuu ES for this verb
conjuu = conjuu_verbs.get(inf_lower)
# Determine level
level = level_verbs.get(inf_lower, assign_level(rank))
verb_entry = {
"id": vid,
"infinitive": infinitive,
"english": cv["english"],
"rank": rank,
"ending": ending,
"reflexive": cv["reflexive"],
"level": level,
"hasConjuuData": conjuu is not None,
}
all_verbs.append(verb_entry)
# Generate forms for each tense
for tense in TENSES:
tid = tense["id"]
if conjuu and tid in conjuu["tenses"]:
# Use Conjuu ES data (pre-computed)
td = conjuu["tenses"][tid]
forms = td["forms"]
regularity = td["regularity"]
else:
# Generate from rules or ConjuGato irregular forms
regularity = "ordinary"
# Check if we have irregular forms from ConjuGato
has_irregular = vid in irregularity_data
if tid in HABER:
# Compound tense
participle = get_participle(infinitive, ending)
# Check for irregular participle from ConjuGato
forms = [f"{aux} {participle}" for aux in HABER[tid]]
regularity = "ordinary"
elif tid in ("ind_futuro", "cond_presente"):
# Future/conditional use full infinitive as stem
base = infinitive.lower()
if base.endswith("se"):
base = base[:-2]
endings_map = {
"ind_futuro": ["é", "ás", "á", "emos", "éis", "án"],
"cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"],
}
forms = [base + e for e in endings_map[tid]]
# Check for irregular future/conditional stems from ConjuGato
if has_irregular:
# Try to find irregular forms
for pi in range(6):
mood_tense = (1, 7) if tid == "ind_futuro" else (1, 6)
vfid = (1000 + vid) * 10000 + mood_tense[0] * 1000 + mood_tense[1] * 100 + (pi + 1)
if vfid in irregular_forms:
forms[pi] = irregular_forms[vfid]
regularity = "irregular"
else:
# Simple tense
stem = get_stem(infinitive, ending)
if tid in ENDINGS.get(ending, {}):
forms = [stem + e for e in ENDINGS[ending][tid]]
else:
forms = [""] * 6
# Override with ConjuGato irregular forms
if has_irregular:
mood_map = {
"ind_presente": (1, 1), "ind_preterito": (1, 2),
"ind_imperfecto": (1, 3),
"subj_presente": (2, 1), "subj_imperfecto_1": (2, 3),
"subj_imperfecto_2": (2, 4), "subj_futuro": (2, 7),
}
if tid in mood_map:
mt = mood_map[tid]
for pi in range(6):
vfid = (1000 + vid) * 10000 + mt[0] * 1000 + mt[1] * 100 + (pi + 1)
if vfid in irregular_forms:
forms[pi] = irregular_forms[vfid]
regularity = "irregular"
elif tid == "imp_afirmativo":
for pi in range(6):
vfid = (1000 + vid) * 10000 + 3000 + (pi + 1)
if vfid in irregular_forms:
forms[pi] = irregular_forms[vfid]
regularity = "irregular"
elif tid == "imp_negativo":
for pi in range(6):
vfid = (1000 + vid) * 10000 + 3800 + (pi + 1)
if vfid in irregular_forms:
forms[pi] = irregular_forms[vfid]
regularity = "irregular"
for pi, form in enumerate(forms):
if form:
verb_forms.append({
"verbId": vid,
"tenseId": tid,
"personIndex": pi,
"form": form,
"regularity": regularity,
})
# Build spans referencing verb forms
print("Processing irregular spans...")
for span in irregular_spans:
vfid = span["verbFormId"]
verb_id, tense_id, person_idx = decode_verb_form_id(vfid)
if verb_id is None or tense_id is None or person_idx is None:
continue
if verb_id not in conjugato_verbs:
continue
spans_output.append({
"verbId": verb_id,
"tenseId": tense_id,
"personIndex": person_idx,
"type": span["type"],
"pattern": span["pattern"],
"start": span["start"],
"end": span["end"],
})
# ─── Step 6: Output ───
print("Writing output...")
output = {
"tenses": TENSES,
"persons": PERSONS,
"verbs": all_verbs,
"verbForms": verb_forms,
"irregularSpans": spans_output,
"tenseGuides": guides_output,
}
with open(OUTPUT, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=None)
# Also write a pretty version for debugging
with open(OUTPUT.replace('.json', '_debug.json'), 'w', encoding='utf-8') as f:
json.dump({
"stats": {
"verbs": len(all_verbs),
"verbForms": len(verb_forms),
"irregularSpans": len(spans_output),
"tenseGuides": len(guides_output),
},
"sampleVerb": all_verbs[0] if all_verbs else None,
"sampleForms": verb_forms[:20],
}, f, ensure_ascii=False, indent=2)
file_size = os.path.getsize(OUTPUT) / (1024 * 1024)
print(f"\nDone!")
print(f" Verbs: {len(all_verbs)}")
print(f" Verb forms: {len(verb_forms)}")
print(f" Irregular spans: {len(spans_output)}")
print(f" Tense guides: {len(guides_output)}")
print(f" Output: {OUTPUT} ({file_size:.1f} MB)")

View File

@@ -0,0 +1,453 @@
#!/usr/bin/env python3
"""
Scrape 7 LanGo Spanish course packs from Brainscape, plus example sentences
from SpanishDict. Outputs all_courses_data.json with all courses, decks, cards,
and examples organized by week.
"""
import asyncio
import json
import re
import os
from playwright.async_api import async_playwright
BASE_URL = "https://www.brainscape.com"
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/all_courses_data.json"
MAX_EXAMPLES = 3
PACK_URLS = [
"https://www.brainscape.com/packs/lango-spanish-beginner-ii-16514996",
"https://www.brainscape.com/packs/lango-spanish-beginner-iii-conversation-18477688",
"https://www.brainscape.com/packs/lango-spanish-intermediate-i-21508666",
"https://www.brainscape.com/packs/lango-spanish-intermediate-ii-21906841",
"https://www.brainscape.com/packs/lango-spanish-intermediate-iii-spanish-through-stories-20677744",
"https://www.brainscape.com/packs/lango-spanish-advanced-i-21511244",
"https://www.brainscape.com/packs/lango-spanish-advanced-ii-21649461",
]
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
# ---------------------------------------------------------------------------
# Parsing helpers (copied from scrape_brainscape.py and scrape_examples.py)
# ---------------------------------------------------------------------------
def parse_title_and_week(text):
"""Extract week number and clean title from page text."""
# Match "Week N: Title" or "Semana N: Title" or "Semana N Title"
m = re.match(r'(?:Week|Semana)\s+(\d+)[:\s]+(.+)', text, re.IGNORECASE)
if m:
return int(m.group(1)), m.group(2).strip()
return 0, text.strip()
def parse_cards(text):
"""Parse flashcard Q/A pairs from page text."""
cards = []
lines = text.split('\n')
skip = {'Q', 'A', 'Study These Flashcards', '', 'Brainscape', 'Find Flashcards',
'Make Flashcards', 'How It Works', 'Educators', 'Businesses', 'Academy',
'Log in', 'Get Started'}
i = 0
while i < len(lines):
line = lines[i].strip()
if re.match(r'^\d+$', line):
num = int(line)
parts = []
j = i + 1
while j < len(lines) and len(parts) < 6:
nextline = lines[j].strip()
if re.match(r'^\d+$', nextline) and int(nextline) == num + 1:
break
if nextline.startswith('LanGo Spanish') or nextline.startswith('Decks in class'):
break
if re.match(r'^(?:Week|Semana) \d+', nextline):
break
if nextline in skip:
j += 1
continue
parts.append(nextline)
j += 1
if len(parts) >= 2:
cards.append({
"front": parts[0],
"back": parts[1],
})
i = j
else:
i += 1
cards = [c for c in cards if not re.match(r'^(?:Week|Semana) \d+', c['front'])
and c['front'] not in ('Decks in class (39)', '# Cards')
and not c['front'].startswith('LanGo Spanish')
and not c['front'].startswith('You may prefer')]
return cards
def extract_word_for_lookup(front):
"""Extract the best lookup word from a card front."""
word = front.strip()
word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE)
word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE)
if ',' in word:
word = word.split(',')[0].strip()
if '/' in word:
word = word.split('/')[0].strip()
return word.lower().strip()
def parse_examples(text, lookup_word):
"""Parse example sentences from SpanishDict page text."""
examples = []
lines = text.split('\n')
for i, line in enumerate(lines):
l = line.strip()
if not l or len(l) < 15:
continue
inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l)
if inline_match:
es = inline_match.group(1).strip()
en = inline_match.group(2).strip()
if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5:
examples.append({"es": es, "en": en})
if len(examples) >= MAX_EXAMPLES:
break
continue
if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300:
for j in range(i + 1, min(i + 3, len(lines))):
next_l = lines[j].strip()
if not next_l:
continue
if (next_l[0].isupper() and
not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])):
examples.append({"es": l, "en": next_l})
if len(examples) >= MAX_EXAMPLES:
break
break
if len(examples) >= MAX_EXAMPLES:
break
return examples
# ---------------------------------------------------------------------------
# Scraping logic
# ---------------------------------------------------------------------------
async def discover_deck_urls(page, pack_url):
"""Visit a pack page and discover all deck URLs within it."""
print(f"\nDiscovering decks in {pack_url}...")
await page.goto(pack_url, wait_until="networkidle", timeout=30000)
await page.wait_for_timeout(2000)
# Scroll to load all content
for _ in range(10):
await page.evaluate("window.scrollBy(0, 1000)")
await page.wait_for_timeout(300)
# Extract pack ID from URL
pack_id = pack_url.rstrip('/').split('-')[-1]
# Find all deck links matching /flashcards/*/packs/*
links = await page.eval_on_selector_all(
'a[href*="/flashcards/"]',
'els => els.map(e => e.getAttribute("href"))'
)
deck_urls = []
seen = set()
for href in links:
if href and '/flashcards/' in href and '/packs/' in href:
# Normalize
if href.startswith('http'):
href = href.replace(BASE_URL, '')
if href not in seen:
seen.add(href)
deck_urls.append(href)
# Extract course name from the page
text = await page.inner_text("body")
course_name = None
# Try to find "LanGo Spanish | ..." pattern
m = re.search(r'(LanGo Spanish\s*\|\s*[^>\n]+)', text)
if m:
course_name = m.group(1).strip()
# Clean trailing noise
course_name = re.sub(r'\s*>\s*$', '', course_name).strip()
# Remove "Flashcards" suffix if present
course_name = re.sub(r'\s*Flashcards\s*$', '', course_name).strip()
else:
# Fallback: derive from URL slug
slug = pack_url.rstrip('/').split('/')[-1]
slug = re.sub(r'-\d+$', '', slug)
course_name = slug.replace('-', ' ').title()
print(f" Course: {course_name}")
print(f" Found {len(deck_urls)} deck URLs")
return course_name, deck_urls
async def scrape_deck(page, url):
"""Scrape a single deck page for flashcard data."""
full_url = BASE_URL + url if url.startswith('/') else url
await page.goto(full_url, wait_until="networkidle", timeout=30000)
await page.wait_for_timeout(2000)
for _ in range(5):
await page.evaluate("window.scrollBy(0, 1000)")
await page.wait_for_timeout(300)
text = await page.inner_text("body")
# Extract title — handle both "Week N:" and "Semana N" patterns
title_match = re.search(r'>\s*((?:Week|Semana)\s+\d+[:\s].+?)\s*>\s*Flashcards', text)
if title_match:
raw_title = title_match.group(1).strip()
else:
heading_match = re.search(r'((?:Week|Semana)\s+\d+[:\s].+?)\s*Flashcards', text)
if heading_match:
raw_title = heading_match.group(1).strip()
else:
slug = url.split('/')[2] if len(url.split('/')) > 2 else url
slug_clean = re.sub(r'-\d+$', '', slug)
slug_clean = re.sub(r'-al-rev(e|é)s$', ' AL REVÉS', slug_clean)
raw_title = slug_clean.replace('-', ' ').title()
wm = re.match(r'Week\s+(\d+)', raw_title, re.IGNORECASE)
if not wm:
raw_title = "Week 0: " + raw_title
week, title = parse_title_and_week(raw_title)
cards = parse_cards(text)
is_reversed = "al rev" in url.lower() or "AL REVÉS" in raw_title.upper()
return {
"week": week,
"title": title,
"isReversed": is_reversed,
"cardCount": len(cards),
"cards": cards,
"url": url,
}
async def scrape_examples_for_word(page, lookup):
"""Scrape example sentences from SpanishDict for a single word."""
url = f"https://www.spanishdict.com/translate/{lookup}"
try:
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
await page.wait_for_timeout(2000)
text = await page.inner_text("body")
return parse_examples(text, lookup)
except Exception:
return []
def save_progress(data):
"""Save current data to output file."""
with open(OUTPUT, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def load_progress():
"""Load existing progress if available."""
if os.path.exists(OUTPUT):
try:
with open(OUTPUT) as f:
return json.load(f)
except (json.JSONDecodeError, KeyError):
pass
return None
async def main():
# Check for existing progress
existing = load_progress()
completed_courses = set()
examples_done = {} # lookup -> examples list
if existing and 'courses' in existing:
for course in existing['courses']:
if course.get('_examples_done'):
completed_courses.add(course['course'])
# Collect already-scraped examples
for week in course.get('weeks', []):
for deck in week.get('decks', []):
for card in deck.get('cards', []):
if card.get('examples'):
lookup = extract_word_for_lookup(card['front'])
examples_done[lookup] = card['examples']
print(f"Loaded progress: {len(completed_courses)} completed courses, {len(examples_done)} words with examples")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent=USER_AGENT)
page = await context.new_page()
all_courses = []
# If we have existing data for completed courses, keep them
if existing and 'courses' in existing:
for course in existing['courses']:
if course['course'] in completed_courses:
all_courses.append(course)
# ---------------------------------------------------------------
# Phase 1: Discover decks and scrape cards for each course pack
# ---------------------------------------------------------------
for pack_url in PACK_URLS:
course_name, deck_urls = await discover_deck_urls(page, pack_url)
# Skip if already completed
if course_name in completed_courses:
print(f" Skipping {course_name} (already completed)")
continue
await page.wait_for_timeout(300)
all_decks = []
total_cards = 0
for i, deck_url in enumerate(deck_urls):
slug = deck_url.split('/')[2] if len(deck_url.split('/')) > 2 else deck_url
print(f" [{i+1}/{len(deck_urls)}] Scraping {slug[:60]}...")
try:
deck = await scrape_deck(page, deck_url)
all_decks.append(deck)
total_cards += deck["cardCount"]
print(f" -> Week {deck['week']}: {deck['title']} ({deck['cardCount']} cards)")
except Exception as e:
print(f" ERROR: {e}")
await page.wait_for_timeout(300)
# Organize by week
weeks = {}
for deck in all_decks:
w = deck["week"]
if w not in weeks:
weeks[w] = []
weeks[w].append({
"title": deck["title"],
"isReversed": deck["isReversed"],
"cardCount": deck["cardCount"],
"cards": deck["cards"],
})
course_data = {
"course": course_name,
"totalDecks": len(all_decks),
"totalCards": total_cards,
"_examples_done": False,
"weeks": [
{"week": w, "decks": weeks[w]}
for w in sorted(weeks.keys())
],
}
all_courses.append(course_data)
# Save after each course
save_progress({"courses": all_courses})
print(f" Saved {course_name}: {len(all_decks)} decks, {total_cards} cards")
# ---------------------------------------------------------------
# Phase 2: Scrape example sentences from SpanishDict
# ---------------------------------------------------------------
print("\n" + "=" * 60)
print("Phase 2: Scraping example sentences from SpanishDict")
print("=" * 60)
# Collect all unique words across all courses (non-reversed decks)
unique_words = {} # lookup -> original front
for course in all_courses:
for week in course['weeks']:
for deck in week['decks']:
if deck.get('isReversed'):
continue
for card in deck['cards']:
front = card['front']
lookup = extract_word_for_lookup(front)
if lookup and lookup not in unique_words:
unique_words[lookup] = front
print(f"Found {len(unique_words)} unique words to look up")
print(f"Already have examples for {len(examples_done)} words")
words_scraped = 0
total_words = len(unique_words)
for i, (lookup, original) in enumerate(unique_words.items()):
if lookup in examples_done:
continue
print(f"[{i+1}/{total_words}] {lookup}...", end=" ", flush=True)
try:
examples = await scrape_examples_for_word(page, lookup)
examples_done[lookup] = examples
if examples:
print(f"{len(examples)} examples")
else:
print("no examples")
except Exception as e:
print(f"error: {e}")
examples_done[lookup] = []
words_scraped += 1
# Save progress every 20 words
if words_scraped % 20 == 0:
# Attach examples to cards before saving
_attach_examples(all_courses, examples_done)
save_progress({"courses": all_courses})
print(f" [saved progress - {len(examples_done)} words done]")
await page.wait_for_timeout(300)
await browser.close()
# ---------------------------------------------------------------
# Final: attach all examples to cards and save
# ---------------------------------------------------------------
_attach_examples(all_courses, examples_done)
# Mark all courses as examples_done and remove internal flag
for course in all_courses:
course['_examples_done'] = True
# Clean up internal flags before final save
for course in all_courses:
course.pop('_examples_done', None)
save_progress({"courses": all_courses})
total_decks = sum(c['totalDecks'] for c in all_courses)
total_cards = sum(c['totalCards'] for c in all_courses)
print(f"\nDone! {len(all_courses)} courses, {total_decks} decks, {total_cards} cards")
print(f"Examples scraped for {len(examples_done)} unique words")
print(f"Output: {OUTPUT}")
def _attach_examples(courses, examples_done):
"""Attach scraped examples to card objects in place."""
for course in courses:
for week in course['weeks']:
for deck in week['decks']:
for card in deck['cards']:
lookup = extract_word_for_lookup(card['front'])
if lookup in examples_done and examples_done[lookup]:
card['examples'] = examples_done[lookup]
elif 'examples' not in card:
card['examples'] = []
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,238 @@
#!/usr/bin/env python3
"""
Scrape all 39 LanGo Spanish Beginner I decks from Brainscape using Playwright.
Outputs course_data.json with all decks and cards organized by week.
"""
import asyncio
import json
import re
from playwright.async_api import async_playwright
BASE_URL = "https://www.brainscape.com"
PACK_ID = "18164266"
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json"
DECK_URLS = [
"/flashcards/week-1-greetings-los-saludos-10176532/packs/18164266",
"/flashcards/week-1-greetings-los-saludos-al-reves-12745728/packs/18164266",
"/flashcards/week-2-adjectives-los-adjetivos-12745741/packs/18164266",
"/flashcards/week-2-adjectives-los-adjetivos-al-reves-12745829/packs/18164266",
"/flashcards/week-2-numbers-los-numeros-12797877/packs/18164266",
"/flashcards/week-2-numbers-los-numeros-al-reves-13698219/packs/18164266",
"/flashcards/week-2-professions-las-profesiones-12740531/packs/18164266",
"/flashcards/week-2-professions-las-profesiones-al-re-12745832/packs/18164266",
"/flashcards/week-3-house-la-casa-10216249/packs/18164266",
"/flashcards/week-3-house-la-casa-al-reves-12745837/packs/18164266",
"/flashcards/week-3-ar-verbs-10207117/packs/18164266",
"/flashcards/week-3-ar-verbs-al-reves-12745833/packs/18164266",
"/flashcards/week-3-er-verbs-12745857/packs/18164266",
"/flashcards/week-3-er-verbs-al-reves-12745888/packs/18164266",
"/flashcards/week-3-ir-verbs-10207120/packs/18164266",
"/flashcards/week-3-ir-verbs-al-reves-12745835/packs/18164266",
"/flashcards/week-4-family-la-familia-10266419/packs/18164266",
"/flashcards/week-4-family-la-familia-al-reves-12745978/packs/18164266",
"/flashcards/week-4-e-ie-stem-changing-verbs-10270069/packs/18164266",
"/flashcards/week-4-e-ie-stem-changing-verbs-al-reves-12749152/packs/18164266",
"/flashcards/week-4-e-i-stem-changing-verbs-10270070/packs/18164266",
"/flashcards/week-4-e-i-stem-changing-verbs-al-reves-12749160/packs/18164266",
"/flashcards/week-4-o-ue-stem-changing-verbs-10270071/packs/18164266",
"/flashcards/week-4-o-ue-stem-changing-verbs-al-reves-12749172/packs/18164266",
"/flashcards/week-4-exceptional-yo-forms-10286213/packs/18164266",
"/flashcards/week-4-exceptional-yo-forms-al-reves-12749234/packs/18164266",
"/flashcards/week-5-reflexive-verbs-los-verbos-reflex-10270072/packs/18164266",
"/flashcards/week-5-reflexive-verbs-los-verbos-reflex-12745842/packs/18164266",
"/flashcards/week-5-daily-routine-la-rutina-cotidiana-11869082/packs/18164266",
"/flashcards/week-5-daily-routine-la-rutina-cotidiana-12745840/packs/18164266",
"/flashcards/week-6-city-la-ciudad-10232784/packs/18164266",
"/flashcards/week-6-city-la-ciudad-al-reves-12745942/packs/18164266",
"/flashcards/week-6-time-expressions-las-expresiones-12797878/packs/18164266",
"/flashcards/week-6-time-expressions-las-expresiones-13698220/packs/18164266",
"/flashcards/week-7-idioms-with-the-verb-tener-los-mo-11951594/packs/18164266",
"/flashcards/week-8-prepositions-and-negation-las-pre-11951441/packs/18164266",
"/flashcards/week-8-prepositions-and-negation-las-pre-16094943/packs/18164266",
"/flashcards/week-8-hobbies-los-pasatiempos-10232782/packs/18164266",
"/flashcards/week-8-hobbies-los-pasatiempos-al-reves-12745838/packs/18164266",
]
def parse_title_and_week(text):
"""Extract week number and clean title from page text."""
# Match "Week N: Title" pattern
m = re.match(r'Week\s+(\d+):\s*(.+)', text, re.IGNORECASE)
if m:
return int(m.group(1)), m.group(2).strip()
return 0, text.strip()
def parse_cards(text):
"""Parse flashcard Q/A pairs from page text."""
cards = []
lines = text.split('\n')
# Filter out noise lines
skip = {'Q', 'A', 'Study These Flashcards', '', 'Brainscape', 'Find Flashcards',
'Make Flashcards', 'How It Works', 'Educators', 'Businesses', 'Academy',
'Log in', 'Get Started'}
i = 0
while i < len(lines):
line = lines[i].strip()
# Look for a card number
if re.match(r'^\d+$', line):
num = int(line)
# Collect content lines until the next card number or deck list
parts = []
j = i + 1
while j < len(lines) and len(parts) < 6:
nextline = lines[j].strip()
# Stop at next card number
if re.match(r'^\d+$', nextline) and int(nextline) == num + 1:
break
# Stop at deck list / footer
if nextline.startswith('LanGo Spanish') or nextline.startswith('Decks in class'):
break
# Stop at other deck titles leaking in
if re.match(r'^Week \d+:', nextline):
break
# Skip noise
if nextline in skip:
j += 1
continue
parts.append(nextline)
j += 1
if len(parts) >= 2:
cards.append({
"front": parts[0],
"back": parts[1],
})
i = j
else:
i += 1
# Post-filter: remove any cards that are actually deck titles
cards = [c for c in cards if not re.match(r'^Week \d+:', c['front'])
and c['front'] not in ('Decks in class (39)', '# Cards')
and not c['front'].startswith('LanGo Spanish')
and not c['front'].startswith('You may prefer')]
return cards
async def scrape_deck(page, url):
"""Scrape a single deck page."""
full_url = BASE_URL + url
await page.goto(full_url, wait_until="networkidle", timeout=30000)
await page.wait_for_timeout(2000)
# Scroll to load lazy content
for _ in range(5):
await page.evaluate("window.scrollBy(0, 1000)")
await page.wait_for_timeout(300)
text = await page.inner_text("body")
# Extract title — try multiple patterns
# Format: "LanGo Spanish | Beginner I > Week N: Title > Flashcards"
title_match = re.search(r'>\s*(Week\s+\d+:.+?)\s*>\s*Flashcards', text)
if title_match:
raw_title = title_match.group(1).strip()
else:
# Try: "Week N: Title (Subtitle) Flashcards"
heading_match = re.search(r'(Week\s+\d+:.+?)\s*Flashcards', text)
if heading_match:
raw_title = heading_match.group(1).strip()
else:
# Last resort: extract from URL slug
slug = url.split('/')[2]
# Convert "week-5-reflexive-verbs-los-verbos-reflex-10270072" to title
slug_clean = re.sub(r'-\d+$', '', slug) # remove trailing ID
slug_clean = re.sub(r'-al-rev(e|é)s$', ' AL REVÉS', slug_clean)
raw_title = slug_clean.replace('-', ' ').title()
# Try to extract week number
wm = re.match(r'Week\s+(\d+)', raw_title, re.IGNORECASE)
if wm:
raw_title = raw_title # already has Week N
else:
raw_title = "Week 0: " + raw_title
week, title = parse_title_and_week(raw_title)
cards = parse_cards(text)
is_reversed = "al rev" in url.lower() or "AL REVÉS" in raw_title.upper()
return {
"week": week,
"title": title,
"isReversed": is_reversed,
"cardCount": len(cards),
"cards": cards,
"url": url,
}
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)
page = await context.new_page()
all_decks = []
total_cards = 0
for i, url in enumerate(DECK_URLS):
print(f"[{i+1}/{len(DECK_URLS)}] Scraping {url.split('/')[2][:50]}...")
try:
deck = await scrape_deck(page, url)
all_decks.append(deck)
total_cards += deck["cardCount"]
print(f" → Week {deck['week']}: {deck['title']} ({deck['cardCount']} cards)")
except Exception as e:
print(f" ERROR: {e}")
# Be polite
await page.wait_for_timeout(500)
await browser.close()
# Organize by week
weeks = {}
for deck in all_decks:
w = deck["week"]
if w not in weeks:
weeks[w] = []
weeks[w].append({
"title": deck["title"],
"isReversed": deck["isReversed"],
"cardCount": deck["cardCount"],
"cards": deck["cards"],
})
output = {
"course": "LanGo Spanish | Beginner I",
"totalDecks": len(all_decks),
"totalCards": total_cards,
"weeks": [
{
"week": w,
"decks": weeks[w],
}
for w in sorted(weeks.keys())
],
}
with open(OUTPUT, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"\nDone! {len(all_decks)} decks, {total_cards} cards → {OUTPUT}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,166 @@
#!/usr/bin/env python3
"""
Scrape 2-3 example sentences per vocab word from SpanishDict.
Reads words from course_data.json, outputs examples to course_examples.json.
"""
import asyncio
import json
import re
import os
from playwright.async_api import async_playwright
INPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json"
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_examples.json"
MAX_EXAMPLES = 3
def extract_word_for_lookup(front):
"""Extract the best lookup word from a card front.
e.g. 'barato, barata' -> 'barato'
e.g. 'el/la periodista' -> 'periodista'
"""
word = front.strip()
# Remove articles
word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE)
word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE)
# Take first word if comma-separated (barato, barata -> barato)
if ',' in word:
word = word.split(',')[0].strip()
# Take first word if slash-separated
if '/' in word:
word = word.split('/')[0].strip()
return word.lower().strip()
def parse_examples(text, lookup_word):
"""Parse example sentences from SpanishDict page text."""
examples = []
lines = text.split('\n')
for i, line in enumerate(lines):
l = line.strip()
if not l or len(l) < 15:
continue
# Pattern: "Spanish sentence.English sentence." (inline on one line)
# SpanishDict puts them together with no space between period and capital
# e.g. "Esta tienda es muy barata.This store is really cheap."
inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l)
if inline_match:
es = inline_match.group(1).strip()
en = inline_match.group(2).strip()
# Verify it contains our word (case-insensitive)
if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5:
examples.append({"es": es, "en": en})
if len(examples) >= MAX_EXAMPLES:
break
continue
# Pattern: standalone Spanish sentence with word, followed by English on next line
if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300:
# Check if next non-empty line is English
for j in range(i + 1, min(i + 3, len(lines))):
next_l = lines[j].strip()
if not next_l:
continue
# Check if it looks English (starts with capital, has common English words)
if (next_l[0].isupper() and
not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])):
examples.append({"es": l, "en": next_l})
if len(examples) >= MAX_EXAMPLES:
break
break
if len(examples) >= MAX_EXAMPLES:
break
return examples
async def scrape_word(page, word, lookup):
"""Scrape examples for a single word."""
url = f"https://www.spanishdict.com/translate/{lookup}"
try:
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
await page.wait_for_timeout(2000)
text = await page.inner_text("body")
examples = parse_examples(text, lookup)
return examples
except Exception as e:
return []
async def main():
# Load course data
with open(INPUT) as f:
data = json.load(f)
# Collect unique words (front values from non-reversed decks)
words = {} # lookup -> original front
for week in data['weeks']:
for deck in week['decks']:
if deck.get('isReversed'):
continue
for card in deck['cards']:
front = card['front']
lookup = extract_word_for_lookup(front)
if lookup and lookup not in words:
words[lookup] = front
print(f"Found {len(words)} unique words to look up")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
ctx = await browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)
page = await ctx.new_page()
# Load existing progress if any
results = {}
if os.path.exists(OUTPUT):
with open(OUTPUT) as f:
results = json.load(f)
print(f"Loaded {len(results)} existing results")
found = len(results)
total = len(words)
for i, (lookup, original) in enumerate(words.items()):
# Skip already scraped
if original in results:
continue
print(f"[{i+1}/{total}] {lookup}...", end=" ", flush=True)
try:
examples = await scrape_word(page, original, lookup)
if examples:
results[original] = examples
found += 1
print(f"{len(examples)} examples")
else:
results[original] = []
print("no examples")
except Exception as e:
print(f"error: {e}")
results[original] = []
# Save progress every 20 words
if (i + 1) % 20 == 0:
with open(OUTPUT, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f" [saved {len(results)} results]")
await page.wait_for_timeout(300)
await browser.close()
# Save results
with open(OUTPUT, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\nDone! {found}/{total} words with examples → {OUTPUT}")
if __name__ == "__main__":
asyncio.run(main())