Files
Spanish/Conjuga/Scripts/merge_data.py
Trey t 4b467ec136 Initial commit: Conjuga Spanish conjugation app
Includes SwiftData dual-store architecture (local reference + CloudKit user data),
JSON-based data seeding, 20 tense guides, 20 grammar notes, SRS review system,
course vocabulary, and widget support.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 20:58:33 -05:00

551 lines
22 KiB
Python

#!/usr/bin/env python3
"""
Merge ConjuGato + Conjuu ES data into unified JSON for Conjuga app.
Sources:
- ConjuGato: 1,750 verbs (verb.md), irregular forms, spans, irregularity bitmasks
- Conjuu ES: 621 verbs with full conjugation tables, tense guides, conjugation rules
Output: conjuga_data.json with all verbs, forms, spans, guides
"""
import csv
import json
import re
import sqlite3
import os
import plistlib
import subprocess
BASE = "/Users/treyt/Desktop/code/Spanish"
CONJUGATO_DB = "/Applications/ConjuGato.app/WrappedBundle/Verbs.sqlite"
CONJUU_VOCAB = "/Applications/Conjuu ES.app/Contents/Resources/Vocabulary.csv"
CONJUU_GUIDE = "/Applications/Conjuu ES.app/Contents/Resources/en.lproj/Guide.strings"
CONJUU_RULES = "/Applications/Conjuu ES.app/Contents/Resources/GuideTableEntries.plist"
CONJUU_LEVELS = "/Applications/Conjuu ES.app/Contents/Resources"
OUTPUT = os.path.join(BASE, "Conjuga", "Scripts", "conjuga_data.json")
# ─── Tense metadata ───
TENSES = [
{"id": "ind_presente", "spanish": "Indicativo Presente", "english": "Present", "mood": "Indicative", "order": 0},
{"id": "ind_preterito", "spanish": "Indicativo Pretérito", "english": "Preterite", "mood": "Indicative", "order": 1},
{"id": "ind_imperfecto", "spanish": "Indicativo Imperfecto", "english": "Imperfect", "mood": "Indicative", "order": 2},
{"id": "ind_futuro", "spanish": "Indicativo Futuro", "english": "Future", "mood": "Indicative", "order": 3},
{"id": "ind_perfecto", "spanish": "Indicativo Perfecto", "english": "Present Perfect", "mood": "Indicative", "order": 4},
{"id": "ind_pluscuamperfecto", "spanish": "Indicativo Pluscuamperfecto", "english": "Pluperfect", "mood": "Indicative", "order": 5},
{"id": "ind_futuro_perfecto", "spanish": "Indicativo Futuro Perfecto", "english": "Future Perfect", "mood": "Indicative", "order": 6},
{"id": "ind_preterito_anterior", "spanish": "Indicativo Pretérito Anterior", "english": "Preterite Perfect", "mood": "Indicative", "order": 7},
{"id": "cond_presente", "spanish": "Condicional Presente", "english": "Conditional", "mood": "Conditional", "order": 8},
{"id": "cond_perfecto", "spanish": "Condicional Perfecto", "english": "Conditional Perfect", "mood": "Conditional", "order": 9},
{"id": "subj_presente", "spanish": "Subjuntivo Presente", "english": "Present Subjunctive", "mood": "Subjunctive", "order": 10},
{"id": "subj_imperfecto_1", "spanish": "Subjuntivo Imperfecto I", "english": "Past Subjunctive (ra)", "mood": "Subjunctive", "order": 11},
{"id": "subj_imperfecto_2", "spanish": "Subjuntivo Imperfecto II", "english": "Past Subjunctive (se)", "mood": "Subjunctive", "order": 12},
{"id": "subj_perfecto", "spanish": "Subjuntivo Perfecto", "english": "Subjunctive Perfect", "mood": "Subjunctive", "order": 13},
{"id": "subj_pluscuamperfecto_1", "spanish": "Subjuntivo Pluscuamperfecto I", "english": "Subjunctive Pluperfect (ra)", "mood": "Subjunctive", "order": 14},
{"id": "subj_pluscuamperfecto_2", "spanish": "Subjuntivo Pluscuamperfecto II", "english": "Subjunctive Pluperfect (se)", "mood": "Subjunctive", "order": 15},
{"id": "subj_futuro", "spanish": "Subjuntivo Futuro", "english": "Subjunctive Future", "mood": "Subjunctive", "order": 16},
{"id": "subj_futuro_perfecto", "spanish": "Subjuntivo Futuro Perfecto", "english": "Subjunctive Future Perfect", "mood": "Subjunctive", "order": 17},
{"id": "imp_afirmativo", "spanish": "Imperativo Afirmativo", "english": "Imperative", "mood": "Imperative", "order": 18},
{"id": "imp_negativo", "spanish": "Imperativo Negativo", "english": "Negative Imperative", "mood": "Imperative", "order": 19},
]
TENSE_LOOKUP = {}
for t in TENSES:
TENSE_LOOKUP[t["spanish"]] = t["id"]
PERSONS = ["yo", "", "él/ella/Ud.", "nosotros", "vosotros", "ellos/ellas/Uds."]
ENDINGS = {
"ar": {
"ind_presente": ["o", "as", "a", "amos", "áis", "an"],
"ind_preterito": ["é", "aste", "ó", "amos", "asteis", "aron"],
"ind_imperfecto": ["aba", "abas", "aba", "ábamos", "abais", "aban"],
"ind_futuro": ["aré", "arás", "ará", "aremos", "aréis", "arán"],
"cond_presente": ["aría", "arías", "aría", "aríamos", "aríais", "arían"],
"subj_presente": ["e", "es", "e", "emos", "éis", "en"],
"subj_imperfecto_1": ["ara", "aras", "ara", "áramos", "arais", "aran"],
"subj_imperfecto_2": ["ase", "ases", "ase", "ásemos", "aseis", "asen"],
"subj_futuro": ["are", "ares", "are", "áremos", "areis", "aren"],
"imp_afirmativo": ["", "a", "e", "emos", "ad", "en"],
"imp_negativo": ["", "es", "e", "emos", "éis", "en"],
},
"er": {
"ind_presente": ["o", "es", "e", "emos", "éis", "en"],
"ind_preterito": ["í", "iste", "", "imos", "isteis", "ieron"],
"ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"],
"ind_futuro": ["eré", "erás", "erá", "eremos", "eréis", "erán"],
"cond_presente": ["ería", "erías", "ería", "eríamos", "eríais", "erían"],
"subj_presente": ["a", "as", "a", "amos", "áis", "an"],
"subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"],
"subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"],
"subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"],
"imp_afirmativo": ["", "e", "a", "amos", "ed", "an"],
"imp_negativo": ["", "as", "a", "amos", "áis", "an"],
},
"ir": {
"ind_presente": ["o", "es", "e", "imos", "ís", "en"],
"ind_preterito": ["í", "iste", "", "imos", "isteis", "ieron"],
"ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"],
"ind_futuro": ["iré", "irás", "irá", "iremos", "iréis", "irán"],
"cond_presente": ["iría", "irías", "iría", "iríamos", "iríais", "irían"],
"subj_presente": ["a", "as", "a", "amos", "áis", "an"],
"subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"],
"subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"],
"subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"],
"imp_afirmativo": ["", "e", "a", "amos", "id", "an"],
"imp_negativo": ["", "as", "a", "amos", "áis", "an"],
},
}
# Compound tenses: auxiliary haber forms
HABER = {
"ind_perfecto": ["he", "has", "ha", "hemos", "habéis", "han"],
"ind_pluscuamperfecto": ["había", "habías", "había", "habíamos", "habíais", "habían"],
"ind_futuro_perfecto": ["habré", "habrás", "habrá", "habremos", "habréis", "habrán"],
"ind_preterito_anterior": ["hube", "hubiste", "hubo", "hubimos", "hubisteis", "hubieron"],
"cond_perfecto": ["habría", "habrías", "habría", "habríamos", "habríais", "habrían"],
"subj_perfecto": ["haya", "hayas", "haya", "hayamos", "hayáis", "hayan"],
"subj_pluscuamperfecto_1": ["hubiera", "hubieras", "hubiera", "hubiéramos", "hubierais", "hubieran"],
"subj_pluscuamperfecto_2": ["hubiese", "hubieses", "hubiese", "hubiésemos", "hubieseis", "hubiesen"],
"subj_futuro_perfecto": ["hubiere", "hubieres", "hubiere", "hubiéremos", "hubiereis", "hubieren"],
}
def get_ending_type(infinitive):
inf = infinitive.lower()
if inf.endswith("arse") or inf.endswith("erse") or inf.endswith("irse"):
core = inf[:-2]
else:
core = inf
if core.endswith("ar"):
return "ar"
elif core.endswith("er"):
return "er"
elif core.endswith("ir") or core.endswith("ír"):
return "ir"
return "ar"
def get_stem(infinitive, ending_type):
inf = infinitive.lower()
if inf.endswith("se"):
inf = inf[:-2]
if ending_type == "ar" and inf.endswith("ar"):
return inf[:-2]
elif ending_type == "er" and inf.endswith("er"):
return inf[:-2]
elif ending_type == "ir" and (inf.endswith("ir") or inf.endswith("ír")):
return inf[:-2]
return inf[:-2]
def get_participle(infinitive, ending_type):
stem = get_stem(infinitive, ending_type)
if ending_type == "ar":
return stem + "ado"
else:
return stem + "ido"
def conjugate_regular(infinitive, tense_id, ending_type):
stem = get_stem(infinitive, ending_type)
if tense_id in HABER:
participle = get_participle(infinitive, ending_type)
return [f"{aux} {participle}" for aux in HABER[tense_id]]
if tense_id in ("ind_futuro", "cond_presente"):
return [infinitive.lower().rstrip("se") + e.lstrip(ending_type[0] if tense_id == "ind_futuro" else "")
for e in ENDINGS[ending_type][tense_id]]
# Actually for future/conditional, the stem is the full infinitive
base = infinitive.lower()
if base.endswith("se"):
base = base[:-2]
return [base + ENDINGS[ending_type][tense_id][i] for i in range(6)]
if tense_id in ENDINGS[ending_type]:
endings = ENDINGS[ending_type][tense_id]
return [stem + e for e in endings]
return [""] * 6
def conjugate_future_cond(infinitive, tense_id, ending_type):
base = infinitive.lower()
if base.endswith("se"):
base = base[:-2]
endings_map = {
"ind_futuro": ["é", "ás", "á", "emos", "éis", "án"],
"cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"],
}
if tense_id in endings_map:
return [base + e for e in endings_map[tense_id]]
return None
# ─── Step 1: Load ConjuGato verbs ───
print("Loading ConjuGato data...")
conn = sqlite3.connect(CONJUGATO_DB)
cursor = conn.cursor()
# Verbs
cursor.execute("SELECT Id, Rank, Ending, Reflexive, Spanish, English FROM Verb ORDER BY Rank")
conjugato_verbs = {}
for row in cursor.fetchall():
vid, rank, ending, reflexive, spanish, english = row
ending_map = {1: "ar", 2: "er", 4: "ir"}
conjugato_verbs[vid] = {
"id": vid,
"rank": rank,
"ending": ending_map.get(ending, "ar"),
"reflexive": reflexive,
"infinitive": spanish,
"english": english,
}
# Irregular verb forms
cursor.execute("SELECT VerbFormId, Form FROM IrregularVerbForm ORDER BY VerbFormId")
irregular_forms = {}
for vfid, form in cursor.fetchall():
irregular_forms[vfid] = form
# Irregular spans
cursor.execute("SELECT Id, VerbFormId, Type, Pattern, Start, End FROM IrregularSpan ORDER BY Id")
irregular_spans = []
for sid, vfid, stype, pattern, start, end in cursor.fetchall():
irregular_spans.append({
"verbFormId": vfid,
"type": stype,
"pattern": pattern,
"start": start,
"end": end,
})
# Irregularity bitmasks
cursor.execute("SELECT * FROM Irregularity ORDER BY VerbId")
irregularity_cols = [d[0] for d in cursor.description]
irregularity_data = {}
for row in cursor.fetchall():
verb_id = row[0]
irregularity_data[verb_id] = dict(zip(irregularity_cols[1:], row[1:]))
conn.close()
print(f" {len(conjugato_verbs)} verbs, {len(irregular_forms)} irregular forms, {len(irregular_spans)} spans")
# ─── Step 2: Load Conjuu ES conjugations ───
print("Loading Conjuu ES data...")
conjuu_verbs = {}
with open(CONJUU_VOCAB, 'r') as f:
for row in csv.reader(f):
verb_name = row[0]
tense_spanish = row[2]
tense_id = TENSE_LOOKUP.get(tense_spanish)
if not tense_id:
continue
regularity = row[1]
forms = row[3:9] # yo, tú, él, nosotros, vosotros, ellos
english = row[9]
rank = int(row[13]) if row[13] else 99999
key = verb_name.lower()
if key not in conjuu_verbs:
conjuu_verbs[key] = {
"infinitive": verb_name,
"english": english,
"rank": rank,
"tenses": {},
}
conjuu_verbs[key]["tenses"][tense_id] = {
"regularity": regularity,
"forms": forms,
}
print(f" {len(conjuu_verbs)} verbs with conjugations")
# ─── Step 3: Load tense guides ───
print("Loading tense guides...")
result = subprocess.run(['plutil', '-convert', 'xml1', '-o', '-', CONJUU_GUIDE], capture_output=True)
guide_data = plistlib.loads(result.stdout)
tense_guides = {}
for key, value in guide_data.items():
m = re.match(r'LL(.+)Guide(Top|Bottom)', key)
if m:
tense_name = m.group(1)
part = m.group(2)
if tense_name not in tense_guides:
tense_guides[tense_name] = {}
tense_guides[tense_name][part] = value
guides_output = []
for t in TENSES:
guide_key = t["spanish"].replace("Indicativo ", "").replace("Condicional ", "").replace("Subjuntivo ", "").replace("Imperativo ", "")
# Try exact match first, then various key patterns
guide = None
for gk, gv in tense_guides.items():
if gk == guide_key or gk == t["spanish"] or gk.replace(" ", "") == guide_key.replace(" ", ""):
guide = gv
break
if not guide:
# Try partial match
for gk, gv in tense_guides.items():
if guide_key.lower() in gk.lower() or gk.lower() in guide_key.lower():
guide = gv
break
guides_output.append({
"tenseId": t["id"],
"title": guide.get("Top", t["english"]) if guide else t["english"],
"body": guide.get("Bottom", "") if guide else "",
})
print(f" {len(guides_output)} tense guides")
# ─── Step 4: Load difficulty levels ───
print("Loading difficulty levels...")
level_files = [
("basic", "Basic.csv"),
("elementary_1", "Elementary-1.csv"),
("elementary_2", "Elementary-2.csv"),
("elementary_3", "Elementary-3.csv"),
("intermediate_1", "Intermediate-1.csv"),
("intermediate_2", "Intermediate-2.csv"),
("intermediate_3", "Intermediate-3.csv"),
("intermediate_4", "Intermediate-4.csv"),
]
level_verbs = {}
for level_id, filename in level_files:
path = os.path.join(CONJUU_LEVELS, filename)
with open(path, 'r') as f:
for row in csv.reader(f):
level_verbs[row[0].lower()] = level_id
print(f" {len(level_verbs)} verbs with curated levels")
# ─── Step 5: Merge everything ───
print("Merging data...")
# Map ConjuGato VerbFormId encoding
# VerbFormId = (1000 + VerbId) * 10000 + MTPP
# M: 1=Indicative, 2=Subjunctive, 3=Imperative
# T: tense within mood
# PP: person (01-08)
CONJUGATO_TENSE_MAP = {
# (mood, tense) -> tense_id
(1, 1): "ind_presente",
(1, 2): "ind_preterito",
(1, 3): "ind_imperfecto",
(1, 6): "cond_presente",
(1, 7): "ind_futuro",
(2, 1): "subj_presente",
(2, 3): "subj_imperfecto_1",
(2, 4): "subj_imperfecto_2",
(2, 7): "subj_futuro",
(3, 0): "imp_afirmativo", # person-specific
}
def decode_verb_form_id(vfid):
"""Decode VerbFormId into (verb_id, tense_id, person_index)"""
s = str(vfid)
if len(s) != 8:
return None, None, None
verb_id = int(s[:4]) - 1000
mood = int(s[4])
tense_num = int(s[5])
person = int(s[6:8])
# Handle imperative
if mood == 3:
if person >= 800:
tense_id = "imp_negativo"
person = person - 800
else:
tense_id = "imp_afirmativo"
else:
tense_id = CONJUGATO_TENSE_MAP.get((mood, tense_num))
if person >= 1 and person <= 6:
person_idx = person - 1
elif person == 7 or person == 8:
person_idx = None # vos/voseo - skip for now
else:
person_idx = None
return verb_id, tense_id, person_idx
def assign_level(rank):
if rank <= 25:
return "basic"
elif rank <= 100:
return "elementary"
elif rank <= 300:
return "intermediate"
elif rank <= 700:
return "advanced"
else:
return "expert"
# Build unified verb list
all_verbs = []
verb_forms = []
spans_output = []
for vid, cv in sorted(conjugato_verbs.items(), key=lambda x: x[1]["rank"]):
infinitive = cv["infinitive"]
inf_lower = infinitive.lower()
ending = cv["ending"]
rank = cv["rank"]
# Check Conjuu ES for this verb
conjuu = conjuu_verbs.get(inf_lower)
# Determine level
level = level_verbs.get(inf_lower, assign_level(rank))
verb_entry = {
"id": vid,
"infinitive": infinitive,
"english": cv["english"],
"rank": rank,
"ending": ending,
"reflexive": cv["reflexive"],
"level": level,
"hasConjuuData": conjuu is not None,
}
all_verbs.append(verb_entry)
# Generate forms for each tense
for tense in TENSES:
tid = tense["id"]
if conjuu and tid in conjuu["tenses"]:
# Use Conjuu ES data (pre-computed)
td = conjuu["tenses"][tid]
forms = td["forms"]
regularity = td["regularity"]
else:
# Generate from rules or ConjuGato irregular forms
regularity = "ordinary"
# Check if we have irregular forms from ConjuGato
has_irregular = vid in irregularity_data
if tid in HABER:
# Compound tense
participle = get_participle(infinitive, ending)
# Check for irregular participle from ConjuGato
forms = [f"{aux} {participle}" for aux in HABER[tid]]
regularity = "ordinary"
elif tid in ("ind_futuro", "cond_presente"):
# Future/conditional use full infinitive as stem
base = infinitive.lower()
if base.endswith("se"):
base = base[:-2]
endings_map = {
"ind_futuro": ["é", "ás", "á", "emos", "éis", "án"],
"cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"],
}
forms = [base + e for e in endings_map[tid]]
# Check for irregular future/conditional stems from ConjuGato
if has_irregular:
# Try to find irregular forms
for pi in range(6):
mood_tense = (1, 7) if tid == "ind_futuro" else (1, 6)
vfid = (1000 + vid) * 10000 + mood_tense[0] * 1000 + mood_tense[1] * 100 + (pi + 1)
if vfid in irregular_forms:
forms[pi] = irregular_forms[vfid]
regularity = "irregular"
else:
# Simple tense
stem = get_stem(infinitive, ending)
if tid in ENDINGS.get(ending, {}):
forms = [stem + e for e in ENDINGS[ending][tid]]
else:
forms = [""] * 6
# Override with ConjuGato irregular forms
if has_irregular:
mood_map = {
"ind_presente": (1, 1), "ind_preterito": (1, 2),
"ind_imperfecto": (1, 3),
"subj_presente": (2, 1), "subj_imperfecto_1": (2, 3),
"subj_imperfecto_2": (2, 4), "subj_futuro": (2, 7),
}
if tid in mood_map:
mt = mood_map[tid]
for pi in range(6):
vfid = (1000 + vid) * 10000 + mt[0] * 1000 + mt[1] * 100 + (pi + 1)
if vfid in irregular_forms:
forms[pi] = irregular_forms[vfid]
regularity = "irregular"
elif tid == "imp_afirmativo":
for pi in range(6):
vfid = (1000 + vid) * 10000 + 3000 + (pi + 1)
if vfid in irregular_forms:
forms[pi] = irregular_forms[vfid]
regularity = "irregular"
elif tid == "imp_negativo":
for pi in range(6):
vfid = (1000 + vid) * 10000 + 3800 + (pi + 1)
if vfid in irregular_forms:
forms[pi] = irregular_forms[vfid]
regularity = "irregular"
for pi, form in enumerate(forms):
if form:
verb_forms.append({
"verbId": vid,
"tenseId": tid,
"personIndex": pi,
"form": form,
"regularity": regularity,
})
# Build spans referencing verb forms
print("Processing irregular spans...")
for span in irregular_spans:
vfid = span["verbFormId"]
verb_id, tense_id, person_idx = decode_verb_form_id(vfid)
if verb_id is None or tense_id is None or person_idx is None:
continue
if verb_id not in conjugato_verbs:
continue
spans_output.append({
"verbId": verb_id,
"tenseId": tense_id,
"personIndex": person_idx,
"type": span["type"],
"pattern": span["pattern"],
"start": span["start"],
"end": span["end"],
})
# ─── Step 6: Output ───
print("Writing output...")
output = {
"tenses": TENSES,
"persons": PERSONS,
"verbs": all_verbs,
"verbForms": verb_forms,
"irregularSpans": spans_output,
"tenseGuides": guides_output,
}
with open(OUTPUT, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=None)
# Also write a pretty version for debugging
with open(OUTPUT.replace('.json', '_debug.json'), 'w', encoding='utf-8') as f:
json.dump({
"stats": {
"verbs": len(all_verbs),
"verbForms": len(verb_forms),
"irregularSpans": len(spans_output),
"tenseGuides": len(guides_output),
},
"sampleVerb": all_verbs[0] if all_verbs else None,
"sampleForms": verb_forms[:20],
}, f, ensure_ascii=False, indent=2)
file_size = os.path.getsize(OUTPUT) / (1024 * 1024)
print(f"\nDone!")
print(f" Verbs: {len(all_verbs)}")
print(f" Verb forms: {len(verb_forms)}")
print(f" Irregular spans: {len(spans_output)}")
print(f" Tense guides: {len(guides_output)}")
print(f" Output: {OUTPUT} ({file_size:.1f} MB)")