#!/usr/bin/env python3 """ Merge ConjuGato + Conjuu ES data into unified JSON for Conjuga app. Sources: - ConjuGato: 1,750 verbs (verb.md), irregular forms, spans, irregularity bitmasks - Conjuu ES: 621 verbs with full conjugation tables, tense guides, conjugation rules Output: conjuga_data.json with all verbs, forms, spans, guides """ import csv import json import re import sqlite3 import os import plistlib import subprocess BASE = "/Users/treyt/Desktop/code/Spanish" CONJUGATO_DB = "/Applications/ConjuGato.app/WrappedBundle/Verbs.sqlite" CONJUU_VOCAB = "/Applications/Conjuu ES.app/Contents/Resources/Vocabulary.csv" CONJUU_GUIDE = "/Applications/Conjuu ES.app/Contents/Resources/en.lproj/Guide.strings" CONJUU_RULES = "/Applications/Conjuu ES.app/Contents/Resources/GuideTableEntries.plist" CONJUU_LEVELS = "/Applications/Conjuu ES.app/Contents/Resources" OUTPUT = os.path.join(BASE, "Conjuga", "Scripts", "conjuga_data.json") # ─── Tense metadata ─── TENSES = [ {"id": "ind_presente", "spanish": "Indicativo Presente", "english": "Present", "mood": "Indicative", "order": 0}, {"id": "ind_preterito", "spanish": "Indicativo Pretérito", "english": "Preterite", "mood": "Indicative", "order": 1}, {"id": "ind_imperfecto", "spanish": "Indicativo Imperfecto", "english": "Imperfect", "mood": "Indicative", "order": 2}, {"id": "ind_futuro", "spanish": "Indicativo Futuro", "english": "Future", "mood": "Indicative", "order": 3}, {"id": "ind_perfecto", "spanish": "Indicativo Perfecto", "english": "Present Perfect", "mood": "Indicative", "order": 4}, {"id": "ind_pluscuamperfecto", "spanish": "Indicativo Pluscuamperfecto", "english": "Pluperfect", "mood": "Indicative", "order": 5}, {"id": "ind_futuro_perfecto", "spanish": "Indicativo Futuro Perfecto", "english": "Future Perfect", "mood": "Indicative", "order": 6}, {"id": "ind_preterito_anterior", "spanish": "Indicativo Pretérito Anterior", "english": "Preterite Perfect", "mood": "Indicative", "order": 7}, {"id": "cond_presente", "spanish": "Condicional Presente", "english": "Conditional", "mood": "Conditional", "order": 8}, {"id": "cond_perfecto", "spanish": "Condicional Perfecto", "english": "Conditional Perfect", "mood": "Conditional", "order": 9}, {"id": "subj_presente", "spanish": "Subjuntivo Presente", "english": "Present Subjunctive", "mood": "Subjunctive", "order": 10}, {"id": "subj_imperfecto_1", "spanish": "Subjuntivo Imperfecto I", "english": "Past Subjunctive (ra)", "mood": "Subjunctive", "order": 11}, {"id": "subj_imperfecto_2", "spanish": "Subjuntivo Imperfecto II", "english": "Past Subjunctive (se)", "mood": "Subjunctive", "order": 12}, {"id": "subj_perfecto", "spanish": "Subjuntivo Perfecto", "english": "Subjunctive Perfect", "mood": "Subjunctive", "order": 13}, {"id": "subj_pluscuamperfecto_1", "spanish": "Subjuntivo Pluscuamperfecto I", "english": "Subjunctive Pluperfect (ra)", "mood": "Subjunctive", "order": 14}, {"id": "subj_pluscuamperfecto_2", "spanish": "Subjuntivo Pluscuamperfecto II", "english": "Subjunctive Pluperfect (se)", "mood": "Subjunctive", "order": 15}, {"id": "subj_futuro", "spanish": "Subjuntivo Futuro", "english": "Subjunctive Future", "mood": "Subjunctive", "order": 16}, {"id": "subj_futuro_perfecto", "spanish": "Subjuntivo Futuro Perfecto", "english": "Subjunctive Future Perfect", "mood": "Subjunctive", "order": 17}, {"id": "imp_afirmativo", "spanish": "Imperativo Afirmativo", "english": "Imperative", "mood": "Imperative", "order": 18}, {"id": "imp_negativo", "spanish": "Imperativo Negativo", "english": "Negative Imperative", "mood": "Imperative", "order": 19}, ] TENSE_LOOKUP = {} for t in TENSES: TENSE_LOOKUP[t["spanish"]] = t["id"] PERSONS = ["yo", "tú", "él/ella/Ud.", "nosotros", "vosotros", "ellos/ellas/Uds."] ENDINGS = { "ar": { "ind_presente": ["o", "as", "a", "amos", "áis", "an"], "ind_preterito": ["é", "aste", "ó", "amos", "asteis", "aron"], "ind_imperfecto": ["aba", "abas", "aba", "ábamos", "abais", "aban"], "ind_futuro": ["aré", "arás", "ará", "aremos", "aréis", "arán"], "cond_presente": ["aría", "arías", "aría", "aríamos", "aríais", "arían"], "subj_presente": ["e", "es", "e", "emos", "éis", "en"], "subj_imperfecto_1": ["ara", "aras", "ara", "áramos", "arais", "aran"], "subj_imperfecto_2": ["ase", "ases", "ase", "ásemos", "aseis", "asen"], "subj_futuro": ["are", "ares", "are", "áremos", "areis", "aren"], "imp_afirmativo": ["", "a", "e", "emos", "ad", "en"], "imp_negativo": ["", "es", "e", "emos", "éis", "en"], }, "er": { "ind_presente": ["o", "es", "e", "emos", "éis", "en"], "ind_preterito": ["í", "iste", "ió", "imos", "isteis", "ieron"], "ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"], "ind_futuro": ["eré", "erás", "erá", "eremos", "eréis", "erán"], "cond_presente": ["ería", "erías", "ería", "eríamos", "eríais", "erían"], "subj_presente": ["a", "as", "a", "amos", "áis", "an"], "subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"], "subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"], "subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"], "imp_afirmativo": ["", "e", "a", "amos", "ed", "an"], "imp_negativo": ["", "as", "a", "amos", "áis", "an"], }, "ir": { "ind_presente": ["o", "es", "e", "imos", "ís", "en"], "ind_preterito": ["í", "iste", "ió", "imos", "isteis", "ieron"], "ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"], "ind_futuro": ["iré", "irás", "irá", "iremos", "iréis", "irán"], "cond_presente": ["iría", "irías", "iría", "iríamos", "iríais", "irían"], "subj_presente": ["a", "as", "a", "amos", "áis", "an"], "subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"], "subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"], "subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"], "imp_afirmativo": ["", "e", "a", "amos", "id", "an"], "imp_negativo": ["", "as", "a", "amos", "áis", "an"], }, } # Compound tenses: auxiliary haber forms HABER = { "ind_perfecto": ["he", "has", "ha", "hemos", "habéis", "han"], "ind_pluscuamperfecto": ["había", "habías", "había", "habíamos", "habíais", "habían"], "ind_futuro_perfecto": ["habré", "habrás", "habrá", "habremos", "habréis", "habrán"], "ind_preterito_anterior": ["hube", "hubiste", "hubo", "hubimos", "hubisteis", "hubieron"], "cond_perfecto": ["habría", "habrías", "habría", "habríamos", "habríais", "habrían"], "subj_perfecto": ["haya", "hayas", "haya", "hayamos", "hayáis", "hayan"], "subj_pluscuamperfecto_1": ["hubiera", "hubieras", "hubiera", "hubiéramos", "hubierais", "hubieran"], "subj_pluscuamperfecto_2": ["hubiese", "hubieses", "hubiese", "hubiésemos", "hubieseis", "hubiesen"], "subj_futuro_perfecto": ["hubiere", "hubieres", "hubiere", "hubiéremos", "hubiereis", "hubieren"], } def get_ending_type(infinitive): inf = infinitive.lower() if inf.endswith("arse") or inf.endswith("erse") or inf.endswith("irse"): core = inf[:-2] else: core = inf if core.endswith("ar"): return "ar" elif core.endswith("er"): return "er" elif core.endswith("ir") or core.endswith("ír"): return "ir" return "ar" def get_stem(infinitive, ending_type): inf = infinitive.lower() if inf.endswith("se"): inf = inf[:-2] if ending_type == "ar" and inf.endswith("ar"): return inf[:-2] elif ending_type == "er" and inf.endswith("er"): return inf[:-2] elif ending_type == "ir" and (inf.endswith("ir") or inf.endswith("ír")): return inf[:-2] return inf[:-2] def get_participle(infinitive, ending_type): stem = get_stem(infinitive, ending_type) if ending_type == "ar": return stem + "ado" else: return stem + "ido" def conjugate_regular(infinitive, tense_id, ending_type): stem = get_stem(infinitive, ending_type) if tense_id in HABER: participle = get_participle(infinitive, ending_type) return [f"{aux} {participle}" for aux in HABER[tense_id]] if tense_id in ("ind_futuro", "cond_presente"): return [infinitive.lower().rstrip("se") + e.lstrip(ending_type[0] if tense_id == "ind_futuro" else "") for e in ENDINGS[ending_type][tense_id]] # Actually for future/conditional, the stem is the full infinitive base = infinitive.lower() if base.endswith("se"): base = base[:-2] return [base + ENDINGS[ending_type][tense_id][i] for i in range(6)] if tense_id in ENDINGS[ending_type]: endings = ENDINGS[ending_type][tense_id] return [stem + e for e in endings] return [""] * 6 def conjugate_future_cond(infinitive, tense_id, ending_type): base = infinitive.lower() if base.endswith("se"): base = base[:-2] endings_map = { "ind_futuro": ["é", "ás", "á", "emos", "éis", "án"], "cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"], } if tense_id in endings_map: return [base + e for e in endings_map[tense_id]] return None # ─── Step 1: Load ConjuGato verbs ─── print("Loading ConjuGato data...") conn = sqlite3.connect(CONJUGATO_DB) cursor = conn.cursor() # Verbs cursor.execute("SELECT Id, Rank, Ending, Reflexive, Spanish, English FROM Verb ORDER BY Rank") conjugato_verbs = {} for row in cursor.fetchall(): vid, rank, ending, reflexive, spanish, english = row ending_map = {1: "ar", 2: "er", 4: "ir"} conjugato_verbs[vid] = { "id": vid, "rank": rank, "ending": ending_map.get(ending, "ar"), "reflexive": reflexive, "infinitive": spanish, "english": english, } # Irregular verb forms cursor.execute("SELECT VerbFormId, Form FROM IrregularVerbForm ORDER BY VerbFormId") irregular_forms = {} for vfid, form in cursor.fetchall(): irregular_forms[vfid] = form # Irregular spans cursor.execute("SELECT Id, VerbFormId, Type, Pattern, Start, End FROM IrregularSpan ORDER BY Id") irregular_spans = [] for sid, vfid, stype, pattern, start, end in cursor.fetchall(): irregular_spans.append({ "verbFormId": vfid, "type": stype, "pattern": pattern, "start": start, "end": end, }) # Irregularity bitmasks cursor.execute("SELECT * FROM Irregularity ORDER BY VerbId") irregularity_cols = [d[0] for d in cursor.description] irregularity_data = {} for row in cursor.fetchall(): verb_id = row[0] irregularity_data[verb_id] = dict(zip(irregularity_cols[1:], row[1:])) conn.close() print(f" {len(conjugato_verbs)} verbs, {len(irregular_forms)} irregular forms, {len(irregular_spans)} spans") # ─── Step 2: Load Conjuu ES conjugations ─── print("Loading Conjuu ES data...") conjuu_verbs = {} with open(CONJUU_VOCAB, 'r') as f: for row in csv.reader(f): verb_name = row[0] tense_spanish = row[2] tense_id = TENSE_LOOKUP.get(tense_spanish) if not tense_id: continue regularity = row[1] forms = row[3:9] # yo, tú, él, nosotros, vosotros, ellos english = row[9] rank = int(row[13]) if row[13] else 99999 key = verb_name.lower() if key not in conjuu_verbs: conjuu_verbs[key] = { "infinitive": verb_name, "english": english, "rank": rank, "tenses": {}, } conjuu_verbs[key]["tenses"][tense_id] = { "regularity": regularity, "forms": forms, } print(f" {len(conjuu_verbs)} verbs with conjugations") # ─── Step 3: Load tense guides ─── print("Loading tense guides...") result = subprocess.run(['plutil', '-convert', 'xml1', '-o', '-', CONJUU_GUIDE], capture_output=True) guide_data = plistlib.loads(result.stdout) tense_guides = {} for key, value in guide_data.items(): m = re.match(r'LL(.+)Guide(Top|Bottom)', key) if m: tense_name = m.group(1) part = m.group(2) if tense_name not in tense_guides: tense_guides[tense_name] = {} tense_guides[tense_name][part] = value guides_output = [] for t in TENSES: guide_key = t["spanish"].replace("Indicativo ", "").replace("Condicional ", "").replace("Subjuntivo ", "").replace("Imperativo ", "") # Try exact match first, then various key patterns guide = None for gk, gv in tense_guides.items(): if gk == guide_key or gk == t["spanish"] or gk.replace(" ", "") == guide_key.replace(" ", ""): guide = gv break if not guide: # Try partial match for gk, gv in tense_guides.items(): if guide_key.lower() in gk.lower() or gk.lower() in guide_key.lower(): guide = gv break guides_output.append({ "tenseId": t["id"], "title": guide.get("Top", t["english"]) if guide else t["english"], "body": guide.get("Bottom", "") if guide else "", }) print(f" {len(guides_output)} tense guides") # ─── Step 4: Load difficulty levels ─── print("Loading difficulty levels...") level_files = [ ("basic", "Basic.csv"), ("elementary_1", "Elementary-1.csv"), ("elementary_2", "Elementary-2.csv"), ("elementary_3", "Elementary-3.csv"), ("intermediate_1", "Intermediate-1.csv"), ("intermediate_2", "Intermediate-2.csv"), ("intermediate_3", "Intermediate-3.csv"), ("intermediate_4", "Intermediate-4.csv"), ] level_verbs = {} for level_id, filename in level_files: path = os.path.join(CONJUU_LEVELS, filename) with open(path, 'r') as f: for row in csv.reader(f): level_verbs[row[0].lower()] = level_id print(f" {len(level_verbs)} verbs with curated levels") # ─── Step 5: Merge everything ─── print("Merging data...") # Map ConjuGato VerbFormId encoding # VerbFormId = (1000 + VerbId) * 10000 + MTPP # M: 1=Indicative, 2=Subjunctive, 3=Imperative # T: tense within mood # PP: person (01-08) CONJUGATO_TENSE_MAP = { # (mood, tense) -> tense_id (1, 1): "ind_presente", (1, 2): "ind_preterito", (1, 3): "ind_imperfecto", (1, 6): "cond_presente", (1, 7): "ind_futuro", (2, 1): "subj_presente", (2, 3): "subj_imperfecto_1", (2, 4): "subj_imperfecto_2", (2, 7): "subj_futuro", (3, 0): "imp_afirmativo", # person-specific } def decode_verb_form_id(vfid): """Decode VerbFormId into (verb_id, tense_id, person_index)""" s = str(vfid) if len(s) != 8: return None, None, None verb_id = int(s[:4]) - 1000 mood = int(s[4]) tense_num = int(s[5]) person = int(s[6:8]) # Handle imperative if mood == 3: if person >= 800: tense_id = "imp_negativo" person = person - 800 else: tense_id = "imp_afirmativo" else: tense_id = CONJUGATO_TENSE_MAP.get((mood, tense_num)) if person >= 1 and person <= 6: person_idx = person - 1 elif person == 7 or person == 8: person_idx = None # vos/voseo - skip for now else: person_idx = None return verb_id, tense_id, person_idx def assign_level(rank): if rank <= 25: return "basic" elif rank <= 100: return "elementary" elif rank <= 300: return "intermediate" elif rank <= 700: return "advanced" else: return "expert" # Build unified verb list all_verbs = [] verb_forms = [] spans_output = [] for vid, cv in sorted(conjugato_verbs.items(), key=lambda x: x[1]["rank"]): infinitive = cv["infinitive"] inf_lower = infinitive.lower() ending = cv["ending"] rank = cv["rank"] # Check Conjuu ES for this verb conjuu = conjuu_verbs.get(inf_lower) # Determine level level = level_verbs.get(inf_lower, assign_level(rank)) verb_entry = { "id": vid, "infinitive": infinitive, "english": cv["english"], "rank": rank, "ending": ending, "reflexive": cv["reflexive"], "level": level, "hasConjuuData": conjuu is not None, } all_verbs.append(verb_entry) # Generate forms for each tense for tense in TENSES: tid = tense["id"] if conjuu and tid in conjuu["tenses"]: # Use Conjuu ES data (pre-computed) td = conjuu["tenses"][tid] forms = td["forms"] regularity = td["regularity"] else: # Generate from rules or ConjuGato irregular forms regularity = "ordinary" # Check if we have irregular forms from ConjuGato has_irregular = vid in irregularity_data if tid in HABER: # Compound tense participle = get_participle(infinitive, ending) # Check for irregular participle from ConjuGato forms = [f"{aux} {participle}" for aux in HABER[tid]] regularity = "ordinary" elif tid in ("ind_futuro", "cond_presente"): # Future/conditional use full infinitive as stem base = infinitive.lower() if base.endswith("se"): base = base[:-2] endings_map = { "ind_futuro": ["é", "ás", "á", "emos", "éis", "án"], "cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"], } forms = [base + e for e in endings_map[tid]] # Check for irregular future/conditional stems from ConjuGato if has_irregular: # Try to find irregular forms for pi in range(6): mood_tense = (1, 7) if tid == "ind_futuro" else (1, 6) vfid = (1000 + vid) * 10000 + mood_tense[0] * 1000 + mood_tense[1] * 100 + (pi + 1) if vfid in irregular_forms: forms[pi] = irregular_forms[vfid] regularity = "irregular" else: # Simple tense stem = get_stem(infinitive, ending) if tid in ENDINGS.get(ending, {}): forms = [stem + e for e in ENDINGS[ending][tid]] else: forms = [""] * 6 # Override with ConjuGato irregular forms if has_irregular: mood_map = { "ind_presente": (1, 1), "ind_preterito": (1, 2), "ind_imperfecto": (1, 3), "subj_presente": (2, 1), "subj_imperfecto_1": (2, 3), "subj_imperfecto_2": (2, 4), "subj_futuro": (2, 7), } if tid in mood_map: mt = mood_map[tid] for pi in range(6): vfid = (1000 + vid) * 10000 + mt[0] * 1000 + mt[1] * 100 + (pi + 1) if vfid in irregular_forms: forms[pi] = irregular_forms[vfid] regularity = "irregular" elif tid == "imp_afirmativo": for pi in range(6): vfid = (1000 + vid) * 10000 + 3000 + (pi + 1) if vfid in irregular_forms: forms[pi] = irregular_forms[vfid] regularity = "irregular" elif tid == "imp_negativo": for pi in range(6): vfid = (1000 + vid) * 10000 + 3800 + (pi + 1) if vfid in irregular_forms: forms[pi] = irregular_forms[vfid] regularity = "irregular" for pi, form in enumerate(forms): if form: verb_forms.append({ "verbId": vid, "tenseId": tid, "personIndex": pi, "form": form, "regularity": regularity, }) # Build spans referencing verb forms print("Processing irregular spans...") for span in irregular_spans: vfid = span["verbFormId"] verb_id, tense_id, person_idx = decode_verb_form_id(vfid) if verb_id is None or tense_id is None or person_idx is None: continue if verb_id not in conjugato_verbs: continue spans_output.append({ "verbId": verb_id, "tenseId": tense_id, "personIndex": person_idx, "type": span["type"], "pattern": span["pattern"], "start": span["start"], "end": span["end"], }) # ─── Step 6: Output ─── print("Writing output...") output = { "tenses": TENSES, "persons": PERSONS, "verbs": all_verbs, "verbForms": verb_forms, "irregularSpans": spans_output, "tenseGuides": guides_output, } with open(OUTPUT, 'w', encoding='utf-8') as f: json.dump(output, f, ensure_ascii=False, indent=None) # Also write a pretty version for debugging with open(OUTPUT.replace('.json', '_debug.json'), 'w', encoding='utf-8') as f: json.dump({ "stats": { "verbs": len(all_verbs), "verbForms": len(verb_forms), "irregularSpans": len(spans_output), "tenseGuides": len(guides_output), }, "sampleVerb": all_verbs[0] if all_verbs else None, "sampleForms": verb_forms[:20], }, f, ensure_ascii=False, indent=2) file_size = os.path.getsize(OUTPUT) / (1024 * 1024) print(f"\nDone!") print(f" Verbs: {len(all_verbs)}") print(f" Verb forms: {len(verb_forms)}") print(f" Irregular spans: {len(spans_output)}") print(f" Tense guides: {len(guides_output)}") print(f" Output: {OUTPUT} ({file_size:.1f} MB)")