Spanish/Conjuga/Scripts/merge_data.py

#!/usr/bin/env python3
"""
Merge ConjuGato + Conjuu ES data into unified JSON for Conjuga app.

Sources:
- ConjuGato: 1,750 verbs (verb.md), irregular forms, spans, irregularity bitmasks
- Conjuu ES: 621 verbs with full conjugation tables, tense guides, conjugation rules

Output: conjuga_data.json with all verbs, forms, spans, guides
"""

import csv
import json
import re
import sqlite3
import os
import plistlib
import subprocess

BASE = "/Users/treyt/Desktop/code/Spanish"
CONJUGATO_DB = "/Applications/ConjuGato.app/WrappedBundle/Verbs.sqlite"
CONJUU_VOCAB = "/Applications/Conjuu ES.app/Contents/Resources/Vocabulary.csv"
CONJUU_GUIDE = "/Applications/Conjuu ES.app/Contents/Resources/en.lproj/Guide.strings"
CONJUU_RULES = "/Applications/Conjuu ES.app/Contents/Resources/GuideTableEntries.plist"
CONJUU_LEVELS = "/Applications/Conjuu ES.app/Contents/Resources"
OUTPUT = os.path.join(BASE, "Conjuga", "Scripts", "conjuga_data.json")

# ─── Tense metadata ───
TENSES = [
    {"id": "ind_presente", "spanish": "Indicativo Presente", "english": "Present", "mood": "Indicative", "order": 0},
    {"id": "ind_preterito", "spanish": "Indicativo Pretérito", "english": "Preterite", "mood": "Indicative", "order": 1},
    {"id": "ind_imperfecto", "spanish": "Indicativo Imperfecto", "english": "Imperfect", "mood": "Indicative", "order": 2},
    {"id": "ind_futuro", "spanish": "Indicativo Futuro", "english": "Future", "mood": "Indicative", "order": 3},
    {"id": "ind_perfecto", "spanish": "Indicativo Perfecto", "english": "Present Perfect", "mood": "Indicative", "order": 4},
    {"id": "ind_pluscuamperfecto", "spanish": "Indicativo Pluscuamperfecto", "english": "Pluperfect", "mood": "Indicative", "order": 5},
    {"id": "ind_futuro_perfecto", "spanish": "Indicativo Futuro Perfecto", "english": "Future Perfect", "mood": "Indicative", "order": 6},
    {"id": "ind_preterito_anterior", "spanish": "Indicativo Pretérito Anterior", "english": "Preterite Perfect", "mood": "Indicative", "order": 7},
    {"id": "cond_presente", "spanish": "Condicional Presente", "english": "Conditional", "mood": "Conditional", "order": 8},
    {"id": "cond_perfecto", "spanish": "Condicional Perfecto", "english": "Conditional Perfect", "mood": "Conditional", "order": 9},
    {"id": "subj_presente", "spanish": "Subjuntivo Presente", "english": "Present Subjunctive", "mood": "Subjunctive", "order": 10},
    {"id": "subj_imperfecto_1", "spanish": "Subjuntivo Imperfecto I", "english": "Past Subjunctive (ra)", "mood": "Subjunctive", "order": 11},
    {"id": "subj_imperfecto_2", "spanish": "Subjuntivo Imperfecto II", "english": "Past Subjunctive (se)", "mood": "Subjunctive", "order": 12},
    {"id": "subj_perfecto", "spanish": "Subjuntivo Perfecto", "english": "Subjunctive Perfect", "mood": "Subjunctive", "order": 13},
    {"id": "subj_pluscuamperfecto_1", "spanish": "Subjuntivo Pluscuamperfecto I", "english": "Subjunctive Pluperfect (ra)", "mood": "Subjunctive", "order": 14},
    {"id": "subj_pluscuamperfecto_2", "spanish": "Subjuntivo Pluscuamperfecto II", "english": "Subjunctive Pluperfect (se)", "mood": "Subjunctive", "order": 15},
    {"id": "subj_futuro", "spanish": "Subjuntivo Futuro", "english": "Subjunctive Future", "mood": "Subjunctive", "order": 16},
    {"id": "subj_futuro_perfecto", "spanish": "Subjuntivo Futuro Perfecto", "english": "Subjunctive Future Perfect", "mood": "Subjunctive", "order": 17},
    {"id": "imp_afirmativo", "spanish": "Imperativo Afirmativo", "english": "Imperative", "mood": "Imperative", "order": 18},
    {"id": "imp_negativo", "spanish": "Imperativo Negativo", "english": "Negative Imperative", "mood": "Imperative", "order": 19},
]

TENSE_LOOKUP = {}
for t in TENSES:
    TENSE_LOOKUP[t["spanish"]] = t["id"]

PERSONS = ["yo", "tú", "él/ella/Ud.", "nosotros", "vosotros", "ellos/ellas/Uds."]

ENDINGS = {
    "ar": {
        "ind_presente": ["o", "as", "a", "amos", "áis", "an"],
        "ind_preterito": ["é", "aste", "ó", "amos", "asteis", "aron"],
        "ind_imperfecto": ["aba", "abas", "aba", "ábamos", "abais", "aban"],
        "ind_futuro": ["aré", "arás", "ará", "aremos", "aréis", "arán"],
        "cond_presente": ["aría", "arías", "aría", "aríamos", "aríais", "arían"],
        "subj_presente": ["e", "es", "e", "emos", "éis", "en"],
        "subj_imperfecto_1": ["ara", "aras", "ara", "áramos", "arais", "aran"],
        "subj_imperfecto_2": ["ase", "ases", "ase", "ásemos", "aseis", "asen"],
        "subj_futuro": ["are", "ares", "are", "áremos", "areis", "aren"],
        "imp_afirmativo": ["", "a", "e", "emos", "ad", "en"],
        "imp_negativo": ["", "es", "e", "emos", "éis", "en"],
    },
    "er": {
        "ind_presente": ["o", "es", "e", "emos", "éis", "en"],
        "ind_preterito": ["í", "iste", "ió", "imos", "isteis", "ieron"],
        "ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"],
        "ind_futuro": ["eré", "erás", "erá", "eremos", "eréis", "erán"],
        "cond_presente": ["ería", "erías", "ería", "eríamos", "eríais", "erían"],
        "subj_presente": ["a", "as", "a", "amos", "áis", "an"],
        "subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"],
        "subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"],
        "subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"],
        "imp_afirmativo": ["", "e", "a", "amos", "ed", "an"],
        "imp_negativo": ["", "as", "a", "amos", "áis", "an"],
    },
    "ir": {
        "ind_presente": ["o", "es", "e", "imos", "ís", "en"],
        "ind_preterito": ["í", "iste", "ió", "imos", "isteis", "ieron"],
        "ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"],
        "ind_futuro": ["iré", "irás", "irá", "iremos", "iréis", "irán"],
        "cond_presente": ["iría", "irías", "iría", "iríamos", "iríais", "irían"],
        "subj_presente": ["a", "as", "a", "amos", "áis", "an"],
        "subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"],
        "subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"],
        "subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"],
        "imp_afirmativo": ["", "e", "a", "amos", "id", "an"],
        "imp_negativo": ["", "as", "a", "amos", "áis", "an"],
    },
}

# Compound tenses: auxiliary haber forms
HABER = {
    "ind_perfecto": ["he", "has", "ha", "hemos", "habéis", "han"],
    "ind_pluscuamperfecto": ["había", "habías", "había", "habíamos", "habíais", "habían"],
    "ind_futuro_perfecto": ["habré", "habrás", "habrá", "habremos", "habréis", "habrán"],
    "ind_preterito_anterior": ["hube", "hubiste", "hubo", "hubimos", "hubisteis", "hubieron"],
    "cond_perfecto": ["habría", "habrías", "habría", "habríamos", "habríais", "habrían"],
    "subj_perfecto": ["haya", "hayas", "haya", "hayamos", "hayáis", "hayan"],
    "subj_pluscuamperfecto_1": ["hubiera", "hubieras", "hubiera", "hubiéramos", "hubierais", "hubieran"],
    "subj_pluscuamperfecto_2": ["hubiese", "hubieses", "hubiese", "hubiésemos", "hubieseis", "hubiesen"],
    "subj_futuro_perfecto": ["hubiere", "hubieres", "hubiere", "hubiéremos", "hubiereis", "hubieren"],
}

def get_ending_type(infinitive):
    inf = infinitive.lower()
    if inf.endswith("arse") or inf.endswith("erse") or inf.endswith("irse"):
        core = inf[:-2]
    else:
        core = inf
    if core.endswith("ar"):
        return "ar"
    elif core.endswith("er"):
        return "er"
    elif core.endswith("ir") or core.endswith("ír"):
        return "ir"
    return "ar"

def get_stem(infinitive, ending_type):
    inf = infinitive.lower()
    if inf.endswith("se"):
        inf = inf[:-2]
    if ending_type == "ar" and inf.endswith("ar"):
        return inf[:-2]
    elif ending_type == "er" and inf.endswith("er"):
        return inf[:-2]
    elif ending_type == "ir" and (inf.endswith("ir") or inf.endswith("ír")):
        return inf[:-2]
    return inf[:-2]

def get_participle(infinitive, ending_type):
    stem = get_stem(infinitive, ending_type)
    if ending_type == "ar":
        return stem + "ado"
    else:
        return stem + "ido"

def conjugate_regular(infinitive, tense_id, ending_type):
    stem = get_stem(infinitive, ending_type)
    if tense_id in HABER:
        participle = get_participle(infinitive, ending_type)
        return [f"{aux} {participle}" for aux in HABER[tense_id]]
    if tense_id in ("ind_futuro", "cond_presente"):
        return [infinitive.lower().rstrip("se") + e.lstrip(ending_type[0] if tense_id == "ind_futuro" else "")
                for e in ENDINGS[ending_type][tense_id]]
        # Actually for future/conditional, the stem is the full infinitive
        base = infinitive.lower()
        if base.endswith("se"):
            base = base[:-2]
        return [base + ENDINGS[ending_type][tense_id][i] for i in range(6)]
    if tense_id in ENDINGS[ending_type]:
        endings = ENDINGS[ending_type][tense_id]
        return [stem + e for e in endings]
    return [""] * 6

def conjugate_future_cond(infinitive, tense_id, ending_type):
    base = infinitive.lower()
    if base.endswith("se"):
        base = base[:-2]
    endings_map = {
        "ind_futuro": ["é", "ás", "á", "emos", "éis", "án"],
        "cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"],
    }
    if tense_id in endings_map:
        return [base + e for e in endings_map[tense_id]]
    return None


# ─── Step 1: Load ConjuGato verbs ───
print("Loading ConjuGato data...")
conn = sqlite3.connect(CONJUGATO_DB)
cursor = conn.cursor()

# Verbs
cursor.execute("SELECT Id, Rank, Ending, Reflexive, Spanish, English FROM Verb ORDER BY Rank")
conjugato_verbs = {}
for row in cursor.fetchall():
    vid, rank, ending, reflexive, spanish, english = row
    ending_map = {1: "ar", 2: "er", 4: "ir"}
    conjugato_verbs[vid] = {
        "id": vid,
        "rank": rank,
        "ending": ending_map.get(ending, "ar"),
        "reflexive": reflexive,
        "infinitive": spanish,
        "english": english,
    }

# Irregular verb forms
cursor.execute("SELECT VerbFormId, Form FROM IrregularVerbForm ORDER BY VerbFormId")
irregular_forms = {}
for vfid, form in cursor.fetchall():
    irregular_forms[vfid] = form

# Irregular spans
cursor.execute("SELECT Id, VerbFormId, Type, Pattern, Start, End FROM IrregularSpan ORDER BY Id")
irregular_spans = []
for sid, vfid, stype, pattern, start, end in cursor.fetchall():
    irregular_spans.append({
        "verbFormId": vfid,
        "type": stype,
        "pattern": pattern,
        "start": start,
        "end": end,
    })

# Irregularity bitmasks
cursor.execute("SELECT * FROM Irregularity ORDER BY VerbId")
irregularity_cols = [d[0] for d in cursor.description]
irregularity_data = {}
for row in cursor.fetchall():
    verb_id = row[0]
    irregularity_data[verb_id] = dict(zip(irregularity_cols[1:], row[1:]))

conn.close()
print(f"  {len(conjugato_verbs)} verbs, {len(irregular_forms)} irregular forms, {len(irregular_spans)} spans")

# ─── Step 2: Load Conjuu ES conjugations ───
print("Loading Conjuu ES data...")
conjuu_verbs = {}
with open(CONJUU_VOCAB, 'r') as f:
    for row in csv.reader(f):
        verb_name = row[0]
        tense_spanish = row[2]
        tense_id = TENSE_LOOKUP.get(tense_spanish)
        if not tense_id:
            continue
        regularity = row[1]
        forms = row[3:9]  # yo, tú, él, nosotros, vosotros, ellos
        english = row[9]
        rank = int(row[13]) if row[13] else 99999

        key = verb_name.lower()
        if key not in conjuu_verbs:
            conjuu_verbs[key] = {
                "infinitive": verb_name,
                "english": english,
                "rank": rank,
                "tenses": {},
            }
        conjuu_verbs[key]["tenses"][tense_id] = {
            "regularity": regularity,
            "forms": forms,
        }

print(f"  {len(conjuu_verbs)} verbs with conjugations")

# ─── Step 3: Load tense guides ───
print("Loading tense guides...")
result = subprocess.run(['plutil', '-convert', 'xml1', '-o', '-', CONJUU_GUIDE], capture_output=True)
guide_data = plistlib.loads(result.stdout)

tense_guides = {}
for key, value in guide_data.items():
    m = re.match(r'LL(.+)Guide(Top|Bottom)', key)
    if m:
        tense_name = m.group(1)
        part = m.group(2)
        if tense_name not in tense_guides:
            tense_guides[tense_name] = {}
        tense_guides[tense_name][part] = value

guides_output = []
for t in TENSES:
    guide_key = t["spanish"].replace("Indicativo ", "").replace("Condicional ", "").replace("Subjuntivo ", "").replace("Imperativo ", "")
    # Try exact match first, then various key patterns
    guide = None
    for gk, gv in tense_guides.items():
        if gk == guide_key or gk == t["spanish"] or gk.replace(" ", "") == guide_key.replace(" ", ""):
            guide = gv
            break
    if not guide:
        # Try partial match
        for gk, gv in tense_guides.items():
            if guide_key.lower() in gk.lower() or gk.lower() in guide_key.lower():
                guide = gv
                break

    guides_output.append({
        "tenseId": t["id"],
        "title": guide.get("Top", t["english"]) if guide else t["english"],
        "body": guide.get("Bottom", "") if guide else "",
    })

print(f"  {len(guides_output)} tense guides")

# ─── Step 4: Load difficulty levels ───
print("Loading difficulty levels...")
level_files = [
    ("basic", "Basic.csv"),
    ("elementary_1", "Elementary-1.csv"),
    ("elementary_2", "Elementary-2.csv"),
    ("elementary_3", "Elementary-3.csv"),
    ("intermediate_1", "Intermediate-1.csv"),
    ("intermediate_2", "Intermediate-2.csv"),
    ("intermediate_3", "Intermediate-3.csv"),
    ("intermediate_4", "Intermediate-4.csv"),
]

level_verbs = {}
for level_id, filename in level_files:
    path = os.path.join(CONJUU_LEVELS, filename)
    with open(path, 'r') as f:
        for row in csv.reader(f):
            level_verbs[row[0].lower()] = level_id

print(f"  {len(level_verbs)} verbs with curated levels")

# ─── Step 5: Merge everything ───
print("Merging data...")

# Map ConjuGato VerbFormId encoding
# VerbFormId = (1000 + VerbId) * 10000 + MTPP
# M: 1=Indicative, 2=Subjunctive, 3=Imperative
# T: tense within mood
# PP: person (01-08)
CONJUGATO_TENSE_MAP = {
    # (mood, tense) -> tense_id
    (1, 1): "ind_presente",
    (1, 2): "ind_preterito",
    (1, 3): "ind_imperfecto",
    (1, 6): "cond_presente",
    (1, 7): "ind_futuro",
    (2, 1): "subj_presente",
    (2, 3): "subj_imperfecto_1",
    (2, 4): "subj_imperfecto_2",
    (2, 7): "subj_futuro",
    (3, 0): "imp_afirmativo",  # person-specific
}

def decode_verb_form_id(vfid):
    """Decode VerbFormId into (verb_id, tense_id, person_index)"""
    s = str(vfid)
    if len(s) != 8:
        return None, None, None
    verb_id = int(s[:4]) - 1000
    mood = int(s[4])
    tense_num = int(s[5])
    person = int(s[6:8])

    # Handle imperative
    if mood == 3:
        if person >= 800:
            tense_id = "imp_negativo"
            person = person - 800
        else:
            tense_id = "imp_afirmativo"
    else:
        tense_id = CONJUGATO_TENSE_MAP.get((mood, tense_num))

    if person >= 1 and person <= 6:
        person_idx = person - 1
    elif person == 7 or person == 8:
        person_idx = None  # vos/voseo - skip for now
    else:
        person_idx = None

    return verb_id, tense_id, person_idx


def assign_level(rank):
    if rank <= 25:
        return "basic"
    elif rank <= 100:
        return "elementary"
    elif rank <= 300:
        return "intermediate"
    elif rank <= 700:
        return "advanced"
    else:
        return "expert"


# Build unified verb list
all_verbs = []
verb_forms = []
spans_output = []

for vid, cv in sorted(conjugato_verbs.items(), key=lambda x: x[1]["rank"]):
    infinitive = cv["infinitive"]
    inf_lower = infinitive.lower()
    ending = cv["ending"]
    rank = cv["rank"]

    # Check Conjuu ES for this verb
    conjuu = conjuu_verbs.get(inf_lower)

    # Determine level
    level = level_verbs.get(inf_lower, assign_level(rank))

    verb_entry = {
        "id": vid,
        "infinitive": infinitive,
        "english": cv["english"],
        "rank": rank,
        "ending": ending,
        "reflexive": cv["reflexive"],
        "level": level,
        "hasConjuuData": conjuu is not None,
    }
    all_verbs.append(verb_entry)

    # Generate forms for each tense
    for tense in TENSES:
        tid = tense["id"]

        if conjuu and tid in conjuu["tenses"]:
            # Use Conjuu ES data (pre-computed)
            td = conjuu["tenses"][tid]
            forms = td["forms"]
            regularity = td["regularity"]
        else:
            # Generate from rules or ConjuGato irregular forms
            regularity = "ordinary"

            # Check if we have irregular forms from ConjuGato
            has_irregular = vid in irregularity_data

            if tid in HABER:
                # Compound tense
                participle = get_participle(infinitive, ending)
                # Check for irregular participle from ConjuGato
                forms = [f"{aux} {participle}" for aux in HABER[tid]]
                regularity = "ordinary"
            elif tid in ("ind_futuro", "cond_presente"):
                # Future/conditional use full infinitive as stem
                base = infinitive.lower()
                if base.endswith("se"):
                    base = base[:-2]
                endings_map = {
                    "ind_futuro": ["é", "ás", "á", "emos", "éis", "án"],
                    "cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"],
                }
                forms = [base + e for e in endings_map[tid]]
                # Check for irregular future/conditional stems from ConjuGato
                if has_irregular:
                    # Try to find irregular forms
                    for pi in range(6):
                        mood_tense = (1, 7) if tid == "ind_futuro" else (1, 6)
                        vfid = (1000 + vid) * 10000 + mood_tense[0] * 1000 + mood_tense[1] * 100 + (pi + 1)
                        if vfid in irregular_forms:
                            forms[pi] = irregular_forms[vfid]
                            regularity = "irregular"
            else:
                # Simple tense
                stem = get_stem(infinitive, ending)
                if tid in ENDINGS.get(ending, {}):
                    forms = [stem + e for e in ENDINGS[ending][tid]]
                else:
                    forms = [""] * 6

                # Override with ConjuGato irregular forms
                if has_irregular:
                    mood_map = {
                        "ind_presente": (1, 1), "ind_preterito": (1, 2),
                        "ind_imperfecto": (1, 3),
                        "subj_presente": (2, 1), "subj_imperfecto_1": (2, 3),
                        "subj_imperfecto_2": (2, 4), "subj_futuro": (2, 7),
                    }
                    if tid in mood_map:
                        mt = mood_map[tid]
                        for pi in range(6):
                            vfid = (1000 + vid) * 10000 + mt[0] * 1000 + mt[1] * 100 + (pi + 1)
                            if vfid in irregular_forms:
                                forms[pi] = irregular_forms[vfid]
                                regularity = "irregular"
                    elif tid == "imp_afirmativo":
                        for pi in range(6):
                            vfid = (1000 + vid) * 10000 + 3000 + (pi + 1)
                            if vfid in irregular_forms:
                                forms[pi] = irregular_forms[vfid]
                                regularity = "irregular"
                    elif tid == "imp_negativo":
                        for pi in range(6):
                            vfid = (1000 + vid) * 10000 + 3800 + (pi + 1)
                            if vfid in irregular_forms:
                                forms[pi] = irregular_forms[vfid]
                                regularity = "irregular"

        for pi, form in enumerate(forms):
            if form:
                verb_forms.append({
                    "verbId": vid,
                    "tenseId": tid,
                    "personIndex": pi,
                    "form": form,
                    "regularity": regularity,
                })

# Build spans referencing verb forms
print("Processing irregular spans...")
for span in irregular_spans:
    vfid = span["verbFormId"]
    verb_id, tense_id, person_idx = decode_verb_form_id(vfid)
    if verb_id is None or tense_id is None or person_idx is None:
        continue
    if verb_id not in conjugato_verbs:
        continue
    spans_output.append({
        "verbId": verb_id,
        "tenseId": tense_id,
        "personIndex": person_idx,
        "type": span["type"],
        "pattern": span["pattern"],
        "start": span["start"],
        "end": span["end"],
    })

# ─── Step 6: Output ───
print("Writing output...")
output = {
    "tenses": TENSES,
    "persons": PERSONS,
    "verbs": all_verbs,
    "verbForms": verb_forms,
    "irregularSpans": spans_output,
    "tenseGuides": guides_output,
}

with open(OUTPUT, 'w', encoding='utf-8') as f:
    json.dump(output, f, ensure_ascii=False, indent=None)

# Also write a pretty version for debugging
with open(OUTPUT.replace('.json', '_debug.json'), 'w', encoding='utf-8') as f:
    json.dump({
        "stats": {
            "verbs": len(all_verbs),
            "verbForms": len(verb_forms),
            "irregularSpans": len(spans_output),
            "tenseGuides": len(guides_output),
        },
        "sampleVerb": all_verbs[0] if all_verbs else None,
        "sampleForms": verb_forms[:20],
    }, f, ensure_ascii=False, indent=2)

file_size = os.path.getsize(OUTPUT) / (1024 * 1024)
print(f"\nDone!")
print(f"  Verbs: {len(all_verbs)}")
print(f"  Verb forms: {len(verb_forms)}")
print(f"  Irregular spans: {len(spans_output)}")
print(f"  Tense guides: {len(guides_output)}")
print(f"  Output: {OUTPUT} ({file_size:.1f} MB)")