Includes SwiftData dual-store architecture (local reference + CloudKit user data), JSON-based data seeding, 20 tense guides, 20 grammar notes, SRS review system, course vocabulary, and widget support. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
551 lines
22 KiB
Python
551 lines
22 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge ConjuGato + Conjuu ES data into unified JSON for Conjuga app.
|
|
|
|
Sources:
|
|
- ConjuGato: 1,750 verbs (verb.md), irregular forms, spans, irregularity bitmasks
|
|
- Conjuu ES: 621 verbs with full conjugation tables, tense guides, conjugation rules
|
|
|
|
Output: conjuga_data.json with all verbs, forms, spans, guides
|
|
"""
|
|
|
|
import csv
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
import os
|
|
import plistlib
|
|
import subprocess
|
|
|
|
BASE = "/Users/treyt/Desktop/code/Spanish"
|
|
CONJUGATO_DB = "/Applications/ConjuGato.app/WrappedBundle/Verbs.sqlite"
|
|
CONJUU_VOCAB = "/Applications/Conjuu ES.app/Contents/Resources/Vocabulary.csv"
|
|
CONJUU_GUIDE = "/Applications/Conjuu ES.app/Contents/Resources/en.lproj/Guide.strings"
|
|
CONJUU_RULES = "/Applications/Conjuu ES.app/Contents/Resources/GuideTableEntries.plist"
|
|
CONJUU_LEVELS = "/Applications/Conjuu ES.app/Contents/Resources"
|
|
OUTPUT = os.path.join(BASE, "Conjuga", "Scripts", "conjuga_data.json")
|
|
|
|
# ─── Tense metadata ───
|
|
TENSES = [
|
|
{"id": "ind_presente", "spanish": "Indicativo Presente", "english": "Present", "mood": "Indicative", "order": 0},
|
|
{"id": "ind_preterito", "spanish": "Indicativo Pretérito", "english": "Preterite", "mood": "Indicative", "order": 1},
|
|
{"id": "ind_imperfecto", "spanish": "Indicativo Imperfecto", "english": "Imperfect", "mood": "Indicative", "order": 2},
|
|
{"id": "ind_futuro", "spanish": "Indicativo Futuro", "english": "Future", "mood": "Indicative", "order": 3},
|
|
{"id": "ind_perfecto", "spanish": "Indicativo Perfecto", "english": "Present Perfect", "mood": "Indicative", "order": 4},
|
|
{"id": "ind_pluscuamperfecto", "spanish": "Indicativo Pluscuamperfecto", "english": "Pluperfect", "mood": "Indicative", "order": 5},
|
|
{"id": "ind_futuro_perfecto", "spanish": "Indicativo Futuro Perfecto", "english": "Future Perfect", "mood": "Indicative", "order": 6},
|
|
{"id": "ind_preterito_anterior", "spanish": "Indicativo Pretérito Anterior", "english": "Preterite Perfect", "mood": "Indicative", "order": 7},
|
|
{"id": "cond_presente", "spanish": "Condicional Presente", "english": "Conditional", "mood": "Conditional", "order": 8},
|
|
{"id": "cond_perfecto", "spanish": "Condicional Perfecto", "english": "Conditional Perfect", "mood": "Conditional", "order": 9},
|
|
{"id": "subj_presente", "spanish": "Subjuntivo Presente", "english": "Present Subjunctive", "mood": "Subjunctive", "order": 10},
|
|
{"id": "subj_imperfecto_1", "spanish": "Subjuntivo Imperfecto I", "english": "Past Subjunctive (ra)", "mood": "Subjunctive", "order": 11},
|
|
{"id": "subj_imperfecto_2", "spanish": "Subjuntivo Imperfecto II", "english": "Past Subjunctive (se)", "mood": "Subjunctive", "order": 12},
|
|
{"id": "subj_perfecto", "spanish": "Subjuntivo Perfecto", "english": "Subjunctive Perfect", "mood": "Subjunctive", "order": 13},
|
|
{"id": "subj_pluscuamperfecto_1", "spanish": "Subjuntivo Pluscuamperfecto I", "english": "Subjunctive Pluperfect (ra)", "mood": "Subjunctive", "order": 14},
|
|
{"id": "subj_pluscuamperfecto_2", "spanish": "Subjuntivo Pluscuamperfecto II", "english": "Subjunctive Pluperfect (se)", "mood": "Subjunctive", "order": 15},
|
|
{"id": "subj_futuro", "spanish": "Subjuntivo Futuro", "english": "Subjunctive Future", "mood": "Subjunctive", "order": 16},
|
|
{"id": "subj_futuro_perfecto", "spanish": "Subjuntivo Futuro Perfecto", "english": "Subjunctive Future Perfect", "mood": "Subjunctive", "order": 17},
|
|
{"id": "imp_afirmativo", "spanish": "Imperativo Afirmativo", "english": "Imperative", "mood": "Imperative", "order": 18},
|
|
{"id": "imp_negativo", "spanish": "Imperativo Negativo", "english": "Negative Imperative", "mood": "Imperative", "order": 19},
|
|
]
|
|
|
|
TENSE_LOOKUP = {}
|
|
for t in TENSES:
|
|
TENSE_LOOKUP[t["spanish"]] = t["id"]
|
|
|
|
PERSONS = ["yo", "tú", "él/ella/Ud.", "nosotros", "vosotros", "ellos/ellas/Uds."]
|
|
|
|
ENDINGS = {
|
|
"ar": {
|
|
"ind_presente": ["o", "as", "a", "amos", "áis", "an"],
|
|
"ind_preterito": ["é", "aste", "ó", "amos", "asteis", "aron"],
|
|
"ind_imperfecto": ["aba", "abas", "aba", "ábamos", "abais", "aban"],
|
|
"ind_futuro": ["aré", "arás", "ará", "aremos", "aréis", "arán"],
|
|
"cond_presente": ["aría", "arías", "aría", "aríamos", "aríais", "arían"],
|
|
"subj_presente": ["e", "es", "e", "emos", "éis", "en"],
|
|
"subj_imperfecto_1": ["ara", "aras", "ara", "áramos", "arais", "aran"],
|
|
"subj_imperfecto_2": ["ase", "ases", "ase", "ásemos", "aseis", "asen"],
|
|
"subj_futuro": ["are", "ares", "are", "áremos", "areis", "aren"],
|
|
"imp_afirmativo": ["", "a", "e", "emos", "ad", "en"],
|
|
"imp_negativo": ["", "es", "e", "emos", "éis", "en"],
|
|
},
|
|
"er": {
|
|
"ind_presente": ["o", "es", "e", "emos", "éis", "en"],
|
|
"ind_preterito": ["í", "iste", "ió", "imos", "isteis", "ieron"],
|
|
"ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"],
|
|
"ind_futuro": ["eré", "erás", "erá", "eremos", "eréis", "erán"],
|
|
"cond_presente": ["ería", "erías", "ería", "eríamos", "eríais", "erían"],
|
|
"subj_presente": ["a", "as", "a", "amos", "áis", "an"],
|
|
"subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"],
|
|
"subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"],
|
|
"subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"],
|
|
"imp_afirmativo": ["", "e", "a", "amos", "ed", "an"],
|
|
"imp_negativo": ["", "as", "a", "amos", "áis", "an"],
|
|
},
|
|
"ir": {
|
|
"ind_presente": ["o", "es", "e", "imos", "ís", "en"],
|
|
"ind_preterito": ["í", "iste", "ió", "imos", "isteis", "ieron"],
|
|
"ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"],
|
|
"ind_futuro": ["iré", "irás", "irá", "iremos", "iréis", "irán"],
|
|
"cond_presente": ["iría", "irías", "iría", "iríamos", "iríais", "irían"],
|
|
"subj_presente": ["a", "as", "a", "amos", "áis", "an"],
|
|
"subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"],
|
|
"subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"],
|
|
"subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"],
|
|
"imp_afirmativo": ["", "e", "a", "amos", "id", "an"],
|
|
"imp_negativo": ["", "as", "a", "amos", "áis", "an"],
|
|
},
|
|
}
|
|
|
|
# Compound tenses: auxiliary haber forms
|
|
HABER = {
|
|
"ind_perfecto": ["he", "has", "ha", "hemos", "habéis", "han"],
|
|
"ind_pluscuamperfecto": ["había", "habías", "había", "habíamos", "habíais", "habían"],
|
|
"ind_futuro_perfecto": ["habré", "habrás", "habrá", "habremos", "habréis", "habrán"],
|
|
"ind_preterito_anterior": ["hube", "hubiste", "hubo", "hubimos", "hubisteis", "hubieron"],
|
|
"cond_perfecto": ["habría", "habrías", "habría", "habríamos", "habríais", "habrían"],
|
|
"subj_perfecto": ["haya", "hayas", "haya", "hayamos", "hayáis", "hayan"],
|
|
"subj_pluscuamperfecto_1": ["hubiera", "hubieras", "hubiera", "hubiéramos", "hubierais", "hubieran"],
|
|
"subj_pluscuamperfecto_2": ["hubiese", "hubieses", "hubiese", "hubiésemos", "hubieseis", "hubiesen"],
|
|
"subj_futuro_perfecto": ["hubiere", "hubieres", "hubiere", "hubiéremos", "hubiereis", "hubieren"],
|
|
}
|
|
|
|
def get_ending_type(infinitive):
|
|
inf = infinitive.lower()
|
|
if inf.endswith("arse") or inf.endswith("erse") or inf.endswith("irse"):
|
|
core = inf[:-2]
|
|
else:
|
|
core = inf
|
|
if core.endswith("ar"):
|
|
return "ar"
|
|
elif core.endswith("er"):
|
|
return "er"
|
|
elif core.endswith("ir") or core.endswith("ír"):
|
|
return "ir"
|
|
return "ar"
|
|
|
|
def get_stem(infinitive, ending_type):
|
|
inf = infinitive.lower()
|
|
if inf.endswith("se"):
|
|
inf = inf[:-2]
|
|
if ending_type == "ar" and inf.endswith("ar"):
|
|
return inf[:-2]
|
|
elif ending_type == "er" and inf.endswith("er"):
|
|
return inf[:-2]
|
|
elif ending_type == "ir" and (inf.endswith("ir") or inf.endswith("ír")):
|
|
return inf[:-2]
|
|
return inf[:-2]
|
|
|
|
def get_participle(infinitive, ending_type):
|
|
stem = get_stem(infinitive, ending_type)
|
|
if ending_type == "ar":
|
|
return stem + "ado"
|
|
else:
|
|
return stem + "ido"
|
|
|
|
def conjugate_regular(infinitive, tense_id, ending_type):
|
|
stem = get_stem(infinitive, ending_type)
|
|
if tense_id in HABER:
|
|
participle = get_participle(infinitive, ending_type)
|
|
return [f"{aux} {participle}" for aux in HABER[tense_id]]
|
|
if tense_id in ("ind_futuro", "cond_presente"):
|
|
return [infinitive.lower().rstrip("se") + e.lstrip(ending_type[0] if tense_id == "ind_futuro" else "")
|
|
for e in ENDINGS[ending_type][tense_id]]
|
|
# Actually for future/conditional, the stem is the full infinitive
|
|
base = infinitive.lower()
|
|
if base.endswith("se"):
|
|
base = base[:-2]
|
|
return [base + ENDINGS[ending_type][tense_id][i] for i in range(6)]
|
|
if tense_id in ENDINGS[ending_type]:
|
|
endings = ENDINGS[ending_type][tense_id]
|
|
return [stem + e for e in endings]
|
|
return [""] * 6
|
|
|
|
def conjugate_future_cond(infinitive, tense_id, ending_type):
|
|
base = infinitive.lower()
|
|
if base.endswith("se"):
|
|
base = base[:-2]
|
|
endings_map = {
|
|
"ind_futuro": ["é", "ás", "á", "emos", "éis", "án"],
|
|
"cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"],
|
|
}
|
|
if tense_id in endings_map:
|
|
return [base + e for e in endings_map[tense_id]]
|
|
return None
|
|
|
|
|
|
# ─── Step 1: Load ConjuGato verbs ───
|
|
print("Loading ConjuGato data...")
|
|
conn = sqlite3.connect(CONJUGATO_DB)
|
|
cursor = conn.cursor()
|
|
|
|
# Verbs
|
|
cursor.execute("SELECT Id, Rank, Ending, Reflexive, Spanish, English FROM Verb ORDER BY Rank")
|
|
conjugato_verbs = {}
|
|
for row in cursor.fetchall():
|
|
vid, rank, ending, reflexive, spanish, english = row
|
|
ending_map = {1: "ar", 2: "er", 4: "ir"}
|
|
conjugato_verbs[vid] = {
|
|
"id": vid,
|
|
"rank": rank,
|
|
"ending": ending_map.get(ending, "ar"),
|
|
"reflexive": reflexive,
|
|
"infinitive": spanish,
|
|
"english": english,
|
|
}
|
|
|
|
# Irregular verb forms
|
|
cursor.execute("SELECT VerbFormId, Form FROM IrregularVerbForm ORDER BY VerbFormId")
|
|
irregular_forms = {}
|
|
for vfid, form in cursor.fetchall():
|
|
irregular_forms[vfid] = form
|
|
|
|
# Irregular spans
|
|
cursor.execute("SELECT Id, VerbFormId, Type, Pattern, Start, End FROM IrregularSpan ORDER BY Id")
|
|
irregular_spans = []
|
|
for sid, vfid, stype, pattern, start, end in cursor.fetchall():
|
|
irregular_spans.append({
|
|
"verbFormId": vfid,
|
|
"type": stype,
|
|
"pattern": pattern,
|
|
"start": start,
|
|
"end": end,
|
|
})
|
|
|
|
# Irregularity bitmasks
|
|
cursor.execute("SELECT * FROM Irregularity ORDER BY VerbId")
|
|
irregularity_cols = [d[0] for d in cursor.description]
|
|
irregularity_data = {}
|
|
for row in cursor.fetchall():
|
|
verb_id = row[0]
|
|
irregularity_data[verb_id] = dict(zip(irregularity_cols[1:], row[1:]))
|
|
|
|
conn.close()
|
|
print(f" {len(conjugato_verbs)} verbs, {len(irregular_forms)} irregular forms, {len(irregular_spans)} spans")
|
|
|
|
# ─── Step 2: Load Conjuu ES conjugations ───
|
|
print("Loading Conjuu ES data...")
|
|
conjuu_verbs = {}
|
|
with open(CONJUU_VOCAB, 'r') as f:
|
|
for row in csv.reader(f):
|
|
verb_name = row[0]
|
|
tense_spanish = row[2]
|
|
tense_id = TENSE_LOOKUP.get(tense_spanish)
|
|
if not tense_id:
|
|
continue
|
|
regularity = row[1]
|
|
forms = row[3:9] # yo, tú, él, nosotros, vosotros, ellos
|
|
english = row[9]
|
|
rank = int(row[13]) if row[13] else 99999
|
|
|
|
key = verb_name.lower()
|
|
if key not in conjuu_verbs:
|
|
conjuu_verbs[key] = {
|
|
"infinitive": verb_name,
|
|
"english": english,
|
|
"rank": rank,
|
|
"tenses": {},
|
|
}
|
|
conjuu_verbs[key]["tenses"][tense_id] = {
|
|
"regularity": regularity,
|
|
"forms": forms,
|
|
}
|
|
|
|
print(f" {len(conjuu_verbs)} verbs with conjugations")
|
|
|
|
# ─── Step 3: Load tense guides ───
|
|
print("Loading tense guides...")
|
|
result = subprocess.run(['plutil', '-convert', 'xml1', '-o', '-', CONJUU_GUIDE], capture_output=True)
|
|
guide_data = plistlib.loads(result.stdout)
|
|
|
|
tense_guides = {}
|
|
for key, value in guide_data.items():
|
|
m = re.match(r'LL(.+)Guide(Top|Bottom)', key)
|
|
if m:
|
|
tense_name = m.group(1)
|
|
part = m.group(2)
|
|
if tense_name not in tense_guides:
|
|
tense_guides[tense_name] = {}
|
|
tense_guides[tense_name][part] = value
|
|
|
|
guides_output = []
|
|
for t in TENSES:
|
|
guide_key = t["spanish"].replace("Indicativo ", "").replace("Condicional ", "").replace("Subjuntivo ", "").replace("Imperativo ", "")
|
|
# Try exact match first, then various key patterns
|
|
guide = None
|
|
for gk, gv in tense_guides.items():
|
|
if gk == guide_key or gk == t["spanish"] or gk.replace(" ", "") == guide_key.replace(" ", ""):
|
|
guide = gv
|
|
break
|
|
if not guide:
|
|
# Try partial match
|
|
for gk, gv in tense_guides.items():
|
|
if guide_key.lower() in gk.lower() or gk.lower() in guide_key.lower():
|
|
guide = gv
|
|
break
|
|
|
|
guides_output.append({
|
|
"tenseId": t["id"],
|
|
"title": guide.get("Top", t["english"]) if guide else t["english"],
|
|
"body": guide.get("Bottom", "") if guide else "",
|
|
})
|
|
|
|
print(f" {len(guides_output)} tense guides")
|
|
|
|
# ─── Step 4: Load difficulty levels ───
|
|
print("Loading difficulty levels...")
|
|
level_files = [
|
|
("basic", "Basic.csv"),
|
|
("elementary_1", "Elementary-1.csv"),
|
|
("elementary_2", "Elementary-2.csv"),
|
|
("elementary_3", "Elementary-3.csv"),
|
|
("intermediate_1", "Intermediate-1.csv"),
|
|
("intermediate_2", "Intermediate-2.csv"),
|
|
("intermediate_3", "Intermediate-3.csv"),
|
|
("intermediate_4", "Intermediate-4.csv"),
|
|
]
|
|
|
|
level_verbs = {}
|
|
for level_id, filename in level_files:
|
|
path = os.path.join(CONJUU_LEVELS, filename)
|
|
with open(path, 'r') as f:
|
|
for row in csv.reader(f):
|
|
level_verbs[row[0].lower()] = level_id
|
|
|
|
print(f" {len(level_verbs)} verbs with curated levels")
|
|
|
|
# ─── Step 5: Merge everything ───
|
|
print("Merging data...")
|
|
|
|
# Map ConjuGato VerbFormId encoding
|
|
# VerbFormId = (1000 + VerbId) * 10000 + MTPP
|
|
# M: 1=Indicative, 2=Subjunctive, 3=Imperative
|
|
# T: tense within mood
|
|
# PP: person (01-08)
|
|
CONJUGATO_TENSE_MAP = {
|
|
# (mood, tense) -> tense_id
|
|
(1, 1): "ind_presente",
|
|
(1, 2): "ind_preterito",
|
|
(1, 3): "ind_imperfecto",
|
|
(1, 6): "cond_presente",
|
|
(1, 7): "ind_futuro",
|
|
(2, 1): "subj_presente",
|
|
(2, 3): "subj_imperfecto_1",
|
|
(2, 4): "subj_imperfecto_2",
|
|
(2, 7): "subj_futuro",
|
|
(3, 0): "imp_afirmativo", # person-specific
|
|
}
|
|
|
|
def decode_verb_form_id(vfid):
|
|
"""Decode VerbFormId into (verb_id, tense_id, person_index)"""
|
|
s = str(vfid)
|
|
if len(s) != 8:
|
|
return None, None, None
|
|
verb_id = int(s[:4]) - 1000
|
|
mood = int(s[4])
|
|
tense_num = int(s[5])
|
|
person = int(s[6:8])
|
|
|
|
# Handle imperative
|
|
if mood == 3:
|
|
if person >= 800:
|
|
tense_id = "imp_negativo"
|
|
person = person - 800
|
|
else:
|
|
tense_id = "imp_afirmativo"
|
|
else:
|
|
tense_id = CONJUGATO_TENSE_MAP.get((mood, tense_num))
|
|
|
|
if person >= 1 and person <= 6:
|
|
person_idx = person - 1
|
|
elif person == 7 or person == 8:
|
|
person_idx = None # vos/voseo - skip for now
|
|
else:
|
|
person_idx = None
|
|
|
|
return verb_id, tense_id, person_idx
|
|
|
|
|
|
def assign_level(rank):
|
|
if rank <= 25:
|
|
return "basic"
|
|
elif rank <= 100:
|
|
return "elementary"
|
|
elif rank <= 300:
|
|
return "intermediate"
|
|
elif rank <= 700:
|
|
return "advanced"
|
|
else:
|
|
return "expert"
|
|
|
|
|
|
# Build unified verb list
|
|
all_verbs = []
|
|
verb_forms = []
|
|
spans_output = []
|
|
|
|
for vid, cv in sorted(conjugato_verbs.items(), key=lambda x: x[1]["rank"]):
|
|
infinitive = cv["infinitive"]
|
|
inf_lower = infinitive.lower()
|
|
ending = cv["ending"]
|
|
rank = cv["rank"]
|
|
|
|
# Check Conjuu ES for this verb
|
|
conjuu = conjuu_verbs.get(inf_lower)
|
|
|
|
# Determine level
|
|
level = level_verbs.get(inf_lower, assign_level(rank))
|
|
|
|
verb_entry = {
|
|
"id": vid,
|
|
"infinitive": infinitive,
|
|
"english": cv["english"],
|
|
"rank": rank,
|
|
"ending": ending,
|
|
"reflexive": cv["reflexive"],
|
|
"level": level,
|
|
"hasConjuuData": conjuu is not None,
|
|
}
|
|
all_verbs.append(verb_entry)
|
|
|
|
# Generate forms for each tense
|
|
for tense in TENSES:
|
|
tid = tense["id"]
|
|
|
|
if conjuu and tid in conjuu["tenses"]:
|
|
# Use Conjuu ES data (pre-computed)
|
|
td = conjuu["tenses"][tid]
|
|
forms = td["forms"]
|
|
regularity = td["regularity"]
|
|
else:
|
|
# Generate from rules or ConjuGato irregular forms
|
|
regularity = "ordinary"
|
|
|
|
# Check if we have irregular forms from ConjuGato
|
|
has_irregular = vid in irregularity_data
|
|
|
|
if tid in HABER:
|
|
# Compound tense
|
|
participle = get_participle(infinitive, ending)
|
|
# Check for irregular participle from ConjuGato
|
|
forms = [f"{aux} {participle}" for aux in HABER[tid]]
|
|
regularity = "ordinary"
|
|
elif tid in ("ind_futuro", "cond_presente"):
|
|
# Future/conditional use full infinitive as stem
|
|
base = infinitive.lower()
|
|
if base.endswith("se"):
|
|
base = base[:-2]
|
|
endings_map = {
|
|
"ind_futuro": ["é", "ás", "á", "emos", "éis", "án"],
|
|
"cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"],
|
|
}
|
|
forms = [base + e for e in endings_map[tid]]
|
|
# Check for irregular future/conditional stems from ConjuGato
|
|
if has_irregular:
|
|
# Try to find irregular forms
|
|
for pi in range(6):
|
|
mood_tense = (1, 7) if tid == "ind_futuro" else (1, 6)
|
|
vfid = (1000 + vid) * 10000 + mood_tense[0] * 1000 + mood_tense[1] * 100 + (pi + 1)
|
|
if vfid in irregular_forms:
|
|
forms[pi] = irregular_forms[vfid]
|
|
regularity = "irregular"
|
|
else:
|
|
# Simple tense
|
|
stem = get_stem(infinitive, ending)
|
|
if tid in ENDINGS.get(ending, {}):
|
|
forms = [stem + e for e in ENDINGS[ending][tid]]
|
|
else:
|
|
forms = [""] * 6
|
|
|
|
# Override with ConjuGato irregular forms
|
|
if has_irregular:
|
|
mood_map = {
|
|
"ind_presente": (1, 1), "ind_preterito": (1, 2),
|
|
"ind_imperfecto": (1, 3),
|
|
"subj_presente": (2, 1), "subj_imperfecto_1": (2, 3),
|
|
"subj_imperfecto_2": (2, 4), "subj_futuro": (2, 7),
|
|
}
|
|
if tid in mood_map:
|
|
mt = mood_map[tid]
|
|
for pi in range(6):
|
|
vfid = (1000 + vid) * 10000 + mt[0] * 1000 + mt[1] * 100 + (pi + 1)
|
|
if vfid in irregular_forms:
|
|
forms[pi] = irregular_forms[vfid]
|
|
regularity = "irregular"
|
|
elif tid == "imp_afirmativo":
|
|
for pi in range(6):
|
|
vfid = (1000 + vid) * 10000 + 3000 + (pi + 1)
|
|
if vfid in irregular_forms:
|
|
forms[pi] = irregular_forms[vfid]
|
|
regularity = "irregular"
|
|
elif tid == "imp_negativo":
|
|
for pi in range(6):
|
|
vfid = (1000 + vid) * 10000 + 3800 + (pi + 1)
|
|
if vfid in irregular_forms:
|
|
forms[pi] = irregular_forms[vfid]
|
|
regularity = "irregular"
|
|
|
|
for pi, form in enumerate(forms):
|
|
if form:
|
|
verb_forms.append({
|
|
"verbId": vid,
|
|
"tenseId": tid,
|
|
"personIndex": pi,
|
|
"form": form,
|
|
"regularity": regularity,
|
|
})
|
|
|
|
# Build spans referencing verb forms
|
|
print("Processing irregular spans...")
|
|
for span in irregular_spans:
|
|
vfid = span["verbFormId"]
|
|
verb_id, tense_id, person_idx = decode_verb_form_id(vfid)
|
|
if verb_id is None or tense_id is None or person_idx is None:
|
|
continue
|
|
if verb_id not in conjugato_verbs:
|
|
continue
|
|
spans_output.append({
|
|
"verbId": verb_id,
|
|
"tenseId": tense_id,
|
|
"personIndex": person_idx,
|
|
"type": span["type"],
|
|
"pattern": span["pattern"],
|
|
"start": span["start"],
|
|
"end": span["end"],
|
|
})
|
|
|
|
# ─── Step 6: Output ───
|
|
print("Writing output...")
|
|
output = {
|
|
"tenses": TENSES,
|
|
"persons": PERSONS,
|
|
"verbs": all_verbs,
|
|
"verbForms": verb_forms,
|
|
"irregularSpans": spans_output,
|
|
"tenseGuides": guides_output,
|
|
}
|
|
|
|
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, ensure_ascii=False, indent=None)
|
|
|
|
# Also write a pretty version for debugging
|
|
with open(OUTPUT.replace('.json', '_debug.json'), 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
"stats": {
|
|
"verbs": len(all_verbs),
|
|
"verbForms": len(verb_forms),
|
|
"irregularSpans": len(spans_output),
|
|
"tenseGuides": len(guides_output),
|
|
},
|
|
"sampleVerb": all_verbs[0] if all_verbs else None,
|
|
"sampleForms": verb_forms[:20],
|
|
}, f, ensure_ascii=False, indent=2)
|
|
|
|
file_size = os.path.getsize(OUTPUT) / (1024 * 1024)
|
|
print(f"\nDone!")
|
|
print(f" Verbs: {len(all_verbs)}")
|
|
print(f" Verb forms: {len(verb_forms)}")
|
|
print(f" Irregular spans: {len(spans_output)}")
|
|
print(f" Tense guides: {len(guides_output)}")
|
|
print(f" Output: {OUTPUT} ({file_size:.1f} MB)")
|