Files
Spanish/Conjuga/Scripts/vocab/build_lexemes.py
T
Trey T 7da98d786c Vocab study — noun & adjective flashcards with CEFR level toggles
Add SRS-driven noun and adjective flashcards modeled on the existing verb
flashcard flow:

- SharedModels/Lexeme — catalog of non-verb vocab, frequency-ranked, with
  gender for nouns and optional example sentences. Seeded from a bundled
  vocab_lexemes.json built by Scripts/vocab/build_lexemes.py, which joins
  frequency.csv + es-en.data from a pinned doozan/spanish_data commit
  (CC-BY-SA: hermitdave/FrequencyWords + Wiktionary). 1,449 nouns and 600
  adjectives, each with Wiktionary-sourced gender and (where available)
  an example sentence with English translation.
- LexemeReviewCard + LexemeReviewStore — cloud-synced SM-2 SRS, keyed by
  partOfSpeech + lexemeId + drillMode so future drill modes can coexist.
- LexemeSessionQueue + LexemePool — parallel to VocabSessionQueue; fresh
  cards sort by frequency rank.
- LexemeStudyGroup — cloud-synced resumable session per
  (partOfSpeech, drillMode).
- NounFlashcardPracticeView + AdjectiveFlashcardPracticeView — same flow
  as VocabFlashcardPracticeView: English prompt → tap to reveal Spanish
  → Again/Hard/Good/Easy. Nouns reveal with their article (la taza, el
  problema) so gender is taught alongside meaning, not as a separate
  quiz. Example sentence shown when present.

CEFR-style level toggles:
- LexemeLevel enum (A1/A2/B1/B2/C1+) derived from frequencyRank with
  standard Spanish-frequency-dictionary cutoffs (250/500/1000/2000).
- UserProgress.selectedLexemeLevels — cloud-synced multi-select, defaults
  to A1+A2 on first launch.
- SettingsView gains a "Vocabulary Levels" section with five toggles; the
  existing "Levels" section is renamed "Verb Levels" for clarity.
- Due SRS cards always surface regardless of toggles. Disabling a level
  only stops new cards from that band entering the pool.

PracticeView gets "Nouns" and "Adjectives" rows under "Books".

DataLoader: new lexemeDataVersion gate that re-seeds the Lexeme table
from vocab_lexemes.json independent of book seeding. project.yml lists
the new JSON resource and the existing book_olly-vol2.json (which the
previous build was silently excluding because xcodegen rewrote the
project from project.yml).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 20:16:55 -05:00

251 lines
9.0 KiB
Python

#!/usr/bin/env python3
"""Build Conjuga/vocab_lexemes.json from doozan/spanish_data.
Joins doozan's frequency.csv (CC-BY-SA 3.0, OpenSubtitles via FrequencyWords)
with es-en.data (CC-BY-SA, Wiktionary) into a single bundled JSON catalog of
the highest-frequency Spanish nouns and adjectives — each row carries the
lemma, English gloss, gender (for nouns), frequency rank, and an example
sentence with translation when Wiktionary has one.
The app's DataLoader.seedLexemesFromCatalog reads this file at startup to
populate the Lexeme table that powers Noun / Adjective flashcard study.
Usage:
python3 build_lexemes.py [--max-nouns N] [--max-adjectives N]
[--output PATH] [--cache-dir PATH]
Pinned doozan commit: aeac698949e7b27112056ee8d72f70f853cd1ef9 (2026-05-01)
"""
from __future__ import annotations
import argparse
import csv
import json
import sys
import urllib.request
from pathlib import Path
DOOZAN_COMMIT = "aeac698949e7b27112056ee8d72f70f853cd1ef9"
BASE_URL = f"https://raw.githubusercontent.com/doozan/spanish_data/{DOOZAN_COMMIT}"
FILES = {
"frequency.csv": f"{BASE_URL}/frequency.csv",
"es-en.data": f"{BASE_URL}/es-en.data",
}
# Both frequency.csv and es-en.data use short POS codes (`n`, `adj`); we keep
# the same codes for the join. The output JSON uses the longer names the
# app's Lexeme model expects.
JOIN_POS = {"n", "adj"}
OUTPUT_POS = {"n": "noun", "adj": "adjective"}
def fetch(name: str, url: str, cache_dir: Path) -> Path:
"""Download once; reuse local cache on subsequent runs."""
cache_dir.mkdir(parents=True, exist_ok=True)
out = cache_dir / name
if out.exists() and out.stat().st_size > 0:
return out
print(f" downloading {name} ({url}) ...", file=sys.stderr)
with urllib.request.urlopen(url) as resp, open(out, "wb") as fh:
fh.write(resp.read())
return out
def load_frequency(path: Path, *, keep_pos: set[str]) -> list[dict]:
"""Read frequency.csv → list of {lemma, pos, rank} for the POSes we care
about. Rank is the row index (1-based), which matches frequency-descending
order in the source file."""
rows: list[dict] = []
with open(path, encoding="utf-8") as fh:
reader = csv.DictReader(fh)
for i, row in enumerate(reader):
pos = (row.get("pos") or "").strip()
if pos not in keep_pos:
continue
flags = (row.get("flags") or "").strip()
if "DUPLICATE" in flags or "NOUSAGE" in flags:
continue
lemma = (row.get("spanish") or "").strip()
if not lemma:
continue
rows.append({"lemma": lemma, "pos": pos, "rank": i + 1})
return rows
def load_es_en(path: Path) -> dict[tuple[str, str], dict]:
"""Parse es-en.data → {(lemma, pos): {gender, english, exampleES, exampleEN}}.
A single `_____`-delimited block can hold multiple `pos:` sub-entries
for the same lemma (e.g. `rojo` is both an adjective ("red") and a
masculine noun ("a red one"); `mano` has two noun senses with different
genders). We commit each sub-entry when we see the next `pos:` line, so
`(lemma, pos)` pairs don't get clobbered by later same-block sub-entries.
First-sense-wins on duplicate keys, which aligns with Wiktionary listing
the most-common meaning first.
"""
entries: dict[tuple[str, str], dict] = {}
lemma = pos = gender = english = ex_es = ex_en = None
next_is_lemma = False
def commit_subentry() -> None:
nonlocal pos, gender, english, ex_es, ex_en
if lemma and pos and english:
key = (lemma, pos)
if key not in entries:
entries[key] = {
"gender": gender,
"english": english,
"exampleES": ex_es,
"exampleEN": ex_en,
}
pos = gender = english = ex_es = ex_en = None
def reset_entry() -> None:
nonlocal lemma
commit_subentry()
lemma = None
with open(path, encoding="utf-8") as fh:
for raw in fh:
line = raw.rstrip("\n")
stripped = line.lstrip()
if stripped == "_____":
reset_entry()
next_is_lemma = True
continue
if next_is_lemma:
lemma = stripped
next_is_lemma = False
continue
if stripped.startswith("pos: "):
# Starting a new sub-entry for the current lemma; commit the
# previous sub-entry's state before resetting.
commit_subentry()
pos = stripped[5:].strip()
elif stripped.startswith("g: "):
gender = stripped[3:].strip()
elif stripped.startswith("gloss: "):
if english is None:
english = stripped[7:].strip()
elif stripped.startswith("ex: "):
if ex_es is None:
ex_es = stripped[4:].strip()
elif stripped.startswith("eng: "):
if ex_en is None:
ex_en = stripped[5:].strip()
reset_entry()
return entries
def normalize_gender(g: str | None) -> str | None:
"""Reduce Wiktionary gender codes to {m, f, m/f, None}.
`mp` (masculine plural) / `fp` (feminine plural) are inherently-plural
nouns (gafas, pantalones); they don't fit the singular el/la drill cleanly
in v1, so we drop them here and the entry is filtered out upstream.
"""
if not g:
return None
g = g.strip()
if g in ("m", "f"):
return g
if g in ("mf", "m/f", "m, f", "f, m"):
return "m/f"
return None
def build(args) -> None:
cache = Path(args.cache_dir).expanduser()
paths = {name: fetch(name, url, cache) for name, url in FILES.items()}
print(
f"Reading frequency.csv (top {args.max_nouns} nouns, "
f"top {args.max_adjectives} adjectives) ...",
file=sys.stderr,
)
rows = load_frequency(paths["frequency.csv"], keep_pos=JOIN_POS)
nouns = [r for r in rows if r["pos"] == "n"][: args.max_nouns]
adjs = [r for r in rows if r["pos"] == "adj"][: args.max_adjectives]
print(f" candidates: {len(nouns)} nouns, {len(adjs)} adjectives", file=sys.stderr)
print("Parsing es-en.data ...", file=sys.stderr)
es_en = load_es_en(paths["es-en.data"])
print(f" {len(es_en)} (lemma, pos) entries", file=sys.stderr)
out: list[dict] = []
skipped_no_entry = 0
skipped_no_english = 0
skipped_no_gender = 0
for source_rows in (nouns, adjs):
for r in source_rows:
short_pos = r["pos"]
output_pos = OUTPUT_POS[short_pos]
entry = es_en.get((r["lemma"], short_pos))
if not entry:
skipped_no_entry += 1
continue
english = entry.get("english")
if not english:
skipped_no_english += 1
continue
gender = normalize_gender(entry.get("gender")) if short_pos == "n" else None
if short_pos == "n" and gender is None:
# Drill needs gender; if Wiktionary doesn't have it, skip.
skipped_no_gender += 1
continue
out.append({
"baseForm": r["lemma"],
"english": english,
"partOfSpeech": output_pos,
"gender": gender,
"frequencyRank": r["rank"],
"exampleES": entry.get("exampleES"),
"exampleEN": entry.get("exampleEN"),
})
out.sort(key=lambda e: e["frequencyRank"])
out_path = Path(args.output).expanduser()
out_path.parent.mkdir(parents=True, exist_ok=True)
with open(out_path, "w", encoding="utf-8") as fh:
json.dump(out, fh, ensure_ascii=False, separators=(",", ":"))
fh.write("\n")
noun_count = sum(1 for e in out if e["partOfSpeech"] == "noun")
adj_count = sum(1 for e in out if e["partOfSpeech"] == "adjective")
print(
f"Wrote {out_path}{noun_count} nouns, {adj_count} adjectives "
f"({len(out)} total, {out_path.stat().st_size:,} bytes)",
file=sys.stderr,
)
print(
f" skipped: no es-en entry={skipped_no_entry}, "
f"no english={skipped_no_english}, "
f"no gender={skipped_no_gender}",
file=sys.stderr,
)
def main() -> None:
here = Path(__file__).resolve().parent
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument("--max-nouns", type=int, default=1500)
parser.add_argument("--max-adjectives", type=int, default=600)
parser.add_argument(
"--output",
default=str(here / ".." / ".." / "Conjuga" / "vocab_lexemes.json"),
)
parser.add_argument(
"--cache-dir",
default=str(here / ".cache" / DOOZAN_COMMIT[:8]),
)
build(parser.parse_args())
if __name__ == "__main__":
main()