Vocab study — noun & adjective flashcards with CEFR level toggles
Add SRS-driven noun and adjective flashcards modeled on the existing verb flashcard flow: - SharedModels/Lexeme — catalog of non-verb vocab, frequency-ranked, with gender for nouns and optional example sentences. Seeded from a bundled vocab_lexemes.json built by Scripts/vocab/build_lexemes.py, which joins frequency.csv + es-en.data from a pinned doozan/spanish_data commit (CC-BY-SA: hermitdave/FrequencyWords + Wiktionary). 1,449 nouns and 600 adjectives, each with Wiktionary-sourced gender and (where available) an example sentence with English translation. - LexemeReviewCard + LexemeReviewStore — cloud-synced SM-2 SRS, keyed by partOfSpeech + lexemeId + drillMode so future drill modes can coexist. - LexemeSessionQueue + LexemePool — parallel to VocabSessionQueue; fresh cards sort by frequency rank. - LexemeStudyGroup — cloud-synced resumable session per (partOfSpeech, drillMode). - NounFlashcardPracticeView + AdjectiveFlashcardPracticeView — same flow as VocabFlashcardPracticeView: English prompt → tap to reveal Spanish → Again/Hard/Good/Easy. Nouns reveal with their article (la taza, el problema) so gender is taught alongside meaning, not as a separate quiz. Example sentence shown when present. CEFR-style level toggles: - LexemeLevel enum (A1/A2/B1/B2/C1+) derived from frequencyRank with standard Spanish-frequency-dictionary cutoffs (250/500/1000/2000). - UserProgress.selectedLexemeLevels — cloud-synced multi-select, defaults to A1+A2 on first launch. - SettingsView gains a "Vocabulary Levels" section with five toggles; the existing "Levels" section is renamed "Verb Levels" for clarity. - Due SRS cards always surface regardless of toggles. Disabling a level only stops new cards from that band entering the pool. PracticeView gets "Nouns" and "Adjectives" rows under "Books". DataLoader: new lexemeDataVersion gate that re-seeds the Lexeme table from vocab_lexemes.json independent of book seeding. project.yml lists the new JSON resource and the existing book_olly-vol2.json (which the previous build was silently excluding because xcodegen rewrote the project from project.yml). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,250 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build Conjuga/vocab_lexemes.json from doozan/spanish_data.
|
||||
|
||||
Joins doozan's frequency.csv (CC-BY-SA 3.0, OpenSubtitles via FrequencyWords)
|
||||
with es-en.data (CC-BY-SA, Wiktionary) into a single bundled JSON catalog of
|
||||
the highest-frequency Spanish nouns and adjectives — each row carries the
|
||||
lemma, English gloss, gender (for nouns), frequency rank, and an example
|
||||
sentence with translation when Wiktionary has one.
|
||||
|
||||
The app's DataLoader.seedLexemesFromCatalog reads this file at startup to
|
||||
populate the Lexeme table that powers Noun / Adjective flashcard study.
|
||||
|
||||
Usage:
|
||||
python3 build_lexemes.py [--max-nouns N] [--max-adjectives N]
|
||||
[--output PATH] [--cache-dir PATH]
|
||||
|
||||
Pinned doozan commit: aeac698949e7b27112056ee8d72f70f853cd1ef9 (2026-05-01)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
DOOZAN_COMMIT = "aeac698949e7b27112056ee8d72f70f853cd1ef9"
|
||||
BASE_URL = f"https://raw.githubusercontent.com/doozan/spanish_data/{DOOZAN_COMMIT}"
|
||||
|
||||
FILES = {
|
||||
"frequency.csv": f"{BASE_URL}/frequency.csv",
|
||||
"es-en.data": f"{BASE_URL}/es-en.data",
|
||||
}
|
||||
|
||||
# Both frequency.csv and es-en.data use short POS codes (`n`, `adj`); we keep
|
||||
# the same codes for the join. The output JSON uses the longer names the
|
||||
# app's Lexeme model expects.
|
||||
JOIN_POS = {"n", "adj"}
|
||||
OUTPUT_POS = {"n": "noun", "adj": "adjective"}
|
||||
|
||||
|
||||
def fetch(name: str, url: str, cache_dir: Path) -> Path:
|
||||
"""Download once; reuse local cache on subsequent runs."""
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
out = cache_dir / name
|
||||
if out.exists() and out.stat().st_size > 0:
|
||||
return out
|
||||
print(f" downloading {name} ({url}) ...", file=sys.stderr)
|
||||
with urllib.request.urlopen(url) as resp, open(out, "wb") as fh:
|
||||
fh.write(resp.read())
|
||||
return out
|
||||
|
||||
|
||||
def load_frequency(path: Path, *, keep_pos: set[str]) -> list[dict]:
|
||||
"""Read frequency.csv → list of {lemma, pos, rank} for the POSes we care
|
||||
about. Rank is the row index (1-based), which matches frequency-descending
|
||||
order in the source file."""
|
||||
rows: list[dict] = []
|
||||
with open(path, encoding="utf-8") as fh:
|
||||
reader = csv.DictReader(fh)
|
||||
for i, row in enumerate(reader):
|
||||
pos = (row.get("pos") or "").strip()
|
||||
if pos not in keep_pos:
|
||||
continue
|
||||
flags = (row.get("flags") or "").strip()
|
||||
if "DUPLICATE" in flags or "NOUSAGE" in flags:
|
||||
continue
|
||||
lemma = (row.get("spanish") or "").strip()
|
||||
if not lemma:
|
||||
continue
|
||||
rows.append({"lemma": lemma, "pos": pos, "rank": i + 1})
|
||||
return rows
|
||||
|
||||
|
||||
def load_es_en(path: Path) -> dict[tuple[str, str], dict]:
|
||||
"""Parse es-en.data → {(lemma, pos): {gender, english, exampleES, exampleEN}}.
|
||||
|
||||
A single `_____`-delimited block can hold multiple `pos:` sub-entries
|
||||
for the same lemma (e.g. `rojo` is both an adjective ("red") and a
|
||||
masculine noun ("a red one"); `mano` has two noun senses with different
|
||||
genders). We commit each sub-entry when we see the next `pos:` line, so
|
||||
`(lemma, pos)` pairs don't get clobbered by later same-block sub-entries.
|
||||
First-sense-wins on duplicate keys, which aligns with Wiktionary listing
|
||||
the most-common meaning first.
|
||||
"""
|
||||
entries: dict[tuple[str, str], dict] = {}
|
||||
lemma = pos = gender = english = ex_es = ex_en = None
|
||||
next_is_lemma = False
|
||||
|
||||
def commit_subentry() -> None:
|
||||
nonlocal pos, gender, english, ex_es, ex_en
|
||||
if lemma and pos and english:
|
||||
key = (lemma, pos)
|
||||
if key not in entries:
|
||||
entries[key] = {
|
||||
"gender": gender,
|
||||
"english": english,
|
||||
"exampleES": ex_es,
|
||||
"exampleEN": ex_en,
|
||||
}
|
||||
pos = gender = english = ex_es = ex_en = None
|
||||
|
||||
def reset_entry() -> None:
|
||||
nonlocal lemma
|
||||
commit_subentry()
|
||||
lemma = None
|
||||
|
||||
with open(path, encoding="utf-8") as fh:
|
||||
for raw in fh:
|
||||
line = raw.rstrip("\n")
|
||||
stripped = line.lstrip()
|
||||
if stripped == "_____":
|
||||
reset_entry()
|
||||
next_is_lemma = True
|
||||
continue
|
||||
if next_is_lemma:
|
||||
lemma = stripped
|
||||
next_is_lemma = False
|
||||
continue
|
||||
if stripped.startswith("pos: "):
|
||||
# Starting a new sub-entry for the current lemma; commit the
|
||||
# previous sub-entry's state before resetting.
|
||||
commit_subentry()
|
||||
pos = stripped[5:].strip()
|
||||
elif stripped.startswith("g: "):
|
||||
gender = stripped[3:].strip()
|
||||
elif stripped.startswith("gloss: "):
|
||||
if english is None:
|
||||
english = stripped[7:].strip()
|
||||
elif stripped.startswith("ex: "):
|
||||
if ex_es is None:
|
||||
ex_es = stripped[4:].strip()
|
||||
elif stripped.startswith("eng: "):
|
||||
if ex_en is None:
|
||||
ex_en = stripped[5:].strip()
|
||||
reset_entry()
|
||||
return entries
|
||||
|
||||
|
||||
def normalize_gender(g: str | None) -> str | None:
|
||||
"""Reduce Wiktionary gender codes to {m, f, m/f, None}.
|
||||
|
||||
`mp` (masculine plural) / `fp` (feminine plural) are inherently-plural
|
||||
nouns (gafas, pantalones); they don't fit the singular el/la drill cleanly
|
||||
in v1, so we drop them here and the entry is filtered out upstream.
|
||||
"""
|
||||
if not g:
|
||||
return None
|
||||
g = g.strip()
|
||||
if g in ("m", "f"):
|
||||
return g
|
||||
if g in ("mf", "m/f", "m, f", "f, m"):
|
||||
return "m/f"
|
||||
return None
|
||||
|
||||
|
||||
def build(args) -> None:
|
||||
cache = Path(args.cache_dir).expanduser()
|
||||
paths = {name: fetch(name, url, cache) for name, url in FILES.items()}
|
||||
|
||||
print(
|
||||
f"Reading frequency.csv (top {args.max_nouns} nouns, "
|
||||
f"top {args.max_adjectives} adjectives) ...",
|
||||
file=sys.stderr,
|
||||
)
|
||||
rows = load_frequency(paths["frequency.csv"], keep_pos=JOIN_POS)
|
||||
nouns = [r for r in rows if r["pos"] == "n"][: args.max_nouns]
|
||||
adjs = [r for r in rows if r["pos"] == "adj"][: args.max_adjectives]
|
||||
print(f" candidates: {len(nouns)} nouns, {len(adjs)} adjectives", file=sys.stderr)
|
||||
|
||||
print("Parsing es-en.data ...", file=sys.stderr)
|
||||
es_en = load_es_en(paths["es-en.data"])
|
||||
print(f" {len(es_en)} (lemma, pos) entries", file=sys.stderr)
|
||||
|
||||
out: list[dict] = []
|
||||
skipped_no_entry = 0
|
||||
skipped_no_english = 0
|
||||
skipped_no_gender = 0
|
||||
for source_rows in (nouns, adjs):
|
||||
for r in source_rows:
|
||||
short_pos = r["pos"]
|
||||
output_pos = OUTPUT_POS[short_pos]
|
||||
entry = es_en.get((r["lemma"], short_pos))
|
||||
if not entry:
|
||||
skipped_no_entry += 1
|
||||
continue
|
||||
english = entry.get("english")
|
||||
if not english:
|
||||
skipped_no_english += 1
|
||||
continue
|
||||
gender = normalize_gender(entry.get("gender")) if short_pos == "n" else None
|
||||
if short_pos == "n" and gender is None:
|
||||
# Drill needs gender; if Wiktionary doesn't have it, skip.
|
||||
skipped_no_gender += 1
|
||||
continue
|
||||
out.append({
|
||||
"baseForm": r["lemma"],
|
||||
"english": english,
|
||||
"partOfSpeech": output_pos,
|
||||
"gender": gender,
|
||||
"frequencyRank": r["rank"],
|
||||
"exampleES": entry.get("exampleES"),
|
||||
"exampleEN": entry.get("exampleEN"),
|
||||
})
|
||||
|
||||
out.sort(key=lambda e: e["frequencyRank"])
|
||||
|
||||
out_path = Path(args.output).expanduser()
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(out_path, "w", encoding="utf-8") as fh:
|
||||
json.dump(out, fh, ensure_ascii=False, separators=(",", ":"))
|
||||
fh.write("\n")
|
||||
|
||||
noun_count = sum(1 for e in out if e["partOfSpeech"] == "noun")
|
||||
adj_count = sum(1 for e in out if e["partOfSpeech"] == "adjective")
|
||||
print(
|
||||
f"Wrote {out_path} — {noun_count} nouns, {adj_count} adjectives "
|
||||
f"({len(out)} total, {out_path.stat().st_size:,} bytes)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
f" skipped: no es-en entry={skipped_no_entry}, "
|
||||
f"no english={skipped_no_english}, "
|
||||
f"no gender={skipped_no_gender}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
here = Path(__file__).resolve().parent
|
||||
parser = argparse.ArgumentParser(
|
||||
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
parser.add_argument("--max-nouns", type=int, default=1500)
|
||||
parser.add_argument("--max-adjectives", type=int, default=600)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default=str(here / ".." / ".." / "Conjuga" / "vocab_lexemes.json"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache-dir",
|
||||
default=str(here / ".cache" / DOOZAN_COMMIT[:8]),
|
||||
)
|
||||
build(parser.parse_args())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user