#!/usr/bin/env python3 """Build Conjuga/vocab_lexemes.json from doozan/spanish_data. Joins doozan's frequency.csv (CC-BY-SA 3.0, OpenSubtitles via FrequencyWords) with es-en.data (CC-BY-SA, Wiktionary) into a single bundled JSON catalog of the highest-frequency Spanish nouns and adjectives — each row carries the lemma, English gloss, gender (for nouns), frequency rank, and an example sentence with translation when Wiktionary has one. The app's DataLoader.seedLexemesFromCatalog reads this file at startup to populate the Lexeme table that powers Noun / Adjective flashcard study. Usage: python3 build_lexemes.py [--max-nouns N] [--max-adjectives N] [--output PATH] [--cache-dir PATH] Pinned doozan commit: aeac698949e7b27112056ee8d72f70f853cd1ef9 (2026-05-01) """ from __future__ import annotations import argparse import csv import json import sys import urllib.request from pathlib import Path DOOZAN_COMMIT = "aeac698949e7b27112056ee8d72f70f853cd1ef9" BASE_URL = f"https://raw.githubusercontent.com/doozan/spanish_data/{DOOZAN_COMMIT}" FILES = { "frequency.csv": f"{BASE_URL}/frequency.csv", "es-en.data": f"{BASE_URL}/es-en.data", } # Both frequency.csv and es-en.data use short POS codes (`n`, `adj`); we keep # the same codes for the join. The output JSON uses the longer names the # app's Lexeme model expects. JOIN_POS = {"n", "adj"} OUTPUT_POS = {"n": "noun", "adj": "adjective"} def fetch(name: str, url: str, cache_dir: Path) -> Path: """Download once; reuse local cache on subsequent runs.""" cache_dir.mkdir(parents=True, exist_ok=True) out = cache_dir / name if out.exists() and out.stat().st_size > 0: return out print(f" downloading {name} ({url}) ...", file=sys.stderr) with urllib.request.urlopen(url) as resp, open(out, "wb") as fh: fh.write(resp.read()) return out def load_frequency(path: Path, *, keep_pos: set[str]) -> list[dict]: """Read frequency.csv → list of {lemma, pos, rank} for the POSes we care about. Rank is the row index (1-based), which matches frequency-descending order in the source file.""" rows: list[dict] = [] with open(path, encoding="utf-8") as fh: reader = csv.DictReader(fh) for i, row in enumerate(reader): pos = (row.get("pos") or "").strip() if pos not in keep_pos: continue flags = (row.get("flags") or "").strip() if "DUPLICATE" in flags or "NOUSAGE" in flags: continue lemma = (row.get("spanish") or "").strip() if not lemma: continue rows.append({"lemma": lemma, "pos": pos, "rank": i + 1}) return rows def load_es_en(path: Path) -> dict[tuple[str, str], dict]: """Parse es-en.data → {(lemma, pos): {gender, english, exampleES, exampleEN}}. A single `_____`-delimited block can hold multiple `pos:` sub-entries for the same lemma (e.g. `rojo` is both an adjective ("red") and a masculine noun ("a red one"); `mano` has two noun senses with different genders). We commit each sub-entry when we see the next `pos:` line, so `(lemma, pos)` pairs don't get clobbered by later same-block sub-entries. First-sense-wins on duplicate keys, which aligns with Wiktionary listing the most-common meaning first. """ entries: dict[tuple[str, str], dict] = {} lemma = pos = gender = english = ex_es = ex_en = None next_is_lemma = False def commit_subentry() -> None: nonlocal pos, gender, english, ex_es, ex_en if lemma and pos and english: key = (lemma, pos) if key not in entries: entries[key] = { "gender": gender, "english": english, "exampleES": ex_es, "exampleEN": ex_en, } pos = gender = english = ex_es = ex_en = None def reset_entry() -> None: nonlocal lemma commit_subentry() lemma = None with open(path, encoding="utf-8") as fh: for raw in fh: line = raw.rstrip("\n") stripped = line.lstrip() if stripped == "_____": reset_entry() next_is_lemma = True continue if next_is_lemma: lemma = stripped next_is_lemma = False continue if stripped.startswith("pos: "): # Starting a new sub-entry for the current lemma; commit the # previous sub-entry's state before resetting. commit_subentry() pos = stripped[5:].strip() elif stripped.startswith("g: "): gender = stripped[3:].strip() elif stripped.startswith("gloss: "): if english is None: english = stripped[7:].strip() elif stripped.startswith("ex: "): if ex_es is None: ex_es = stripped[4:].strip() elif stripped.startswith("eng: "): if ex_en is None: ex_en = stripped[5:].strip() reset_entry() return entries def normalize_gender(g: str | None) -> str | None: """Reduce Wiktionary gender codes to {m, f, m/f, None}. `mp` (masculine plural) / `fp` (feminine plural) are inherently-plural nouns (gafas, pantalones); they don't fit the singular el/la drill cleanly in v1, so we drop them here and the entry is filtered out upstream. """ if not g: return None g = g.strip() if g in ("m", "f"): return g if g in ("mf", "m/f", "m, f", "f, m"): return "m/f" return None def build(args) -> None: cache = Path(args.cache_dir).expanduser() paths = {name: fetch(name, url, cache) for name, url in FILES.items()} print( f"Reading frequency.csv (top {args.max_nouns} nouns, " f"top {args.max_adjectives} adjectives) ...", file=sys.stderr, ) rows = load_frequency(paths["frequency.csv"], keep_pos=JOIN_POS) nouns = [r for r in rows if r["pos"] == "n"][: args.max_nouns] adjs = [r for r in rows if r["pos"] == "adj"][: args.max_adjectives] print(f" candidates: {len(nouns)} nouns, {len(adjs)} adjectives", file=sys.stderr) print("Parsing es-en.data ...", file=sys.stderr) es_en = load_es_en(paths["es-en.data"]) print(f" {len(es_en)} (lemma, pos) entries", file=sys.stderr) out: list[dict] = [] skipped_no_entry = 0 skipped_no_english = 0 skipped_no_gender = 0 for source_rows in (nouns, adjs): for r in source_rows: short_pos = r["pos"] output_pos = OUTPUT_POS[short_pos] entry = es_en.get((r["lemma"], short_pos)) if not entry: skipped_no_entry += 1 continue english = entry.get("english") if not english: skipped_no_english += 1 continue gender = normalize_gender(entry.get("gender")) if short_pos == "n" else None if short_pos == "n" and gender is None: # Drill needs gender; if Wiktionary doesn't have it, skip. skipped_no_gender += 1 continue out.append({ "baseForm": r["lemma"], "english": english, "partOfSpeech": output_pos, "gender": gender, "frequencyRank": r["rank"], "exampleES": entry.get("exampleES"), "exampleEN": entry.get("exampleEN"), }) out.sort(key=lambda e: e["frequencyRank"]) out_path = Path(args.output).expanduser() out_path.parent.mkdir(parents=True, exist_ok=True) with open(out_path, "w", encoding="utf-8") as fh: json.dump(out, fh, ensure_ascii=False, separators=(",", ":")) fh.write("\n") noun_count = sum(1 for e in out if e["partOfSpeech"] == "noun") adj_count = sum(1 for e in out if e["partOfSpeech"] == "adjective") print( f"Wrote {out_path} — {noun_count} nouns, {adj_count} adjectives " f"({len(out)} total, {out_path.stat().st_size:,} bytes)", file=sys.stderr, ) print( f" skipped: no es-en entry={skipped_no_entry}, " f"no english={skipped_no_english}, " f"no gender={skipped_no_gender}", file=sys.stderr, ) def main() -> None: here = Path(__file__).resolve().parent parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument("--max-nouns", type=int, default=1500) parser.add_argument("--max-adjectives", type=int, default=600) parser.add_argument( "--output", default=str(here / ".." / ".." / "Conjuga" / "vocab_lexemes.json"), ) parser.add_argument( "--cache-dir", default=str(here / ".cache" / DOOZAN_COMMIT[:8]), ) build(parser.parse_args()) if __name__ == "__main__": main()