Spanish/Conjuga/Scripts/vocab/build_lexemes.py

#!/usr/bin/env python3
"""Build Conjuga/vocab_lexemes.json from doozan/spanish_data.

Joins doozan's frequency.csv (CC-BY-SA 3.0, OpenSubtitles via FrequencyWords)
with es-en.data (CC-BY-SA, Wiktionary) into a single bundled JSON catalog of
the highest-frequency Spanish nouns and adjectives — each row carries the
lemma, English gloss, gender (for nouns), frequency rank, and an example
sentence with translation when Wiktionary has one.

The app's DataLoader.seedLexemesFromCatalog reads this file at startup to
populate the Lexeme table that powers Noun / Adjective flashcard study.

Usage:
    python3 build_lexemes.py [--max-nouns N] [--max-adjectives N]
                             [--output PATH] [--cache-dir PATH]

Pinned doozan commit:  aeac698949e7b27112056ee8d72f70f853cd1ef9  (2026-05-01)
"""

from __future__ import annotations

import argparse
import csv
import json
import sys
import urllib.request
from pathlib import Path

DOOZAN_COMMIT = "aeac698949e7b27112056ee8d72f70f853cd1ef9"
BASE_URL = f"https://raw.githubusercontent.com/doozan/spanish_data/{DOOZAN_COMMIT}"

FILES = {
    "frequency.csv": f"{BASE_URL}/frequency.csv",
    "es-en.data":    f"{BASE_URL}/es-en.data",
}

# Both frequency.csv and es-en.data use short POS codes (`n`, `adj`); we keep
# the same codes for the join. The output JSON uses the longer names the
# app's Lexeme model expects.
JOIN_POS = {"n", "adj"}
OUTPUT_POS = {"n": "noun", "adj": "adjective"}


def fetch(name: str, url: str, cache_dir: Path) -> Path:
    """Download once; reuse local cache on subsequent runs."""
    cache_dir.mkdir(parents=True, exist_ok=True)
    out = cache_dir / name
    if out.exists() and out.stat().st_size > 0:
        return out
    print(f"  downloading {name} ({url}) ...", file=sys.stderr)
    with urllib.request.urlopen(url) as resp, open(out, "wb") as fh:
        fh.write(resp.read())
    return out


def load_frequency(path: Path, *, keep_pos: set[str]) -> list[dict]:
    """Read frequency.csv → list of {lemma, pos, rank} for the POSes we care
    about. Rank is the row index (1-based), which matches frequency-descending
    order in the source file."""
    rows: list[dict] = []
    with open(path, encoding="utf-8") as fh:
        reader = csv.DictReader(fh)
        for i, row in enumerate(reader):
            pos = (row.get("pos") or "").strip()
            if pos not in keep_pos:
                continue
            flags = (row.get("flags") or "").strip()
            if "DUPLICATE" in flags or "NOUSAGE" in flags:
                continue
            lemma = (row.get("spanish") or "").strip()
            if not lemma:
                continue
            rows.append({"lemma": lemma, "pos": pos, "rank": i + 1})
    return rows


def load_es_en(path: Path) -> dict[tuple[str, str], dict]:
    """Parse es-en.data → {(lemma, pos): {gender, english, exampleES, exampleEN}}.

    A single `_____`-delimited block can hold multiple `pos:` sub-entries
    for the same lemma (e.g. `rojo` is both an adjective ("red") and a
    masculine noun ("a red one"); `mano` has two noun senses with different
    genders). We commit each sub-entry when we see the next `pos:` line, so
    `(lemma, pos)` pairs don't get clobbered by later same-block sub-entries.
    First-sense-wins on duplicate keys, which aligns with Wiktionary listing
    the most-common meaning first.
    """
    entries: dict[tuple[str, str], dict] = {}
    lemma = pos = gender = english = ex_es = ex_en = None
    next_is_lemma = False

    def commit_subentry() -> None:
        nonlocal pos, gender, english, ex_es, ex_en
        if lemma and pos and english:
            key = (lemma, pos)
            if key not in entries:
                entries[key] = {
                    "gender": gender,
                    "english": english,
                    "exampleES": ex_es,
                    "exampleEN": ex_en,
                }
        pos = gender = english = ex_es = ex_en = None

    def reset_entry() -> None:
        nonlocal lemma
        commit_subentry()
        lemma = None

    with open(path, encoding="utf-8") as fh:
        for raw in fh:
            line = raw.rstrip("\n")
            stripped = line.lstrip()
            if stripped == "_____":
                reset_entry()
                next_is_lemma = True
                continue
            if next_is_lemma:
                lemma = stripped
                next_is_lemma = False
                continue
            if stripped.startswith("pos: "):
                # Starting a new sub-entry for the current lemma; commit the
                # previous sub-entry's state before resetting.
                commit_subentry()
                pos = stripped[5:].strip()
            elif stripped.startswith("g: "):
                gender = stripped[3:].strip()
            elif stripped.startswith("gloss: "):
                if english is None:
                    english = stripped[7:].strip()
            elif stripped.startswith("ex: "):
                if ex_es is None:
                    ex_es = stripped[4:].strip()
            elif stripped.startswith("eng: "):
                if ex_en is None:
                    ex_en = stripped[5:].strip()
    reset_entry()
    return entries


def normalize_gender(g: str | None) -> str | None:
    """Reduce Wiktionary gender codes to {m, f, m/f, None}.

    `mp` (masculine plural) / `fp` (feminine plural) are inherently-plural
    nouns (gafas, pantalones); they don't fit the singular el/la drill cleanly
    in v1, so we drop them here and the entry is filtered out upstream.
    """
    if not g:
        return None
    g = g.strip()
    if g in ("m", "f"):
        return g
    if g in ("mf", "m/f", "m, f", "f, m"):
        return "m/f"
    return None


def build(args) -> None:
    cache = Path(args.cache_dir).expanduser()
    paths = {name: fetch(name, url, cache) for name, url in FILES.items()}

    print(
        f"Reading frequency.csv (top {args.max_nouns} nouns, "
        f"top {args.max_adjectives} adjectives) ...",
        file=sys.stderr,
    )
    rows = load_frequency(paths["frequency.csv"], keep_pos=JOIN_POS)
    nouns = [r for r in rows if r["pos"] == "n"][: args.max_nouns]
    adjs = [r for r in rows if r["pos"] == "adj"][: args.max_adjectives]
    print(f"  candidates: {len(nouns)} nouns, {len(adjs)} adjectives", file=sys.stderr)

    print("Parsing es-en.data ...", file=sys.stderr)
    es_en = load_es_en(paths["es-en.data"])
    print(f"  {len(es_en)} (lemma, pos) entries", file=sys.stderr)

    out: list[dict] = []
    skipped_no_entry = 0
    skipped_no_english = 0
    skipped_no_gender = 0
    for source_rows in (nouns, adjs):
        for r in source_rows:
            short_pos = r["pos"]
            output_pos = OUTPUT_POS[short_pos]
            entry = es_en.get((r["lemma"], short_pos))
            if not entry:
                skipped_no_entry += 1
                continue
            english = entry.get("english")
            if not english:
                skipped_no_english += 1
                continue
            gender = normalize_gender(entry.get("gender")) if short_pos == "n" else None
            if short_pos == "n" and gender is None:
                # Drill needs gender; if Wiktionary doesn't have it, skip.
                skipped_no_gender += 1
                continue
            out.append({
                "baseForm":      r["lemma"],
                "english":       english,
                "partOfSpeech":  output_pos,
                "gender":        gender,
                "frequencyRank": r["rank"],
                "exampleES":     entry.get("exampleES"),
                "exampleEN":     entry.get("exampleEN"),
            })

    out.sort(key=lambda e: e["frequencyRank"])

    out_path = Path(args.output).expanduser()
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as fh:
        json.dump(out, fh, ensure_ascii=False, separators=(",", ":"))
        fh.write("\n")

    noun_count = sum(1 for e in out if e["partOfSpeech"] == "noun")
    adj_count = sum(1 for e in out if e["partOfSpeech"] == "adjective")
    print(
        f"Wrote {out_path}  —  {noun_count} nouns, {adj_count} adjectives "
        f"({len(out)} total, {out_path.stat().st_size:,} bytes)",
        file=sys.stderr,
    )
    print(
        f"  skipped: no es-en entry={skipped_no_entry}, "
        f"no english={skipped_no_english}, "
        f"no gender={skipped_no_gender}",
        file=sys.stderr,
    )


def main() -> None:
    here = Path(__file__).resolve().parent
    parser = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument("--max-nouns", type=int, default=1500)
    parser.add_argument("--max-adjectives", type=int, default=600)
    parser.add_argument(
        "--output",
        default=str(here / ".." / ".." / "Conjuga" / "vocab_lexemes.json"),
    )
    parser.add_argument(
        "--cache-dir",
        default=str(here / ".cache" / DOOZAN_COMMIT[:8]),
    )
    build(parser.parse_args())


if __name__ == "__main__":
    main()