7da98d786c
Add SRS-driven noun and adjective flashcards modeled on the existing verb flashcard flow: - SharedModels/Lexeme — catalog of non-verb vocab, frequency-ranked, with gender for nouns and optional example sentences. Seeded from a bundled vocab_lexemes.json built by Scripts/vocab/build_lexemes.py, which joins frequency.csv + es-en.data from a pinned doozan/spanish_data commit (CC-BY-SA: hermitdave/FrequencyWords + Wiktionary). 1,449 nouns and 600 adjectives, each with Wiktionary-sourced gender and (where available) an example sentence with English translation. - LexemeReviewCard + LexemeReviewStore — cloud-synced SM-2 SRS, keyed by partOfSpeech + lexemeId + drillMode so future drill modes can coexist. - LexemeSessionQueue + LexemePool — parallel to VocabSessionQueue; fresh cards sort by frequency rank. - LexemeStudyGroup — cloud-synced resumable session per (partOfSpeech, drillMode). - NounFlashcardPracticeView + AdjectiveFlashcardPracticeView — same flow as VocabFlashcardPracticeView: English prompt → tap to reveal Spanish → Again/Hard/Good/Easy. Nouns reveal with their article (la taza, el problema) so gender is taught alongside meaning, not as a separate quiz. Example sentence shown when present. CEFR-style level toggles: - LexemeLevel enum (A1/A2/B1/B2/C1+) derived from frequencyRank with standard Spanish-frequency-dictionary cutoffs (250/500/1000/2000). - UserProgress.selectedLexemeLevels — cloud-synced multi-select, defaults to A1+A2 on first launch. - SettingsView gains a "Vocabulary Levels" section with five toggles; the existing "Levels" section is renamed "Verb Levels" for clarity. - Due SRS cards always surface regardless of toggles. Disabling a level only stops new cards from that band entering the pool. PracticeView gets "Nouns" and "Adjectives" rows under "Books". DataLoader: new lexemeDataVersion gate that re-seeds the Lexeme table from vocab_lexemes.json independent of book seeding. project.yml lists the new JSON resource and the existing book_olly-vol2.json (which the previous build was silently excluding because xcodegen rewrote the project from project.yml). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
206 lines
7.7 KiB
Python
206 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""Phase 2b — build a per-book glossary job manifest.
|
|
|
|
Scans chapters.json, tokenizes every Spanish paragraph the SAME way the iOS app
|
|
does (whitespace split, lowercase, strip leading/trailing punctuation), collects
|
|
the distinct words with a few example sentences each, and writes batched
|
|
glossary jobs that Claude Code subagents can translate in parallel. Resumable:
|
|
jobs whose output file already exists are skipped.
|
|
|
|
Usage:
|
|
python3 build_glossary.py <slug> [--batch-size N] [--max-examples N]
|
|
[--build BUILD_DIR]
|
|
|
|
Inputs:
|
|
BUILD_DIR/<slug>/chapters.json (from extract_epub.py)
|
|
|
|
Outputs:
|
|
BUILD_DIR/<slug>/glossary/<jobid>.input.json (one per batch — read by subagents)
|
|
BUILD_DIR/<slug>/glossary/_pending.txt (job IDs still missing output)
|
|
BUILD_DIR/<slug>/glossary/_prompt_template.md (prompt for each subagent)
|
|
|
|
Job input shape (.input.json):
|
|
{"jobId": "gloss_b00",
|
|
"words": [{"word": "taza", "examples": ["...", "..."]}, ...]}
|
|
|
|
Subagents must write <jobid>.output.json with shape:
|
|
{"jobId": "gloss_b00",
|
|
"entries": [{"word": "taza", "baseForm": "taza",
|
|
"english": "cup", "partOfSpeech": "noun"}, ...]}
|
|
|
|
`entries` must contain exactly one object per input word.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import unicodedata
|
|
from pathlib import Path
|
|
|
|
|
|
PROMPT_TEMPLATE = """\
|
|
You are building a Spanish->English glossary for a language-learning app.
|
|
|
|
Input file: {input_path}
|
|
Output file: {output_path}
|
|
|
|
Read the input file. It contains a JSON object with a `words` array; each item
|
|
has a `word` (a lowercase Spanish word exactly as it appears in a book) and
|
|
`examples` (sentences from the book that use that word).
|
|
|
|
For EACH word, produce one entry:
|
|
- baseForm: the dictionary base form -- infinitive for verbs, masculine
|
|
singular for nouns/adjectives, the word itself for invariant words.
|
|
- english: a concise English translation (1-4 words). Use the sense the word
|
|
carries in the example sentences. Many Spanish words are both a verb form
|
|
AND a function word -- e.g. "como" is "I eat" (verb) and "as/like"
|
|
(conjunction). Choose the meaning shown in the examples, not the most common
|
|
dictionary sense.
|
|
- partOfSpeech: one of verb, noun, adjective, adverb, pronoun, preposition,
|
|
conjunction, article, interjection, numeral, proper noun, other.
|
|
- gender: ONLY for `partOfSpeech == "noun"`. "m" for masculine, "f" for
|
|
feminine, "m/f" for nouns that take either article (estudiante, artista).
|
|
OMIT the field entirely (or use null) for non-nouns and for cases where the
|
|
gender is genuinely unknowable from context. Don't guess for non-nouns.
|
|
|
|
Write the output file as JSON with this exact shape:
|
|
{{"jobId": "<the jobId from the input>", "entries": [
|
|
{{"word": "...", "baseForm": "...", "english": "...",
|
|
"partOfSpeech": "...", "gender": "m"}}
|
|
]}}
|
|
|
|
`entries` MUST contain exactly one object per input word, cover every word, and
|
|
echo each `word` back verbatim. Write nothing else to disk and produce no other
|
|
output.
|
|
"""
|
|
|
|
SENTENCE_SPLIT = re.compile(r"(?<=[.!?…])\s+")
|
|
|
|
|
|
def is_punct(ch: str) -> bool:
|
|
"""True for any Unicode punctuation — matches Swift's .punctuationCharacters."""
|
|
return unicodedata.category(ch).startswith("P")
|
|
|
|
|
|
def clean_word(token: str) -> str:
|
|
"""Mirror BookReaderView.cleanWord: lowercase, strip leading/trailing
|
|
punctuation, trim whitespace. Accents are preserved (no folding)."""
|
|
t = token.lower()
|
|
start, end = 0, len(t)
|
|
while start < end and is_punct(t[start]):
|
|
start += 1
|
|
while end > start and is_punct(t[end - 1]):
|
|
end -= 1
|
|
return t[start:end].strip()
|
|
|
|
|
|
def has_letter(s: str) -> bool:
|
|
return any(c.isalpha() for c in s)
|
|
|
|
|
|
def split_sentences(paragraph: str) -> list[str]:
|
|
parts = SENTENCE_SPLIT.split(paragraph.strip())
|
|
return [p.strip() for p in parts if p.strip()]
|
|
|
|
|
|
def is_english_front_matter(chapter: dict, threshold: float = 0.5) -> bool:
|
|
"""True when most of a chapter's paragraphs are untranslated — i.e. it is
|
|
English front matter (Preface, reading guide, …) rather than Spanish story
|
|
content. Story chapters still have *some* identical lines (verbatim
|
|
`word = meaning` vocab entries), so a majority threshold separates them:
|
|
front matter runs ~70-100% identical, stories ~25-35%. Only detectable once
|
|
paragraphsEN is populated; raw extracted chapters carry none, so nothing is
|
|
skipped on a fresh book's first pass."""
|
|
es = [p.strip() for p in chapter.get("paragraphsES", [])]
|
|
en = [p.strip() for p in chapter.get("paragraphsEN", [])]
|
|
if not en or len(en) != len(es) or not es:
|
|
return False
|
|
identical = sum(1 for a, b in zip(es, en) if a == b)
|
|
return identical / len(es) > threshold
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("slug")
|
|
parser.add_argument("--batch-size", type=int, default=150)
|
|
parser.add_argument("--max-examples", type=int, default=3)
|
|
parser.add_argument("--build", type=Path, default=Path("build"))
|
|
args = parser.parse_args()
|
|
|
|
base = args.build / args.slug
|
|
chapters = json.loads((base / "chapters.json").read_text(encoding="utf-8"))
|
|
gloss_dir = base / "glossary"
|
|
gloss_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
examples: dict[str, list[str]] = {}
|
|
first_seen: dict[str, int] = {}
|
|
order = 0
|
|
|
|
skipped_front_matter = 0
|
|
for ch in chapters["chapters"]:
|
|
if is_english_front_matter(ch):
|
|
skipped_front_matter += 1
|
|
continue
|
|
for paragraph in ch.get("paragraphsES", []):
|
|
for sentence in split_sentences(paragraph):
|
|
cleaned = {clean_word(tok) for tok in sentence.split()}
|
|
for w in cleaned:
|
|
if not w or not has_letter(w):
|
|
continue
|
|
if w not in first_seen:
|
|
first_seen[w] = order
|
|
order += 1
|
|
examples[w] = []
|
|
bucket = examples[w]
|
|
if len(bucket) < args.max_examples and sentence not in bucket:
|
|
bucket.append(sentence)
|
|
|
|
words = sorted(examples.keys(), key=lambda w: first_seen[w])
|
|
|
|
pending: list[str] = []
|
|
completed: list[str] = []
|
|
total_jobs = 0
|
|
|
|
for offset in range(0, len(words), args.batch_size):
|
|
chunk = words[offset : offset + args.batch_size]
|
|
job_id = f"gloss_b{offset // args.batch_size:02d}"
|
|
input_path = gloss_dir / f"{job_id}.input.json"
|
|
output_path = gloss_dir / f"{job_id}.output.json"
|
|
input_path.write_text(
|
|
json.dumps(
|
|
{
|
|
"jobId": job_id,
|
|
"words": [{"word": w, "examples": examples[w]} for w in chunk],
|
|
},
|
|
ensure_ascii=False,
|
|
indent=2,
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
total_jobs += 1
|
|
(completed if output_path.exists() else pending).append(job_id)
|
|
|
|
(gloss_dir / "_pending.txt").write_text(
|
|
"\n".join(pending) + ("\n" if pending else ""), encoding="utf-8"
|
|
)
|
|
(gloss_dir / "_prompt_template.md").write_text(
|
|
PROMPT_TEMPLATE.format(
|
|
input_path="<JOB_INPUT_PATH>", output_path="<JOB_OUTPUT_PATH>"
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
print(f"Skipped front matter: {skipped_front_matter} chapter(s)")
|
|
print(f"Distinct words: {len(words)}")
|
|
print(f"Total glossary jobs: {total_jobs}")
|
|
print(f" Completed: {len(completed)}")
|
|
print(f" Pending: {len(pending)}")
|
|
print(f"Manifest at: {gloss_dir / '_pending.txt'}")
|
|
print(f"Prompt template at: {gloss_dir / '_prompt_template.md'}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|