Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
55 lines
2.0 KiB
Bash
Executable File
55 lines
2.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# End-to-end textbook extraction pipeline.
|
|
#
|
|
# Requires: Python 3 + lxml/beautifulsoup4/pypdf installed.
|
|
# macOS for Vision + NSSpellChecker (Swift).
|
|
#
|
|
# Inputs: EPUB extracted to epub_extract/OEBPS/ and the PDF at project root.
|
|
# Outputs: book.json, vocab_cards.json, manual_review.json, quarantined_cards.json
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
|
|
cd "$ROOT"
|
|
|
|
echo "=== Phase 1a: parse XHTML chapters ==="
|
|
python3 "$SCRIPT_DIR/extract_chapters.py"
|
|
|
|
echo "=== Phase 1b: parse answer key ==="
|
|
python3 "$SCRIPT_DIR/extract_answers.py"
|
|
|
|
if [ ! -f "$SCRIPT_DIR/ocr.json" ]; then
|
|
echo "=== Phase 1c: OCR EPUB images (first-time only) ==="
|
|
swift "$SCRIPT_DIR/ocr_images.swift" "$ROOT/epub_extract/OEBPS" "$SCRIPT_DIR/ocr.json"
|
|
else
|
|
echo "=== Phase 1c: EPUB OCR already cached ==="
|
|
fi
|
|
|
|
PDF_FILE="$(ls "$ROOT"/Complete\ Spanish\ Step-By-Step*.pdf 2>/dev/null | head -1 || true)"
|
|
if [ -n "$PDF_FILE" ] && [ ! -f "$SCRIPT_DIR/pdf_ocr.json" ]; then
|
|
echo "=== Phase 1d: OCR PDF pages (first-time only) ==="
|
|
swift "$SCRIPT_DIR/ocr_pdf.swift" "$PDF_FILE" "$SCRIPT_DIR/pdf_ocr.json" 240
|
|
fi
|
|
|
|
echo "=== Phase 1e: merge into book.json ==="
|
|
python3 "$SCRIPT_DIR/merge_pdf_into_book.py"
|
|
|
|
echo "=== Phase 2: spell-check validation ==="
|
|
swift "$SCRIPT_DIR/validate_vocab.swift" "$SCRIPT_DIR/vocab_cards.json" "$SCRIPT_DIR/vocab_validation.json"
|
|
|
|
echo "=== Phase 3: auto-fix + quarantine pass 1 ==="
|
|
python3 "$SCRIPT_DIR/fix_vocab.py"
|
|
|
|
echo "=== Phase 3: auto-fix + quarantine pass 2 (convergence) ==="
|
|
swift "$SCRIPT_DIR/validate_vocab.swift" "$SCRIPT_DIR/vocab_cards.json" "$SCRIPT_DIR/vocab_validation.json"
|
|
python3 "$SCRIPT_DIR/fix_vocab.py"
|
|
|
|
echo ""
|
|
echo "=== Copy to app bundle ==="
|
|
cp "$SCRIPT_DIR/book.json" "$ROOT/Conjuga/Conjuga/textbook_data.json"
|
|
cp "$SCRIPT_DIR/vocab_cards.json" "$ROOT/Conjuga/Conjuga/textbook_vocab.json"
|
|
ls -lh "$ROOT/Conjuga/Conjuga/textbook_"*.json
|
|
echo ""
|
|
echo "Done. Bump textbookDataVersion in DataLoader.swift to trigger re-seed."
|