#!/usr/bin/env bash # End-to-end textbook extraction pipeline. # # Requires: Python 3 + lxml/beautifulsoup4/pypdf installed. # macOS for Vision + NSSpellChecker (Swift). # # Inputs: EPUB extracted to epub_extract/OEBPS/ and the PDF at project root. # Outputs: book.json, vocab_cards.json, manual_review.json, quarantined_cards.json set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" cd "$ROOT" echo "=== Phase 1a: parse XHTML chapters ===" python3 "$SCRIPT_DIR/extract_chapters.py" echo "=== Phase 1b: parse answer key ===" python3 "$SCRIPT_DIR/extract_answers.py" if [ ! -f "$SCRIPT_DIR/ocr.json" ]; then echo "=== Phase 1c: OCR EPUB images (first-time only) ===" swift "$SCRIPT_DIR/ocr_images.swift" "$ROOT/epub_extract/OEBPS" "$SCRIPT_DIR/ocr.json" else echo "=== Phase 1c: EPUB OCR already cached ===" fi PDF_FILE="$(ls "$ROOT"/Complete\ Spanish\ Step-By-Step*.pdf 2>/dev/null | head -1 || true)" if [ -n "$PDF_FILE" ] && [ ! -f "$SCRIPT_DIR/pdf_ocr.json" ]; then echo "=== Phase 1d: OCR PDF pages (first-time only) ===" swift "$SCRIPT_DIR/ocr_pdf.swift" "$PDF_FILE" "$SCRIPT_DIR/pdf_ocr.json" 240 fi echo "=== Phase 1e: merge into book.json ===" python3 "$SCRIPT_DIR/merge_pdf_into_book.py" echo "=== Phase 2: spell-check validation ===" swift "$SCRIPT_DIR/validate_vocab.swift" "$SCRIPT_DIR/vocab_cards.json" "$SCRIPT_DIR/vocab_validation.json" echo "=== Phase 3: auto-fix + quarantine pass 1 ===" python3 "$SCRIPT_DIR/fix_vocab.py" echo "=== Phase 3: auto-fix + quarantine pass 2 (convergence) ===" swift "$SCRIPT_DIR/validate_vocab.swift" "$SCRIPT_DIR/vocab_cards.json" "$SCRIPT_DIR/vocab_validation.json" python3 "$SCRIPT_DIR/fix_vocab.py" echo "" echo "=== Copy to app bundle ===" cp "$SCRIPT_DIR/book.json" "$ROOT/Conjuga/Conjuga/textbook_data.json" cp "$SCRIPT_DIR/vocab_cards.json" "$ROOT/Conjuga/Conjuga/textbook_vocab.json" ls -lh "$ROOT/Conjuga/Conjuga/textbook_"*.json echo "" echo "Done. Bump textbookDataVersion in DataLoader.swift to trigger re-seed."