Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
133
Conjuga/Scripts/textbook/ocr_pdf.swift
Normal file
133
Conjuga/Scripts/textbook/ocr_pdf.swift
Normal file
@@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env swift
|
||||
// Rasterize each page of a PDF at high DPI and OCR it with Vision.
|
||||
// Output: { "<pdfIndex>": { "lines": [...], "confidence": Double, "bookPage": Int? } }
|
||||
//
|
||||
// Usage: swift ocr_pdf.swift <pdf_path> <output_json> [dpi]
|
||||
// Example: swift ocr_pdf.swift "book.pdf" pdf_ocr.json 240
|
||||
|
||||
import Foundation
|
||||
import Vision
|
||||
import AppKit
|
||||
import Quartz
|
||||
|
||||
guard CommandLine.arguments.count >= 3 else {
|
||||
print("Usage: swift ocr_pdf.swift <pdf_path> <output_json> [dpi]")
|
||||
exit(1)
|
||||
}
|
||||
|
||||
let pdfURL = URL(fileURLWithPath: CommandLine.arguments[1])
|
||||
let outputURL = URL(fileURLWithPath: CommandLine.arguments[2])
|
||||
let dpi: CGFloat = CommandLine.arguments.count >= 4 ? CGFloat(Double(CommandLine.arguments[3]) ?? 240.0) : 240.0
|
||||
|
||||
guard let pdfDoc = PDFDocument(url: pdfURL) else {
|
||||
print("Could not open PDF at \(pdfURL.path)")
|
||||
exit(1)
|
||||
}
|
||||
|
||||
let pageCount = pdfDoc.pageCount
|
||||
print("PDF has \(pageCount) pages. Rendering at \(dpi) DPI.")
|
||||
|
||||
struct PageResult: Encodable {
|
||||
var lines: [String]
|
||||
var confidence: Double
|
||||
var bookPage: Int?
|
||||
}
|
||||
|
||||
var results: [String: PageResult] = [:]
|
||||
let startTime = Date()
|
||||
|
||||
// Render at scale = dpi / 72 (72 is default PDF DPI)
|
||||
let scale: CGFloat = dpi / 72.0
|
||||
|
||||
for i in 0..<pageCount {
|
||||
guard let page = pdfDoc.page(at: i) else { continue }
|
||||
let pageBounds = page.bounds(for: .mediaBox)
|
||||
let scaledSize = CGSize(width: pageBounds.width * scale, height: pageBounds.height * scale)
|
||||
|
||||
// Render the page into a CGImage
|
||||
let colorSpace = CGColorSpaceCreateDeviceRGB()
|
||||
let bitmapInfo = CGImageAlphaInfo.noneSkipLast.rawValue
|
||||
guard let context = CGContext(
|
||||
data: nil,
|
||||
width: Int(scaledSize.width),
|
||||
height: Int(scaledSize.height),
|
||||
bitsPerComponent: 8,
|
||||
bytesPerRow: 0,
|
||||
space: colorSpace,
|
||||
bitmapInfo: bitmapInfo
|
||||
) else {
|
||||
print("\(i): could not create CGContext")
|
||||
continue
|
||||
}
|
||||
context.setFillColor(CGColor(gray: 1.0, alpha: 1.0))
|
||||
context.fill(CGRect(origin: .zero, size: scaledSize))
|
||||
context.scaleBy(x: scale, y: scale)
|
||||
page.draw(with: .mediaBox, to: context)
|
||||
|
||||
guard let cgImage = context.makeImage() else {
|
||||
print("\(i): could not create CGImage")
|
||||
continue
|
||||
}
|
||||
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
let request = VNRecognizeTextRequest()
|
||||
request.recognitionLevel = .accurate
|
||||
request.recognitionLanguages = ["es-ES", "es", "en-US"]
|
||||
request.usesLanguageCorrection = true
|
||||
if #available(macOS 13.0, *) {
|
||||
request.automaticallyDetectsLanguage = true
|
||||
}
|
||||
|
||||
do {
|
||||
try handler.perform([request])
|
||||
let observations = request.results ?? []
|
||||
var lines: [String] = []
|
||||
var totalConfidence: Float = 0
|
||||
var count = 0
|
||||
for obs in observations {
|
||||
if let top = obs.topCandidates(1).first {
|
||||
let s = top.string.trimmingCharacters(in: .whitespaces)
|
||||
if !s.isEmpty {
|
||||
lines.append(s)
|
||||
totalConfidence += top.confidence
|
||||
count += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
let avg = count > 0 ? Double(totalConfidence) / Double(count) : 0.0
|
||||
|
||||
// Try to detect book page number: a short numeric line in the first
|
||||
// 3 or last 3 entries (typical page-number placement).
|
||||
var bookPage: Int? = nil
|
||||
let candidates = Array(lines.prefix(3)) + Array(lines.suffix(3))
|
||||
for c in candidates {
|
||||
let trimmed = c.trimmingCharacters(in: .whitespaces)
|
||||
if let n = Int(trimmed), n >= 1 && n <= 1000 {
|
||||
bookPage = n
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
results[String(i)] = PageResult(lines: lines, confidence: avg, bookPage: bookPage)
|
||||
} catch {
|
||||
print("\(i): \(error)")
|
||||
}
|
||||
|
||||
if (i + 1) % 25 == 0 || (i + 1) == pageCount {
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
let rate = Double(i + 1) / max(elapsed, 0.001)
|
||||
let remaining = Double(pageCount - (i + 1)) / max(rate, 0.001)
|
||||
print(String(format: "%d/%d %.1f pg/s eta %.0fs", i + 1, pageCount, rate, remaining))
|
||||
}
|
||||
}
|
||||
|
||||
let encoder = JSONEncoder()
|
||||
encoder.outputFormatting = [.sortedKeys]
|
||||
do {
|
||||
let data = try encoder.encode(results)
|
||||
try data.write(to: outputURL)
|
||||
print("Wrote \(results.count) pages to \(outputURL.path)")
|
||||
} catch {
|
||||
print("Error writing output: \(error)")
|
||||
exit(1)
|
||||
}
|
||||
Reference in New Issue
Block a user