Render textbook vocab as paired Spanish→English grid
Previously the chapter reader showed vocab tables as a flat list of OCR lines — because Vision reads columns top-to-bottom, the Spanish column appeared as one block followed by the English column, making pairings illegible. Now every vocab table renders as a 2-column grid with Spanish on the left and English on the right. Supporting changes: - New ocr_all_vocab.swift: bounding-box OCR over all 931 vocab images, cluster lines into rows by Y-coordinate, split rows by largest X-gap, detect 2- / 3- / 4-column layouts automatically. ~2800 pairs extracted this pass vs ~1100 from the old block-alternation heuristic. - merge_pdf_into_book.py now prefers bounding-box pairs when present, falls back to the heuristic, embeds the resulting pairs as vocab_table.cards in book.json. - DataLoader passes cards through to TextbookBlock on seed. - TextbookChapterView renders cards via SwiftUI Grid (2 cols). - fix_vocab.py quarantine rule relaxed — only mis-pairs where both sides are clearly the same language are removed. "unknown" sides stay (bbox pipeline already oriented them correctly). Textbook card count jumps from 1044 → 3118 active pairs. textbookDataVersion bumped to 9. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -76,6 +76,7 @@
|
||||
B4603AA6EFB134794AA39BF4 /* LyricsLibraryView.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2B1F646394D7C03493F1BF /* LyricsLibraryView.swift */; };
|
||||
B73F6EED00304B718C6FEFFA /* GrammarExerciseView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1F71CA5CD67342F18319DB9A /* GrammarExerciseView.swift */; };
|
||||
BB48230C3B26EA6E84D2D823 /* DailyProgressRing.swift in Sources */ = {isa = PBXBuildFile; fileRef = 180F9D59828C36B44A5E384F /* DailyProgressRing.swift */; };
|
||||
BC662C36AC503E00A977CEC1 /* VocabGridTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6584E0FDA939E3B82EECA4B5 /* VocabGridTests.swift */; };
|
||||
BF0832865857EFDA1D1CDEAD /* SharedModels in Frameworks */ = {isa = PBXBuildFile; productRef = BCCBABD74CADDB118179D8E9 /* SharedModels */; };
|
||||
C0BAEF49A6270D8F64CF13D6 /* PracticeViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = C359C051FB157EF447561405 /* PracticeViewModel.swift */; };
|
||||
C1F84182F12EB5CFF32768B6 /* MainTabView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5983A534E4836F30B5281ACB /* MainTabView.swift */; };
|
||||
@@ -184,6 +185,7 @@
|
||||
5E7EF4161C73AAC67B3A0004 /* WeekTestView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WeekTestView.swift; sourceTree = "<group>"; };
|
||||
626873572466403C0288090D /* QuizType.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = QuizType.swift; sourceTree = "<group>"; };
|
||||
631DC0A942DD57C81DECE083 /* DeckStudyView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DeckStudyView.swift; sourceTree = "<group>"; };
|
||||
6584E0FDA939E3B82EECA4B5 /* VocabGridTests.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = VocabGridTests.swift; sourceTree = "<group>"; };
|
||||
69D98E1564C6538056D81200 /* TenseEndingTable.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TenseEndingTable.swift; sourceTree = "<group>"; };
|
||||
6B9A9F2AB21895E06989A4D5 /* FlashcardView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FlashcardView.swift; sourceTree = "<group>"; };
|
||||
70960F0FD7509310B3F61C48 /* LyricsSearchView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LyricsSearchView.swift; sourceTree = "<group>"; };
|
||||
@@ -526,6 +528,7 @@
|
||||
CEEA84E15880A9D56DE18F33 /* TextbookFlowUITests.swift */,
|
||||
8A630C74D28CE1B280C9F296 /* AllChaptersScreenshotTests.swift */,
|
||||
8F08E1DC6932D9EA1D380913 /* StemChangeToggleTests.swift */,
|
||||
6584E0FDA939E3B82EECA4B5 /* VocabGridTests.swift */,
|
||||
);
|
||||
name = ConjugaUITests;
|
||||
path = ConjugaUITests;
|
||||
@@ -802,6 +805,7 @@
|
||||
96A3E5FA8EC63123D97365E1 /* TextbookFlowUITests.swift in Sources */,
|
||||
F7E459C46F25A8A45D7E0DFB /* AllChaptersScreenshotTests.swift in Sources */,
|
||||
1B0B3B2C771AD72E25B3493C /* StemChangeToggleTests.swift in Sources */,
|
||||
BC662C36AC503E00A977CEC1 /* VocabGridTests.swift in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
|
||||
@@ -6,7 +6,7 @@ actor DataLoader {
|
||||
static let courseDataVersion = 7
|
||||
static let courseDataKey = "courseDataVersion"
|
||||
|
||||
static let textbookDataVersion = 8
|
||||
static let textbookDataVersion = 9
|
||||
static let textbookDataKey = "textbookDataVersion"
|
||||
|
||||
/// Quick check: does the DB need seeding or course data refresh?
|
||||
@@ -426,6 +426,17 @@ actor DataLoader {
|
||||
if let src = b["sourceImage"] { out["sourceImage"] = src }
|
||||
if let lines = b["ocrLines"] { out["ocrLines"] = lines }
|
||||
if let conf = b["ocrConfidence"] { out["ocrConfidence"] = conf }
|
||||
// Paired Spanish→English cards from the bounding-box extractor.
|
||||
if let cards = b["cards"] as? [[String: Any]], !cards.isEmpty {
|
||||
let normalized: [[String: Any]] = cards.compactMap { c in
|
||||
guard let front = c["front"] as? String,
|
||||
let back = c["back"] as? String else { return nil }
|
||||
return ["front": front, "back": back]
|
||||
}
|
||||
if !normalized.isEmpty {
|
||||
out["cards"] = normalized
|
||||
}
|
||||
}
|
||||
case "exercise":
|
||||
exerciseCount += 1
|
||||
if let exId = b["id"] { out["exerciseId"] = exId }
|
||||
|
||||
@@ -84,7 +84,9 @@ struct TextbookChapterView: View {
|
||||
|
||||
private func vocabTableView(_ block: TextbookBlock) -> some View {
|
||||
let expanded = expandedVocab.contains(block.index)
|
||||
let cards = block.cards ?? []
|
||||
let lines = block.ocrLines ?? []
|
||||
let itemCount = cards.isEmpty ? lines.count : cards.count
|
||||
return VStack(alignment: .leading, spacing: 4) {
|
||||
Button {
|
||||
if expanded { expandedVocab.remove(block.index) } else { expandedVocab.insert(block.index) }
|
||||
@@ -92,7 +94,7 @@ struct TextbookChapterView: View {
|
||||
HStack {
|
||||
Image(systemName: expanded ? "chevron.down" : "chevron.right")
|
||||
.font(.caption)
|
||||
Text("Vocabulary (\(lines.count) items)")
|
||||
Text("Vocabulary (\(itemCount) items)")
|
||||
.font(.subheadline.weight(.medium))
|
||||
.foregroundStyle(.primary)
|
||||
Spacer()
|
||||
@@ -102,14 +104,20 @@ struct TextbookChapterView: View {
|
||||
.buttonStyle(.plain)
|
||||
|
||||
if expanded {
|
||||
VStack(alignment: .leading, spacing: 2) {
|
||||
ForEach(Array(lines.enumerated()), id: \.offset) { _, line in
|
||||
Text(line)
|
||||
.font(.callout.monospaced())
|
||||
.foregroundStyle(.secondary)
|
||||
if cards.isEmpty {
|
||||
// Fallback: no paired cards available — show raw OCR lines.
|
||||
VStack(alignment: .leading, spacing: 2) {
|
||||
ForEach(Array(lines.enumerated()), id: \.offset) { _, line in
|
||||
Text(line)
|
||||
.font(.callout.monospaced())
|
||||
.foregroundStyle(.secondary)
|
||||
}
|
||||
}
|
||||
.padding(.leading, 14)
|
||||
} else {
|
||||
vocabGrid(cards: cards)
|
||||
.padding(.leading, 14)
|
||||
}
|
||||
.padding(.leading, 14)
|
||||
}
|
||||
}
|
||||
.padding(10)
|
||||
@@ -117,6 +125,22 @@ struct TextbookChapterView: View {
|
||||
.background(Color.orange.opacity(0.08), in: RoundedRectangle(cornerRadius: 10))
|
||||
}
|
||||
|
||||
@ViewBuilder
|
||||
private func vocabGrid(cards: [TextbookVocabPair]) -> some View {
|
||||
Grid(alignment: .leading, horizontalSpacing: 16, verticalSpacing: 6) {
|
||||
ForEach(Array(cards.enumerated()), id: \.offset) { _, card in
|
||||
GridRow {
|
||||
Text(card.front)
|
||||
.font(.callout)
|
||||
.foregroundStyle(.primary)
|
||||
Text(card.back)
|
||||
.font(.callout)
|
||||
.foregroundStyle(.secondary)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func exerciseLinkView(_ block: TextbookBlock) -> some View {
|
||||
NavigationLink(value: TextbookExerciseDestination(
|
||||
chapterId: chapter.id,
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
53
Conjuga/ConjugaUITests/VocabGridTests.swift
Normal file
53
Conjuga/ConjugaUITests/VocabGridTests.swift
Normal file
@@ -0,0 +1,53 @@
|
||||
import XCTest
|
||||
|
||||
final class VocabGridTests: XCTestCase {
|
||||
|
||||
override func setUpWithError() throws {
|
||||
continueAfterFailure = false
|
||||
}
|
||||
|
||||
/// Verifies the chapter reader renders vocab tables as a paired Spanish↔English grid.
|
||||
func testChapter4VocabGrid() throws {
|
||||
let app = XCUIApplication()
|
||||
app.launchArguments += ["-onboardingComplete", "YES"]
|
||||
app.launch()
|
||||
|
||||
app.tabBars.buttons["Course"].tap()
|
||||
|
||||
let textbookRow = app.buttons.containing(NSPredicate(
|
||||
format: "label CONTAINS[c] 'Complete Spanish'"
|
||||
)).firstMatch
|
||||
XCTAssertTrue(textbookRow.waitForExistence(timeout: 5))
|
||||
textbookRow.tap()
|
||||
|
||||
let ch4 = app.buttons["textbook-chapter-row-4"]
|
||||
XCTAssertTrue(ch4.waitForExistence(timeout: 3))
|
||||
ch4.tap()
|
||||
|
||||
attach(app, name: "01-ch4-top")
|
||||
|
||||
// Tap the first vocab disclosure — "Vocabulary (N items)"
|
||||
let vocabButton = app.buttons.matching(NSPredicate(
|
||||
format: "label BEGINSWITH 'Vocabulary ('"
|
||||
)).firstMatch
|
||||
XCTAssertTrue(vocabButton.waitForExistence(timeout: 3))
|
||||
vocabButton.tap()
|
||||
Thread.sleep(forTimeInterval: 0.4)
|
||||
|
||||
attach(app, name: "02-ch4-vocab-open")
|
||||
|
||||
// Scroll a little and screenshot a deeper vocab — numbers table is
|
||||
// typically a few screens down in chapter 4.
|
||||
app.swipeUp(velocity: .fast)
|
||||
app.swipeUp(velocity: .fast)
|
||||
attach(app, name: "03-ch4-deeper")
|
||||
}
|
||||
|
||||
private func attach(_ app: XCUIApplication, name: String) {
|
||||
let s = app.screenshot()
|
||||
let a = XCTAttachment(screenshot: s)
|
||||
a.name = name
|
||||
a.lifetime = .keepAlways
|
||||
add(a)
|
||||
}
|
||||
}
|
||||
@@ -173,14 +173,17 @@ def main() -> None:
|
||||
kept_cards.append(card)
|
||||
continue
|
||||
|
||||
# Quarantine obvious mis-pairs: both sides same language OR language mismatch
|
||||
# Quarantine only clear mis-pairs: both sides EXPLICITLY the wrong
|
||||
# language (both Spanish or both English). "unknown" sides stay —
|
||||
# the bounding-box pipeline already handled orientation correctly
|
||||
# and many valid pairs lack the article/accent markers we classify on.
|
||||
fes, fen = language_score(card["front"])
|
||||
bes, ben = language_score(card["back"])
|
||||
front_lang = "es" if fes > fen else ("en" if fen > fes else "unknown")
|
||||
back_lang = "es" if bes > ben else ("en" if ben > bes else "unknown")
|
||||
# A good card has front=es, back=en. Anything else when the card is
|
||||
# flagged is almost always a column-pairing error.
|
||||
if front_lang != "es" or back_lang != "en":
|
||||
bothSameLang = (front_lang == "es" and back_lang == "es") or (front_lang == "en" and back_lang == "en")
|
||||
reversed_pair = front_lang == "en" and back_lang == "es"
|
||||
if bothSameLang or reversed_pair:
|
||||
quarantined_cards.append({
|
||||
"chapter": ch["chapter"],
|
||||
"front": card["front"],
|
||||
|
||||
@@ -33,6 +33,7 @@ CHAPTERS_JSON = HERE / "chapters.json"
|
||||
ANSWERS_JSON = HERE / "answers.json"
|
||||
OCR_JSON = HERE / "ocr.json"
|
||||
PDF_OCR_JSON = HERE / "pdf_ocr.json"
|
||||
PAIRED_VOCAB_JSON = HERE / "paired_vocab.json" # bounding-box pairs (preferred)
|
||||
OUT_BOOK = HERE / "book.json"
|
||||
OUT_VOCAB = HERE / "vocab_cards.json"
|
||||
|
||||
@@ -222,7 +223,9 @@ def main() -> None:
|
||||
epub_ocr = load(OCR_JSON)
|
||||
pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {}
|
||||
pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {}
|
||||
paired_vocab = load(PAIRED_VOCAB_JSON) if PAIRED_VOCAB_JSON.exists() else {}
|
||||
print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers")
|
||||
print(f"Loaded bounding-box pairs for {len(paired_vocab)} vocab images")
|
||||
|
||||
# Build a global set of EPUB narrative lines (for subtraction when pulling vocab)
|
||||
narrative_set = set()
|
||||
@@ -279,19 +282,48 @@ def main() -> None:
|
||||
if repairs > 0:
|
||||
merged_pages += 1
|
||||
|
||||
derived = build_vocab_cards_for_block(
|
||||
# Prefer bounding-box pairs (from paired_vocab.json) when
|
||||
# present. Fall back to the block-alternation heuristic.
|
||||
bbox = paired_vocab.get(src, {})
|
||||
bbox_pairs = bbox.get("pairs", []) if isinstance(bbox, dict) else []
|
||||
heuristic = build_vocab_cards_for_block(
|
||||
{"src": src},
|
||||
{"lines": merged_lines, "confidence": merged_conf},
|
||||
ch, current_section_title, bi
|
||||
)
|
||||
all_vocab_cards.extend(derived)
|
||||
|
||||
if bbox_pairs:
|
||||
cards_for_block = [
|
||||
{"front": p["es"], "back": p["en"]}
|
||||
for p in bbox_pairs
|
||||
if p.get("es") and p.get("en")
|
||||
]
|
||||
# Also feed the flashcard deck
|
||||
for p in bbox_pairs:
|
||||
if p.get("es") and p.get("en"):
|
||||
all_vocab_cards.append({
|
||||
"front": p["es"],
|
||||
"back": p["en"],
|
||||
"chapter": ch["number"],
|
||||
"chapterTitle": ch["title"],
|
||||
"section": current_section_title,
|
||||
"sourceImage": src,
|
||||
})
|
||||
pair_source = "bbox"
|
||||
else:
|
||||
cards_for_block = [{"front": c["front"], "back": c["back"]} for c in heuristic]
|
||||
all_vocab_cards.extend(heuristic)
|
||||
pair_source = "heuristic"
|
||||
|
||||
out_blocks.append({
|
||||
"kind": "vocab_table",
|
||||
"sourceImage": src,
|
||||
"ocrLines": merged_lines,
|
||||
"ocrConfidence": merged_conf,
|
||||
"cardCount": len(derived),
|
||||
"source": "pdf-repaired" if repairs > 0 else ("epub" if epub_lines else "pdf"),
|
||||
"cardCount": len(cards_for_block),
|
||||
"cards": cards_for_block,
|
||||
"columnCount": bbox.get("columnCount", 2) if isinstance(bbox, dict) else 2,
|
||||
"source": pair_source,
|
||||
"bookPage": book_page,
|
||||
"repairs": repairs,
|
||||
})
|
||||
|
||||
232
Conjuga/Scripts/textbook/ocr_all_vocab.swift
Normal file
232
Conjuga/Scripts/textbook/ocr_all_vocab.swift
Normal file
@@ -0,0 +1,232 @@
|
||||
#!/usr/bin/env swift
|
||||
// Bounding-box OCR over every vocab image, producing Spanish→English pairs.
|
||||
// Much higher accuracy than the flat-OCR block-alternation heuristic because
|
||||
// we use each recognized line's position on the page: rows are clustered by
|
||||
// Y-coordinate and cells within a row are split by the biggest X gap.
|
||||
//
|
||||
// Usage: swift ocr_all_vocab.swift <image_list.json> <oebps_dir> <output.json>
|
||||
|
||||
import Foundation
|
||||
import Vision
|
||||
import AppKit
|
||||
|
||||
guard CommandLine.arguments.count >= 4 else {
|
||||
print("Usage: swift ocr_all_vocab.swift <image_list.json> <oebps_dir> <output.json>")
|
||||
exit(1)
|
||||
}
|
||||
|
||||
let imageListURL = URL(fileURLWithPath: CommandLine.arguments[1])
|
||||
let oebpsDir = URL(fileURLWithPath: CommandLine.arguments[2])
|
||||
let outputURL = URL(fileURLWithPath: CommandLine.arguments[3])
|
||||
|
||||
guard let listData = try? Data(contentsOf: imageListURL),
|
||||
let imageNames = try? JSONDecoder().decode([String].self, from: listData) else {
|
||||
print("Could not load image list at \(imageListURL.path)")
|
||||
exit(1)
|
||||
}
|
||||
print("Processing \(imageNames.count) images...")
|
||||
|
||||
struct RecognizedLine {
|
||||
let text: String
|
||||
let cx: Double
|
||||
let cy: Double
|
||||
let confidence: Double
|
||||
}
|
||||
|
||||
struct Pair: Encodable {
|
||||
var es: String
|
||||
var en: String
|
||||
var confidence: Double
|
||||
}
|
||||
|
||||
struct ImageResult: Encodable {
|
||||
var pairs: [Pair]
|
||||
var columnCount: Int
|
||||
var strategy: String
|
||||
var lineCount: Int
|
||||
}
|
||||
|
||||
let spanishAccents = Set<Character>(["á","é","í","ó","ú","ñ","ü","Á","É","Í","Ó","Ú","Ñ","Ü","¿","¡"])
|
||||
let spanishArticles: Set<String> = ["el","la","los","las","un","una","unos","unas"]
|
||||
let englishStarters: Set<String> = ["the","a","an","to","my","his","her","our","their","your"]
|
||||
let englishOnly: Set<String> = ["the","he","she","it","we","they","is","are","was","were","been","have","has","had","will","would"]
|
||||
|
||||
func classify(_ s: String) -> String {
|
||||
let lower = s.lowercased()
|
||||
if lower.contains(where: { spanishAccents.contains($0) }) { return "es" }
|
||||
let first = lower.split(separator: " ").first.map(String.init)?.trimmingCharacters(in: .punctuationCharacters) ?? ""
|
||||
if spanishArticles.contains(first) { return "es" }
|
||||
if englishStarters.contains(first) || englishOnly.contains(first) { return "en" }
|
||||
return "?"
|
||||
}
|
||||
|
||||
func recognize(_ cgImage: CGImage) -> [RecognizedLine] {
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
let req = VNRecognizeTextRequest()
|
||||
req.recognitionLevel = .accurate
|
||||
req.recognitionLanguages = ["es-ES", "es", "en-US"]
|
||||
req.usesLanguageCorrection = true
|
||||
if #available(macOS 13.0, *) { req.automaticallyDetectsLanguage = true }
|
||||
try? handler.perform([req])
|
||||
var out: [RecognizedLine] = []
|
||||
for obs in req.results ?? [] {
|
||||
guard let top = obs.topCandidates(1).first else { continue }
|
||||
let s = top.string.trimmingCharacters(in: .whitespaces)
|
||||
if s.isEmpty { continue }
|
||||
let bb = obs.boundingBox
|
||||
out.append(RecognizedLine(
|
||||
text: s,
|
||||
cx: Double(bb.origin.x + bb.width / 2),
|
||||
cy: Double(1.0 - (bb.origin.y + bb.height / 2)),
|
||||
confidence: Double(top.confidence)
|
||||
))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
/// Split a sorted-by-X line group into cells by finding the largest gap(s).
|
||||
/// `desiredCells` = 2 for 2-col, 4 for 2-pair, etc.
|
||||
func splitRow(_ lines: [RecognizedLine], into desiredCells: Int) -> [String] {
|
||||
guard lines.count >= desiredCells else {
|
||||
// Merge into fewer cells: just concatenate left-to-right.
|
||||
return [lines.map(\.text).joined(separator: " ")]
|
||||
}
|
||||
let sorted = lines.sorted { $0.cx < $1.cx }
|
||||
// Find (desiredCells - 1) biggest gaps
|
||||
var gaps: [(idx: Int, gap: Double)] = []
|
||||
for i in 1..<sorted.count {
|
||||
gaps.append((i, sorted[i].cx - sorted[i - 1].cx))
|
||||
}
|
||||
let splitAt = gaps.sorted { $0.gap > $1.gap }
|
||||
.prefix(desiredCells - 1)
|
||||
.map(\.idx)
|
||||
.sorted()
|
||||
var cells: [[RecognizedLine]] = []
|
||||
var start = 0
|
||||
for s in splitAt {
|
||||
cells.append(Array(sorted[start..<s]))
|
||||
start = s
|
||||
}
|
||||
cells.append(Array(sorted[start..<sorted.count]))
|
||||
return cells.map { $0.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces) }
|
||||
}
|
||||
|
||||
/// Cluster lines into rows by Y proximity. Returns rows in top-to-bottom order.
|
||||
func groupRows(_ lines: [RecognizedLine], tol: Double = 0.025) -> [[RecognizedLine]] {
|
||||
let sorted = lines.sorted { $0.cy < $1.cy }
|
||||
var rows: [[RecognizedLine]] = []
|
||||
var current: [RecognizedLine] = []
|
||||
for l in sorted {
|
||||
if let last = current.last, abs(l.cy - last.cy) > tol {
|
||||
rows.append(current)
|
||||
current = [l]
|
||||
} else {
|
||||
current.append(l)
|
||||
}
|
||||
}
|
||||
if !current.isEmpty { rows.append(current) }
|
||||
return rows
|
||||
}
|
||||
|
||||
/// Detect likely column count: look at how many x-cluster peaks exist across all rows.
|
||||
/// Clusters X-coords from all lines into buckets of 10% width.
|
||||
func detectColumnCount(_ lines: [RecognizedLine]) -> Int {
|
||||
guard !lines.isEmpty else { return 2 }
|
||||
let step = 0.10
|
||||
var buckets = [Int](repeating: 0, count: Int(1.0 / step) + 1)
|
||||
for l in lines {
|
||||
let b = min(max(0, Int(l.cx / step)), buckets.count - 1)
|
||||
buckets[b] += 1
|
||||
}
|
||||
// A peak = a bucket with count > 10% of total lines
|
||||
let threshold = max(2, lines.count / 10)
|
||||
let peaks = buckets.filter { $0 >= threshold }.count
|
||||
// Most tables are 2-col (peaks = 2). Some 4-col (2 ES/EN pairs side by side → peaks = 4).
|
||||
// Roman/decorative layouts may show 1 peak; treat as 2.
|
||||
switch peaks {
|
||||
case 0, 1, 2: return 2
|
||||
case 3: return 3
|
||||
default: return 4
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge label-less cells into Spanish→English pairs.
|
||||
/// `cells` is a row's cells (length = columnCount). For N=2, [es, en]. For N=4,
|
||||
/// [es1, en1, es2, en2] (two pairs). For N=3, [es, en_short, en_long] (rare, merge).
|
||||
func cellsToPairs(_ cells: [String], columnCount: Int) -> [(String, String)] {
|
||||
switch columnCount {
|
||||
case 2 where cells.count >= 2:
|
||||
return [(cells[0], cells[1])]
|
||||
case 3 where cells.count >= 3:
|
||||
// 3-col source: es | en | en-alternate. Keep all three by merging EN sides.
|
||||
return [(cells[0], [cells[1], cells[2]].joined(separator: " / "))]
|
||||
case 4 where cells.count >= 4:
|
||||
return [(cells[0], cells[1]), (cells[2], cells[3])]
|
||||
default:
|
||||
if cells.count >= 2 { return [(cells[0], cells.dropFirst().joined(separator: " "))] }
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
/// Swap pair if orientation is backwards (English on left, Spanish on right).
|
||||
func orientPair(_ pair: (String, String)) -> (String, String) {
|
||||
let (a, b) = pair
|
||||
let ca = classify(a), cb = classify(b)
|
||||
if ca == "en" && cb == "es" { return (b, a) }
|
||||
return pair
|
||||
}
|
||||
|
||||
var results: [String: ImageResult] = [:]
|
||||
var processed = 0
|
||||
let startTime = Date()
|
||||
|
||||
for name in imageNames {
|
||||
processed += 1
|
||||
let url = oebpsDir.appendingPathComponent(name)
|
||||
guard let nsImg = NSImage(contentsOf: url),
|
||||
let tiff = nsImg.tiffRepresentation,
|
||||
let rep = NSBitmapImageRep(data: tiff),
|
||||
let cg = rep.cgImage else {
|
||||
continue
|
||||
}
|
||||
let lines = recognize(cg)
|
||||
if lines.isEmpty {
|
||||
results[name] = ImageResult(pairs: [], columnCount: 2, strategy: "empty", lineCount: 0)
|
||||
continue
|
||||
}
|
||||
|
||||
let columnCount = detectColumnCount(lines)
|
||||
let rows = groupRows(lines, tol: 0.025)
|
||||
var pairs: [Pair] = []
|
||||
for row in rows {
|
||||
guard row.count >= 2 else { continue }
|
||||
let cells = splitRow(row, into: columnCount)
|
||||
let rawPairs = cellsToPairs(cells, columnCount: columnCount)
|
||||
for p in rawPairs {
|
||||
let (es, en) = orientPair(p)
|
||||
if es.count < 1 || en.count < 1 { continue }
|
||||
let avgConf = row.reduce(0.0) { $0 + $1.confidence } / Double(row.count)
|
||||
pairs.append(Pair(es: es, en: en, confidence: avgConf))
|
||||
}
|
||||
}
|
||||
results[name] = ImageResult(
|
||||
pairs: pairs,
|
||||
columnCount: columnCount,
|
||||
strategy: "bbox-row-split",
|
||||
lineCount: lines.count
|
||||
)
|
||||
|
||||
if processed % 50 == 0 || processed == imageNames.count {
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
let rate = Double(processed) / max(elapsed, 0.001)
|
||||
let eta = Double(imageNames.count - processed) / max(rate, 0.001)
|
||||
print(String(format: "%d/%d %.1f img/s eta %.0fs", processed, imageNames.count, rate, eta))
|
||||
}
|
||||
}
|
||||
|
||||
let enc = JSONEncoder()
|
||||
enc.outputFormatting = [.sortedKeys]
|
||||
try enc.encode(results).write(to: outputURL)
|
||||
let totalPairs = results.values.reduce(0) { $0 + $1.pairs.count }
|
||||
let emptyTables = results.values.filter { $0.pairs.isEmpty }.count
|
||||
print("Wrote \(results.count) results, \(totalPairs) total pairs, \(emptyTables) unpaired")
|
||||
Reference in New Issue
Block a user