Issue #32 cleanup — drop the last 5 mis-oriented vocab pairs
Two small fixes after the LLM-vision pass:
1. merge_pdf_into_book.py — when the LLM classifies an image as 'hybrid'
but extracts zero pairs (e.g., a conjugation table whose only English
text is on the section header that was excluded by the prompt rules),
respect that decision instead of falling through to the bbox/heuristic
pipeline. Previously: 1 chapter-2 estar conjugation table generated
4 bad pairs from the heuristic fallback.
2. fix_vocab.py language_score — recognize Spanish present-perfect
('he tenido', 'He andado por este pueblo') as Spanish. The classifier
was treating the auxiliary 'he'/'has'/'ha' as English subject pronouns,
producing false-positive mis-orientation flags on 4 chapter-15/20/23
present-perfect example tables.
Result: mis-oriented vocab pairs across the book go from 5 → 0.
textbookDataVersion bumped to 14.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -6,7 +6,7 @@ actor DataLoader {
|
|||||||
static let courseDataVersion = 7
|
static let courseDataVersion = 7
|
||||||
static let courseDataKey = "courseDataVersion"
|
static let courseDataKey = "courseDataVersion"
|
||||||
|
|
||||||
static let textbookDataVersion = 13
|
static let textbookDataVersion = 14
|
||||||
static let textbookDataKey = "textbookDataVersion"
|
static let textbookDataKey = "textbookDataVersion"
|
||||||
|
|
||||||
/// Quick check: does the DB need seeding or course data refresh?
|
/// Quick check: does the DB need seeding or course data refresh?
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -1297,78 +1297,6 @@
|
|||||||
"section": "Subject Pronouns",
|
"section": "Subject Pronouns",
|
||||||
"sourceImage": "f0014-01.jpg"
|
"sourceImage": "f0014-01.jpg"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"front": "vosotros estáis",
|
|
||||||
"back": "he is",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Estar (to be)",
|
|
||||||
"sourceImage": "f0015-01.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "ellos están",
|
|
||||||
"back": "she is",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Estar (to be)",
|
|
||||||
"sourceImage": "f0015-01.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "ellas están",
|
|
||||||
"back": "you are",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Estar (to be)",
|
|
||||||
"sourceImage": "f0015-01.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "Uds. están",
|
|
||||||
"back": "nosotros estamos",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Estar (to be)",
|
|
||||||
"sourceImage": "f0015-01.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "La comida está buena.",
|
|
||||||
"back": "The meal is (tastes) good.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Estar (to be)",
|
|
||||||
"sourceImage": "f0016-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "El pescado está delicioso.",
|
|
||||||
"back": "The fish is (tastes) delicious.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Estar (to be)",
|
|
||||||
"sourceImage": "f0016-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "La sopa está sabrosa.",
|
|
||||||
"back": "The soup is (tastes) delicious",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Estar (to be)",
|
|
||||||
"sourceImage": "f0016-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "Ella está hermosa hoy.",
|
|
||||||
"back": "She is (looks) pretty today.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Estar (to be)",
|
|
||||||
"sourceImage": "f0016-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "Él está guapo.",
|
|
||||||
"back": "He is (looks) handsome.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Estar (to be)",
|
|
||||||
"sourceImage": "f0016-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"front": "¿cómo?",
|
"front": "¿cómo?",
|
||||||
"back": "how?",
|
"back": "how?",
|
||||||
@@ -1513,150 +1441,6 @@
|
|||||||
"section": "Adjectives",
|
"section": "Adjectives",
|
||||||
"sourceImage": "f0017-03.jpg"
|
"sourceImage": "f0017-03.jpg"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"front": "La casa es roja.",
|
|
||||||
"back": "The house is red.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0019-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "El libro es azul.",
|
|
||||||
"back": "The book is blue.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0019-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "Los carros son viejos.",
|
|
||||||
"back": "The cars are old.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0019-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "Somos simpáticos.",
|
|
||||||
"back": "We are nice.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0019-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "¿ Es la flor amarilla?",
|
|
||||||
"back": "Is the flower yellow?",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0019-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "El vino es de Portugal.",
|
|
||||||
"back": "The wine is from Portugal",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-01.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "La cerveza es de México.",
|
|
||||||
"back": "The beer is from Mexico.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-01.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "El café es de Brazil.",
|
|
||||||
"back": "The coffee is from Brazil.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-01.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "Somos amigos.",
|
|
||||||
"back": "We are friends.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "José y Eduardo son hermanos.",
|
|
||||||
"back": "Joe and Ed are brothers",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "Pablo es español.",
|
|
||||||
"back": "Paul is Spanish.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "¿ Eres tú cubano?",
|
|
||||||
"back": "Are you Cuban?",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "Fila es católica.",
|
|
||||||
"back": "She is Catholic",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-02.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "La mesa es de madera.",
|
|
||||||
"back": "The table is of wood.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-03.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "La bolsa es de plástico.",
|
|
||||||
"back": "The bag is of plastic.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-03.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "Los zapatos son de cuero.",
|
|
||||||
"back": "The shoes are of leather:",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-03.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "Las ventanas son de vidrio.",
|
|
||||||
"back": "The windows are of glass",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-03.jpg"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"front": "La casa es de piedra.",
|
|
||||||
"back": "The house is of stone.",
|
|
||||||
"chapter": 2,
|
|
||||||
"chapterTitle": "Estar, Ser, and Subject Pronouns",
|
|
||||||
"section": "Ser (to be)",
|
|
||||||
"sourceImage": "f0020-03.jpg"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"front": "la alcoba",
|
"front": "la alcoba",
|
||||||
"back": "the bedroom",
|
"back": "the bedroom",
|
||||||
|
|||||||
@@ -46,6 +46,9 @@ SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
|
|||||||
ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}
|
ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}
|
||||||
|
|
||||||
|
|
||||||
|
HABER_FORMS = {"he", "has", "ha", "hemos", "habéis", "han"}
|
||||||
|
|
||||||
|
|
||||||
def language_score(s: str) -> "tuple[int, int]":
|
def language_score(s: str) -> "tuple[int, int]":
|
||||||
"""Return (es_score, en_score) for a string."""
|
"""Return (es_score, en_score) for a string."""
|
||||||
es = 0
|
es = 0
|
||||||
@@ -56,9 +59,17 @@ def language_score(s: str) -> "tuple[int, int]":
|
|||||||
if not words:
|
if not words:
|
||||||
return (es, en)
|
return (es, en)
|
||||||
first = words[0].strip(",.;:")
|
first = words[0].strip(",.;:")
|
||||||
if first in SPANISH_ARTICLES:
|
second = words[1].strip(",.;:") if len(words) > 1 else ""
|
||||||
|
# Spanish present-perfect ("he tenido", "Ha andado") starts with a haber
|
||||||
|
# form followed by an -ado/-ido past participle. Recognise this pattern
|
||||||
|
# before the bare-pronoun check so "he" isn't mistaken for English "he".
|
||||||
|
if first in HABER_FORMS and (
|
||||||
|
second.endswith(("ado", "ido", "to", "cho", "sto", "esto"))
|
||||||
|
):
|
||||||
|
es += 3
|
||||||
|
elif first in SPANISH_ARTICLES:
|
||||||
es += 2
|
es += 2
|
||||||
if first in ENGLISH_STARTERS:
|
elif first in ENGLISH_STARTERS:
|
||||||
en += 2
|
en += 2
|
||||||
# Spanish-likely endings on later words
|
# Spanish-likely endings on later words
|
||||||
for w in words:
|
for w in words:
|
||||||
|
|||||||
@@ -307,10 +307,13 @@ def main() -> None:
|
|||||||
|
|
||||||
# Choose pair source. For reference_only (Spanish-only tables)
|
# Choose pair source. For reference_only (Spanish-only tables)
|
||||||
# we deliberately produce no cards — the UI will fall back to
|
# we deliberately produce no cards — the UI will fall back to
|
||||||
# rendering the flat OCR lines as a reference list.
|
# rendering the flat OCR lines as a reference list. Same for
|
||||||
if llm_kind == "reference_only":
|
# hybrid images where the LLM determined no genuine pair rows
|
||||||
|
# exist (e.g. estar conjugations with English glosses on the
|
||||||
|
# header row only).
|
||||||
|
if llm_kind == "reference_only" or (llm_kind == "hybrid" and not llm_pairs):
|
||||||
cards_for_block = []
|
cards_for_block = []
|
||||||
pair_source = "llm-reference"
|
pair_source = "llm-no-pairs"
|
||||||
elif llm_pairs:
|
elif llm_pairs:
|
||||||
cards_for_block = [
|
cards_for_block = [
|
||||||
{"front": p["es"], "back": p["en"]}
|
{"front": p["es"], "back": p["en"]}
|
||||||
|
|||||||
Reference in New Issue
Block a user