Initial commit: Conjuga Spanish conjugation app
Includes SwiftData dual-store architecture (local reference + CloudKit user data), JSON-based data seeding, 20 tense guides, 20 grammar notes, SRS review system, course vocabulary, and widget support. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
47501
Conjuga/Scripts/all_courses_data.json
Normal file
47501
Conjuga/Scripts/all_courses_data.json
Normal file
File diff suppressed because it is too large
Load Diff
14
Conjuga/Scripts/build_store.swift
Normal file
14
Conjuga/Scripts/build_store.swift
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env swift
|
||||
|
||||
// Run this script to generate a pre-built SwiftData store (default.store)
|
||||
// that ships with the app bundle. No first-launch seeding needed.
|
||||
|
||||
import Foundation
|
||||
import SwiftData
|
||||
|
||||
// We can't easily run this as a standalone script because it needs
|
||||
// the @Model types compiled. Instead, we'll build it as part of the app.
|
||||
// See DataLoader.buildPreloadedStore() below.
|
||||
|
||||
print("Use DataLoader.buildPreloadedStore() from within the app to generate the store.")
|
||||
print("Then copy the .store file to the bundle.")
|
||||
1
Conjuga/Scripts/conjuga_data.json
Normal file
1
Conjuga/Scripts/conjuga_data.json
Normal file
File diff suppressed because one or more lines are too long
160
Conjuga/Scripts/conjuga_data_debug.json
Normal file
160
Conjuga/Scripts/conjuga_data_debug.json
Normal file
@@ -0,0 +1,160 @@
|
||||
{
|
||||
"stats": {
|
||||
"verbs": 1750,
|
||||
"verbForms": 209014,
|
||||
"irregularSpans": 14078,
|
||||
"tenseGuides": 20
|
||||
},
|
||||
"sampleVerb": {
|
||||
"id": 1,
|
||||
"infinitive": "ser",
|
||||
"english": "to be",
|
||||
"rank": 1,
|
||||
"ending": "er",
|
||||
"reflexive": 0,
|
||||
"level": "basic",
|
||||
"hasConjuuData": true
|
||||
},
|
||||
"sampleForms": [
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_presente",
|
||||
"personIndex": 0,
|
||||
"form": "soy",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_presente",
|
||||
"personIndex": 1,
|
||||
"form": "eres",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_presente",
|
||||
"personIndex": 2,
|
||||
"form": "es",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_presente",
|
||||
"personIndex": 3,
|
||||
"form": "somos",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_presente",
|
||||
"personIndex": 4,
|
||||
"form": "sois",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_presente",
|
||||
"personIndex": 5,
|
||||
"form": "son",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_preterito",
|
||||
"personIndex": 0,
|
||||
"form": "fui",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_preterito",
|
||||
"personIndex": 1,
|
||||
"form": "fuiste",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_preterito",
|
||||
"personIndex": 2,
|
||||
"form": "fue",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_preterito",
|
||||
"personIndex": 3,
|
||||
"form": "fuimos",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_preterito",
|
||||
"personIndex": 4,
|
||||
"form": "fuisteis",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_preterito",
|
||||
"personIndex": 5,
|
||||
"form": "fueron",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_imperfecto",
|
||||
"personIndex": 0,
|
||||
"form": "era",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_imperfecto",
|
||||
"personIndex": 1,
|
||||
"form": "eras",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_imperfecto",
|
||||
"personIndex": 2,
|
||||
"form": "era",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_imperfecto",
|
||||
"personIndex": 3,
|
||||
"form": "éramos",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_imperfecto",
|
||||
"personIndex": 4,
|
||||
"form": "erais",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_imperfecto",
|
||||
"personIndex": 5,
|
||||
"form": "eran",
|
||||
"regularity": "irregular"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_futuro",
|
||||
"personIndex": 0,
|
||||
"form": "seré",
|
||||
"regularity": "ordinary"
|
||||
},
|
||||
{
|
||||
"verbId": 1,
|
||||
"tenseId": "ind_futuro",
|
||||
"personIndex": 1,
|
||||
"form": "serás",
|
||||
"regularity": "ordinary"
|
||||
}
|
||||
]
|
||||
}
|
||||
14420
Conjuga/Scripts/course_data.json
Normal file
14420
Conjuga/Scripts/course_data.json
Normal file
File diff suppressed because it is too large
Load Diff
7276
Conjuga/Scripts/course_examples.json
Normal file
7276
Conjuga/Scripts/course_examples.json
Normal file
File diff suppressed because it is too large
Load Diff
550
Conjuga/Scripts/merge_data.py
Normal file
550
Conjuga/Scripts/merge_data.py
Normal file
@@ -0,0 +1,550 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Merge ConjuGato + Conjuu ES data into unified JSON for Conjuga app.
|
||||
|
||||
Sources:
|
||||
- ConjuGato: 1,750 verbs (verb.md), irregular forms, spans, irregularity bitmasks
|
||||
- Conjuu ES: 621 verbs with full conjugation tables, tense guides, conjugation rules
|
||||
|
||||
Output: conjuga_data.json with all verbs, forms, spans, guides
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import os
|
||||
import plistlib
|
||||
import subprocess
|
||||
|
||||
BASE = "/Users/treyt/Desktop/code/Spanish"
|
||||
CONJUGATO_DB = "/Applications/ConjuGato.app/WrappedBundle/Verbs.sqlite"
|
||||
CONJUU_VOCAB = "/Applications/Conjuu ES.app/Contents/Resources/Vocabulary.csv"
|
||||
CONJUU_GUIDE = "/Applications/Conjuu ES.app/Contents/Resources/en.lproj/Guide.strings"
|
||||
CONJUU_RULES = "/Applications/Conjuu ES.app/Contents/Resources/GuideTableEntries.plist"
|
||||
CONJUU_LEVELS = "/Applications/Conjuu ES.app/Contents/Resources"
|
||||
OUTPUT = os.path.join(BASE, "Conjuga", "Scripts", "conjuga_data.json")
|
||||
|
||||
# ─── Tense metadata ───
|
||||
TENSES = [
|
||||
{"id": "ind_presente", "spanish": "Indicativo Presente", "english": "Present", "mood": "Indicative", "order": 0},
|
||||
{"id": "ind_preterito", "spanish": "Indicativo Pretérito", "english": "Preterite", "mood": "Indicative", "order": 1},
|
||||
{"id": "ind_imperfecto", "spanish": "Indicativo Imperfecto", "english": "Imperfect", "mood": "Indicative", "order": 2},
|
||||
{"id": "ind_futuro", "spanish": "Indicativo Futuro", "english": "Future", "mood": "Indicative", "order": 3},
|
||||
{"id": "ind_perfecto", "spanish": "Indicativo Perfecto", "english": "Present Perfect", "mood": "Indicative", "order": 4},
|
||||
{"id": "ind_pluscuamperfecto", "spanish": "Indicativo Pluscuamperfecto", "english": "Pluperfect", "mood": "Indicative", "order": 5},
|
||||
{"id": "ind_futuro_perfecto", "spanish": "Indicativo Futuro Perfecto", "english": "Future Perfect", "mood": "Indicative", "order": 6},
|
||||
{"id": "ind_preterito_anterior", "spanish": "Indicativo Pretérito Anterior", "english": "Preterite Perfect", "mood": "Indicative", "order": 7},
|
||||
{"id": "cond_presente", "spanish": "Condicional Presente", "english": "Conditional", "mood": "Conditional", "order": 8},
|
||||
{"id": "cond_perfecto", "spanish": "Condicional Perfecto", "english": "Conditional Perfect", "mood": "Conditional", "order": 9},
|
||||
{"id": "subj_presente", "spanish": "Subjuntivo Presente", "english": "Present Subjunctive", "mood": "Subjunctive", "order": 10},
|
||||
{"id": "subj_imperfecto_1", "spanish": "Subjuntivo Imperfecto I", "english": "Past Subjunctive (ra)", "mood": "Subjunctive", "order": 11},
|
||||
{"id": "subj_imperfecto_2", "spanish": "Subjuntivo Imperfecto II", "english": "Past Subjunctive (se)", "mood": "Subjunctive", "order": 12},
|
||||
{"id": "subj_perfecto", "spanish": "Subjuntivo Perfecto", "english": "Subjunctive Perfect", "mood": "Subjunctive", "order": 13},
|
||||
{"id": "subj_pluscuamperfecto_1", "spanish": "Subjuntivo Pluscuamperfecto I", "english": "Subjunctive Pluperfect (ra)", "mood": "Subjunctive", "order": 14},
|
||||
{"id": "subj_pluscuamperfecto_2", "spanish": "Subjuntivo Pluscuamperfecto II", "english": "Subjunctive Pluperfect (se)", "mood": "Subjunctive", "order": 15},
|
||||
{"id": "subj_futuro", "spanish": "Subjuntivo Futuro", "english": "Subjunctive Future", "mood": "Subjunctive", "order": 16},
|
||||
{"id": "subj_futuro_perfecto", "spanish": "Subjuntivo Futuro Perfecto", "english": "Subjunctive Future Perfect", "mood": "Subjunctive", "order": 17},
|
||||
{"id": "imp_afirmativo", "spanish": "Imperativo Afirmativo", "english": "Imperative", "mood": "Imperative", "order": 18},
|
||||
{"id": "imp_negativo", "spanish": "Imperativo Negativo", "english": "Negative Imperative", "mood": "Imperative", "order": 19},
|
||||
]
|
||||
|
||||
TENSE_LOOKUP = {}
|
||||
for t in TENSES:
|
||||
TENSE_LOOKUP[t["spanish"]] = t["id"]
|
||||
|
||||
PERSONS = ["yo", "tú", "él/ella/Ud.", "nosotros", "vosotros", "ellos/ellas/Uds."]
|
||||
|
||||
ENDINGS = {
|
||||
"ar": {
|
||||
"ind_presente": ["o", "as", "a", "amos", "áis", "an"],
|
||||
"ind_preterito": ["é", "aste", "ó", "amos", "asteis", "aron"],
|
||||
"ind_imperfecto": ["aba", "abas", "aba", "ábamos", "abais", "aban"],
|
||||
"ind_futuro": ["aré", "arás", "ará", "aremos", "aréis", "arán"],
|
||||
"cond_presente": ["aría", "arías", "aría", "aríamos", "aríais", "arían"],
|
||||
"subj_presente": ["e", "es", "e", "emos", "éis", "en"],
|
||||
"subj_imperfecto_1": ["ara", "aras", "ara", "áramos", "arais", "aran"],
|
||||
"subj_imperfecto_2": ["ase", "ases", "ase", "ásemos", "aseis", "asen"],
|
||||
"subj_futuro": ["are", "ares", "are", "áremos", "areis", "aren"],
|
||||
"imp_afirmativo": ["", "a", "e", "emos", "ad", "en"],
|
||||
"imp_negativo": ["", "es", "e", "emos", "éis", "en"],
|
||||
},
|
||||
"er": {
|
||||
"ind_presente": ["o", "es", "e", "emos", "éis", "en"],
|
||||
"ind_preterito": ["í", "iste", "ió", "imos", "isteis", "ieron"],
|
||||
"ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"],
|
||||
"ind_futuro": ["eré", "erás", "erá", "eremos", "eréis", "erán"],
|
||||
"cond_presente": ["ería", "erías", "ería", "eríamos", "eríais", "erían"],
|
||||
"subj_presente": ["a", "as", "a", "amos", "áis", "an"],
|
||||
"subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"],
|
||||
"subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"],
|
||||
"subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"],
|
||||
"imp_afirmativo": ["", "e", "a", "amos", "ed", "an"],
|
||||
"imp_negativo": ["", "as", "a", "amos", "áis", "an"],
|
||||
},
|
||||
"ir": {
|
||||
"ind_presente": ["o", "es", "e", "imos", "ís", "en"],
|
||||
"ind_preterito": ["í", "iste", "ió", "imos", "isteis", "ieron"],
|
||||
"ind_imperfecto": ["ía", "ías", "ía", "íamos", "íais", "ían"],
|
||||
"ind_futuro": ["iré", "irás", "irá", "iremos", "iréis", "irán"],
|
||||
"cond_presente": ["iría", "irías", "iría", "iríamos", "iríais", "irían"],
|
||||
"subj_presente": ["a", "as", "a", "amos", "áis", "an"],
|
||||
"subj_imperfecto_1": ["iera", "ieras", "iera", "iéramos", "ierais", "ieran"],
|
||||
"subj_imperfecto_2": ["iese", "ieses", "iese", "iésemos", "ieseis", "iesen"],
|
||||
"subj_futuro": ["iere", "ieres", "iere", "iéremos", "iereis", "ieren"],
|
||||
"imp_afirmativo": ["", "e", "a", "amos", "id", "an"],
|
||||
"imp_negativo": ["", "as", "a", "amos", "áis", "an"],
|
||||
},
|
||||
}
|
||||
|
||||
# Compound tenses: auxiliary haber forms
|
||||
HABER = {
|
||||
"ind_perfecto": ["he", "has", "ha", "hemos", "habéis", "han"],
|
||||
"ind_pluscuamperfecto": ["había", "habías", "había", "habíamos", "habíais", "habían"],
|
||||
"ind_futuro_perfecto": ["habré", "habrás", "habrá", "habremos", "habréis", "habrán"],
|
||||
"ind_preterito_anterior": ["hube", "hubiste", "hubo", "hubimos", "hubisteis", "hubieron"],
|
||||
"cond_perfecto": ["habría", "habrías", "habría", "habríamos", "habríais", "habrían"],
|
||||
"subj_perfecto": ["haya", "hayas", "haya", "hayamos", "hayáis", "hayan"],
|
||||
"subj_pluscuamperfecto_1": ["hubiera", "hubieras", "hubiera", "hubiéramos", "hubierais", "hubieran"],
|
||||
"subj_pluscuamperfecto_2": ["hubiese", "hubieses", "hubiese", "hubiésemos", "hubieseis", "hubiesen"],
|
||||
"subj_futuro_perfecto": ["hubiere", "hubieres", "hubiere", "hubiéremos", "hubiereis", "hubieren"],
|
||||
}
|
||||
|
||||
def get_ending_type(infinitive):
|
||||
inf = infinitive.lower()
|
||||
if inf.endswith("arse") or inf.endswith("erse") or inf.endswith("irse"):
|
||||
core = inf[:-2]
|
||||
else:
|
||||
core = inf
|
||||
if core.endswith("ar"):
|
||||
return "ar"
|
||||
elif core.endswith("er"):
|
||||
return "er"
|
||||
elif core.endswith("ir") or core.endswith("ír"):
|
||||
return "ir"
|
||||
return "ar"
|
||||
|
||||
def get_stem(infinitive, ending_type):
|
||||
inf = infinitive.lower()
|
||||
if inf.endswith("se"):
|
||||
inf = inf[:-2]
|
||||
if ending_type == "ar" and inf.endswith("ar"):
|
||||
return inf[:-2]
|
||||
elif ending_type == "er" and inf.endswith("er"):
|
||||
return inf[:-2]
|
||||
elif ending_type == "ir" and (inf.endswith("ir") or inf.endswith("ír")):
|
||||
return inf[:-2]
|
||||
return inf[:-2]
|
||||
|
||||
def get_participle(infinitive, ending_type):
|
||||
stem = get_stem(infinitive, ending_type)
|
||||
if ending_type == "ar":
|
||||
return stem + "ado"
|
||||
else:
|
||||
return stem + "ido"
|
||||
|
||||
def conjugate_regular(infinitive, tense_id, ending_type):
|
||||
stem = get_stem(infinitive, ending_type)
|
||||
if tense_id in HABER:
|
||||
participle = get_participle(infinitive, ending_type)
|
||||
return [f"{aux} {participle}" for aux in HABER[tense_id]]
|
||||
if tense_id in ("ind_futuro", "cond_presente"):
|
||||
return [infinitive.lower().rstrip("se") + e.lstrip(ending_type[0] if tense_id == "ind_futuro" else "")
|
||||
for e in ENDINGS[ending_type][tense_id]]
|
||||
# Actually for future/conditional, the stem is the full infinitive
|
||||
base = infinitive.lower()
|
||||
if base.endswith("se"):
|
||||
base = base[:-2]
|
||||
return [base + ENDINGS[ending_type][tense_id][i] for i in range(6)]
|
||||
if tense_id in ENDINGS[ending_type]:
|
||||
endings = ENDINGS[ending_type][tense_id]
|
||||
return [stem + e for e in endings]
|
||||
return [""] * 6
|
||||
|
||||
def conjugate_future_cond(infinitive, tense_id, ending_type):
|
||||
base = infinitive.lower()
|
||||
if base.endswith("se"):
|
||||
base = base[:-2]
|
||||
endings_map = {
|
||||
"ind_futuro": ["é", "ás", "á", "emos", "éis", "án"],
|
||||
"cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"],
|
||||
}
|
||||
if tense_id in endings_map:
|
||||
return [base + e for e in endings_map[tense_id]]
|
||||
return None
|
||||
|
||||
|
||||
# ─── Step 1: Load ConjuGato verbs ───
|
||||
print("Loading ConjuGato data...")
|
||||
conn = sqlite3.connect(CONJUGATO_DB)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Verbs
|
||||
cursor.execute("SELECT Id, Rank, Ending, Reflexive, Spanish, English FROM Verb ORDER BY Rank")
|
||||
conjugato_verbs = {}
|
||||
for row in cursor.fetchall():
|
||||
vid, rank, ending, reflexive, spanish, english = row
|
||||
ending_map = {1: "ar", 2: "er", 4: "ir"}
|
||||
conjugato_verbs[vid] = {
|
||||
"id": vid,
|
||||
"rank": rank,
|
||||
"ending": ending_map.get(ending, "ar"),
|
||||
"reflexive": reflexive,
|
||||
"infinitive": spanish,
|
||||
"english": english,
|
||||
}
|
||||
|
||||
# Irregular verb forms
|
||||
cursor.execute("SELECT VerbFormId, Form FROM IrregularVerbForm ORDER BY VerbFormId")
|
||||
irregular_forms = {}
|
||||
for vfid, form in cursor.fetchall():
|
||||
irregular_forms[vfid] = form
|
||||
|
||||
# Irregular spans
|
||||
cursor.execute("SELECT Id, VerbFormId, Type, Pattern, Start, End FROM IrregularSpan ORDER BY Id")
|
||||
irregular_spans = []
|
||||
for sid, vfid, stype, pattern, start, end in cursor.fetchall():
|
||||
irregular_spans.append({
|
||||
"verbFormId": vfid,
|
||||
"type": stype,
|
||||
"pattern": pattern,
|
||||
"start": start,
|
||||
"end": end,
|
||||
})
|
||||
|
||||
# Irregularity bitmasks
|
||||
cursor.execute("SELECT * FROM Irregularity ORDER BY VerbId")
|
||||
irregularity_cols = [d[0] for d in cursor.description]
|
||||
irregularity_data = {}
|
||||
for row in cursor.fetchall():
|
||||
verb_id = row[0]
|
||||
irregularity_data[verb_id] = dict(zip(irregularity_cols[1:], row[1:]))
|
||||
|
||||
conn.close()
|
||||
print(f" {len(conjugato_verbs)} verbs, {len(irregular_forms)} irregular forms, {len(irregular_spans)} spans")
|
||||
|
||||
# ─── Step 2: Load Conjuu ES conjugations ───
|
||||
print("Loading Conjuu ES data...")
|
||||
conjuu_verbs = {}
|
||||
with open(CONJUU_VOCAB, 'r') as f:
|
||||
for row in csv.reader(f):
|
||||
verb_name = row[0]
|
||||
tense_spanish = row[2]
|
||||
tense_id = TENSE_LOOKUP.get(tense_spanish)
|
||||
if not tense_id:
|
||||
continue
|
||||
regularity = row[1]
|
||||
forms = row[3:9] # yo, tú, él, nosotros, vosotros, ellos
|
||||
english = row[9]
|
||||
rank = int(row[13]) if row[13] else 99999
|
||||
|
||||
key = verb_name.lower()
|
||||
if key not in conjuu_verbs:
|
||||
conjuu_verbs[key] = {
|
||||
"infinitive": verb_name,
|
||||
"english": english,
|
||||
"rank": rank,
|
||||
"tenses": {},
|
||||
}
|
||||
conjuu_verbs[key]["tenses"][tense_id] = {
|
||||
"regularity": regularity,
|
||||
"forms": forms,
|
||||
}
|
||||
|
||||
print(f" {len(conjuu_verbs)} verbs with conjugations")
|
||||
|
||||
# ─── Step 3: Load tense guides ───
|
||||
print("Loading tense guides...")
|
||||
result = subprocess.run(['plutil', '-convert', 'xml1', '-o', '-', CONJUU_GUIDE], capture_output=True)
|
||||
guide_data = plistlib.loads(result.stdout)
|
||||
|
||||
tense_guides = {}
|
||||
for key, value in guide_data.items():
|
||||
m = re.match(r'LL(.+)Guide(Top|Bottom)', key)
|
||||
if m:
|
||||
tense_name = m.group(1)
|
||||
part = m.group(2)
|
||||
if tense_name not in tense_guides:
|
||||
tense_guides[tense_name] = {}
|
||||
tense_guides[tense_name][part] = value
|
||||
|
||||
guides_output = []
|
||||
for t in TENSES:
|
||||
guide_key = t["spanish"].replace("Indicativo ", "").replace("Condicional ", "").replace("Subjuntivo ", "").replace("Imperativo ", "")
|
||||
# Try exact match first, then various key patterns
|
||||
guide = None
|
||||
for gk, gv in tense_guides.items():
|
||||
if gk == guide_key or gk == t["spanish"] or gk.replace(" ", "") == guide_key.replace(" ", ""):
|
||||
guide = gv
|
||||
break
|
||||
if not guide:
|
||||
# Try partial match
|
||||
for gk, gv in tense_guides.items():
|
||||
if guide_key.lower() in gk.lower() or gk.lower() in guide_key.lower():
|
||||
guide = gv
|
||||
break
|
||||
|
||||
guides_output.append({
|
||||
"tenseId": t["id"],
|
||||
"title": guide.get("Top", t["english"]) if guide else t["english"],
|
||||
"body": guide.get("Bottom", "") if guide else "",
|
||||
})
|
||||
|
||||
print(f" {len(guides_output)} tense guides")
|
||||
|
||||
# ─── Step 4: Load difficulty levels ───
|
||||
print("Loading difficulty levels...")
|
||||
level_files = [
|
||||
("basic", "Basic.csv"),
|
||||
("elementary_1", "Elementary-1.csv"),
|
||||
("elementary_2", "Elementary-2.csv"),
|
||||
("elementary_3", "Elementary-3.csv"),
|
||||
("intermediate_1", "Intermediate-1.csv"),
|
||||
("intermediate_2", "Intermediate-2.csv"),
|
||||
("intermediate_3", "Intermediate-3.csv"),
|
||||
("intermediate_4", "Intermediate-4.csv"),
|
||||
]
|
||||
|
||||
level_verbs = {}
|
||||
for level_id, filename in level_files:
|
||||
path = os.path.join(CONJUU_LEVELS, filename)
|
||||
with open(path, 'r') as f:
|
||||
for row in csv.reader(f):
|
||||
level_verbs[row[0].lower()] = level_id
|
||||
|
||||
print(f" {len(level_verbs)} verbs with curated levels")
|
||||
|
||||
# ─── Step 5: Merge everything ───
|
||||
print("Merging data...")
|
||||
|
||||
# Map ConjuGato VerbFormId encoding
|
||||
# VerbFormId = (1000 + VerbId) * 10000 + MTPP
|
||||
# M: 1=Indicative, 2=Subjunctive, 3=Imperative
|
||||
# T: tense within mood
|
||||
# PP: person (01-08)
|
||||
CONJUGATO_TENSE_MAP = {
|
||||
# (mood, tense) -> tense_id
|
||||
(1, 1): "ind_presente",
|
||||
(1, 2): "ind_preterito",
|
||||
(1, 3): "ind_imperfecto",
|
||||
(1, 6): "cond_presente",
|
||||
(1, 7): "ind_futuro",
|
||||
(2, 1): "subj_presente",
|
||||
(2, 3): "subj_imperfecto_1",
|
||||
(2, 4): "subj_imperfecto_2",
|
||||
(2, 7): "subj_futuro",
|
||||
(3, 0): "imp_afirmativo", # person-specific
|
||||
}
|
||||
|
||||
def decode_verb_form_id(vfid):
|
||||
"""Decode VerbFormId into (verb_id, tense_id, person_index)"""
|
||||
s = str(vfid)
|
||||
if len(s) != 8:
|
||||
return None, None, None
|
||||
verb_id = int(s[:4]) - 1000
|
||||
mood = int(s[4])
|
||||
tense_num = int(s[5])
|
||||
person = int(s[6:8])
|
||||
|
||||
# Handle imperative
|
||||
if mood == 3:
|
||||
if person >= 800:
|
||||
tense_id = "imp_negativo"
|
||||
person = person - 800
|
||||
else:
|
||||
tense_id = "imp_afirmativo"
|
||||
else:
|
||||
tense_id = CONJUGATO_TENSE_MAP.get((mood, tense_num))
|
||||
|
||||
if person >= 1 and person <= 6:
|
||||
person_idx = person - 1
|
||||
elif person == 7 or person == 8:
|
||||
person_idx = None # vos/voseo - skip for now
|
||||
else:
|
||||
person_idx = None
|
||||
|
||||
return verb_id, tense_id, person_idx
|
||||
|
||||
|
||||
def assign_level(rank):
|
||||
if rank <= 25:
|
||||
return "basic"
|
||||
elif rank <= 100:
|
||||
return "elementary"
|
||||
elif rank <= 300:
|
||||
return "intermediate"
|
||||
elif rank <= 700:
|
||||
return "advanced"
|
||||
else:
|
||||
return "expert"
|
||||
|
||||
|
||||
# Build unified verb list
|
||||
all_verbs = []
|
||||
verb_forms = []
|
||||
spans_output = []
|
||||
|
||||
for vid, cv in sorted(conjugato_verbs.items(), key=lambda x: x[1]["rank"]):
|
||||
infinitive = cv["infinitive"]
|
||||
inf_lower = infinitive.lower()
|
||||
ending = cv["ending"]
|
||||
rank = cv["rank"]
|
||||
|
||||
# Check Conjuu ES for this verb
|
||||
conjuu = conjuu_verbs.get(inf_lower)
|
||||
|
||||
# Determine level
|
||||
level = level_verbs.get(inf_lower, assign_level(rank))
|
||||
|
||||
verb_entry = {
|
||||
"id": vid,
|
||||
"infinitive": infinitive,
|
||||
"english": cv["english"],
|
||||
"rank": rank,
|
||||
"ending": ending,
|
||||
"reflexive": cv["reflexive"],
|
||||
"level": level,
|
||||
"hasConjuuData": conjuu is not None,
|
||||
}
|
||||
all_verbs.append(verb_entry)
|
||||
|
||||
# Generate forms for each tense
|
||||
for tense in TENSES:
|
||||
tid = tense["id"]
|
||||
|
||||
if conjuu and tid in conjuu["tenses"]:
|
||||
# Use Conjuu ES data (pre-computed)
|
||||
td = conjuu["tenses"][tid]
|
||||
forms = td["forms"]
|
||||
regularity = td["regularity"]
|
||||
else:
|
||||
# Generate from rules or ConjuGato irregular forms
|
||||
regularity = "ordinary"
|
||||
|
||||
# Check if we have irregular forms from ConjuGato
|
||||
has_irregular = vid in irregularity_data
|
||||
|
||||
if tid in HABER:
|
||||
# Compound tense
|
||||
participle = get_participle(infinitive, ending)
|
||||
# Check for irregular participle from ConjuGato
|
||||
forms = [f"{aux} {participle}" for aux in HABER[tid]]
|
||||
regularity = "ordinary"
|
||||
elif tid in ("ind_futuro", "cond_presente"):
|
||||
# Future/conditional use full infinitive as stem
|
||||
base = infinitive.lower()
|
||||
if base.endswith("se"):
|
||||
base = base[:-2]
|
||||
endings_map = {
|
||||
"ind_futuro": ["é", "ás", "á", "emos", "éis", "án"],
|
||||
"cond_presente": ["ía", "ías", "ía", "íamos", "íais", "ían"],
|
||||
}
|
||||
forms = [base + e for e in endings_map[tid]]
|
||||
# Check for irregular future/conditional stems from ConjuGato
|
||||
if has_irregular:
|
||||
# Try to find irregular forms
|
||||
for pi in range(6):
|
||||
mood_tense = (1, 7) if tid == "ind_futuro" else (1, 6)
|
||||
vfid = (1000 + vid) * 10000 + mood_tense[0] * 1000 + mood_tense[1] * 100 + (pi + 1)
|
||||
if vfid in irregular_forms:
|
||||
forms[pi] = irregular_forms[vfid]
|
||||
regularity = "irregular"
|
||||
else:
|
||||
# Simple tense
|
||||
stem = get_stem(infinitive, ending)
|
||||
if tid in ENDINGS.get(ending, {}):
|
||||
forms = [stem + e for e in ENDINGS[ending][tid]]
|
||||
else:
|
||||
forms = [""] * 6
|
||||
|
||||
# Override with ConjuGato irregular forms
|
||||
if has_irregular:
|
||||
mood_map = {
|
||||
"ind_presente": (1, 1), "ind_preterito": (1, 2),
|
||||
"ind_imperfecto": (1, 3),
|
||||
"subj_presente": (2, 1), "subj_imperfecto_1": (2, 3),
|
||||
"subj_imperfecto_2": (2, 4), "subj_futuro": (2, 7),
|
||||
}
|
||||
if tid in mood_map:
|
||||
mt = mood_map[tid]
|
||||
for pi in range(6):
|
||||
vfid = (1000 + vid) * 10000 + mt[0] * 1000 + mt[1] * 100 + (pi + 1)
|
||||
if vfid in irregular_forms:
|
||||
forms[pi] = irregular_forms[vfid]
|
||||
regularity = "irregular"
|
||||
elif tid == "imp_afirmativo":
|
||||
for pi in range(6):
|
||||
vfid = (1000 + vid) * 10000 + 3000 + (pi + 1)
|
||||
if vfid in irregular_forms:
|
||||
forms[pi] = irregular_forms[vfid]
|
||||
regularity = "irregular"
|
||||
elif tid == "imp_negativo":
|
||||
for pi in range(6):
|
||||
vfid = (1000 + vid) * 10000 + 3800 + (pi + 1)
|
||||
if vfid in irregular_forms:
|
||||
forms[pi] = irregular_forms[vfid]
|
||||
regularity = "irregular"
|
||||
|
||||
for pi, form in enumerate(forms):
|
||||
if form:
|
||||
verb_forms.append({
|
||||
"verbId": vid,
|
||||
"tenseId": tid,
|
||||
"personIndex": pi,
|
||||
"form": form,
|
||||
"regularity": regularity,
|
||||
})
|
||||
|
||||
# Build spans referencing verb forms
|
||||
print("Processing irregular spans...")
|
||||
for span in irregular_spans:
|
||||
vfid = span["verbFormId"]
|
||||
verb_id, tense_id, person_idx = decode_verb_form_id(vfid)
|
||||
if verb_id is None or tense_id is None or person_idx is None:
|
||||
continue
|
||||
if verb_id not in conjugato_verbs:
|
||||
continue
|
||||
spans_output.append({
|
||||
"verbId": verb_id,
|
||||
"tenseId": tense_id,
|
||||
"personIndex": person_idx,
|
||||
"type": span["type"],
|
||||
"pattern": span["pattern"],
|
||||
"start": span["start"],
|
||||
"end": span["end"],
|
||||
})
|
||||
|
||||
# ─── Step 6: Output ───
|
||||
print("Writing output...")
|
||||
output = {
|
||||
"tenses": TENSES,
|
||||
"persons": PERSONS,
|
||||
"verbs": all_verbs,
|
||||
"verbForms": verb_forms,
|
||||
"irregularSpans": spans_output,
|
||||
"tenseGuides": guides_output,
|
||||
}
|
||||
|
||||
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=None)
|
||||
|
||||
# Also write a pretty version for debugging
|
||||
with open(OUTPUT.replace('.json', '_debug.json'), 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
"stats": {
|
||||
"verbs": len(all_verbs),
|
||||
"verbForms": len(verb_forms),
|
||||
"irregularSpans": len(spans_output),
|
||||
"tenseGuides": len(guides_output),
|
||||
},
|
||||
"sampleVerb": all_verbs[0] if all_verbs else None,
|
||||
"sampleForms": verb_forms[:20],
|
||||
}, f, ensure_ascii=False, indent=2)
|
||||
|
||||
file_size = os.path.getsize(OUTPUT) / (1024 * 1024)
|
||||
print(f"\nDone!")
|
||||
print(f" Verbs: {len(all_verbs)}")
|
||||
print(f" Verb forms: {len(verb_forms)}")
|
||||
print(f" Irregular spans: {len(spans_output)}")
|
||||
print(f" Tense guides: {len(guides_output)}")
|
||||
print(f" Output: {OUTPUT} ({file_size:.1f} MB)")
|
||||
453
Conjuga/Scripts/scrape_all_courses.py
Normal file
453
Conjuga/Scripts/scrape_all_courses.py
Normal file
@@ -0,0 +1,453 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrape 7 LanGo Spanish course packs from Brainscape, plus example sentences
|
||||
from SpanishDict. Outputs all_courses_data.json with all courses, decks, cards,
|
||||
and examples organized by week.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
BASE_URL = "https://www.brainscape.com"
|
||||
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/all_courses_data.json"
|
||||
MAX_EXAMPLES = 3
|
||||
|
||||
PACK_URLS = [
|
||||
"https://www.brainscape.com/packs/lango-spanish-beginner-ii-16514996",
|
||||
"https://www.brainscape.com/packs/lango-spanish-beginner-iii-conversation-18477688",
|
||||
"https://www.brainscape.com/packs/lango-spanish-intermediate-i-21508666",
|
||||
"https://www.brainscape.com/packs/lango-spanish-intermediate-ii-21906841",
|
||||
"https://www.brainscape.com/packs/lango-spanish-intermediate-iii-spanish-through-stories-20677744",
|
||||
"https://www.brainscape.com/packs/lango-spanish-advanced-i-21511244",
|
||||
"https://www.brainscape.com/packs/lango-spanish-advanced-ii-21649461",
|
||||
]
|
||||
|
||||
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parsing helpers (copied from scrape_brainscape.py and scrape_examples.py)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_title_and_week(text):
|
||||
"""Extract week number and clean title from page text."""
|
||||
# Match "Week N: Title" or "Semana N: Title" or "Semana N Title"
|
||||
m = re.match(r'(?:Week|Semana)\s+(\d+)[:\s]+(.+)', text, re.IGNORECASE)
|
||||
if m:
|
||||
return int(m.group(1)), m.group(2).strip()
|
||||
return 0, text.strip()
|
||||
|
||||
|
||||
def parse_cards(text):
|
||||
"""Parse flashcard Q/A pairs from page text."""
|
||||
cards = []
|
||||
lines = text.split('\n')
|
||||
|
||||
skip = {'Q', 'A', 'Study These Flashcards', '', 'Brainscape', 'Find Flashcards',
|
||||
'Make Flashcards', 'How It Works', 'Educators', 'Businesses', 'Academy',
|
||||
'Log in', 'Get Started'}
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
if re.match(r'^\d+$', line):
|
||||
num = int(line)
|
||||
parts = []
|
||||
j = i + 1
|
||||
while j < len(lines) and len(parts) < 6:
|
||||
nextline = lines[j].strip()
|
||||
|
||||
if re.match(r'^\d+$', nextline) and int(nextline) == num + 1:
|
||||
break
|
||||
if nextline.startswith('LanGo Spanish') or nextline.startswith('Decks in class'):
|
||||
break
|
||||
if re.match(r'^(?:Week|Semana) \d+', nextline):
|
||||
break
|
||||
if nextline in skip:
|
||||
j += 1
|
||||
continue
|
||||
|
||||
parts.append(nextline)
|
||||
j += 1
|
||||
|
||||
if len(parts) >= 2:
|
||||
cards.append({
|
||||
"front": parts[0],
|
||||
"back": parts[1],
|
||||
})
|
||||
i = j
|
||||
else:
|
||||
i += 1
|
||||
|
||||
cards = [c for c in cards if not re.match(r'^(?:Week|Semana) \d+', c['front'])
|
||||
and c['front'] not in ('Decks in class (39)', '# Cards')
|
||||
and not c['front'].startswith('LanGo Spanish')
|
||||
and not c['front'].startswith('You may prefer')]
|
||||
return cards
|
||||
|
||||
|
||||
def extract_word_for_lookup(front):
|
||||
"""Extract the best lookup word from a card front."""
|
||||
word = front.strip()
|
||||
word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE)
|
||||
word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE)
|
||||
if ',' in word:
|
||||
word = word.split(',')[0].strip()
|
||||
if '/' in word:
|
||||
word = word.split('/')[0].strip()
|
||||
return word.lower().strip()
|
||||
|
||||
|
||||
def parse_examples(text, lookup_word):
|
||||
"""Parse example sentences from SpanishDict page text."""
|
||||
examples = []
|
||||
lines = text.split('\n')
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
l = line.strip()
|
||||
if not l or len(l) < 15:
|
||||
continue
|
||||
|
||||
inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l)
|
||||
if inline_match:
|
||||
es = inline_match.group(1).strip()
|
||||
en = inline_match.group(2).strip()
|
||||
if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5:
|
||||
examples.append({"es": es, "en": en})
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
continue
|
||||
|
||||
if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300:
|
||||
for j in range(i + 1, min(i + 3, len(lines))):
|
||||
next_l = lines[j].strip()
|
||||
if not next_l:
|
||||
continue
|
||||
if (next_l[0].isupper() and
|
||||
not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])):
|
||||
examples.append({"es": l, "en": next_l})
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
break
|
||||
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
|
||||
return examples
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scraping logic
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def discover_deck_urls(page, pack_url):
|
||||
"""Visit a pack page and discover all deck URLs within it."""
|
||||
print(f"\nDiscovering decks in {pack_url}...")
|
||||
await page.goto(pack_url, wait_until="networkidle", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# Scroll to load all content
|
||||
for _ in range(10):
|
||||
await page.evaluate("window.scrollBy(0, 1000)")
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
# Extract pack ID from URL
|
||||
pack_id = pack_url.rstrip('/').split('-')[-1]
|
||||
|
||||
# Find all deck links matching /flashcards/*/packs/*
|
||||
links = await page.eval_on_selector_all(
|
||||
'a[href*="/flashcards/"]',
|
||||
'els => els.map(e => e.getAttribute("href"))'
|
||||
)
|
||||
|
||||
deck_urls = []
|
||||
seen = set()
|
||||
for href in links:
|
||||
if href and '/flashcards/' in href and '/packs/' in href:
|
||||
# Normalize
|
||||
if href.startswith('http'):
|
||||
href = href.replace(BASE_URL, '')
|
||||
if href not in seen:
|
||||
seen.add(href)
|
||||
deck_urls.append(href)
|
||||
|
||||
# Extract course name from the page
|
||||
text = await page.inner_text("body")
|
||||
course_name = None
|
||||
# Try to find "LanGo Spanish | ..." pattern
|
||||
m = re.search(r'(LanGo Spanish\s*\|\s*[^>\n]+)', text)
|
||||
if m:
|
||||
course_name = m.group(1).strip()
|
||||
# Clean trailing noise
|
||||
course_name = re.sub(r'\s*>\s*$', '', course_name).strip()
|
||||
# Remove "Flashcards" suffix if present
|
||||
course_name = re.sub(r'\s*Flashcards\s*$', '', course_name).strip()
|
||||
else:
|
||||
# Fallback: derive from URL slug
|
||||
slug = pack_url.rstrip('/').split('/')[-1]
|
||||
slug = re.sub(r'-\d+$', '', slug)
|
||||
course_name = slug.replace('-', ' ').title()
|
||||
|
||||
print(f" Course: {course_name}")
|
||||
print(f" Found {len(deck_urls)} deck URLs")
|
||||
return course_name, deck_urls
|
||||
|
||||
|
||||
async def scrape_deck(page, url):
|
||||
"""Scrape a single deck page for flashcard data."""
|
||||
full_url = BASE_URL + url if url.startswith('/') else url
|
||||
await page.goto(full_url, wait_until="networkidle", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
for _ in range(5):
|
||||
await page.evaluate("window.scrollBy(0, 1000)")
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
text = await page.inner_text("body")
|
||||
|
||||
# Extract title — handle both "Week N:" and "Semana N" patterns
|
||||
title_match = re.search(r'>\s*((?:Week|Semana)\s+\d+[:\s].+?)\s*>\s*Flashcards', text)
|
||||
if title_match:
|
||||
raw_title = title_match.group(1).strip()
|
||||
else:
|
||||
heading_match = re.search(r'((?:Week|Semana)\s+\d+[:\s].+?)\s*Flashcards', text)
|
||||
if heading_match:
|
||||
raw_title = heading_match.group(1).strip()
|
||||
else:
|
||||
slug = url.split('/')[2] if len(url.split('/')) > 2 else url
|
||||
slug_clean = re.sub(r'-\d+$', '', slug)
|
||||
slug_clean = re.sub(r'-al-rev(e|é)s$', ' AL REVÉS', slug_clean)
|
||||
raw_title = slug_clean.replace('-', ' ').title()
|
||||
wm = re.match(r'Week\s+(\d+)', raw_title, re.IGNORECASE)
|
||||
if not wm:
|
||||
raw_title = "Week 0: " + raw_title
|
||||
|
||||
week, title = parse_title_and_week(raw_title)
|
||||
cards = parse_cards(text)
|
||||
is_reversed = "al rev" in url.lower() or "AL REVÉS" in raw_title.upper()
|
||||
|
||||
return {
|
||||
"week": week,
|
||||
"title": title,
|
||||
"isReversed": is_reversed,
|
||||
"cardCount": len(cards),
|
||||
"cards": cards,
|
||||
"url": url,
|
||||
}
|
||||
|
||||
|
||||
async def scrape_examples_for_word(page, lookup):
|
||||
"""Scrape example sentences from SpanishDict for a single word."""
|
||||
url = f"https://www.spanishdict.com/translate/{lookup}"
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
||||
await page.wait_for_timeout(2000)
|
||||
text = await page.inner_text("body")
|
||||
return parse_examples(text, lookup)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def save_progress(data):
|
||||
"""Save current data to output file."""
|
||||
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def load_progress():
|
||||
"""Load existing progress if available."""
|
||||
if os.path.exists(OUTPUT):
|
||||
try:
|
||||
with open(OUTPUT) as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
async def main():
|
||||
# Check for existing progress
|
||||
existing = load_progress()
|
||||
completed_courses = set()
|
||||
examples_done = {} # lookup -> examples list
|
||||
|
||||
if existing and 'courses' in existing:
|
||||
for course in existing['courses']:
|
||||
if course.get('_examples_done'):
|
||||
completed_courses.add(course['course'])
|
||||
# Collect already-scraped examples
|
||||
for week in course.get('weeks', []):
|
||||
for deck in week.get('decks', []):
|
||||
for card in deck.get('cards', []):
|
||||
if card.get('examples'):
|
||||
lookup = extract_word_for_lookup(card['front'])
|
||||
examples_done[lookup] = card['examples']
|
||||
print(f"Loaded progress: {len(completed_courses)} completed courses, {len(examples_done)} words with examples")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent=USER_AGENT)
|
||||
page = await context.new_page()
|
||||
|
||||
all_courses = []
|
||||
|
||||
# If we have existing data for completed courses, keep them
|
||||
if existing and 'courses' in existing:
|
||||
for course in existing['courses']:
|
||||
if course['course'] in completed_courses:
|
||||
all_courses.append(course)
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Phase 1: Discover decks and scrape cards for each course pack
|
||||
# ---------------------------------------------------------------
|
||||
for pack_url in PACK_URLS:
|
||||
course_name, deck_urls = await discover_deck_urls(page, pack_url)
|
||||
|
||||
# Skip if already completed
|
||||
if course_name in completed_courses:
|
||||
print(f" Skipping {course_name} (already completed)")
|
||||
continue
|
||||
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
all_decks = []
|
||||
total_cards = 0
|
||||
|
||||
for i, deck_url in enumerate(deck_urls):
|
||||
slug = deck_url.split('/')[2] if len(deck_url.split('/')) > 2 else deck_url
|
||||
print(f" [{i+1}/{len(deck_urls)}] Scraping {slug[:60]}...")
|
||||
try:
|
||||
deck = await scrape_deck(page, deck_url)
|
||||
all_decks.append(deck)
|
||||
total_cards += deck["cardCount"]
|
||||
print(f" -> Week {deck['week']}: {deck['title']} ({deck['cardCount']} cards)")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
# Organize by week
|
||||
weeks = {}
|
||||
for deck in all_decks:
|
||||
w = deck["week"]
|
||||
if w not in weeks:
|
||||
weeks[w] = []
|
||||
weeks[w].append({
|
||||
"title": deck["title"],
|
||||
"isReversed": deck["isReversed"],
|
||||
"cardCount": deck["cardCount"],
|
||||
"cards": deck["cards"],
|
||||
})
|
||||
|
||||
course_data = {
|
||||
"course": course_name,
|
||||
"totalDecks": len(all_decks),
|
||||
"totalCards": total_cards,
|
||||
"_examples_done": False,
|
||||
"weeks": [
|
||||
{"week": w, "decks": weeks[w]}
|
||||
for w in sorted(weeks.keys())
|
||||
],
|
||||
}
|
||||
all_courses.append(course_data)
|
||||
|
||||
# Save after each course
|
||||
save_progress({"courses": all_courses})
|
||||
print(f" Saved {course_name}: {len(all_decks)} decks, {total_cards} cards")
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Phase 2: Scrape example sentences from SpanishDict
|
||||
# ---------------------------------------------------------------
|
||||
print("\n" + "=" * 60)
|
||||
print("Phase 2: Scraping example sentences from SpanishDict")
|
||||
print("=" * 60)
|
||||
|
||||
# Collect all unique words across all courses (non-reversed decks)
|
||||
unique_words = {} # lookup -> original front
|
||||
for course in all_courses:
|
||||
for week in course['weeks']:
|
||||
for deck in week['decks']:
|
||||
if deck.get('isReversed'):
|
||||
continue
|
||||
for card in deck['cards']:
|
||||
front = card['front']
|
||||
lookup = extract_word_for_lookup(front)
|
||||
if lookup and lookup not in unique_words:
|
||||
unique_words[lookup] = front
|
||||
|
||||
print(f"Found {len(unique_words)} unique words to look up")
|
||||
print(f"Already have examples for {len(examples_done)} words")
|
||||
|
||||
words_scraped = 0
|
||||
total_words = len(unique_words)
|
||||
|
||||
for i, (lookup, original) in enumerate(unique_words.items()):
|
||||
if lookup in examples_done:
|
||||
continue
|
||||
|
||||
print(f"[{i+1}/{total_words}] {lookup}...", end=" ", flush=True)
|
||||
try:
|
||||
examples = await scrape_examples_for_word(page, lookup)
|
||||
examples_done[lookup] = examples
|
||||
if examples:
|
||||
print(f"{len(examples)} examples")
|
||||
else:
|
||||
print("no examples")
|
||||
except Exception as e:
|
||||
print(f"error: {e}")
|
||||
examples_done[lookup] = []
|
||||
|
||||
words_scraped += 1
|
||||
|
||||
# Save progress every 20 words
|
||||
if words_scraped % 20 == 0:
|
||||
# Attach examples to cards before saving
|
||||
_attach_examples(all_courses, examples_done)
|
||||
save_progress({"courses": all_courses})
|
||||
print(f" [saved progress - {len(examples_done)} words done]")
|
||||
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
await browser.close()
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Final: attach all examples to cards and save
|
||||
# ---------------------------------------------------------------
|
||||
_attach_examples(all_courses, examples_done)
|
||||
|
||||
# Mark all courses as examples_done and remove internal flag
|
||||
for course in all_courses:
|
||||
course['_examples_done'] = True
|
||||
|
||||
# Clean up internal flags before final save
|
||||
for course in all_courses:
|
||||
course.pop('_examples_done', None)
|
||||
|
||||
save_progress({"courses": all_courses})
|
||||
|
||||
total_decks = sum(c['totalDecks'] for c in all_courses)
|
||||
total_cards = sum(c['totalCards'] for c in all_courses)
|
||||
print(f"\nDone! {len(all_courses)} courses, {total_decks} decks, {total_cards} cards")
|
||||
print(f"Examples scraped for {len(examples_done)} unique words")
|
||||
print(f"Output: {OUTPUT}")
|
||||
|
||||
|
||||
def _attach_examples(courses, examples_done):
|
||||
"""Attach scraped examples to card objects in place."""
|
||||
for course in courses:
|
||||
for week in course['weeks']:
|
||||
for deck in week['decks']:
|
||||
for card in deck['cards']:
|
||||
lookup = extract_word_for_lookup(card['front'])
|
||||
if lookup in examples_done and examples_done[lookup]:
|
||||
card['examples'] = examples_done[lookup]
|
||||
elif 'examples' not in card:
|
||||
card['examples'] = []
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
238
Conjuga/Scripts/scrape_brainscape.py
Normal file
238
Conjuga/Scripts/scrape_brainscape.py
Normal file
@@ -0,0 +1,238 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrape all 39 LanGo Spanish Beginner I decks from Brainscape using Playwright.
|
||||
Outputs course_data.json with all decks and cards organized by week.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
BASE_URL = "https://www.brainscape.com"
|
||||
PACK_ID = "18164266"
|
||||
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json"
|
||||
|
||||
DECK_URLS = [
|
||||
"/flashcards/week-1-greetings-los-saludos-10176532/packs/18164266",
|
||||
"/flashcards/week-1-greetings-los-saludos-al-reves-12745728/packs/18164266",
|
||||
"/flashcards/week-2-adjectives-los-adjetivos-12745741/packs/18164266",
|
||||
"/flashcards/week-2-adjectives-los-adjetivos-al-reves-12745829/packs/18164266",
|
||||
"/flashcards/week-2-numbers-los-numeros-12797877/packs/18164266",
|
||||
"/flashcards/week-2-numbers-los-numeros-al-reves-13698219/packs/18164266",
|
||||
"/flashcards/week-2-professions-las-profesiones-12740531/packs/18164266",
|
||||
"/flashcards/week-2-professions-las-profesiones-al-re-12745832/packs/18164266",
|
||||
"/flashcards/week-3-house-la-casa-10216249/packs/18164266",
|
||||
"/flashcards/week-3-house-la-casa-al-reves-12745837/packs/18164266",
|
||||
"/flashcards/week-3-ar-verbs-10207117/packs/18164266",
|
||||
"/flashcards/week-3-ar-verbs-al-reves-12745833/packs/18164266",
|
||||
"/flashcards/week-3-er-verbs-12745857/packs/18164266",
|
||||
"/flashcards/week-3-er-verbs-al-reves-12745888/packs/18164266",
|
||||
"/flashcards/week-3-ir-verbs-10207120/packs/18164266",
|
||||
"/flashcards/week-3-ir-verbs-al-reves-12745835/packs/18164266",
|
||||
"/flashcards/week-4-family-la-familia-10266419/packs/18164266",
|
||||
"/flashcards/week-4-family-la-familia-al-reves-12745978/packs/18164266",
|
||||
"/flashcards/week-4-e-ie-stem-changing-verbs-10270069/packs/18164266",
|
||||
"/flashcards/week-4-e-ie-stem-changing-verbs-al-reves-12749152/packs/18164266",
|
||||
"/flashcards/week-4-e-i-stem-changing-verbs-10270070/packs/18164266",
|
||||
"/flashcards/week-4-e-i-stem-changing-verbs-al-reves-12749160/packs/18164266",
|
||||
"/flashcards/week-4-o-ue-stem-changing-verbs-10270071/packs/18164266",
|
||||
"/flashcards/week-4-o-ue-stem-changing-verbs-al-reves-12749172/packs/18164266",
|
||||
"/flashcards/week-4-exceptional-yo-forms-10286213/packs/18164266",
|
||||
"/flashcards/week-4-exceptional-yo-forms-al-reves-12749234/packs/18164266",
|
||||
"/flashcards/week-5-reflexive-verbs-los-verbos-reflex-10270072/packs/18164266",
|
||||
"/flashcards/week-5-reflexive-verbs-los-verbos-reflex-12745842/packs/18164266",
|
||||
"/flashcards/week-5-daily-routine-la-rutina-cotidiana-11869082/packs/18164266",
|
||||
"/flashcards/week-5-daily-routine-la-rutina-cotidiana-12745840/packs/18164266",
|
||||
"/flashcards/week-6-city-la-ciudad-10232784/packs/18164266",
|
||||
"/flashcards/week-6-city-la-ciudad-al-reves-12745942/packs/18164266",
|
||||
"/flashcards/week-6-time-expressions-las-expresiones-12797878/packs/18164266",
|
||||
"/flashcards/week-6-time-expressions-las-expresiones-13698220/packs/18164266",
|
||||
"/flashcards/week-7-idioms-with-the-verb-tener-los-mo-11951594/packs/18164266",
|
||||
"/flashcards/week-8-prepositions-and-negation-las-pre-11951441/packs/18164266",
|
||||
"/flashcards/week-8-prepositions-and-negation-las-pre-16094943/packs/18164266",
|
||||
"/flashcards/week-8-hobbies-los-pasatiempos-10232782/packs/18164266",
|
||||
"/flashcards/week-8-hobbies-los-pasatiempos-al-reves-12745838/packs/18164266",
|
||||
]
|
||||
|
||||
|
||||
def parse_title_and_week(text):
|
||||
"""Extract week number and clean title from page text."""
|
||||
# Match "Week N: Title" pattern
|
||||
m = re.match(r'Week\s+(\d+):\s*(.+)', text, re.IGNORECASE)
|
||||
if m:
|
||||
return int(m.group(1)), m.group(2).strip()
|
||||
return 0, text.strip()
|
||||
|
||||
|
||||
def parse_cards(text):
|
||||
"""Parse flashcard Q/A pairs from page text."""
|
||||
cards = []
|
||||
lines = text.split('\n')
|
||||
|
||||
# Filter out noise lines
|
||||
skip = {'Q', 'A', 'Study These Flashcards', '', 'Brainscape', 'Find Flashcards',
|
||||
'Make Flashcards', 'How It Works', 'Educators', 'Businesses', 'Academy',
|
||||
'Log in', 'Get Started'}
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
# Look for a card number
|
||||
if re.match(r'^\d+$', line):
|
||||
num = int(line)
|
||||
# Collect content lines until the next card number or deck list
|
||||
parts = []
|
||||
j = i + 1
|
||||
while j < len(lines) and len(parts) < 6:
|
||||
nextline = lines[j].strip()
|
||||
|
||||
# Stop at next card number
|
||||
if re.match(r'^\d+$', nextline) and int(nextline) == num + 1:
|
||||
break
|
||||
|
||||
# Stop at deck list / footer
|
||||
if nextline.startswith('LanGo Spanish') or nextline.startswith('Decks in class'):
|
||||
break
|
||||
|
||||
# Stop at other deck titles leaking in
|
||||
if re.match(r'^Week \d+:', nextline):
|
||||
break
|
||||
|
||||
# Skip noise
|
||||
if nextline in skip:
|
||||
j += 1
|
||||
continue
|
||||
|
||||
parts.append(nextline)
|
||||
j += 1
|
||||
|
||||
if len(parts) >= 2:
|
||||
cards.append({
|
||||
"front": parts[0],
|
||||
"back": parts[1],
|
||||
})
|
||||
i = j
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Post-filter: remove any cards that are actually deck titles
|
||||
cards = [c for c in cards if not re.match(r'^Week \d+:', c['front'])
|
||||
and c['front'] not in ('Decks in class (39)', '# Cards')
|
||||
and not c['front'].startswith('LanGo Spanish')
|
||||
and not c['front'].startswith('You may prefer')]
|
||||
return cards
|
||||
|
||||
|
||||
async def scrape_deck(page, url):
|
||||
"""Scrape a single deck page."""
|
||||
full_url = BASE_URL + url
|
||||
await page.goto(full_url, wait_until="networkidle", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
# Scroll to load lazy content
|
||||
for _ in range(5):
|
||||
await page.evaluate("window.scrollBy(0, 1000)")
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
text = await page.inner_text("body")
|
||||
|
||||
# Extract title — try multiple patterns
|
||||
# Format: "LanGo Spanish | Beginner I > Week N: Title > Flashcards"
|
||||
title_match = re.search(r'>\s*(Week\s+\d+:.+?)\s*>\s*Flashcards', text)
|
||||
if title_match:
|
||||
raw_title = title_match.group(1).strip()
|
||||
else:
|
||||
# Try: "Week N: Title (Subtitle) Flashcards"
|
||||
heading_match = re.search(r'(Week\s+\d+:.+?)\s*Flashcards', text)
|
||||
if heading_match:
|
||||
raw_title = heading_match.group(1).strip()
|
||||
else:
|
||||
# Last resort: extract from URL slug
|
||||
slug = url.split('/')[2]
|
||||
# Convert "week-5-reflexive-verbs-los-verbos-reflex-10270072" to title
|
||||
slug_clean = re.sub(r'-\d+$', '', slug) # remove trailing ID
|
||||
slug_clean = re.sub(r'-al-rev(e|é)s$', ' AL REVÉS', slug_clean)
|
||||
raw_title = slug_clean.replace('-', ' ').title()
|
||||
# Try to extract week number
|
||||
wm = re.match(r'Week\s+(\d+)', raw_title, re.IGNORECASE)
|
||||
if wm:
|
||||
raw_title = raw_title # already has Week N
|
||||
else:
|
||||
raw_title = "Week 0: " + raw_title
|
||||
|
||||
week, title = parse_title_and_week(raw_title)
|
||||
cards = parse_cards(text)
|
||||
|
||||
is_reversed = "al rev" in url.lower() or "AL REVÉS" in raw_title.upper()
|
||||
|
||||
return {
|
||||
"week": week,
|
||||
"title": title,
|
||||
"isReversed": is_reversed,
|
||||
"cardCount": len(cards),
|
||||
"cards": cards,
|
||||
"url": url,
|
||||
}
|
||||
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
all_decks = []
|
||||
total_cards = 0
|
||||
|
||||
for i, url in enumerate(DECK_URLS):
|
||||
print(f"[{i+1}/{len(DECK_URLS)}] Scraping {url.split('/')[2][:50]}...")
|
||||
try:
|
||||
deck = await scrape_deck(page, url)
|
||||
all_decks.append(deck)
|
||||
total_cards += deck["cardCount"]
|
||||
print(f" → Week {deck['week']}: {deck['title']} ({deck['cardCount']} cards)")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
# Be polite
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
await browser.close()
|
||||
|
||||
# Organize by week
|
||||
weeks = {}
|
||||
for deck in all_decks:
|
||||
w = deck["week"]
|
||||
if w not in weeks:
|
||||
weeks[w] = []
|
||||
weeks[w].append({
|
||||
"title": deck["title"],
|
||||
"isReversed": deck["isReversed"],
|
||||
"cardCount": deck["cardCount"],
|
||||
"cards": deck["cards"],
|
||||
})
|
||||
|
||||
output = {
|
||||
"course": "LanGo Spanish | Beginner I",
|
||||
"totalDecks": len(all_decks),
|
||||
"totalCards": total_cards,
|
||||
"weeks": [
|
||||
{
|
||||
"week": w,
|
||||
"decks": weeks[w],
|
||||
}
|
||||
for w in sorted(weeks.keys())
|
||||
],
|
||||
}
|
||||
|
||||
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\nDone! {len(all_decks)} decks, {total_cards} cards → {OUTPUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
166
Conjuga/Scripts/scrape_examples.py
Normal file
166
Conjuga/Scripts/scrape_examples.py
Normal file
@@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrape 2-3 example sentences per vocab word from SpanishDict.
|
||||
Reads words from course_data.json, outputs examples to course_examples.json.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
INPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json"
|
||||
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_examples.json"
|
||||
MAX_EXAMPLES = 3
|
||||
|
||||
def extract_word_for_lookup(front):
|
||||
"""Extract the best lookup word from a card front.
|
||||
e.g. 'barato, barata' -> 'barato'
|
||||
e.g. 'el/la periodista' -> 'periodista'
|
||||
"""
|
||||
word = front.strip()
|
||||
# Remove articles
|
||||
word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE)
|
||||
word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE)
|
||||
# Take first word if comma-separated (barato, barata -> barato)
|
||||
if ',' in word:
|
||||
word = word.split(',')[0].strip()
|
||||
# Take first word if slash-separated
|
||||
if '/' in word:
|
||||
word = word.split('/')[0].strip()
|
||||
return word.lower().strip()
|
||||
|
||||
|
||||
def parse_examples(text, lookup_word):
|
||||
"""Parse example sentences from SpanishDict page text."""
|
||||
examples = []
|
||||
lines = text.split('\n')
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
l = line.strip()
|
||||
if not l or len(l) < 15:
|
||||
continue
|
||||
|
||||
# Pattern: "Spanish sentence.English sentence." (inline on one line)
|
||||
# SpanishDict puts them together with no space between period and capital
|
||||
# e.g. "Esta tienda es muy barata.This store is really cheap."
|
||||
inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l)
|
||||
if inline_match:
|
||||
es = inline_match.group(1).strip()
|
||||
en = inline_match.group(2).strip()
|
||||
# Verify it contains our word (case-insensitive)
|
||||
if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5:
|
||||
examples.append({"es": es, "en": en})
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
continue
|
||||
|
||||
# Pattern: standalone Spanish sentence with word, followed by English on next line
|
||||
if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300:
|
||||
# Check if next non-empty line is English
|
||||
for j in range(i + 1, min(i + 3, len(lines))):
|
||||
next_l = lines[j].strip()
|
||||
if not next_l:
|
||||
continue
|
||||
# Check if it looks English (starts with capital, has common English words)
|
||||
if (next_l[0].isupper() and
|
||||
not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])):
|
||||
examples.append({"es": l, "en": next_l})
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
break
|
||||
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
|
||||
return examples
|
||||
|
||||
|
||||
async def scrape_word(page, word, lookup):
|
||||
"""Scrape examples for a single word."""
|
||||
url = f"https://www.spanishdict.com/translate/{lookup}"
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
||||
await page.wait_for_timeout(2000)
|
||||
text = await page.inner_text("body")
|
||||
examples = parse_examples(text, lookup)
|
||||
return examples
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
|
||||
async def main():
|
||||
# Load course data
|
||||
with open(INPUT) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Collect unique words (front values from non-reversed decks)
|
||||
words = {} # lookup -> original front
|
||||
for week in data['weeks']:
|
||||
for deck in week['decks']:
|
||||
if deck.get('isReversed'):
|
||||
continue
|
||||
for card in deck['cards']:
|
||||
front = card['front']
|
||||
lookup = extract_word_for_lookup(front)
|
||||
if lookup and lookup not in words:
|
||||
words[lookup] = front
|
||||
|
||||
print(f"Found {len(words)} unique words to look up")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
ctx = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = await ctx.new_page()
|
||||
|
||||
# Load existing progress if any
|
||||
results = {}
|
||||
if os.path.exists(OUTPUT):
|
||||
with open(OUTPUT) as f:
|
||||
results = json.load(f)
|
||||
print(f"Loaded {len(results)} existing results")
|
||||
|
||||
found = len(results)
|
||||
total = len(words)
|
||||
|
||||
for i, (lookup, original) in enumerate(words.items()):
|
||||
# Skip already scraped
|
||||
if original in results:
|
||||
continue
|
||||
|
||||
print(f"[{i+1}/{total}] {lookup}...", end=" ", flush=True)
|
||||
try:
|
||||
examples = await scrape_word(page, original, lookup)
|
||||
if examples:
|
||||
results[original] = examples
|
||||
found += 1
|
||||
print(f"{len(examples)} examples")
|
||||
else:
|
||||
results[original] = []
|
||||
print("no examples")
|
||||
except Exception as e:
|
||||
print(f"error: {e}")
|
||||
results[original] = []
|
||||
|
||||
# Save progress every 20 words
|
||||
if (i + 1) % 20 == 0:
|
||||
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
print(f" [saved {len(results)} results]")
|
||||
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
await browser.close()
|
||||
|
||||
# Save results
|
||||
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\nDone! {found}/{total} words with examples → {OUTPUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user