Includes SwiftData dual-store architecture (local reference + CloudKit user data), JSON-based data seeding, 20 tense guides, 20 grammar notes, SRS review system, course vocabulary, and widget support. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
167 lines
5.8 KiB
Python
167 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scrape 2-3 example sentences per vocab word from SpanishDict.
|
|
Reads words from course_data.json, outputs examples to course_examples.json.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import re
|
|
import os
|
|
from playwright.async_api import async_playwright
|
|
|
|
INPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json"
|
|
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_examples.json"
|
|
MAX_EXAMPLES = 3
|
|
|
|
def extract_word_for_lookup(front):
|
|
"""Extract the best lookup word from a card front.
|
|
e.g. 'barato, barata' -> 'barato'
|
|
e.g. 'el/la periodista' -> 'periodista'
|
|
"""
|
|
word = front.strip()
|
|
# Remove articles
|
|
word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE)
|
|
word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE)
|
|
# Take first word if comma-separated (barato, barata -> barato)
|
|
if ',' in word:
|
|
word = word.split(',')[0].strip()
|
|
# Take first word if slash-separated
|
|
if '/' in word:
|
|
word = word.split('/')[0].strip()
|
|
return word.lower().strip()
|
|
|
|
|
|
def parse_examples(text, lookup_word):
|
|
"""Parse example sentences from SpanishDict page text."""
|
|
examples = []
|
|
lines = text.split('\n')
|
|
|
|
for i, line in enumerate(lines):
|
|
l = line.strip()
|
|
if not l or len(l) < 15:
|
|
continue
|
|
|
|
# Pattern: "Spanish sentence.English sentence." (inline on one line)
|
|
# SpanishDict puts them together with no space between period and capital
|
|
# e.g. "Esta tienda es muy barata.This store is really cheap."
|
|
inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l)
|
|
if inline_match:
|
|
es = inline_match.group(1).strip()
|
|
en = inline_match.group(2).strip()
|
|
# Verify it contains our word (case-insensitive)
|
|
if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5:
|
|
examples.append({"es": es, "en": en})
|
|
if len(examples) >= MAX_EXAMPLES:
|
|
break
|
|
continue
|
|
|
|
# Pattern: standalone Spanish sentence with word, followed by English on next line
|
|
if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300:
|
|
# Check if next non-empty line is English
|
|
for j in range(i + 1, min(i + 3, len(lines))):
|
|
next_l = lines[j].strip()
|
|
if not next_l:
|
|
continue
|
|
# Check if it looks English (starts with capital, has common English words)
|
|
if (next_l[0].isupper() and
|
|
not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])):
|
|
examples.append({"es": l, "en": next_l})
|
|
if len(examples) >= MAX_EXAMPLES:
|
|
break
|
|
break
|
|
|
|
if len(examples) >= MAX_EXAMPLES:
|
|
break
|
|
|
|
return examples
|
|
|
|
|
|
async def scrape_word(page, word, lookup):
|
|
"""Scrape examples for a single word."""
|
|
url = f"https://www.spanishdict.com/translate/{lookup}"
|
|
try:
|
|
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
|
await page.wait_for_timeout(2000)
|
|
text = await page.inner_text("body")
|
|
examples = parse_examples(text, lookup)
|
|
return examples
|
|
except Exception as e:
|
|
return []
|
|
|
|
|
|
async def main():
|
|
# Load course data
|
|
with open(INPUT) as f:
|
|
data = json.load(f)
|
|
|
|
# Collect unique words (front values from non-reversed decks)
|
|
words = {} # lookup -> original front
|
|
for week in data['weeks']:
|
|
for deck in week['decks']:
|
|
if deck.get('isReversed'):
|
|
continue
|
|
for card in deck['cards']:
|
|
front = card['front']
|
|
lookup = extract_word_for_lookup(front)
|
|
if lookup and lookup not in words:
|
|
words[lookup] = front
|
|
|
|
print(f"Found {len(words)} unique words to look up")
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
ctx = await browser.new_context(
|
|
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
)
|
|
page = await ctx.new_page()
|
|
|
|
# Load existing progress if any
|
|
results = {}
|
|
if os.path.exists(OUTPUT):
|
|
with open(OUTPUT) as f:
|
|
results = json.load(f)
|
|
print(f"Loaded {len(results)} existing results")
|
|
|
|
found = len(results)
|
|
total = len(words)
|
|
|
|
for i, (lookup, original) in enumerate(words.items()):
|
|
# Skip already scraped
|
|
if original in results:
|
|
continue
|
|
|
|
print(f"[{i+1}/{total}] {lookup}...", end=" ", flush=True)
|
|
try:
|
|
examples = await scrape_word(page, original, lookup)
|
|
if examples:
|
|
results[original] = examples
|
|
found += 1
|
|
print(f"{len(examples)} examples")
|
|
else:
|
|
results[original] = []
|
|
print("no examples")
|
|
except Exception as e:
|
|
print(f"error: {e}")
|
|
results[original] = []
|
|
|
|
# Save progress every 20 words
|
|
if (i + 1) % 20 == 0:
|
|
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
print(f" [saved {len(results)} results]")
|
|
|
|
await page.wait_for_timeout(300)
|
|
|
|
await browser.close()
|
|
|
|
# Save results
|
|
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\nDone! {found}/{total} words with examples → {OUTPUT}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|