Initial commit: Conjuga Spanish conjugation app
Includes SwiftData dual-store architecture (local reference + CloudKit user data), JSON-based data seeding, 20 tense guides, 20 grammar notes, SRS review system, course vocabulary, and widget support. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
166
Conjuga/Scripts/scrape_examples.py
Normal file
166
Conjuga/Scripts/scrape_examples.py
Normal file
@@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrape 2-3 example sentences per vocab word from SpanishDict.
|
||||
Reads words from course_data.json, outputs examples to course_examples.json.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
INPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json"
|
||||
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_examples.json"
|
||||
MAX_EXAMPLES = 3
|
||||
|
||||
def extract_word_for_lookup(front):
|
||||
"""Extract the best lookup word from a card front.
|
||||
e.g. 'barato, barata' -> 'barato'
|
||||
e.g. 'el/la periodista' -> 'periodista'
|
||||
"""
|
||||
word = front.strip()
|
||||
# Remove articles
|
||||
word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE)
|
||||
word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE)
|
||||
# Take first word if comma-separated (barato, barata -> barato)
|
||||
if ',' in word:
|
||||
word = word.split(',')[0].strip()
|
||||
# Take first word if slash-separated
|
||||
if '/' in word:
|
||||
word = word.split('/')[0].strip()
|
||||
return word.lower().strip()
|
||||
|
||||
|
||||
def parse_examples(text, lookup_word):
|
||||
"""Parse example sentences from SpanishDict page text."""
|
||||
examples = []
|
||||
lines = text.split('\n')
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
l = line.strip()
|
||||
if not l or len(l) < 15:
|
||||
continue
|
||||
|
||||
# Pattern: "Spanish sentence.English sentence." (inline on one line)
|
||||
# SpanishDict puts them together with no space between period and capital
|
||||
# e.g. "Esta tienda es muy barata.This store is really cheap."
|
||||
inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l)
|
||||
if inline_match:
|
||||
es = inline_match.group(1).strip()
|
||||
en = inline_match.group(2).strip()
|
||||
# Verify it contains our word (case-insensitive)
|
||||
if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5:
|
||||
examples.append({"es": es, "en": en})
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
continue
|
||||
|
||||
# Pattern: standalone Spanish sentence with word, followed by English on next line
|
||||
if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300:
|
||||
# Check if next non-empty line is English
|
||||
for j in range(i + 1, min(i + 3, len(lines))):
|
||||
next_l = lines[j].strip()
|
||||
if not next_l:
|
||||
continue
|
||||
# Check if it looks English (starts with capital, has common English words)
|
||||
if (next_l[0].isupper() and
|
||||
not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])):
|
||||
examples.append({"es": l, "en": next_l})
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
break
|
||||
|
||||
if len(examples) >= MAX_EXAMPLES:
|
||||
break
|
||||
|
||||
return examples
|
||||
|
||||
|
||||
async def scrape_word(page, word, lookup):
|
||||
"""Scrape examples for a single word."""
|
||||
url = f"https://www.spanishdict.com/translate/{lookup}"
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
||||
await page.wait_for_timeout(2000)
|
||||
text = await page.inner_text("body")
|
||||
examples = parse_examples(text, lookup)
|
||||
return examples
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
|
||||
async def main():
|
||||
# Load course data
|
||||
with open(INPUT) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Collect unique words (front values from non-reversed decks)
|
||||
words = {} # lookup -> original front
|
||||
for week in data['weeks']:
|
||||
for deck in week['decks']:
|
||||
if deck.get('isReversed'):
|
||||
continue
|
||||
for card in deck['cards']:
|
||||
front = card['front']
|
||||
lookup = extract_word_for_lookup(front)
|
||||
if lookup and lookup not in words:
|
||||
words[lookup] = front
|
||||
|
||||
print(f"Found {len(words)} unique words to look up")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
ctx = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = await ctx.new_page()
|
||||
|
||||
# Load existing progress if any
|
||||
results = {}
|
||||
if os.path.exists(OUTPUT):
|
||||
with open(OUTPUT) as f:
|
||||
results = json.load(f)
|
||||
print(f"Loaded {len(results)} existing results")
|
||||
|
||||
found = len(results)
|
||||
total = len(words)
|
||||
|
||||
for i, (lookup, original) in enumerate(words.items()):
|
||||
# Skip already scraped
|
||||
if original in results:
|
||||
continue
|
||||
|
||||
print(f"[{i+1}/{total}] {lookup}...", end=" ", flush=True)
|
||||
try:
|
||||
examples = await scrape_word(page, original, lookup)
|
||||
if examples:
|
||||
results[original] = examples
|
||||
found += 1
|
||||
print(f"{len(examples)} examples")
|
||||
else:
|
||||
results[original] = []
|
||||
print("no examples")
|
||||
except Exception as e:
|
||||
print(f"error: {e}")
|
||||
results[original] = []
|
||||
|
||||
# Save progress every 20 words
|
||||
if (i + 1) % 20 == 0:
|
||||
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
print(f" [saved {len(results)} results]")
|
||||
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
await browser.close()
|
||||
|
||||
# Save results
|
||||
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\nDone! {found}/{total} words with examples → {OUTPUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user