Files
Spanish/Conjuga/Scripts/scrape_examples.py
Trey t 4b467ec136 Initial commit: Conjuga Spanish conjugation app
Includes SwiftData dual-store architecture (local reference + CloudKit user data),
JSON-based data seeding, 20 tense guides, 20 grammar notes, SRS review system,
course vocabulary, and widget support.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 20:58:33 -05:00

167 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
Scrape 2-3 example sentences per vocab word from SpanishDict.
Reads words from course_data.json, outputs examples to course_examples.json.
"""
import asyncio
import json
import re
import os
from playwright.async_api import async_playwright
INPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json"
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_examples.json"
MAX_EXAMPLES = 3
def extract_word_for_lookup(front):
"""Extract the best lookup word from a card front.
e.g. 'barato, barata' -> 'barato'
e.g. 'el/la periodista' -> 'periodista'
"""
word = front.strip()
# Remove articles
word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE)
word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE)
# Take first word if comma-separated (barato, barata -> barato)
if ',' in word:
word = word.split(',')[0].strip()
# Take first word if slash-separated
if '/' in word:
word = word.split('/')[0].strip()
return word.lower().strip()
def parse_examples(text, lookup_word):
"""Parse example sentences from SpanishDict page text."""
examples = []
lines = text.split('\n')
for i, line in enumerate(lines):
l = line.strip()
if not l or len(l) < 15:
continue
# Pattern: "Spanish sentence.English sentence." (inline on one line)
# SpanishDict puts them together with no space between period and capital
# e.g. "Esta tienda es muy barata.This store is really cheap."
inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l)
if inline_match:
es = inline_match.group(1).strip()
en = inline_match.group(2).strip()
# Verify it contains our word (case-insensitive)
if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5:
examples.append({"es": es, "en": en})
if len(examples) >= MAX_EXAMPLES:
break
continue
# Pattern: standalone Spanish sentence with word, followed by English on next line
if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300:
# Check if next non-empty line is English
for j in range(i + 1, min(i + 3, len(lines))):
next_l = lines[j].strip()
if not next_l:
continue
# Check if it looks English (starts with capital, has common English words)
if (next_l[0].isupper() and
not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])):
examples.append({"es": l, "en": next_l})
if len(examples) >= MAX_EXAMPLES:
break
break
if len(examples) >= MAX_EXAMPLES:
break
return examples
async def scrape_word(page, word, lookup):
"""Scrape examples for a single word."""
url = f"https://www.spanishdict.com/translate/{lookup}"
try:
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
await page.wait_for_timeout(2000)
text = await page.inner_text("body")
examples = parse_examples(text, lookup)
return examples
except Exception as e:
return []
async def main():
# Load course data
with open(INPUT) as f:
data = json.load(f)
# Collect unique words (front values from non-reversed decks)
words = {} # lookup -> original front
for week in data['weeks']:
for deck in week['decks']:
if deck.get('isReversed'):
continue
for card in deck['cards']:
front = card['front']
lookup = extract_word_for_lookup(front)
if lookup and lookup not in words:
words[lookup] = front
print(f"Found {len(words)} unique words to look up")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
ctx = await browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)
page = await ctx.new_page()
# Load existing progress if any
results = {}
if os.path.exists(OUTPUT):
with open(OUTPUT) as f:
results = json.load(f)
print(f"Loaded {len(results)} existing results")
found = len(results)
total = len(words)
for i, (lookup, original) in enumerate(words.items()):
# Skip already scraped
if original in results:
continue
print(f"[{i+1}/{total}] {lookup}...", end=" ", flush=True)
try:
examples = await scrape_word(page, original, lookup)
if examples:
results[original] = examples
found += 1
print(f"{len(examples)} examples")
else:
results[original] = []
print("no examples")
except Exception as e:
print(f"error: {e}")
results[original] = []
# Save progress every 20 words
if (i + 1) % 20 == 0:
with open(OUTPUT, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f" [saved {len(results)} results]")
await page.wait_for_timeout(300)
await browser.close()
# Save results
with open(OUTPUT, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\nDone! {found}/{total} words with examples → {OUTPUT}")
if __name__ == "__main__":
asyncio.run(main())