#!/usr/bin/env python3 """ Scrape 2-3 example sentences per vocab word from SpanishDict. Reads words from course_data.json, outputs examples to course_examples.json. """ import asyncio import json import re import os from playwright.async_api import async_playwright INPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json" OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_examples.json" MAX_EXAMPLES = 3 def extract_word_for_lookup(front): """Extract the best lookup word from a card front. e.g. 'barato, barata' -> 'barato' e.g. 'el/la periodista' -> 'periodista' """ word = front.strip() # Remove articles word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE) word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE) # Take first word if comma-separated (barato, barata -> barato) if ',' in word: word = word.split(',')[0].strip() # Take first word if slash-separated if '/' in word: word = word.split('/')[0].strip() return word.lower().strip() def parse_examples(text, lookup_word): """Parse example sentences from SpanishDict page text.""" examples = [] lines = text.split('\n') for i, line in enumerate(lines): l = line.strip() if not l or len(l) < 15: continue # Pattern: "Spanish sentence.English sentence." (inline on one line) # SpanishDict puts them together with no space between period and capital # e.g. "Esta tienda es muy barata.This store is really cheap." inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l) if inline_match: es = inline_match.group(1).strip() en = inline_match.group(2).strip() # Verify it contains our word (case-insensitive) if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5: examples.append({"es": es, "en": en}) if len(examples) >= MAX_EXAMPLES: break continue # Pattern: standalone Spanish sentence with word, followed by English on next line if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300: # Check if next non-empty line is English for j in range(i + 1, min(i + 3, len(lines))): next_l = lines[j].strip() if not next_l: continue # Check if it looks English (starts with capital, has common English words) if (next_l[0].isupper() and not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])): examples.append({"es": l, "en": next_l}) if len(examples) >= MAX_EXAMPLES: break break if len(examples) >= MAX_EXAMPLES: break return examples async def scrape_word(page, word, lookup): """Scrape examples for a single word.""" url = f"https://www.spanishdict.com/translate/{lookup}" try: await page.goto(url, wait_until="domcontentloaded", timeout=15000) await page.wait_for_timeout(2000) text = await page.inner_text("body") examples = parse_examples(text, lookup) return examples except Exception as e: return [] async def main(): # Load course data with open(INPUT) as f: data = json.load(f) # Collect unique words (front values from non-reversed decks) words = {} # lookup -> original front for week in data['weeks']: for deck in week['decks']: if deck.get('isReversed'): continue for card in deck['cards']: front = card['front'] lookup = extract_word_for_lookup(front) if lookup and lookup not in words: words[lookup] = front print(f"Found {len(words)} unique words to look up") async with async_playwright() as p: browser = await p.chromium.launch(headless=True) ctx = await browser.new_context( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" ) page = await ctx.new_page() # Load existing progress if any results = {} if os.path.exists(OUTPUT): with open(OUTPUT) as f: results = json.load(f) print(f"Loaded {len(results)} existing results") found = len(results) total = len(words) for i, (lookup, original) in enumerate(words.items()): # Skip already scraped if original in results: continue print(f"[{i+1}/{total}] {lookup}...", end=" ", flush=True) try: examples = await scrape_word(page, original, lookup) if examples: results[original] = examples found += 1 print(f"{len(examples)} examples") else: results[original] = [] print("no examples") except Exception as e: print(f"error: {e}") results[original] = [] # Save progress every 20 words if (i + 1) % 20 == 0: with open(OUTPUT, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f" [saved {len(results)} results]") await page.wait_for_timeout(300) await browser.close() # Save results with open(OUTPUT, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\nDone! {found}/{total} words with examples → {OUTPUT}") if __name__ == "__main__": asyncio.run(main())