Spanish/Conjuga/Scripts/scrape_examples.py

#!/usr/bin/env python3
"""
Scrape 2-3 example sentences per vocab word from SpanishDict.
Reads words from course_data.json, outputs examples to course_examples.json.
"""

import asyncio
import json
import re
import os
from playwright.async_api import async_playwright

INPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json"
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_examples.json"
MAX_EXAMPLES = 3

def extract_word_for_lookup(front):
    """Extract the best lookup word from a card front.
    e.g. 'barato, barata' -> 'barato'
    e.g. 'el/la periodista' -> 'periodista'
    """
    word = front.strip()
    # Remove articles
    word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE)
    word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE)
    # Take first word if comma-separated (barato, barata -> barato)
    if ',' in word:
        word = word.split(',')[0].strip()
    # Take first word if slash-separated
    if '/' in word:
        word = word.split('/')[0].strip()
    return word.lower().strip()


def parse_examples(text, lookup_word):
    """Parse example sentences from SpanishDict page text."""
    examples = []
    lines = text.split('\n')

    for i, line in enumerate(lines):
        l = line.strip()
        if not l or len(l) < 15:
            continue

        # Pattern: "Spanish sentence.English sentence." (inline on one line)
        # SpanishDict puts them together with no space between period and capital
        # e.g. "Esta tienda es muy barata.This store is really cheap."
        inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l)
        if inline_match:
            es = inline_match.group(1).strip()
            en = inline_match.group(2).strip()
            # Verify it contains our word (case-insensitive)
            if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5:
                examples.append({"es": es, "en": en})
                if len(examples) >= MAX_EXAMPLES:
                    break
                continue

        # Pattern: standalone Spanish sentence with word, followed by English on next line
        if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300:
            # Check if next non-empty line is English
            for j in range(i + 1, min(i + 3, len(lines))):
                next_l = lines[j].strip()
                if not next_l:
                    continue
                # Check if it looks English (starts with capital, has common English words)
                if (next_l[0].isupper() and
                    not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])):
                    examples.append({"es": l, "en": next_l})
                    if len(examples) >= MAX_EXAMPLES:
                        break
                break

        if len(examples) >= MAX_EXAMPLES:
            break

    return examples


async def scrape_word(page, word, lookup):
    """Scrape examples for a single word."""
    url = f"https://www.spanishdict.com/translate/{lookup}"
    try:
        await page.goto(url, wait_until="domcontentloaded", timeout=15000)
        await page.wait_for_timeout(2000)
        text = await page.inner_text("body")
        examples = parse_examples(text, lookup)
        return examples
    except Exception as e:
        return []


async def main():
    # Load course data
    with open(INPUT) as f:
        data = json.load(f)

    # Collect unique words (front values from non-reversed decks)
    words = {}  # lookup -> original front
    for week in data['weeks']:
        for deck in week['decks']:
            if deck.get('isReversed'):
                continue
            for card in deck['cards']:
                front = card['front']
                lookup = extract_word_for_lookup(front)
                if lookup and lookup not in words:
                    words[lookup] = front

    print(f"Found {len(words)} unique words to look up")

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        ctx = await browser.new_context(
            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
        )
        page = await ctx.new_page()

        # Load existing progress if any
        results = {}
        if os.path.exists(OUTPUT):
            with open(OUTPUT) as f:
                results = json.load(f)
            print(f"Loaded {len(results)} existing results")

        found = len(results)
        total = len(words)

        for i, (lookup, original) in enumerate(words.items()):
            # Skip already scraped
            if original in results:
                continue

            print(f"[{i+1}/{total}] {lookup}...", end=" ", flush=True)
            try:
                examples = await scrape_word(page, original, lookup)
                if examples:
                    results[original] = examples
                    found += 1
                    print(f"{len(examples)} examples")
                else:
                    results[original] = []
                    print("no examples")
            except Exception as e:
                print(f"error: {e}")
                results[original] = []

            # Save progress every 20 words
            if (i + 1) % 20 == 0:
                with open(OUTPUT, 'w', encoding='utf-8') as f:
                    json.dump(results, f, ensure_ascii=False, indent=2)
                print(f"  [saved {len(results)} results]")

            await page.wait_for_timeout(300)

        await browser.close()

    # Save results
    with open(OUTPUT, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\nDone! {found}/{total} words with examples → {OUTPUT}")


if __name__ == "__main__":
    asyncio.run(main())