#!/usr/bin/env python3 """ Scrape 7 LanGo Spanish course packs from Brainscape, plus example sentences from SpanishDict. Outputs all_courses_data.json with all courses, decks, cards, and examples organized by week. """ import asyncio import json import re import os from playwright.async_api import async_playwright BASE_URL = "https://www.brainscape.com" OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/all_courses_data.json" MAX_EXAMPLES = 3 PACK_URLS = [ "https://www.brainscape.com/packs/lango-spanish-beginner-ii-16514996", "https://www.brainscape.com/packs/lango-spanish-beginner-iii-conversation-18477688", "https://www.brainscape.com/packs/lango-spanish-intermediate-i-21508666", "https://www.brainscape.com/packs/lango-spanish-intermediate-ii-21906841", "https://www.brainscape.com/packs/lango-spanish-intermediate-iii-spanish-through-stories-20677744", "https://www.brainscape.com/packs/lango-spanish-advanced-i-21511244", "https://www.brainscape.com/packs/lango-spanish-advanced-ii-21649461", ] USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" # --------------------------------------------------------------------------- # Parsing helpers (copied from scrape_brainscape.py and scrape_examples.py) # --------------------------------------------------------------------------- def parse_title_and_week(text): """Extract week number and clean title from page text.""" # Match "Week N: Title" or "Semana N: Title" or "Semana N Title" m = re.match(r'(?:Week|Semana)\s+(\d+)[:\s]+(.+)', text, re.IGNORECASE) if m: return int(m.group(1)), m.group(2).strip() return 0, text.strip() def parse_cards(text): """Parse flashcard Q/A pairs from page text.""" cards = [] lines = text.split('\n') skip = {'Q', 'A', 'Study These Flashcards', '', 'Brainscape', 'Find Flashcards', 'Make Flashcards', 'How It Works', 'Educators', 'Businesses', 'Academy', 'Log in', 'Get Started'} i = 0 while i < len(lines): line = lines[i].strip() if re.match(r'^\d+$', line): num = int(line) parts = [] j = i + 1 while j < len(lines) and len(parts) < 6: nextline = lines[j].strip() if re.match(r'^\d+$', nextline) and int(nextline) == num + 1: break if nextline.startswith('LanGo Spanish') or nextline.startswith('Decks in class'): break if re.match(r'^(?:Week|Semana) \d+', nextline): break if nextline in skip: j += 1 continue parts.append(nextline) j += 1 if len(parts) >= 2: cards.append({ "front": parts[0], "back": parts[1], }) i = j else: i += 1 cards = [c for c in cards if not re.match(r'^(?:Week|Semana) \d+', c['front']) and c['front'] not in ('Decks in class (39)', '# Cards') and not c['front'].startswith('LanGo Spanish') and not c['front'].startswith('You may prefer')] return cards def extract_word_for_lookup(front): """Extract the best lookup word from a card front.""" word = front.strip() word = re.sub(r'^(el|la|los|las|un|una)\s+', '', word, flags=re.IGNORECASE) word = re.sub(r'^(el/la|los/las)\s+', '', word, flags=re.IGNORECASE) if ',' in word: word = word.split(',')[0].strip() if '/' in word: word = word.split('/')[0].strip() return word.lower().strip() def parse_examples(text, lookup_word): """Parse example sentences from SpanishDict page text.""" examples = [] lines = text.split('\n') for i, line in enumerate(lines): l = line.strip() if not l or len(l) < 15: continue inline_match = re.match(r'^(.+?[.!?])([A-Z].+)$', l) if inline_match: es = inline_match.group(1).strip() en = inline_match.group(2).strip() if lookup_word.lower() in es.lower() and len(es) > 10 and len(en) > 5: examples.append({"es": es, "en": en}) if len(examples) >= MAX_EXAMPLES: break continue if lookup_word.lower() in l.lower() and len(l) > 15 and len(l) < 300: for j in range(i + 1, min(i + 3, len(lines))): next_l = lines[j].strip() if not next_l: continue if (next_l[0].isupper() and not any(c in next_l for c in ['á', 'é', 'í', 'ó', 'ú', 'ñ', '¿', '¡'])): examples.append({"es": l, "en": next_l}) if len(examples) >= MAX_EXAMPLES: break break if len(examples) >= MAX_EXAMPLES: break return examples # --------------------------------------------------------------------------- # Scraping logic # --------------------------------------------------------------------------- async def discover_deck_urls(page, pack_url): """Visit a pack page and discover all deck URLs within it.""" print(f"\nDiscovering decks in {pack_url}...") await page.goto(pack_url, wait_until="networkidle", timeout=30000) await page.wait_for_timeout(2000) # Scroll to load all content for _ in range(10): await page.evaluate("window.scrollBy(0, 1000)") await page.wait_for_timeout(300) # Extract pack ID from URL pack_id = pack_url.rstrip('/').split('-')[-1] # Find all deck links matching /flashcards/*/packs/* links = await page.eval_on_selector_all( 'a[href*="/flashcards/"]', 'els => els.map(e => e.getAttribute("href"))' ) deck_urls = [] seen = set() for href in links: if href and '/flashcards/' in href and '/packs/' in href: # Normalize if href.startswith('http'): href = href.replace(BASE_URL, '') if href not in seen: seen.add(href) deck_urls.append(href) # Extract course name from the page text = await page.inner_text("body") course_name = None # Try to find "LanGo Spanish | ..." pattern m = re.search(r'(LanGo Spanish\s*\|\s*[^>\n]+)', text) if m: course_name = m.group(1).strip() # Clean trailing noise course_name = re.sub(r'\s*>\s*$', '', course_name).strip() # Remove "Flashcards" suffix if present course_name = re.sub(r'\s*Flashcards\s*$', '', course_name).strip() else: # Fallback: derive from URL slug slug = pack_url.rstrip('/').split('/')[-1] slug = re.sub(r'-\d+$', '', slug) course_name = slug.replace('-', ' ').title() print(f" Course: {course_name}") print(f" Found {len(deck_urls)} deck URLs") return course_name, deck_urls async def scrape_deck(page, url): """Scrape a single deck page for flashcard data.""" full_url = BASE_URL + url if url.startswith('/') else url await page.goto(full_url, wait_until="networkidle", timeout=30000) await page.wait_for_timeout(2000) for _ in range(5): await page.evaluate("window.scrollBy(0, 1000)") await page.wait_for_timeout(300) text = await page.inner_text("body") # Extract title — handle both "Week N:" and "Semana N" patterns title_match = re.search(r'>\s*((?:Week|Semana)\s+\d+[:\s].+?)\s*>\s*Flashcards', text) if title_match: raw_title = title_match.group(1).strip() else: heading_match = re.search(r'((?:Week|Semana)\s+\d+[:\s].+?)\s*Flashcards', text) if heading_match: raw_title = heading_match.group(1).strip() else: slug = url.split('/')[2] if len(url.split('/')) > 2 else url slug_clean = re.sub(r'-\d+$', '', slug) slug_clean = re.sub(r'-al-rev(e|é)s$', ' AL REVÉS', slug_clean) raw_title = slug_clean.replace('-', ' ').title() wm = re.match(r'Week\s+(\d+)', raw_title, re.IGNORECASE) if not wm: raw_title = "Week 0: " + raw_title week, title = parse_title_and_week(raw_title) cards = parse_cards(text) is_reversed = "al rev" in url.lower() or "AL REVÉS" in raw_title.upper() return { "week": week, "title": title, "isReversed": is_reversed, "cardCount": len(cards), "cards": cards, "url": url, } async def scrape_examples_for_word(page, lookup): """Scrape example sentences from SpanishDict for a single word.""" url = f"https://www.spanishdict.com/translate/{lookup}" try: await page.goto(url, wait_until="domcontentloaded", timeout=15000) await page.wait_for_timeout(2000) text = await page.inner_text("body") return parse_examples(text, lookup) except Exception: return [] def save_progress(data): """Save current data to output file.""" with open(OUTPUT, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) def load_progress(): """Load existing progress if available.""" if os.path.exists(OUTPUT): try: with open(OUTPUT) as f: return json.load(f) except (json.JSONDecodeError, KeyError): pass return None async def main(): # Check for existing progress existing = load_progress() completed_courses = set() examples_done = {} # lookup -> examples list if existing and 'courses' in existing: for course in existing['courses']: if course.get('_examples_done'): completed_courses.add(course['course']) # Collect already-scraped examples for week in course.get('weeks', []): for deck in week.get('decks', []): for card in deck.get('cards', []): if card.get('examples'): lookup = extract_word_for_lookup(card['front']) examples_done[lookup] = card['examples'] print(f"Loaded progress: {len(completed_courses)} completed courses, {len(examples_done)} words with examples") async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context(user_agent=USER_AGENT) page = await context.new_page() all_courses = [] # If we have existing data for completed courses, keep them if existing and 'courses' in existing: for course in existing['courses']: if course['course'] in completed_courses: all_courses.append(course) # --------------------------------------------------------------- # Phase 1: Discover decks and scrape cards for each course pack # --------------------------------------------------------------- for pack_url in PACK_URLS: course_name, deck_urls = await discover_deck_urls(page, pack_url) # Skip if already completed if course_name in completed_courses: print(f" Skipping {course_name} (already completed)") continue await page.wait_for_timeout(300) all_decks = [] total_cards = 0 for i, deck_url in enumerate(deck_urls): slug = deck_url.split('/')[2] if len(deck_url.split('/')) > 2 else deck_url print(f" [{i+1}/{len(deck_urls)}] Scraping {slug[:60]}...") try: deck = await scrape_deck(page, deck_url) all_decks.append(deck) total_cards += deck["cardCount"] print(f" -> Week {deck['week']}: {deck['title']} ({deck['cardCount']} cards)") except Exception as e: print(f" ERROR: {e}") await page.wait_for_timeout(300) # Organize by week weeks = {} for deck in all_decks: w = deck["week"] if w not in weeks: weeks[w] = [] weeks[w].append({ "title": deck["title"], "isReversed": deck["isReversed"], "cardCount": deck["cardCount"], "cards": deck["cards"], }) course_data = { "course": course_name, "totalDecks": len(all_decks), "totalCards": total_cards, "_examples_done": False, "weeks": [ {"week": w, "decks": weeks[w]} for w in sorted(weeks.keys()) ], } all_courses.append(course_data) # Save after each course save_progress({"courses": all_courses}) print(f" Saved {course_name}: {len(all_decks)} decks, {total_cards} cards") # --------------------------------------------------------------- # Phase 2: Scrape example sentences from SpanishDict # --------------------------------------------------------------- print("\n" + "=" * 60) print("Phase 2: Scraping example sentences from SpanishDict") print("=" * 60) # Collect all unique words across all courses (non-reversed decks) unique_words = {} # lookup -> original front for course in all_courses: for week in course['weeks']: for deck in week['decks']: if deck.get('isReversed'): continue for card in deck['cards']: front = card['front'] lookup = extract_word_for_lookup(front) if lookup and lookup not in unique_words: unique_words[lookup] = front print(f"Found {len(unique_words)} unique words to look up") print(f"Already have examples for {len(examples_done)} words") words_scraped = 0 total_words = len(unique_words) for i, (lookup, original) in enumerate(unique_words.items()): if lookup in examples_done: continue print(f"[{i+1}/{total_words}] {lookup}...", end=" ", flush=True) try: examples = await scrape_examples_for_word(page, lookup) examples_done[lookup] = examples if examples: print(f"{len(examples)} examples") else: print("no examples") except Exception as e: print(f"error: {e}") examples_done[lookup] = [] words_scraped += 1 # Save progress every 20 words if words_scraped % 20 == 0: # Attach examples to cards before saving _attach_examples(all_courses, examples_done) save_progress({"courses": all_courses}) print(f" [saved progress - {len(examples_done)} words done]") await page.wait_for_timeout(300) await browser.close() # --------------------------------------------------------------- # Final: attach all examples to cards and save # --------------------------------------------------------------- _attach_examples(all_courses, examples_done) # Mark all courses as examples_done and remove internal flag for course in all_courses: course['_examples_done'] = True # Clean up internal flags before final save for course in all_courses: course.pop('_examples_done', None) save_progress({"courses": all_courses}) total_decks = sum(c['totalDecks'] for c in all_courses) total_cards = sum(c['totalCards'] for c in all_courses) print(f"\nDone! {len(all_courses)} courses, {total_decks} decks, {total_cards} cards") print(f"Examples scraped for {len(examples_done)} unique words") print(f"Output: {OUTPUT}") def _attach_examples(courses, examples_done): """Attach scraped examples to card objects in place.""" for course in courses: for week in course['weeks']: for deck in week['decks']: for card in deck['cards']: lookup = extract_word_for_lookup(card['front']) if lookup in examples_done and examples_done[lookup]: card['examples'] = examples_done[lookup] elif 'examples' not in card: card['examples'] = [] if __name__ == "__main__": asyncio.run(main())