#!/usr/bin/env python3 """ Scrape all 39 LanGo Spanish Beginner I decks from Brainscape using Playwright. Outputs course_data.json with all decks and cards organized by week. """ import asyncio import json import re from playwright.async_api import async_playwright BASE_URL = "https://www.brainscape.com" PACK_ID = "18164266" OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json" DECK_URLS = [ "/flashcards/week-1-greetings-los-saludos-10176532/packs/18164266", "/flashcards/week-1-greetings-los-saludos-al-reves-12745728/packs/18164266", "/flashcards/week-2-adjectives-los-adjetivos-12745741/packs/18164266", "/flashcards/week-2-adjectives-los-adjetivos-al-reves-12745829/packs/18164266", "/flashcards/week-2-numbers-los-numeros-12797877/packs/18164266", "/flashcards/week-2-numbers-los-numeros-al-reves-13698219/packs/18164266", "/flashcards/week-2-professions-las-profesiones-12740531/packs/18164266", "/flashcards/week-2-professions-las-profesiones-al-re-12745832/packs/18164266", "/flashcards/week-3-house-la-casa-10216249/packs/18164266", "/flashcards/week-3-house-la-casa-al-reves-12745837/packs/18164266", "/flashcards/week-3-ar-verbs-10207117/packs/18164266", "/flashcards/week-3-ar-verbs-al-reves-12745833/packs/18164266", "/flashcards/week-3-er-verbs-12745857/packs/18164266", "/flashcards/week-3-er-verbs-al-reves-12745888/packs/18164266", "/flashcards/week-3-ir-verbs-10207120/packs/18164266", "/flashcards/week-3-ir-verbs-al-reves-12745835/packs/18164266", "/flashcards/week-4-family-la-familia-10266419/packs/18164266", "/flashcards/week-4-family-la-familia-al-reves-12745978/packs/18164266", "/flashcards/week-4-e-ie-stem-changing-verbs-10270069/packs/18164266", "/flashcards/week-4-e-ie-stem-changing-verbs-al-reves-12749152/packs/18164266", "/flashcards/week-4-e-i-stem-changing-verbs-10270070/packs/18164266", "/flashcards/week-4-e-i-stem-changing-verbs-al-reves-12749160/packs/18164266", "/flashcards/week-4-o-ue-stem-changing-verbs-10270071/packs/18164266", "/flashcards/week-4-o-ue-stem-changing-verbs-al-reves-12749172/packs/18164266", "/flashcards/week-4-exceptional-yo-forms-10286213/packs/18164266", "/flashcards/week-4-exceptional-yo-forms-al-reves-12749234/packs/18164266", "/flashcards/week-5-reflexive-verbs-los-verbos-reflex-10270072/packs/18164266", "/flashcards/week-5-reflexive-verbs-los-verbos-reflex-12745842/packs/18164266", "/flashcards/week-5-daily-routine-la-rutina-cotidiana-11869082/packs/18164266", "/flashcards/week-5-daily-routine-la-rutina-cotidiana-12745840/packs/18164266", "/flashcards/week-6-city-la-ciudad-10232784/packs/18164266", "/flashcards/week-6-city-la-ciudad-al-reves-12745942/packs/18164266", "/flashcards/week-6-time-expressions-las-expresiones-12797878/packs/18164266", "/flashcards/week-6-time-expressions-las-expresiones-13698220/packs/18164266", "/flashcards/week-7-idioms-with-the-verb-tener-los-mo-11951594/packs/18164266", "/flashcards/week-8-prepositions-and-negation-las-pre-11951441/packs/18164266", "/flashcards/week-8-prepositions-and-negation-las-pre-16094943/packs/18164266", "/flashcards/week-8-hobbies-los-pasatiempos-10232782/packs/18164266", "/flashcards/week-8-hobbies-los-pasatiempos-al-reves-12745838/packs/18164266", ] def parse_title_and_week(text): """Extract week number and clean title from page text.""" # Match "Week N: Title" pattern m = re.match(r'Week\s+(\d+):\s*(.+)', text, re.IGNORECASE) if m: return int(m.group(1)), m.group(2).strip() return 0, text.strip() def parse_cards(text): """Parse flashcard Q/A pairs from page text.""" cards = [] lines = text.split('\n') # Filter out noise lines skip = {'Q', 'A', 'Study These Flashcards', '', 'Brainscape', 'Find Flashcards', 'Make Flashcards', 'How It Works', 'Educators', 'Businesses', 'Academy', 'Log in', 'Get Started'} i = 0 while i < len(lines): line = lines[i].strip() # Look for a card number if re.match(r'^\d+$', line): num = int(line) # Collect content lines until the next card number or deck list parts = [] j = i + 1 while j < len(lines) and len(parts) < 6: nextline = lines[j].strip() # Stop at next card number if re.match(r'^\d+$', nextline) and int(nextline) == num + 1: break # Stop at deck list / footer if nextline.startswith('LanGo Spanish') or nextline.startswith('Decks in class'): break # Stop at other deck titles leaking in if re.match(r'^Week \d+:', nextline): break # Skip noise if nextline in skip: j += 1 continue parts.append(nextline) j += 1 if len(parts) >= 2: cards.append({ "front": parts[0], "back": parts[1], }) i = j else: i += 1 # Post-filter: remove any cards that are actually deck titles cards = [c for c in cards if not re.match(r'^Week \d+:', c['front']) and c['front'] not in ('Decks in class (39)', '# Cards') and not c['front'].startswith('LanGo Spanish') and not c['front'].startswith('You may prefer')] return cards async def scrape_deck(page, url): """Scrape a single deck page.""" full_url = BASE_URL + url await page.goto(full_url, wait_until="networkidle", timeout=30000) await page.wait_for_timeout(2000) # Scroll to load lazy content for _ in range(5): await page.evaluate("window.scrollBy(0, 1000)") await page.wait_for_timeout(300) text = await page.inner_text("body") # Extract title — try multiple patterns # Format: "LanGo Spanish | Beginner I > Week N: Title > Flashcards" title_match = re.search(r'>\s*(Week\s+\d+:.+?)\s*>\s*Flashcards', text) if title_match: raw_title = title_match.group(1).strip() else: # Try: "Week N: Title (Subtitle) Flashcards" heading_match = re.search(r'(Week\s+\d+:.+?)\s*Flashcards', text) if heading_match: raw_title = heading_match.group(1).strip() else: # Last resort: extract from URL slug slug = url.split('/')[2] # Convert "week-5-reflexive-verbs-los-verbos-reflex-10270072" to title slug_clean = re.sub(r'-\d+$', '', slug) # remove trailing ID slug_clean = re.sub(r'-al-rev(e|é)s$', ' AL REVÉS', slug_clean) raw_title = slug_clean.replace('-', ' ').title() # Try to extract week number wm = re.match(r'Week\s+(\d+)', raw_title, re.IGNORECASE) if wm: raw_title = raw_title # already has Week N else: raw_title = "Week 0: " + raw_title week, title = parse_title_and_week(raw_title) cards = parse_cards(text) is_reversed = "al rev" in url.lower() or "AL REVÉS" in raw_title.upper() return { "week": week, "title": title, "isReversed": is_reversed, "cardCount": len(cards), "cards": cards, "url": url, } async def main(): async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" ) page = await context.new_page() all_decks = [] total_cards = 0 for i, url in enumerate(DECK_URLS): print(f"[{i+1}/{len(DECK_URLS)}] Scraping {url.split('/')[2][:50]}...") try: deck = await scrape_deck(page, url) all_decks.append(deck) total_cards += deck["cardCount"] print(f" → Week {deck['week']}: {deck['title']} ({deck['cardCount']} cards)") except Exception as e: print(f" ERROR: {e}") # Be polite await page.wait_for_timeout(500) await browser.close() # Organize by week weeks = {} for deck in all_decks: w = deck["week"] if w not in weeks: weeks[w] = [] weeks[w].append({ "title": deck["title"], "isReversed": deck["isReversed"], "cardCount": deck["cardCount"], "cards": deck["cards"], }) output = { "course": "LanGo Spanish | Beginner I", "totalDecks": len(all_decks), "totalCards": total_cards, "weeks": [ { "week": w, "decks": weeks[w], } for w in sorted(weeks.keys()) ], } with open(OUTPUT, 'w', encoding='utf-8') as f: json.dump(output, f, ensure_ascii=False, indent=2) print(f"\nDone! {len(all_decks)} decks, {total_cards} cards → {OUTPUT}") if __name__ == "__main__": asyncio.run(main())