Initial commit: Conjuga Spanish conjugation app
Includes SwiftData dual-store architecture (local reference + CloudKit user data), JSON-based data seeding, 20 tense guides, 20 grammar notes, SRS review system, course vocabulary, and widget support. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
238
Conjuga/Scripts/scrape_brainscape.py
Normal file
238
Conjuga/Scripts/scrape_brainscape.py
Normal file
@@ -0,0 +1,238 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrape all 39 LanGo Spanish Beginner I decks from Brainscape using Playwright.
|
||||
Outputs course_data.json with all decks and cards organized by week.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
BASE_URL = "https://www.brainscape.com"
|
||||
PACK_ID = "18164266"
|
||||
OUTPUT = "/Users/treyt/Desktop/code/Spanish/Conjuga/Scripts/course_data.json"
|
||||
|
||||
DECK_URLS = [
|
||||
"/flashcards/week-1-greetings-los-saludos-10176532/packs/18164266",
|
||||
"/flashcards/week-1-greetings-los-saludos-al-reves-12745728/packs/18164266",
|
||||
"/flashcards/week-2-adjectives-los-adjetivos-12745741/packs/18164266",
|
||||
"/flashcards/week-2-adjectives-los-adjetivos-al-reves-12745829/packs/18164266",
|
||||
"/flashcards/week-2-numbers-los-numeros-12797877/packs/18164266",
|
||||
"/flashcards/week-2-numbers-los-numeros-al-reves-13698219/packs/18164266",
|
||||
"/flashcards/week-2-professions-las-profesiones-12740531/packs/18164266",
|
||||
"/flashcards/week-2-professions-las-profesiones-al-re-12745832/packs/18164266",
|
||||
"/flashcards/week-3-house-la-casa-10216249/packs/18164266",
|
||||
"/flashcards/week-3-house-la-casa-al-reves-12745837/packs/18164266",
|
||||
"/flashcards/week-3-ar-verbs-10207117/packs/18164266",
|
||||
"/flashcards/week-3-ar-verbs-al-reves-12745833/packs/18164266",
|
||||
"/flashcards/week-3-er-verbs-12745857/packs/18164266",
|
||||
"/flashcards/week-3-er-verbs-al-reves-12745888/packs/18164266",
|
||||
"/flashcards/week-3-ir-verbs-10207120/packs/18164266",
|
||||
"/flashcards/week-3-ir-verbs-al-reves-12745835/packs/18164266",
|
||||
"/flashcards/week-4-family-la-familia-10266419/packs/18164266",
|
||||
"/flashcards/week-4-family-la-familia-al-reves-12745978/packs/18164266",
|
||||
"/flashcards/week-4-e-ie-stem-changing-verbs-10270069/packs/18164266",
|
||||
"/flashcards/week-4-e-ie-stem-changing-verbs-al-reves-12749152/packs/18164266",
|
||||
"/flashcards/week-4-e-i-stem-changing-verbs-10270070/packs/18164266",
|
||||
"/flashcards/week-4-e-i-stem-changing-verbs-al-reves-12749160/packs/18164266",
|
||||
"/flashcards/week-4-o-ue-stem-changing-verbs-10270071/packs/18164266",
|
||||
"/flashcards/week-4-o-ue-stem-changing-verbs-al-reves-12749172/packs/18164266",
|
||||
"/flashcards/week-4-exceptional-yo-forms-10286213/packs/18164266",
|
||||
"/flashcards/week-4-exceptional-yo-forms-al-reves-12749234/packs/18164266",
|
||||
"/flashcards/week-5-reflexive-verbs-los-verbos-reflex-10270072/packs/18164266",
|
||||
"/flashcards/week-5-reflexive-verbs-los-verbos-reflex-12745842/packs/18164266",
|
||||
"/flashcards/week-5-daily-routine-la-rutina-cotidiana-11869082/packs/18164266",
|
||||
"/flashcards/week-5-daily-routine-la-rutina-cotidiana-12745840/packs/18164266",
|
||||
"/flashcards/week-6-city-la-ciudad-10232784/packs/18164266",
|
||||
"/flashcards/week-6-city-la-ciudad-al-reves-12745942/packs/18164266",
|
||||
"/flashcards/week-6-time-expressions-las-expresiones-12797878/packs/18164266",
|
||||
"/flashcards/week-6-time-expressions-las-expresiones-13698220/packs/18164266",
|
||||
"/flashcards/week-7-idioms-with-the-verb-tener-los-mo-11951594/packs/18164266",
|
||||
"/flashcards/week-8-prepositions-and-negation-las-pre-11951441/packs/18164266",
|
||||
"/flashcards/week-8-prepositions-and-negation-las-pre-16094943/packs/18164266",
|
||||
"/flashcards/week-8-hobbies-los-pasatiempos-10232782/packs/18164266",
|
||||
"/flashcards/week-8-hobbies-los-pasatiempos-al-reves-12745838/packs/18164266",
|
||||
]
|
||||
|
||||
|
||||
def parse_title_and_week(text):
|
||||
"""Extract week number and clean title from page text."""
|
||||
# Match "Week N: Title" pattern
|
||||
m = re.match(r'Week\s+(\d+):\s*(.+)', text, re.IGNORECASE)
|
||||
if m:
|
||||
return int(m.group(1)), m.group(2).strip()
|
||||
return 0, text.strip()
|
||||
|
||||
|
||||
def parse_cards(text):
|
||||
"""Parse flashcard Q/A pairs from page text."""
|
||||
cards = []
|
||||
lines = text.split('\n')
|
||||
|
||||
# Filter out noise lines
|
||||
skip = {'Q', 'A', 'Study These Flashcards', '', 'Brainscape', 'Find Flashcards',
|
||||
'Make Flashcards', 'How It Works', 'Educators', 'Businesses', 'Academy',
|
||||
'Log in', 'Get Started'}
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
# Look for a card number
|
||||
if re.match(r'^\d+$', line):
|
||||
num = int(line)
|
||||
# Collect content lines until the next card number or deck list
|
||||
parts = []
|
||||
j = i + 1
|
||||
while j < len(lines) and len(parts) < 6:
|
||||
nextline = lines[j].strip()
|
||||
|
||||
# Stop at next card number
|
||||
if re.match(r'^\d+$', nextline) and int(nextline) == num + 1:
|
||||
break
|
||||
|
||||
# Stop at deck list / footer
|
||||
if nextline.startswith('LanGo Spanish') or nextline.startswith('Decks in class'):
|
||||
break
|
||||
|
||||
# Stop at other deck titles leaking in
|
||||
if re.match(r'^Week \d+:', nextline):
|
||||
break
|
||||
|
||||
# Skip noise
|
||||
if nextline in skip:
|
||||
j += 1
|
||||
continue
|
||||
|
||||
parts.append(nextline)
|
||||
j += 1
|
||||
|
||||
if len(parts) >= 2:
|
||||
cards.append({
|
||||
"front": parts[0],
|
||||
"back": parts[1],
|
||||
})
|
||||
i = j
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Post-filter: remove any cards that are actually deck titles
|
||||
cards = [c for c in cards if not re.match(r'^Week \d+:', c['front'])
|
||||
and c['front'] not in ('Decks in class (39)', '# Cards')
|
||||
and not c['front'].startswith('LanGo Spanish')
|
||||
and not c['front'].startswith('You may prefer')]
|
||||
return cards
|
||||
|
||||
|
||||
async def scrape_deck(page, url):
|
||||
"""Scrape a single deck page."""
|
||||
full_url = BASE_URL + url
|
||||
await page.goto(full_url, wait_until="networkidle", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
# Scroll to load lazy content
|
||||
for _ in range(5):
|
||||
await page.evaluate("window.scrollBy(0, 1000)")
|
||||
await page.wait_for_timeout(300)
|
||||
|
||||
text = await page.inner_text("body")
|
||||
|
||||
# Extract title — try multiple patterns
|
||||
# Format: "LanGo Spanish | Beginner I > Week N: Title > Flashcards"
|
||||
title_match = re.search(r'>\s*(Week\s+\d+:.+?)\s*>\s*Flashcards', text)
|
||||
if title_match:
|
||||
raw_title = title_match.group(1).strip()
|
||||
else:
|
||||
# Try: "Week N: Title (Subtitle) Flashcards"
|
||||
heading_match = re.search(r'(Week\s+\d+:.+?)\s*Flashcards', text)
|
||||
if heading_match:
|
||||
raw_title = heading_match.group(1).strip()
|
||||
else:
|
||||
# Last resort: extract from URL slug
|
||||
slug = url.split('/')[2]
|
||||
# Convert "week-5-reflexive-verbs-los-verbos-reflex-10270072" to title
|
||||
slug_clean = re.sub(r'-\d+$', '', slug) # remove trailing ID
|
||||
slug_clean = re.sub(r'-al-rev(e|é)s$', ' AL REVÉS', slug_clean)
|
||||
raw_title = slug_clean.replace('-', ' ').title()
|
||||
# Try to extract week number
|
||||
wm = re.match(r'Week\s+(\d+)', raw_title, re.IGNORECASE)
|
||||
if wm:
|
||||
raw_title = raw_title # already has Week N
|
||||
else:
|
||||
raw_title = "Week 0: " + raw_title
|
||||
|
||||
week, title = parse_title_and_week(raw_title)
|
||||
cards = parse_cards(text)
|
||||
|
||||
is_reversed = "al rev" in url.lower() or "AL REVÉS" in raw_title.upper()
|
||||
|
||||
return {
|
||||
"week": week,
|
||||
"title": title,
|
||||
"isReversed": is_reversed,
|
||||
"cardCount": len(cards),
|
||||
"cards": cards,
|
||||
"url": url,
|
||||
}
|
||||
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
all_decks = []
|
||||
total_cards = 0
|
||||
|
||||
for i, url in enumerate(DECK_URLS):
|
||||
print(f"[{i+1}/{len(DECK_URLS)}] Scraping {url.split('/')[2][:50]}...")
|
||||
try:
|
||||
deck = await scrape_deck(page, url)
|
||||
all_decks.append(deck)
|
||||
total_cards += deck["cardCount"]
|
||||
print(f" → Week {deck['week']}: {deck['title']} ({deck['cardCount']} cards)")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
# Be polite
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
await browser.close()
|
||||
|
||||
# Organize by week
|
||||
weeks = {}
|
||||
for deck in all_decks:
|
||||
w = deck["week"]
|
||||
if w not in weeks:
|
||||
weeks[w] = []
|
||||
weeks[w].append({
|
||||
"title": deck["title"],
|
||||
"isReversed": deck["isReversed"],
|
||||
"cardCount": deck["cardCount"],
|
||||
"cards": deck["cards"],
|
||||
})
|
||||
|
||||
output = {
|
||||
"course": "LanGo Spanish | Beginner I",
|
||||
"totalDecks": len(all_decks),
|
||||
"totalCards": total_cards,
|
||||
"weeks": [
|
||||
{
|
||||
"week": w,
|
||||
"decks": weeks[w],
|
||||
}
|
||||
for w in sorted(weeks.keys())
|
||||
],
|
||||
}
|
||||
|
||||
with open(OUTPUT, 'w', encoding='utf-8') as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\nDone! {len(all_decks)} decks, {total_cards} cards → {OUTPUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user