#!/usr/bin/env python3 """Generate a markdown report of every curated YouTube video referenced by the app. Reads Conjuga/youtube_videos.json, queries yt-dlp for metadata on each video, and emits Conjuga/youtube_videos.md with tables for tense guides and grammar notes plus a list of topics with no curated video. Usage: python3 Scripts/generate_videos_markdown.py Requires `yt-dlp` on PATH. Videos that have been taken down or made private appear in the tables with an "(unavailable)" marker in the title column. """ from __future__ import annotations import json import re import subprocess import sys from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import date from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] VIDEOS_JSON = REPO_ROOT / "Conjuga" / "youtube_videos.json" OUTPUT_MD = REPO_ROOT / "Conjuga" / "youtube_videos.md" # The curated ids we expect — anything in the source file that's missing from # the JSON shows up in the "missing" section at the bottom. EXPECTED_TENSE_IDS = [ "ind_presente", "ind_preterito", "ind_imperfecto", "ind_futuro", "ind_perfecto", "ind_pluscuamperfecto", "ind_futuro_perfecto", "ind_preterito_anterior", "cond_presente", "cond_perfecto", "subj_presente", "subj_imperfecto_1", "subj_imperfecto_2", "subj_perfecto", "subj_pluscuamperfecto_1", "subj_pluscuamperfecto_2", "subj_futuro", "subj_futuro_perfecto", "imp_afirmativo", "imp_negativo", ] EXPECTED_GRAMMAR_IDS = [ "ser-vs-estar", "por-vs-para", "preterite-vs-imperfect", "subjunctive-triggers", "reflexive-verbs", "object-pronouns", "gustar-like-verbs", "comparatives-superlatives", "conditional-if-clauses", "commands-imperative", "saber-vs-conocer", "double-negatives", "adjective-placement", "tener-expressions", "personal-a", "relative-pronouns", "future-vs-ir-a", "accent-marks-stress", "se-constructions", "estar-gerund-progressive", "spanish-suffixes", "common-irregular-verbs", "types-of-irregular-verbs", "present-indicative-conjugation", "articles-and-gender", "possessive-adjectives", "demonstrative-adjectives", "greetings-farewells", "poder-infinitive", "al-del-contractions", "prepositional-pronouns", "irregular-yo-verbs", "stem-changing-verbs", "stressed-possessives", "present-perfect-tense", "future-perfect-tense", ] def fetch_metadata(video_id: str) -> dict: """Return a dict of useful metadata fields for a single video. On any yt-dlp failure (video removed, network issue, extraction break) returns a dict with `unavailable=True` so the caller can mark the row. """ try: result = subprocess.run( ["yt-dlp", "--skip-download", "--dump-json", "--no-warnings", "--", video_id], capture_output=True, text=True, timeout=30, ) except subprocess.TimeoutExpired: return {"unavailable": True, "reason": "timeout"} if result.returncode != 0: # yt-dlp errors look like: # "ERROR: [youtube] ID: . " # Extract just and drop everything after the first "." so the # markdown table stays readable. Help URLs contain colons so a naive # split-on-colon grabs the wrong chunk. reason = "yt-dlp failed" pattern = re.compile(r"ERROR:\s*\[[^\]]+\]\s*[^:]+:\s*(.+)") for line in result.stderr.strip().splitlines(): m = pattern.search(line) if m: reason = m.group(1).split(". ")[0].rstrip(".") break return {"unavailable": True, "reason": reason} try: data = json.loads(result.stdout) except json.JSONDecodeError: return {"unavailable": True, "reason": "invalid json"} return { "unavailable": False, "title": data.get("title") or "", "uploader": data.get("uploader") or data.get("channel") or "", "upload_date": data.get("upload_date") or "", # YYYYMMDD "duration": data.get("duration"), # seconds "view_count": data.get("view_count"), "like_count": data.get("like_count"), } def fmt_duration(seconds: int | None) -> str: if not seconds: return "—" h, rem = divmod(int(seconds), 3600) m, s = divmod(rem, 60) if h: return f"{h}:{m:02d}:{s:02d}" return f"{m}:{s:02d}" def fmt_date(raw: str) -> str: if not raw or len(raw) != 8: return "—" return f"{raw[0:4]}-{raw[4:6]}-{raw[6:8]}" def fmt_int(n: int | None) -> str: if n is None: return "—" return f"{n:,}" def render_row(key: str, curated: dict, meta: dict) -> str: video_id = curated["videoId"] url = f"https://www.youtube.com/watch?v={video_id}" if meta.get("unavailable"): title = f"_(unavailable — {meta.get('reason', 'unknown')})_" channel = "—" uploaded = "—" duration = "—" views = "—" likes = "—" else: title = meta.get("title") or curated.get("title") or "" # Escape pipes in titles so table rendering doesn't break. title = title.replace("|", "\\|") channel = (meta.get("uploader") or "—").replace("|", "\\|") uploaded = fmt_date(meta.get("upload_date", "")) duration = fmt_duration(meta.get("duration")) views = fmt_int(meta.get("view_count")) likes = fmt_int(meta.get("like_count")) return f"| `{key}` | {title} | {channel} | {uploaded} | {duration} | {views} | {likes} | [watch]({url}) |" def main() -> int: with VIDEOS_JSON.open() as f: data = json.load(f) tense_entries = data.get("tenseGuides", {}) grammar_entries = data.get("grammarNotes", {}) # Collect all unique videoIds so we only call yt-dlp once per video # (several grammar notes reuse tense-guide videos). video_ids = {e["videoId"] for e in tense_entries.values()} | { e["videoId"] for e in grammar_entries.values() } print(f"Fetching metadata for {len(video_ids)} unique videos…", file=sys.stderr) metadata: dict[str, dict] = {} with ThreadPoolExecutor(max_workers=8) as pool: future_to_id = {pool.submit(fetch_metadata, vid): vid for vid in video_ids} for future in as_completed(future_to_id): vid = future_to_id[future] metadata[vid] = future.result() status = "✗" if metadata[vid].get("unavailable") else "✓" print(f" {status} {vid}", file=sys.stderr) missing_tenses = [tid for tid in EXPECTED_TENSE_IDS if tid not in tense_entries] missing_grammar = [gid for gid in EXPECTED_GRAMMAR_IDS if gid not in grammar_entries] today = date.today().isoformat() lines: list[str] = [] lines.append("# Curated YouTube Videos") lines.append("") lines.append( "Every tense guide and grammar note in the app can be tied to a single " "curated YouTube video. This file is generated from " "`Conjuga/youtube_videos.json` by `Scripts/generate_videos_markdown.py` " "— regenerate when you add or change entries." ) lines.append("") lines.append(f"- Total tense-guide entries: **{len(tense_entries)}** of {len(EXPECTED_TENSE_IDS)}") lines.append(f"- Total grammar-note entries: **{len(grammar_entries)}** of {len(EXPECTED_GRAMMAR_IDS)}") lines.append(f"- Last verified: **{today}** (run `python3 Scripts/generate_videos_markdown.py` to refresh)") lines.append("") lines.append( "Like counts are often blank because YouTube hides the public count on " "most videos for signed-out requests. Titles and durations are pulled " "live from YouTube; unavailable entries mean the video has been taken " "down, made private, or region-locked. A few entries marked " "\"not available on this app\" are a transient yt-dlp extraction limit " "— the video itself still plays fine when tapping Stream in the app." ) lines.append("") # Tense guides section lines.append("## Tense guides") lines.append("") lines.append("Tied to `TenseGuide.tenseId` in the Guide tab.") lines.append("") lines.append("| Tense ID | Title | Channel | Uploaded | Duration | Views | Likes | URL |") lines.append("|---|---|---|---|---|---|---|---|") for tid in EXPECTED_TENSE_IDS: if tid not in tense_entries: continue entry = tense_entries[tid] lines.append(render_row(tid, entry, metadata.get(entry["videoId"], {}))) lines.append("") # Grammar notes section lines.append("## Grammar notes") lines.append("") lines.append("Tied to `GrammarNote.id` (hand-authored + generated) in the Guide → Grammar tab.") lines.append("") lines.append("| Grammar ID | Title | Channel | Uploaded | Duration | Views | Likes | URL |") lines.append("|---|---|---|---|---|---|---|---|") for gid in EXPECTED_GRAMMAR_IDS: if gid not in grammar_entries: continue entry = grammar_entries[gid] lines.append(render_row(gid, entry, metadata.get(entry["videoId"], {}))) lines.append("") # Missing section if missing_tenses or missing_grammar: lines.append("## Topics without a curated video") lines.append("") lines.append( "These show a \"No video yet\" label in the app. Add entries to " "`Conjuga/youtube_videos.json` to fill them in." ) lines.append("") if missing_tenses: lines.append("**Tense guides:**") lines.append("") for tid in missing_tenses: lines.append(f"- `{tid}`") lines.append("") if missing_grammar: lines.append("**Grammar notes:**") lines.append("") for gid in missing_grammar: lines.append(f"- `{gid}`") lines.append("") OUTPUT_MD.write_text("\n".join(lines)) print(f"\nWrote {OUTPUT_MD.relative_to(REPO_ROOT)}", file=sys.stderr) return 0 if __name__ == "__main__": sys.exit(main())