Add curated-videos markdown report + generator script

youtube_videos.md lists every entry in youtube_videos.json with its tense-guide / grammar-note id, title, channel, upload date, duration, views, and likes (where public). Also flags the two topics with no curated video so the gap is auditable in one place. generate_videos_markdown.py queries yt-dlp in parallel for each unique videoId and writes the markdown. Rerun when curation changes. One current entry (saber-vs-conocer → j87i7MVCvIE) is now marked Private Video — needs re-curation as a follow-up. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 07:07:41 -05:00
parent 0a099c3fc9
commit 9c7033d1b4
2 changed files with 348 additions and 0 deletions
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""Generate a markdown report of every curated YouTube video referenced by the app.
+
+Reads Conjuga/youtube_videos.json, queries yt-dlp for metadata on each video,
+and emits Conjuga/youtube_videos.md with tables for tense guides and grammar
+notes plus a list of topics with no curated video.
+
+Usage:
+    python3 Scripts/generate_videos_markdown.py
+
+Requires `yt-dlp` on PATH. Videos that have been taken down or made private
+appear in the tables with an "(unavailable)" marker in the title column.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import date
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+VIDEOS_JSON = REPO_ROOT / "Conjuga" / "youtube_videos.json"
+OUTPUT_MD = REPO_ROOT / "Conjuga" / "youtube_videos.md"
+
+# The curated ids we expect — anything in the source file that's missing from
+# the JSON shows up in the "missing" section at the bottom.
+EXPECTED_TENSE_IDS = [
+    "ind_presente", "ind_preterito", "ind_imperfecto", "ind_futuro",
+    "ind_perfecto", "ind_pluscuamperfecto", "ind_futuro_perfecto",
+    "ind_preterito_anterior",
+    "cond_presente", "cond_perfecto",
+    "subj_presente", "subj_imperfecto_1", "subj_imperfecto_2",
+    "subj_perfecto", "subj_pluscuamperfecto_1", "subj_pluscuamperfecto_2",
+    "subj_futuro", "subj_futuro_perfecto",
+    "imp_afirmativo", "imp_negativo",
+]
+
+EXPECTED_GRAMMAR_IDS = [
+    "ser-vs-estar", "por-vs-para", "preterite-vs-imperfect",
+    "subjunctive-triggers", "reflexive-verbs", "object-pronouns",
+    "gustar-like-verbs", "comparatives-superlatives",
+    "conditional-if-clauses", "commands-imperative", "saber-vs-conocer",
+    "double-negatives", "adjective-placement", "tener-expressions",
+    "personal-a", "relative-pronouns", "future-vs-ir-a",
+    "accent-marks-stress", "se-constructions", "estar-gerund-progressive",
+    "spanish-suffixes", "common-irregular-verbs", "types-of-irregular-verbs",
+    "present-indicative-conjugation", "articles-and-gender",
+    "possessive-adjectives", "demonstrative-adjectives",
+    "greetings-farewells", "poder-infinitive", "al-del-contractions",
+    "prepositional-pronouns", "irregular-yo-verbs", "stem-changing-verbs",
+    "stressed-possessives", "present-perfect-tense", "future-perfect-tense",
+]
+
+
+def fetch_metadata(video_id: str) -> dict:
+    """Return a dict of useful metadata fields for a single video.
+
+    On any yt-dlp failure (video removed, network issue, extraction break)
+    returns a dict with `unavailable=True` so the caller can mark the row.
+    """
+    try:
+        result = subprocess.run(
+            ["yt-dlp", "--skip-download", "--dump-json", "--no-warnings", "--", video_id],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+    except subprocess.TimeoutExpired:
+        return {"unavailable": True, "reason": "timeout"}
+
+    if result.returncode != 0:
+        # yt-dlp errors look like:
+        #   "ERROR: [youtube] ID: <reason>. <cookie/help nag with URLs…>"
+        # Extract just <reason> and drop everything after the first "." so the
+        # markdown table stays readable. Help URLs contain colons so a naive
+        # split-on-colon grabs the wrong chunk.
+        reason = "yt-dlp failed"
+        pattern = re.compile(r"ERROR:\s*\[[^\]]+\]\s*[^:]+:\s*(.+)")
+        for line in result.stderr.strip().splitlines():
+            m = pattern.search(line)
+            if m:
+                reason = m.group(1).split(". ")[0].rstrip(".")
+                break
+        return {"unavailable": True, "reason": reason}
+
+    try:
+        data = json.loads(result.stdout)
+    except json.JSONDecodeError:
+        return {"unavailable": True, "reason": "invalid json"}
+
+    return {
+        "unavailable": False,
+        "title": data.get("title") or "",
+        "uploader": data.get("uploader") or data.get("channel") or "",
+        "upload_date": data.get("upload_date") or "",  # YYYYMMDD
+        "duration": data.get("duration"),  # seconds
+        "view_count": data.get("view_count"),
+        "like_count": data.get("like_count"),
+    }
+
+
+def fmt_duration(seconds: int | None) -> str:
+    if not seconds:
+        return "—"
+    h, rem = divmod(int(seconds), 3600)
+    m, s = divmod(rem, 60)
+    if h:
+        return f"{h}:{m:02d}:{s:02d}"
+    return f"{m}:{s:02d}"
+
+
+def fmt_date(raw: str) -> str:
+    if not raw or len(raw) != 8:
+        return "—"
+    return f"{raw[0:4]}-{raw[4:6]}-{raw[6:8]}"
+
+
+def fmt_int(n: int | None) -> str:
+    if n is None:
+        return "—"
+    return f"{n:,}"
+
+
+def render_row(key: str, curated: dict, meta: dict) -> str:
+    video_id = curated["videoId"]
+    url = f"https://www.youtube.com/watch?v={video_id}"
+
+    if meta.get("unavailable"):
+        title = f"_(unavailable — {meta.get('reason', 'unknown')})_"
+        channel = "—"
+        uploaded = "—"
+        duration = "—"
+        views = "—"
+        likes = "—"
+    else:
+        title = meta.get("title") or curated.get("title") or ""
+        # Escape pipes in titles so table rendering doesn't break.
+        title = title.replace("|", "\\|")
+        channel = (meta.get("uploader") or "—").replace("|", "\\|")
+        uploaded = fmt_date(meta.get("upload_date", ""))
+        duration = fmt_duration(meta.get("duration"))
+        views = fmt_int(meta.get("view_count"))
+        likes = fmt_int(meta.get("like_count"))
+
+    return f"| `{key}` | {title} | {channel} | {uploaded} | {duration} | {views} | {likes} | [watch]({url}) |"
+
+
+def main() -> int:
+    with VIDEOS_JSON.open() as f:
+        data = json.load(f)
+
+    tense_entries = data.get("tenseGuides", {})
+    grammar_entries = data.get("grammarNotes", {})
+
+    # Collect all unique videoIds so we only call yt-dlp once per video
+    # (several grammar notes reuse tense-guide videos).
+    video_ids = {e["videoId"] for e in tense_entries.values()} | {
+        e["videoId"] for e in grammar_entries.values()
+    }
+
+    print(f"Fetching metadata for {len(video_ids)} unique videos…", file=sys.stderr)
+
+    metadata: dict[str, dict] = {}
+    with ThreadPoolExecutor(max_workers=8) as pool:
+        future_to_id = {pool.submit(fetch_metadata, vid): vid for vid in video_ids}
+        for future in as_completed(future_to_id):
+            vid = future_to_id[future]
+            metadata[vid] = future.result()
+            status = "✗" if metadata[vid].get("unavailable") else "✓"
+            print(f"  {status} {vid}", file=sys.stderr)
+
+    missing_tenses = [tid for tid in EXPECTED_TENSE_IDS if tid not in tense_entries]
+    missing_grammar = [gid for gid in EXPECTED_GRAMMAR_IDS if gid not in grammar_entries]
+
+    today = date.today().isoformat()
+
+    lines: list[str] = []
+    lines.append("# Curated YouTube Videos")
+    lines.append("")
+    lines.append(
+        "Every tense guide and grammar note in the app can be tied to a single "
+        "curated YouTube video. This file is generated from "
+        "`Conjuga/youtube_videos.json` by `Scripts/generate_videos_markdown.py` "
+        "— regenerate when you add or change entries."
+    )
+    lines.append("")
+    lines.append(f"- Total tense-guide entries: **{len(tense_entries)}** of {len(EXPECTED_TENSE_IDS)}")
+    lines.append(f"- Total grammar-note entries: **{len(grammar_entries)}** of {len(EXPECTED_GRAMMAR_IDS)}")
+    lines.append(f"- Last verified: **{today}** (run `python3 Scripts/generate_videos_markdown.py` to refresh)")
+    lines.append("")
+    lines.append(
+        "Like counts are often blank because YouTube hides the public count on "
+        "most videos for signed-out requests. Titles and durations are pulled "
+        "live from YouTube; unavailable entries mean the video has been taken "
+        "down, made private, or region-locked."
+    )
+    lines.append("")
+
+    # Tense guides section
+    lines.append("## Tense guides")
+    lines.append("")
+    lines.append("Tied to `TenseGuide.tenseId` in the Guide tab.")
+    lines.append("")
+    lines.append("| Tense ID | Title | Channel | Uploaded | Duration | Views | Likes | URL |")
+    lines.append("|---|---|---|---|---|---|---|---|")
+    for tid in EXPECTED_TENSE_IDS:
+        if tid not in tense_entries:
+            continue
+        entry = tense_entries[tid]
+        lines.append(render_row(tid, entry, metadata.get(entry["videoId"], {})))
+    lines.append("")
+
+    # Grammar notes section
+    lines.append("## Grammar notes")
+    lines.append("")
+    lines.append("Tied to `GrammarNote.id` (hand-authored + generated) in the Guide → Grammar tab.")
+    lines.append("")
+    lines.append("| Grammar ID | Title | Channel | Uploaded | Duration | Views | Likes | URL |")
+    lines.append("|---|---|---|---|---|---|---|---|")
+    for gid in EXPECTED_GRAMMAR_IDS:
+        if gid not in grammar_entries:
+            continue
+        entry = grammar_entries[gid]
+        lines.append(render_row(gid, entry, metadata.get(entry["videoId"], {})))
+    lines.append("")
+
+    # Missing section
+    if missing_tenses or missing_grammar:
+        lines.append("## Topics without a curated video")
+        lines.append("")
+        lines.append(
+            "These show a \"No video yet\" label in the app. Add entries to "
+            "`Conjuga/youtube_videos.json` to fill them in."
+        )
+        lines.append("")
+        if missing_tenses:
+            lines.append("**Tense guides:**")
+            lines.append("")
+            for tid in missing_tenses:
+                lines.append(f"- `{tid}`")
+            lines.append("")
+        if missing_grammar:
+            lines.append("**Grammar notes:**")
+            lines.append("")
+            for gid in missing_grammar:
+                lines.append(f"- `{gid}`")
+            lines.append("")
+
+    OUTPUT_MD.write_text("\n".join(lines))
+    print(f"\nWrote {OUTPUT_MD.relative_to(REPO_ROOT)}", file=sys.stderr)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())