fcb907718a
Swap 24 tense-guide / grammar-note videos to The Language Tutor's numbered lesson series where a matching lesson exists, filling the two remaining gaps (ind_preterito_anterior → Lesson 65, estar-gerund- progressive → Lesson 113). All 32 TLT picks preserved on this pass. For the non-TLT slots, prefer BaseLang's beginner lesson series where a topic-specific video exists: ser-vs-estar, preterite-vs-imperfect, subjunctive-triggers, object-pronouns, conditional-if-clauses, tener-expressions, future-vs-ir-a, possessive-adjectives, irregular-yo-verbs, and stem-changing-verbs. Retire both Tell Me In Spanish videos (personal-a → castellano4U, types-of-irregular-verbs → Master IRREGULAR VERBS Complete Lesson). Generator header note clarifies that "not available on this app" rows are a transient yt-dlp extraction limit — videos still play when tapped in the app via the Stream button, which opens youtube.com externally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
262 lines
9.9 KiB
Python
262 lines
9.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate a markdown report of every curated YouTube video referenced by the app.
|
|
|
|
Reads Conjuga/youtube_videos.json, queries yt-dlp for metadata on each video,
|
|
and emits Conjuga/youtube_videos.md with tables for tense guides and grammar
|
|
notes plus a list of topics with no curated video.
|
|
|
|
Usage:
|
|
python3 Scripts/generate_videos_markdown.py
|
|
|
|
Requires `yt-dlp` on PATH. Videos that have been taken down or made private
|
|
appear in the tables with an "(unavailable)" marker in the title column.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from datetime import date
|
|
from pathlib import Path
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
VIDEOS_JSON = REPO_ROOT / "Conjuga" / "youtube_videos.json"
|
|
OUTPUT_MD = REPO_ROOT / "Conjuga" / "youtube_videos.md"
|
|
|
|
# The curated ids we expect — anything in the source file that's missing from
|
|
# the JSON shows up in the "missing" section at the bottom.
|
|
EXPECTED_TENSE_IDS = [
|
|
"ind_presente", "ind_preterito", "ind_imperfecto", "ind_futuro",
|
|
"ind_perfecto", "ind_pluscuamperfecto", "ind_futuro_perfecto",
|
|
"ind_preterito_anterior",
|
|
"cond_presente", "cond_perfecto",
|
|
"subj_presente", "subj_imperfecto_1", "subj_imperfecto_2",
|
|
"subj_perfecto", "subj_pluscuamperfecto_1", "subj_pluscuamperfecto_2",
|
|
"subj_futuro", "subj_futuro_perfecto",
|
|
"imp_afirmativo", "imp_negativo",
|
|
]
|
|
|
|
EXPECTED_GRAMMAR_IDS = [
|
|
"ser-vs-estar", "por-vs-para", "preterite-vs-imperfect",
|
|
"subjunctive-triggers", "reflexive-verbs", "object-pronouns",
|
|
"gustar-like-verbs", "comparatives-superlatives",
|
|
"conditional-if-clauses", "commands-imperative", "saber-vs-conocer",
|
|
"double-negatives", "adjective-placement", "tener-expressions",
|
|
"personal-a", "relative-pronouns", "future-vs-ir-a",
|
|
"accent-marks-stress", "se-constructions", "estar-gerund-progressive",
|
|
"spanish-suffixes", "common-irregular-verbs", "types-of-irregular-verbs",
|
|
"present-indicative-conjugation", "articles-and-gender",
|
|
"possessive-adjectives", "demonstrative-adjectives",
|
|
"greetings-farewells", "poder-infinitive", "al-del-contractions",
|
|
"prepositional-pronouns", "irregular-yo-verbs", "stem-changing-verbs",
|
|
"stressed-possessives", "present-perfect-tense", "future-perfect-tense",
|
|
]
|
|
|
|
|
|
def fetch_metadata(video_id: str) -> dict:
|
|
"""Return a dict of useful metadata fields for a single video.
|
|
|
|
On any yt-dlp failure (video removed, network issue, extraction break)
|
|
returns a dict with `unavailable=True` so the caller can mark the row.
|
|
"""
|
|
try:
|
|
result = subprocess.run(
|
|
["yt-dlp", "--skip-download", "--dump-json", "--no-warnings", "--", video_id],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
)
|
|
except subprocess.TimeoutExpired:
|
|
return {"unavailable": True, "reason": "timeout"}
|
|
|
|
if result.returncode != 0:
|
|
# yt-dlp errors look like:
|
|
# "ERROR: [youtube] ID: <reason>. <cookie/help nag with URLs…>"
|
|
# Extract just <reason> and drop everything after the first "." so the
|
|
# markdown table stays readable. Help URLs contain colons so a naive
|
|
# split-on-colon grabs the wrong chunk.
|
|
reason = "yt-dlp failed"
|
|
pattern = re.compile(r"ERROR:\s*\[[^\]]+\]\s*[^:]+:\s*(.+)")
|
|
for line in result.stderr.strip().splitlines():
|
|
m = pattern.search(line)
|
|
if m:
|
|
reason = m.group(1).split(". ")[0].rstrip(".")
|
|
break
|
|
return {"unavailable": True, "reason": reason}
|
|
|
|
try:
|
|
data = json.loads(result.stdout)
|
|
except json.JSONDecodeError:
|
|
return {"unavailable": True, "reason": "invalid json"}
|
|
|
|
return {
|
|
"unavailable": False,
|
|
"title": data.get("title") or "",
|
|
"uploader": data.get("uploader") or data.get("channel") or "",
|
|
"upload_date": data.get("upload_date") or "", # YYYYMMDD
|
|
"duration": data.get("duration"), # seconds
|
|
"view_count": data.get("view_count"),
|
|
"like_count": data.get("like_count"),
|
|
}
|
|
|
|
|
|
def fmt_duration(seconds: int | None) -> str:
|
|
if not seconds:
|
|
return "—"
|
|
h, rem = divmod(int(seconds), 3600)
|
|
m, s = divmod(rem, 60)
|
|
if h:
|
|
return f"{h}:{m:02d}:{s:02d}"
|
|
return f"{m}:{s:02d}"
|
|
|
|
|
|
def fmt_date(raw: str) -> str:
|
|
if not raw or len(raw) != 8:
|
|
return "—"
|
|
return f"{raw[0:4]}-{raw[4:6]}-{raw[6:8]}"
|
|
|
|
|
|
def fmt_int(n: int | None) -> str:
|
|
if n is None:
|
|
return "—"
|
|
return f"{n:,}"
|
|
|
|
|
|
def render_row(key: str, curated: dict, meta: dict) -> str:
|
|
video_id = curated["videoId"]
|
|
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
|
|
if meta.get("unavailable"):
|
|
title = f"_(unavailable — {meta.get('reason', 'unknown')})_"
|
|
channel = "—"
|
|
uploaded = "—"
|
|
duration = "—"
|
|
views = "—"
|
|
likes = "—"
|
|
else:
|
|
title = meta.get("title") or curated.get("title") or ""
|
|
# Escape pipes in titles so table rendering doesn't break.
|
|
title = title.replace("|", "\\|")
|
|
channel = (meta.get("uploader") or "—").replace("|", "\\|")
|
|
uploaded = fmt_date(meta.get("upload_date", ""))
|
|
duration = fmt_duration(meta.get("duration"))
|
|
views = fmt_int(meta.get("view_count"))
|
|
likes = fmt_int(meta.get("like_count"))
|
|
|
|
return f"| `{key}` | {title} | {channel} | {uploaded} | {duration} | {views} | {likes} | [watch]({url}) |"
|
|
|
|
|
|
def main() -> int:
|
|
with VIDEOS_JSON.open() as f:
|
|
data = json.load(f)
|
|
|
|
tense_entries = data.get("tenseGuides", {})
|
|
grammar_entries = data.get("grammarNotes", {})
|
|
|
|
# Collect all unique videoIds so we only call yt-dlp once per video
|
|
# (several grammar notes reuse tense-guide videos).
|
|
video_ids = {e["videoId"] for e in tense_entries.values()} | {
|
|
e["videoId"] for e in grammar_entries.values()
|
|
}
|
|
|
|
print(f"Fetching metadata for {len(video_ids)} unique videos…", file=sys.stderr)
|
|
|
|
metadata: dict[str, dict] = {}
|
|
with ThreadPoolExecutor(max_workers=8) as pool:
|
|
future_to_id = {pool.submit(fetch_metadata, vid): vid for vid in video_ids}
|
|
for future in as_completed(future_to_id):
|
|
vid = future_to_id[future]
|
|
metadata[vid] = future.result()
|
|
status = "✗" if metadata[vid].get("unavailable") else "✓"
|
|
print(f" {status} {vid}", file=sys.stderr)
|
|
|
|
missing_tenses = [tid for tid in EXPECTED_TENSE_IDS if tid not in tense_entries]
|
|
missing_grammar = [gid for gid in EXPECTED_GRAMMAR_IDS if gid not in grammar_entries]
|
|
|
|
today = date.today().isoformat()
|
|
|
|
lines: list[str] = []
|
|
lines.append("# Curated YouTube Videos")
|
|
lines.append("")
|
|
lines.append(
|
|
"Every tense guide and grammar note in the app can be tied to a single "
|
|
"curated YouTube video. This file is generated from "
|
|
"`Conjuga/youtube_videos.json` by `Scripts/generate_videos_markdown.py` "
|
|
"— regenerate when you add or change entries."
|
|
)
|
|
lines.append("")
|
|
lines.append(f"- Total tense-guide entries: **{len(tense_entries)}** of {len(EXPECTED_TENSE_IDS)}")
|
|
lines.append(f"- Total grammar-note entries: **{len(grammar_entries)}** of {len(EXPECTED_GRAMMAR_IDS)}")
|
|
lines.append(f"- Last verified: **{today}** (run `python3 Scripts/generate_videos_markdown.py` to refresh)")
|
|
lines.append("")
|
|
lines.append(
|
|
"Like counts are often blank because YouTube hides the public count on "
|
|
"most videos for signed-out requests. Titles and durations are pulled "
|
|
"live from YouTube; unavailable entries mean the video has been taken "
|
|
"down, made private, or region-locked. A few entries marked "
|
|
"\"not available on this app\" are a transient yt-dlp extraction limit "
|
|
"— the video itself still plays fine when tapping Stream in the app."
|
|
)
|
|
lines.append("")
|
|
|
|
# Tense guides section
|
|
lines.append("## Tense guides")
|
|
lines.append("")
|
|
lines.append("Tied to `TenseGuide.tenseId` in the Guide tab.")
|
|
lines.append("")
|
|
lines.append("| Tense ID | Title | Channel | Uploaded | Duration | Views | Likes | URL |")
|
|
lines.append("|---|---|---|---|---|---|---|---|")
|
|
for tid in EXPECTED_TENSE_IDS:
|
|
if tid not in tense_entries:
|
|
continue
|
|
entry = tense_entries[tid]
|
|
lines.append(render_row(tid, entry, metadata.get(entry["videoId"], {})))
|
|
lines.append("")
|
|
|
|
# Grammar notes section
|
|
lines.append("## Grammar notes")
|
|
lines.append("")
|
|
lines.append("Tied to `GrammarNote.id` (hand-authored + generated) in the Guide → Grammar tab.")
|
|
lines.append("")
|
|
lines.append("| Grammar ID | Title | Channel | Uploaded | Duration | Views | Likes | URL |")
|
|
lines.append("|---|---|---|---|---|---|---|---|")
|
|
for gid in EXPECTED_GRAMMAR_IDS:
|
|
if gid not in grammar_entries:
|
|
continue
|
|
entry = grammar_entries[gid]
|
|
lines.append(render_row(gid, entry, metadata.get(entry["videoId"], {})))
|
|
lines.append("")
|
|
|
|
# Missing section
|
|
if missing_tenses or missing_grammar:
|
|
lines.append("## Topics without a curated video")
|
|
lines.append("")
|
|
lines.append(
|
|
"These show a \"No video yet\" label in the app. Add entries to "
|
|
"`Conjuga/youtube_videos.json` to fill them in."
|
|
)
|
|
lines.append("")
|
|
if missing_tenses:
|
|
lines.append("**Tense guides:**")
|
|
lines.append("")
|
|
for tid in missing_tenses:
|
|
lines.append(f"- `{tid}`")
|
|
lines.append("")
|
|
if missing_grammar:
|
|
lines.append("**Grammar notes:**")
|
|
lines.append("")
|
|
for gid in missing_grammar:
|
|
lines.append(f"- `{gid}`")
|
|
lines.append("")
|
|
|
|
OUTPUT_MD.write_text("\n".join(lines))
|
|
print(f"\nWrote {OUTPUT_MD.relative_to(REPO_ROOT)}", file=sys.stderr)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|