Add DRM downloads, scrapers, gallery index, and UI improvements

- DRM video download pipeline with pywidevine subprocess for Widevine key acquisition - Scraper system: forum threads, Coomer/Kemono API, and MediaLink (Fapello) scrapers - SQLite-backed media index for instant gallery loads with startup scan - Duplicate detection and gallery filtering/sorting - HLS video component, log viewer, and scrape management UI - Dockerfile updated for Python/pywidevine, docker-compose volume for CDM Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 11:29:11 -06:00
parent c60de19348
commit 1e5f54f60b
28 changed files with 4736 additions and 203 deletions
@@ -0,0 +1,373 @@
+import { mkdirSync, createWriteStream, existsSync, rmSync } from 'node:fs';
+import { execSync, exec as execCb } from 'node:child_process';
+import { promisify } from 'node:util';
+const execAsync = promisify(execCb);
+import { dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import fetch from 'node-fetch';
+import { getAuthConfig } from './db.js';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const OF_BASE = 'https://onlyfans.com';
+const WVD_PATH = process.env.WVD_PATH || '/data/cdm/device.wvd';
+const HELPER_PATH = `${__dirname}/pywidevine_helper.py`;
+
+export function hasCDM() {
+  return existsSync(WVD_PATH);
+}
+
+// ==================== MPD Parser ====================
+
+function parseMpd(mpdText, baseUrl) {
+  const result = { pssh: null, video: null, audio: null };
+
+  // Extract Widevine PSSH (system ID edef8ba9-79d6-4ace-a3c8-27dcd51d21ed)
+  // Must find the ContentProtection block for Widevine, not PlayReady
+  const cpRegex = /<ContentProtection[^>]*schemeIdUri="urn:uuid:edef8ba9[^"]*"[^>]*>([\s\S]*?)<\/ContentProtection>/gi;
+  let cpMatch;
+  while ((cpMatch = cpRegex.exec(mpdText)) !== null) {
+    const psshInner = cpMatch[1].match(/cenc:pssh[^>]*>([^<]+)</i);
+    if (psshInner) {
+      result.pssh = psshInner[1].trim();
+      break;
+    }
+  }
+  // Fallback: if Widevine-specific block not found, try any cenc:pssh
+  if (!result.pssh) {
+    const psshMatch = mpdText.match(/cenc:pssh[^>]*>([^<]+)</i);
+    if (psshMatch) result.pssh = psshMatch[1].trim();
+  }
+
+  // Split into AdaptationSets
+  const asRegex = /<AdaptationSet([^>]*)>([\s\S]*?)<\/AdaptationSet>/gi;
+  let match;
+  while ((match = asRegex.exec(mpdText)) !== null) {
+    const asAttrs = match[1];
+    const asBody = match[2];
+
+    const mimeMatch = asAttrs.match(/mimeType="([^"]+)"/);
+    const mime = mimeMatch ? mimeMatch[1] : '';
+    const isVideo = mime.includes('video');
+    const isAudio = mime.includes('audio');
+    if (!isVideo && !isAudio) continue;
+
+    // Find all Representations, pick highest bandwidth
+    const reps = [];
+    const repRegex = /<Representation([^>]*)(?:\/>|>([\s\S]*?)<\/Representation>)/gi;
+    let repMatch;
+    while ((repMatch = repRegex.exec(asBody)) !== null) {
+      const bwMatch = repMatch[1].match(/bandwidth="(\d+)"/);
+      const idMatch = repMatch[1].match(/id="([^"]+)"/);
+      const bwAttr = repMatch[1].match(/bandwidth="(\d+)"/);
+      reps.push({
+        id: idMatch ? idMatch[1] : '1',
+        bandwidth: bwMatch ? parseInt(bwMatch[1]) : 0,
+        body: repMatch[2] || '',
+      });
+    }
+    reps.sort((a, b) => b.bandwidth - a.bandwidth);
+    const best = reps[0];
+    if (!best) continue;
+
+    // Try SegmentTemplate from Representation first, then AdaptationSet
+    let segInfo = parseSegmentTemplate(best.body, best.id, best.bandwidth, baseUrl);
+    if (!segInfo) segInfo = parseSegmentTemplate(asBody, best.id, best.bandwidth, baseUrl);
+
+    // Try SegmentList as fallback
+    if (!segInfo) segInfo = parseSegmentList(best.body || asBody, baseUrl);
+
+    // Try SegmentBase (on-demand profile) as final fallback
+    if (!segInfo) segInfo = parseSegmentBase(best.body || asBody, baseUrl);
+
+    if (segInfo) {
+      if (isVideo) result.video = segInfo;
+      else result.audio = segInfo;
+    }
+  }
+
+  return result;
+}
+
+function parseSegmentTemplate(text, repId, bandwidth, baseUrl) {
+  const tmplMatch = text.match(/<SegmentTemplate([^>]*)(?:\/>|>([\s\S]*?)<\/SegmentTemplate>)/i);
+  if (!tmplMatch) return null;
+
+  const attrs = tmplMatch[1];
+  const body = tmplMatch[2] || '';
+
+  const initMatch = attrs.match(/initialization="([^"]+)"/);
+  const mediaMatch = attrs.match(/media="([^"]+)"/);
+  const startNumMatch = attrs.match(/startNumber="(\d+)"/);
+
+  if (!initMatch || !mediaMatch) return null;
+
+  const initTmpl = initMatch[1];
+  const mediaTmpl = mediaMatch[1];
+  const startNumber = startNumMatch ? parseInt(startNumMatch[1]) : 1;
+  const usesTime = mediaTmpl.includes('$Time$');
+
+  const initUrl = resolveUrl(
+    replaceTemplateVars(initTmpl, repId, bandwidth),
+    baseUrl,
+  );
+
+  const segmentUrls = [];
+  const timelineMatch = body.match(/<SegmentTimeline>([\s\S]*?)<\/SegmentTimeline>/i);
+
+  if (timelineMatch) {
+    let currentTime = 0;
+    let segNum = startNumber;
+    const sElements = [...timelineMatch[1].matchAll(/<S\s+([^/]*?)\/?\s*>/gi)];
+
+    for (const s of sElements) {
+      const tMatch = s[1].match(/t="(\d+)"/);
+      const dMatch = s[1].match(/d="(\d+)"/);
+      const rMatch = s[1].match(/r="(-?\d+)"/);
+
+      if (tMatch) currentTime = parseInt(tMatch[1]);
+      const duration = dMatch ? parseInt(dMatch[1]) : 0;
+      let repeat = rMatch ? parseInt(rMatch[1]) : 0;
+      if (repeat < 0) repeat = 9999; // r=-1 means repeat until end; bounded by 404 in download
+
+      for (let i = 0; i <= repeat; i++) {
+        let url;
+        if (usesTime) {
+          url = replaceTemplateVars(mediaTmpl, repId, bandwidth)
+            .replace(/\$Time\$/g, String(currentTime));
+        } else {
+          url = replaceTemplateVars(mediaTmpl, repId, bandwidth)
+            .replace(/\$Number\$/g, String(segNum))
+            .replace(/\$Number%(\d+)d\$/g, (_, w) => String(segNum).padStart(parseInt(w), '0'));
+        }
+        segmentUrls.push(resolveUrl(url, baseUrl));
+        currentTime += duration;
+        segNum++;
+      }
+    }
+  } else {
+    // No timeline — use a large count, download will stop on 404
+    const startNum = startNumber;
+    for (let i = 0; i < 10000; i++) {
+      const url = replaceTemplateVars(mediaTmpl, repId, bandwidth)
+        .replace(/\$Number\$/g, String(startNum + i))
+        .replace(/\$Number%(\d+)d\$/g, (_, w) => String(startNum + i).padStart(parseInt(w), '0'));
+      segmentUrls.push(resolveUrl(url, baseUrl));
+    }
+  }
+
+  return { initUrl, segmentUrls };
+}
+
+function parseSegmentList(text, baseUrl) {
+  const initMatch = text.match(/<Initialization\s+sourceURL="([^"]+)"/i);
+  if (!initMatch) return null;
+
+  const initUrl = resolveUrl(initMatch[1], baseUrl);
+  const segmentUrls = [];
+  const segRegex = /<SegmentURL\s+media="([^"]+)"/gi;
+  let m;
+  while ((m = segRegex.exec(text)) !== null) {
+    segmentUrls.push(resolveUrl(m[1], baseUrl));
+  }
+  return { initUrl, segmentUrls };
+}
+
+function parseSegmentBase(text, baseUrl) {
+  const baseUrlMatch = text.match(/<BaseURL>([^<]+)<\/BaseURL>/i);
+  if (!baseUrlMatch) return null;
+
+  const fileUrl = resolveUrl(baseUrlMatch[1].trim(), baseUrl);
+
+  // On-demand: single file, no segments. Mark as on-demand so the download
+  // pipeline can just fetch the whole file instead of init+segments.
+  return { onDemand: true, fileUrl };
+}
+
+function replaceTemplateVars(template, repId, bandwidth) {
+  return template
+    .replace(/\$RepresentationID\$/g, repId)
+    .replace(/\$Bandwidth\$/g, String(bandwidth));
+}
+
+function resolveUrl(url, baseUrl) {
+  if (url.startsWith('http')) return url;
+  return baseUrl + url;
+}
+
+// ==================== Download Pipeline ====================
+
+async function fetchWithCookies(url, cfCookies) {
+  const cookieParts = [];
+  if (cfCookies.cp) cookieParts.push(`CloudFront-Policy=${cfCookies.cp}`);
+  if (cfCookies.cs) cookieParts.push(`CloudFront-Signature=${cfCookies.cs}`);
+  if (cfCookies.ck) cookieParts.push(`CloudFront-Key-Pair-Id=${cfCookies.ck}`);
+
+  const headers = {};
+  if (cookieParts.length > 0) headers['Cookie'] = cookieParts.join('; ');
+
+  const res = await fetch(url, { headers });
+  return res;
+}
+
+async function downloadWholeFile(url, cfCookies, outputPath) {
+  const res = await fetchWithCookies(url, cfCookies);
+  if (!res.ok) throw new Error(`Download failed: ${res.status} ${url}`);
+  const ws = createWriteStream(outputPath);
+  for await (const chunk of res.body) ws.write(chunk);
+  ws.end();
+  await new Promise((resolve, reject) => {
+    ws.on('finish', resolve);
+    ws.on('error', reject);
+  });
+  console.log(`[drm-download] Downloaded whole file → ${outputPath}`);
+}
+
+async function downloadSegments(track, cfCookies, outputPath) {
+  const ws = createWriteStream(outputPath);
+
+  // Init segment
+  const initRes = await fetchWithCookies(track.initUrl, cfCookies);
+  if (!initRes.ok) throw new Error(`Init segment failed: ${initRes.status}`);
+  for await (const chunk of initRes.body) ws.write(chunk);
+
+  // Media segments
+  let downloaded = 0;
+  for (const segUrl of track.segmentUrls) {
+    const segRes = await fetchWithCookies(segUrl, cfCookies);
+    if (segRes.status === 404 || segRes.status === 403) break; // end of segments
+    if (!segRes.ok) throw new Error(`Segment failed: ${segRes.status} ${segUrl}`);
+    for await (const chunk of segRes.body) ws.write(chunk);
+    downloaded++;
+  }
+
+  ws.end();
+  await new Promise((resolve, reject) => {
+    ws.on('finish', resolve);
+    ws.on('error', reject);
+  });
+
+  console.log(`[drm-download] Downloaded ${downloaded} segments → ${outputPath}`);
+}
+
+export async function downloadDrmMedia({
+  mpdUrl,
+  cfCookies,
+  mediaId,
+  entityType,
+  entityId,
+  outputDir,
+  outputFilename,
+}) {
+  if (!existsSync(WVD_PATH)) throw new Error('No CDM available — place a .wvd file at ' + WVD_PATH);
+
+  const authConfig = getAuthConfig();
+  if (!authConfig) throw new Error('No auth config');
+
+  console.log(`[drm-download] Starting DRM download for media ${mediaId}`);
+
+  // 1. Fetch & parse MPD
+  const mpdRes = await fetchWithCookies(mpdUrl, cfCookies);
+  if (!mpdRes.ok) throw new Error(`MPD fetch failed: ${mpdRes.status}`);
+  const mpdText = await mpdRes.text();
+  const mpdBaseUrl = mpdUrl.substring(0, mpdUrl.lastIndexOf('/') + 1);
+  const mpd = parseMpd(mpdText, mpdBaseUrl);
+
+  if (!mpd.pssh) {
+    throw new Error('No Widevine PSSH found in MPD');
+  }
+  if (!mpd.video) {
+    throw new Error('No video track found in MPD');
+  }
+  const videoDesc = mpd.video.onDemand ? 'on-demand' : `${mpd.video.segmentUrls.length} segs`;
+  const audioDesc = mpd.audio ? (mpd.audio.onDemand ? 'on-demand' : `${mpd.audio.segmentUrls.length} segs`) : 'none';
+  console.log(`[drm-download] MPD parsed: video=${videoDesc}, audio=${audioDesc}`);
+
+  // 2. Get content key via pywidevine (routed through local proxy)
+  const PORT = process.env.PORT || 3001;
+  const proxyParams = new URLSearchParams({ mediaId });
+  if (entityType) proxyParams.set('entityType', entityType);
+  if (entityId) proxyParams.set('entityId', entityId);
+  const proxyUrl = `http://localhost:${PORT}/api/drm-license?${proxyParams}`;
+
+  console.log(`[drm-download] Getting content key via pywidevine (proxy → OF)`);
+  let keyResult;
+  try {
+    const { stdout, stderr } = await execAsync(
+      `python3 "${HELPER_PATH}" "${WVD_PATH}" "${mpd.pssh}" "${proxyUrl}"`,
+      { timeout: 60000, maxBuffer: 1024 * 1024 },
+    );
+    keyResult = JSON.parse(stdout.trim());
+  } catch (err) {
+    const stderr = err.stderr?.toString() || '';
+    const stdout = err.stdout?.toString() || '';
+    throw new Error(`pywidevine failed: ${stderr || stdout || err.message}`);
+  }
+
+  if (keyResult.error) throw new Error(`License failed: ${keyResult.error}`);
+  if (!keyResult.keys?.length) throw new Error('No content keys returned');
+
+  const contentKey = keyResult.keys.find(k => k.type === 'CONTENT') || keyResult.keys[0];
+  console.log(`[drm-download] Got ${keyResult.keys.length} key(s), KID=${contentKey.kid}`);
+
+  // 3. Download encrypted segments
+  mkdirSync(outputDir, { recursive: true });
+  const tmpDir = `${outputDir}/.drm-tmp-${mediaId}`;
+  mkdirSync(tmpDir, { recursive: true });
+
+  try {
+    console.log('[drm-download] Downloading video...');
+    if (mpd.video.onDemand) {
+      await downloadWholeFile(mpd.video.fileUrl, cfCookies, `${tmpDir}/video_enc.mp4`);
+    } else {
+      await downloadSegments(mpd.video, cfCookies, `${tmpDir}/video_enc.mp4`);
+    }
+
+    let hasAudio = false;
+    if (mpd.audio) {
+      console.log('[drm-download] Downloading audio...');
+      if (mpd.audio.onDemand) {
+        await downloadWholeFile(mpd.audio.fileUrl, cfCookies, `${tmpDir}/audio_enc.mp4`);
+      } else if (mpd.audio.segmentUrls?.length > 0) {
+        await downloadSegments(mpd.audio, cfCookies, `${tmpDir}/audio_enc.mp4`);
+      }
+      hasAudio = true;
+    }
+
+    // 4. Decrypt with ffmpeg
+    const keyHex = contentKey.key;
+    console.log('[drm-download] Decrypting...');
+
+    execSync(
+      `ffmpeg -y -loglevel error -decryption_key ${keyHex} -i "${tmpDir}/video_enc.mp4" -c copy "${tmpDir}/video.mp4"`,
+      { stdio: 'pipe', timeout: 300000 },
+    );
+
+    if (hasAudio) {
+      execSync(
+        `ffmpeg -y -loglevel error -decryption_key ${keyHex} -i "${tmpDir}/audio_enc.mp4" -c copy "${tmpDir}/audio.mp4"`,
+        { stdio: 'pipe', timeout: 300000 },
+      );
+    }
+
+    // 5. Mux into final file
+    const outputPath = `${outputDir}/${outputFilename}`;
+    if (hasAudio) {
+      console.log('[drm-download] Muxing audio + video...');
+      execSync(
+        `ffmpeg -y -loglevel error -i "${tmpDir}/video.mp4" -i "${tmpDir}/audio.mp4" -c copy -movflags +faststart "${outputPath}"`,
+        { stdio: 'pipe', timeout: 300000 },
+      );
+    } else {
+      execSync(
+        `ffmpeg -y -loglevel error -i "${tmpDir}/video.mp4" -c copy -movflags +faststart "${outputPath}"`,
+        { stdio: 'pipe', timeout: 300000 },
+      );
+    }
+
+    console.log(`[drm-download] Complete: ${outputPath}`);
+    return outputPath;
+  } finally {
+    // Cleanup temp files
+    try { rmSync(tmpDir, { recursive: true, force: true }); } catch {}
+  }
+}