Add DRM downloads, scrapers, gallery index, and UI improvements

- DRM video download pipeline with pywidevine subprocess for Widevine key acquisition - Scraper system: forum threads, Coomer/Kemono API, and MediaLink (Fapello) scrapers - SQLite-backed media index for instant gallery loads with startup scan - Duplicate detection and gallery filtering/sorting - HLS video component, log viewer, and scrape management UI - Dockerfile updated for Python/pywidevine, docker-compose volume for CDM Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 11:29:11 -06:00
parent c60de19348
commit 1e5f54f60b
28 changed files with 4736 additions and 203 deletions
@@ -0,0 +1,201 @@
+import { existsSync, mkdirSync, writeFileSync } from 'fs';
+import { basename, join, extname } from 'path';
+import { upsertMediaFile } from '../db.js';
+
+const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
+
+export function parseUserUrl(url) {
+  const parsed = new URL(url);
+  const base = `${parsed.protocol}//${parsed.hostname}`;
+  const m = parsed.pathname.match(/^\/([^/]+)\/user\/([^/?#]+)/);
+  if (!m) throw new Error(`Can't parse URL. Expected: https://coomer.su/SERVICE/user/USER_ID`);
+  return { base, service: m[1], userId: m[2] };
+}
+
+async function fetchApi(apiUrl, logFn, retries = 3) {
+  for (let attempt = 0; attempt < retries; attempt++) {
+    try {
+      const resp = await fetch(apiUrl, {
+        headers: { 'User-Agent': UA, 'Accept': 'application/json' },
+        signal: AbortSignal.timeout(15000),
+      });
+
+      if (resp.ok) return await resp.json();
+      if (resp.status === 404) return [];
+      if (resp.status === 429) {
+        const wait = 5 * (attempt + 1);
+        logFn(`Rate limited, waiting ${wait}s...`);
+        await sleep(wait * 1000);
+        continue;
+      }
+      if (resp.status >= 500) {
+        await sleep(2000);
+        continue;
+      }
+      logFn(`API error ${resp.status}: ${apiUrl}`);
+      return null;
+    } catch (err) {
+      if (attempt < retries - 1) {
+        await sleep(2000);
+      } else {
+        throw err;
+      }
+    }
+  }
+  return null;
+}
+
+function sleep(ms) {
+  return new Promise(r => setTimeout(r, ms));
+}
+
+export function collectFiles(posts, cdnBase) {
+  const files = [];
+  const seen = new Set();
+
+  for (const post of posts) {
+    const items = [];
+    if (post.file && post.file.path) items.push(post.file);
+    if (post.attachments) {
+      for (const att of post.attachments) {
+        if (att.path) items.push(att);
+      }
+    }
+    for (const f of items) {
+      const fileUrl = `${cdnBase}${f.path}`;
+      if (seen.has(fileUrl)) continue;
+      seen.add(fileUrl);
+      const name = f.name || basename(f.path);
+      files.push({ url: fileUrl, name });
+    }
+  }
+  return files;
+}
+
+async function downloadFile(url, outputDir, name, logFn) {
+  let filepath = join(outputDir, name);
+  if (existsSync(filepath)) {
+    // File already exists, skip
+    return { skipped: true };
+  }
+
+  try {
+    const resp = await fetch(url, {
+      headers: { 'User-Agent': UA },
+      signal: AbortSignal.timeout(60000),
+    });
+    if (!resp.ok) {
+      logFn(`FAILED (${resp.status}): ${name}`);
+      return { error: true };
+    }
+
+    const buf = Buffer.from(await resp.arrayBuffer());
+
+    // Handle filename collision (different content)
+    if (existsSync(filepath)) {
+      const ext = extname(name);
+      const base = name.slice(0, -ext.length);
+      let i = 1;
+      while (existsSync(filepath)) {
+        filepath = join(outputDir, `${base}_${i}${ext}`);
+        i++;
+      }
+    }
+
+    writeFileSync(filepath, buf);
+    const savedName = basename(filepath);
+    const folderName = basename(outputDir);
+    const ext = extname(savedName).toLowerCase();
+    const fileType = ['.mp4', '.mov', '.avi', '.webm', '.mkv', '.m4v'].includes(ext) ? 'video' : 'image';
+    try { upsertMediaFile(folderName, savedName, fileType, buf.length, Date.now(), null); } catch { /* ignore */ }
+    const sizeKb = (buf.length / 1024).toFixed(1);
+    return { filename: savedName, sizeKb };
+  } catch (err) {
+    logFn(`FAILED: ${name} - ${err.message}`);
+    return { error: true };
+  }
+}
+
+export async function fetchAllPosts(base, service, userId, maxPages, logFn, checkCancelled) {
+  const allFiles = [];
+
+  for (let page = 0; page < maxPages; page++) {
+    if (checkCancelled()) break;
+
+    const offset = page * 50;
+    const apiUrl = `${base}/api/v1/${service}/user/${userId}/posts?o=${offset}`;
+
+    let posts;
+    try {
+      posts = await fetchApi(apiUrl, logFn);
+    } catch (err) {
+      logFn(`API failed: ${err.message}`);
+      break;
+    }
+
+    if (!posts || posts.length === 0) break;
+
+    const parsed = new URL(base);
+    const cdnHost = `n1.${parsed.hostname}`;
+    const cdnBase = `${parsed.protocol}//${cdnHost}/data`;
+
+    const files = collectFiles(posts, cdnBase);
+    allFiles.push(...files);
+
+    logFn(`Page ${page + 1}: ${posts.length} posts (${allFiles.length} files total)`);
+
+    if (posts.length < 50) break;
+  }
+
+  return allFiles;
+}
+
+export async function downloadFiles(files, outputDir, concurrency, logFn, progressFn, checkCancelled) {
+  mkdirSync(outputDir, { recursive: true });
+
+  // Filter out already existing files
+  const toDownload = [];
+  let skipped = 0;
+  for (const f of files) {
+    if (existsSync(join(outputDir, f.name))) {
+      skipped++;
+    } else {
+      toDownload.push(f);
+    }
+  }
+
+  if (skipped > 0) logFn(`Skipping ${skipped} already downloaded files`);
+  logFn(`Downloading ${toDownload.length} files with ${concurrency} workers...`);
+
+  let completed = 0;
+  let errors = 0;
+  let active = 0;
+  let index = 0;
+
+  // Simple semaphore-based concurrency
+  async function processNext() {
+    while (index < toDownload.length) {
+      if (checkCancelled()) return;
+
+      const current = index++;
+      const file = toDownload[current];
+
+      const result = await downloadFile(file.url, outputDir, file.name, logFn);
+      if (result.error) {
+        errors++;
+      } else if (!result.skipped) {
+        completed++;
+        logFn(`[${completed}/${toDownload.length}] ${result.filename} (${result.sizeKb} KB)`);
+      }
+      progressFn(completed + skipped, errors, files.length);
+    }
+  }
+
+  const workers = [];
+  for (let i = 0; i < Math.min(concurrency, toDownload.length); i++) {
+    workers.push(processNext());
+  }
+  await Promise.all(workers);
+
+  return { completed, errors, skipped, total: files.length };
+}
@@ -0,0 +1,230 @@
+import * as cheerio from 'cheerio';
+import { createWriteStream, existsSync, mkdirSync, statSync } from 'fs';
+import { basename, join, extname } from 'path';
+import { pipeline } from 'stream/promises';
+import { upsertMediaFile } from '../db.js';
+
+const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
+
+const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']);
+const SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star'];
+
+function isImageUrl(url) {
+  try {
+    const path = new URL(url).pathname.toLowerCase();
+    return [...IMAGE_EXTS].some(ext => path.endsWith(ext));
+  } catch { return false; }
+}
+
+export function getPageUrl(baseUrl, pageNum) {
+  const url = baseUrl.replace(/page-\d+/, `page-${pageNum}`);
+  return url.split('#')[0];
+}
+
+export async function detectMaxPage(baseUrl, logFn) {
+  try {
+    const resp = await fetch(baseUrl, { headers: { 'User-Agent': UA }, signal: AbortSignal.timeout(15000) });
+    if (!resp.ok) return null;
+    const html = await resp.text();
+    const $ = cheerio.load(html);
+
+    let maxPage = 1;
+    // XenForo-style
+    $('a.pageNav-page, .pageNav a[href*="page-"], .pagination a[href*="page-"]').each((_, el) => {
+      const href = $(el).attr('href') || '';
+      const m = href.match(/page-(\d+)/);
+      if (m) maxPage = Math.max(maxPage, parseInt(m[1], 10));
+    });
+    // Generic pagination text
+    $('a').each((_, el) => {
+      const text = $(el).text().trim();
+      if (/^\d+$/.test(text)) {
+        const n = parseInt(text, 10);
+        if (n > maxPage && n < 10000) maxPage = n;
+      }
+    });
+
+    if (maxPage > 1) {
+      logFn(`Detected ${maxPage} pages`);
+      return maxPage;
+    }
+    return null;
+  } catch (err) {
+    logFn(`Page detection failed: ${err.message}`);
+    return null;
+  }
+}
+
+function tryFullSizeUrl(thumbUrl) {
+  const candidates = [];
+  if (thumbUrl.includes('.th.')) candidates.push(thumbUrl.replace('.th.', '.'));
+  if (/_thumb\./i.test(thumbUrl)) candidates.push(thumbUrl.replace(/_thumb\./i, '.'));
+  if (thumbUrl.includes('/thumbs/')) {
+    candidates.push(thumbUrl.replace('/thumbs/', '/images/'));
+    candidates.push(thumbUrl.replace('/thumbs/', '/full/'));
+  }
+  try {
+    const parsed = new URL(thumbUrl);
+    const base = basename(parsed.pathname);
+    if (base.startsWith('thumb_')) {
+      candidates.push(thumbUrl.replace(`/${base}`, `/${base.slice(6)}`));
+    }
+    if (parsed.search) candidates.push(thumbUrl.split('?')[0]);
+  } catch {}
+  return candidates;
+}
+
+async function downloadImage(url, outputDir, downloadedSet, logFn) {
+  if (downloadedSet.has(url)) return false;
+  if (!isImageUrl(url)) return false;
+  const lower = url.toLowerCase();
+  if (SKIP_PATTERNS.some(p => lower.includes(p))) return false;
+
+  downloadedSet.add(url);
+
+  let filename;
+  try {
+    filename = basename(new URL(url).pathname);
+  } catch { return false; }
+  if (!filename) return false;
+
+  filename = filename.replace('.th.', '.');
+
+  let filepath = join(outputDir, filename);
+  if (existsSync(filepath)) {
+    const ext = extname(filename);
+    const name = filename.slice(0, -ext.length);
+    let i = 1;
+    while (existsSync(filepath)) {
+      filepath = join(outputDir, `${name}_${i}${ext}`);
+      i++;
+    }
+  }
+
+  try {
+    const resp = await fetch(url, {
+      headers: { 'User-Agent': UA },
+      signal: AbortSignal.timeout(30000),
+    });
+    if (!resp.ok) {
+      logFn(`FAILED (${resp.status}): ${url}`);
+      return false;
+    }
+
+    // Read full body to check size
+    const buf = Buffer.from(await resp.arrayBuffer());
+    if (buf.length < 1000) {
+      downloadedSet.delete(url);
+      return false;
+    }
+
+    const { writeFileSync } = await import('fs');
+    writeFileSync(filepath, buf);
+
+    const savedName = basename(filepath);
+    const folderName = basename(outputDir);
+    try { upsertMediaFile(folderName, savedName, 'image', buf.length, Date.now(), null); } catch { /* ignore */ }
+
+    const sizeKb = (buf.length / 1024).toFixed(1);
+    logFn(`Downloaded: ${savedName} (${sizeKb} KB)`);
+    return true;
+  } catch (err) {
+    logFn(`FAILED: ${basename(filepath)} - ${err.message}`);
+    return false;
+  }
+}
+
+export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn) {
+  logFn(`Fetching page: ${pageUrl}`);
+
+  let html;
+  try {
+    const resp = await fetch(pageUrl, {
+      headers: { 'User-Agent': UA },
+      signal: AbortSignal.timeout(15000),
+    });
+    if (!resp.ok) {
+      logFn(`Failed to fetch page (${resp.status})`);
+      return 0;
+    }
+    html = await resp.text();
+  } catch (err) {
+    logFn(`Failed to fetch page: ${err.message}`);
+    return 0;
+  }
+
+  const $ = cheerio.load(html);
+
+  // Try known content selectors, fall back to whole page
+  const selectors = '.message-body, .post-body, .post_body, .postcontent, .messageContent, .bbWrapper, article, .entry-content, .post_message, .post-content, #posts, .threadBody';
+  let contentAreas = $(selectors).toArray();
+  if (contentAreas.length === 0) {
+    contentAreas = [$.root().get(0)];
+  }
+
+  const imageUrls = [];
+
+  for (const area of contentAreas) {
+    const $area = $(area);
+
+    // Pass 1: <img> tags
+    $area.find('img').each((_, el) => {
+      const $img = $(el);
+      const src = $img.attr('src') || $img.attr('data-src') || $img.attr('data-url') || '';
+      if (!src) return;
+
+      let absSrc;
+      try { absSrc = new URL(src, pageUrl).href; } catch { return; }
+
+      // Check parent <a> for direct image link
+      const $parentA = $img.closest('a');
+      if ($parentA.length && $parentA.attr('href')) {
+        try {
+          const aHref = new URL($parentA.attr('href'), pageUrl).href;
+          if (isImageUrl(aHref)) {
+            imageUrls.push(aHref);
+            return;
+          }
+        } catch {}
+      }
+
+      // Try to derive full-size from thumbnail URL
+      const fullCandidates = tryFullSizeUrl(absSrc);
+      if (fullCandidates.length > 0) {
+        imageUrls.push(...fullCandidates);
+      } else {
+        imageUrls.push(absSrc);
+      }
+
+      // Also check data attributes
+      for (const attr of ['data-src', 'data-url', 'data-orig', 'data-original', 'data-full-url', 'data-zoom-src']) {
+        const val = $img.attr(attr);
+        if (val && val !== src) {
+          try { imageUrls.push(new URL(val, pageUrl).href); } catch {}
+        }
+      }
+    });
+
+    // Pass 2: <a href> pointing directly to images (no child <img>)
+    $area.find('a[href]').each((_, el) => {
+      const $a = $(el);
+      if ($a.find('img').length) return;
+      try {
+        const href = new URL($a.attr('href'), pageUrl).href;
+        if (isImageUrl(href)) imageUrls.push(href);
+      } catch {}
+    });
+  }
+
+  logFn(`Found ${imageUrls.length} candidate URLs`);
+
+  let count = 0;
+  for (const imgUrl of imageUrls) {
+    if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn)) {
+      count++;
+    }
+  }
+
+  logFn(`${count} images from this page`);
+  return count;
+}
@@ -0,0 +1,187 @@
+import { existsSync, writeFileSync, mkdirSync } from 'fs';
+import { basename, join, extname } from 'path';
+import { upsertMediaFile } from '../db.js';
+
+const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
+
+const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.webm', '.mkv', '.m4v']);
+
+export function parseMediaUrl(url) {
+  const parsed = new URL(url);
+  const base = `${parsed.protocol}//${parsed.hostname}`;
+  // Support /model/{id} or /media/{id}
+  const m = parsed.pathname.match(/\/(?:model|media)\/(\d+)/);
+  if (!m) throw new Error(`Can't parse URL. Expected: https://fapello.to/model/12345`);
+  return { base, userId: m[1] };
+}
+
+// Fetch JSON from the API endpoint
+// API: GET /api/media/{userId}/{page}/{order}
+// Requires X-Requested-With and Referer headers to avoid 403
+async function fetchApiPage(base, userId, page, order, logFn) {
+  const apiUrl = `${base}/api/media/${userId}/${page}/${order}`;
+  try {
+    const resp = await fetch(apiUrl, {
+      headers: {
+        'User-Agent': UA,
+        'Accept': 'application/json, text/javascript, */*; q=0.01',
+        'X-Requested-With': 'XMLHttpRequest',
+        'Referer': `${base}/model/${userId}`,
+      },
+      signal: AbortSignal.timeout(15000),
+    });
+    if (!resp.ok) {
+      if (resp.status === 404) return null;
+      logFn(`API error (${resp.status}): ${apiUrl}`);
+      return null;
+    }
+    const data = await resp.json();
+    return data;
+  } catch (err) {
+    logFn(`API fetch error: ${err.message}`);
+    return null;
+  }
+}
+
+// Collect all media items by paginating through the API
+export async function fetchAllMedia(base, userId, maxPages, delay, logFn, checkCancelled) {
+  const allItems = [];
+  const seen = new Set();
+
+  for (let page = 1; page <= maxPages; page++) {
+    if (checkCancelled()) break;
+
+    logFn(`Fetching page ${page}...`);
+    const data = await fetchApiPage(base, userId, page, 1, logFn);
+
+    if (!data || data.length === 0) {
+      logFn(`Page ${page}: no more items — done`);
+      break;
+    }
+
+    let newCount = 0;
+    for (const item of data) {
+      if (seen.has(item.id)) continue;
+      seen.add(item.id);
+      newCount++;
+
+      // type "2" = video (newUrl is mp4), type "1" = image (newUrl is full-size jpg)
+      const isVideo = item.type === '2' || item.type === 2;
+      const fullUrl = item.newUrl;
+      if (!fullUrl) continue;
+
+      allItems.push({
+        id: item.id,
+        url: fullUrl,
+        type: isVideo ? 'video' : 'image',
+      });
+    }
+
+    if (newCount === 0) {
+      logFn(`Page ${page}: all duplicates — stopping`);
+      break;
+    }
+
+    logFn(`Page ${page}: ${data.length} items (${newCount} new, ${allItems.length} total)`);
+
+    if (page < maxPages && !checkCancelled()) {
+      await new Promise(r => setTimeout(r, delay));
+    }
+  }
+
+  return allItems;
+}
+
+// Download all collected media items with concurrency
+export async function downloadMedia(items, outputDir, workers, logFn, progressFn, checkCancelled) {
+  mkdirSync(outputDir, { recursive: true });
+
+  let completed = 0;
+  let errors = 0;
+  let skipped = 0;
+  let index = 0;
+
+  async function processNext() {
+    while (index < items.length) {
+      if (checkCancelled()) return;
+
+      const current = index++;
+      const item = items[current];
+
+      let filename;
+      try {
+        filename = basename(new URL(item.url).pathname);
+        if (!filename || filename === '/') {
+          filename = `${item.id}.${item.type === 'video' ? 'mp4' : 'jpg'}`;
+        }
+      } catch {
+        filename = `${item.id}.${item.type === 'video' ? 'mp4' : 'jpg'}`;
+      }
+
+      let filepath = join(outputDir, filename);
+      if (existsSync(filepath)) {
+        skipped++;
+        progressFn(completed + skipped, errors, items.length);
+        continue;
+      }
+
+      try {
+        const resp = await fetch(item.url, {
+          headers: {
+            'User-Agent': UA,
+            'Referer': 'https://fapello.to/',
+          },
+          signal: AbortSignal.timeout(60000),
+        });
+        if (!resp.ok) {
+          logFn(`FAILED (${resp.status}): ${filename}`);
+          errors++;
+          progressFn(completed + skipped, errors, items.length);
+          continue;
+        }
+
+        const buf = Buffer.from(await resp.arrayBuffer());
+        if (buf.length < 500) {
+          skipped++;
+          progressFn(completed + skipped, errors, items.length);
+          continue;
+        }
+
+        // Handle filename collision
+        if (existsSync(filepath)) {
+          const ext = extname(filename);
+          const name = filename.slice(0, -ext.length);
+          let i = 1;
+          while (existsSync(filepath)) {
+            filepath = join(outputDir, `${name}_${i}${ext}`);
+            i++;
+          }
+        }
+
+        writeFileSync(filepath, buf);
+        const savedName = basename(filepath);
+        const folderName = basename(outputDir);
+        const fileExt = extname(savedName).toLowerCase();
+        const fileType = VIDEO_EXTS.has(fileExt) ? 'video' : 'image';
+        try { upsertMediaFile(folderName, savedName, fileType, buf.length, Date.now(), null); } catch {}
+
+        completed++;
+        const sizeKb = (buf.length / 1024).toFixed(1);
+        logFn(`[${completed}/${items.length}] ${savedName} (${sizeKb} KB)`);
+        progressFn(completed + skipped, errors, items.length);
+      } catch (err) {
+        logFn(`FAILED: ${filename} - ${err.message}`);
+        errors++;
+        progressFn(completed + skipped, errors, items.length);
+      }
+    }
+  }
+
+  const workerPromises = [];
+  for (let i = 0; i < Math.min(workers, items.length); i++) {
+    workerPromises.push(processNext());
+  }
+  await Promise.all(workerPromises);
+
+  return { completed, errors, skipped, total: items.length };
+}